Ai
1 Star 0 Fork 3

aiobc/Algorithms

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
SubstringVerificationSuffixArray.java 4.05 KB
一键复制 编辑 原始数据 按行查看 历史
Rafael Chaves 提交于 2020-02-03 02:07 +08:00 . Adopt standard Java project layout (#120)
/**
* This file shows you how to use a suffix array to determine if a pattern exists within a text.
* This implementation has the advantage that once the suffix array is built queries can be very
* fast.
*
* <p>Time complexity: O(nlogn) for suffix array construction and O(mlogn) time for individual
* queries (where m is query string length). As noted below, depending on the length of the string
* (if it is very large) it may be faster to use KMP or if you're doing a lot of queries on small
* strings then Rabin-Karp in combination with a bloom filter.
*
* @author William Fiset, william.alexandre.fiset@gmail.com
*/
package com.williamfiset.algorithms.strings;
import java.util.*;
// Example usage
public class SubstringVerificationSuffixArray {
public static void main(String[] args) {
String pattern = "hello world";
String text = "hello lemon Lennon wallet world tree cabbage hello world teapot calculator";
SuffixArray sa = new SuffixArray(text);
System.out.println(sa.contains(pattern));
System.out.println(sa.contains("this pattern does not exist"));
}
public static class SuffixArray {
// ALPHABET_SZ is the default alphabet size, this may need to be much
// larger if you're using the LCS method with multiple sentinels
int ALPHABET_SZ = 256, N;
int[] T, sa, sa2, rank, tmp, c;
public SuffixArray(String str) {
this(toIntArray(str));
}
private static int[] toIntArray(String s) {
int[] text = new int[s.length()];
for (int i = 0; i < s.length(); i++) text[i] = s.charAt(i);
return text;
}
// Designated constructor
public SuffixArray(int[] text) {
T = text;
N = text.length;
sa = new int[N];
sa2 = new int[N];
rank = new int[N];
c = new int[Math.max(ALPHABET_SZ, N)];
construct();
}
private void construct() {
int i, p, r;
for (i = 0; i < N; ++i) c[rank[i] = T[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[T[i]]] = i;
for (p = 1; p < N; p <<= 1) {
for (r = 0, i = N - p; i < N; ++i) sa2[r++] = i;
for (i = 0; i < N; ++i) if (sa[i] >= p) sa2[r++] = sa[i] - p;
Arrays.fill(c, 0, ALPHABET_SZ, 0);
for (i = 0; i < N; ++i) c[rank[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[rank[sa2[i]]]] = sa2[i];
for (sa2[sa[0]] = r = 0, i = 1; i < N; ++i) {
if (!(rank[sa[i - 1]] == rank[sa[i]]
&& sa[i - 1] + p < N
&& sa[i] + p < N
&& rank[sa[i - 1] + p] == rank[sa[i] + p])) r++;
sa2[sa[i]] = r;
}
tmp = rank;
rank = sa2;
sa2 = tmp;
if (r == N - 1) break;
ALPHABET_SZ = r + 1;
}
}
// Runs on O(mlog(n)) where m is the length of the substring
// and n is the length of the text. Depending on the length
// of the string (if it is very large) it may be faster to
// use KMP or if you're doing a lot of queries then Rabin-Karp
// in combination with a bloom filter.
//
// NOTE: This is the naive implementation. There exists an
// implementation which runs in O(m + log(n)) time
public boolean contains(String substr) {
if (substr == null) return false;
if (substr.equals("")) return true;
String suffix_str;
int lo = 0, hi = N - 1;
int substr_len = substr.length();
// Do binary search
while (lo <= hi) {
int mid = (lo + hi) >>> 1;
int suffix_index = sa[mid];
int suffix_len = N - suffix_index;
// Extract part of the suffix we need to compare
if (suffix_len <= substr_len) suffix_str = new String(T, suffix_index, suffix_len);
else suffix_str = new String(T, suffix_index, substr_len);
int cmp = suffix_str.compareTo(substr);
// Found a match
if (cmp == 0) {
return true;
} else if (cmp < 0) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
return false;
}
}
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/aiobc/Algorithms.git
git@gitee.com:aiobc/Algorithms.git
aiobc
Algorithms
Algorithms
master

搜索帮助