Last active
June 29, 2017 11:13
-
-
Save dipu-bd/1df39b57b3b23957ba3abd432d97c36a to your computer and use it in GitHub Desktop.
Finding Longest Common Prefix (LCP) using Suffix Array. Complexity: SA=O(n.log(n)), LCP=O(n)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /*==================================================================== | |
| Title: Compute LCP from Suffix Array using Radix Sort | |
| Suffix Array: O(n.log(n)) | |
| Compute LCP: O(n) | |
| Author : Sudipto Chandra (Dipu) | |
| =====================================================================*/ | |
| #include <bits/stdc++.h> | |
| using namespace std; | |
| #define mem(a,b) memset(a, b, sizeof(a)) | |
| #define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i) | |
| #define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i) | |
| /*------------------------------------------------------------------------------------*/ | |
| int test, cas = 1; | |
| const int SIZ = 10000050; // maximum possible size | |
| int n; // text length | |
| char T[SIZ]; // text | |
| int SA[SIZ], tempSA[SIZ]; // the sorted suffixes | |
| int RA[SIZ], tempRA[SIZ]; // ranks of suffix array | |
| int L[SIZ]; // used in counting sort | |
| int Phi[SIZ]; // the jumping parameter | |
| int LCP[SIZ]; // longest common prefix | |
| int PLCP[SIZ]; // permuted longest common prefix | |
| inline int getRA(int i) | |
| { | |
| return (i < n) ? RA[i] : 0; | |
| } | |
| void radix_sort(int k) | |
| { | |
| mem(L, 0); | |
| // count frequencies | |
| for(int i = 0; i < n; ++i) | |
| { | |
| L[getRA(i + k)]++; | |
| } | |
| // save first index of every characters | |
| int mx = max(n, 130); | |
| for(int i = 0, s = 0; i < mx; ++i) | |
| { | |
| int x = L[i]; | |
| L[i] = s; | |
| s += x; | |
| } | |
| // build sorted tempSA | |
| for(int i = 0; i < n; ++i) | |
| { | |
| int& x = L[getRA(SA[i] + k)]; | |
| tempSA[x++] = SA[i]; | |
| } | |
| // copy tempSA to SA | |
| for(int i = 0; i < n; ++i) | |
| { | |
| SA[i] = tempSA[i]; | |
| } | |
| } | |
| // text must ends with a $ | |
| void buildSA() | |
| { | |
| // initialize | |
| n = strlen(T); | |
| T[n++] = '$', T[n] = 0; // append $ | |
| for(int i = 0; i < n; ++i) | |
| { | |
| SA[i] = i; | |
| RA[i] = T[i]; | |
| } | |
| // algorithm loop | |
| for(int k = 1; k < n; k <<= 1) | |
| { | |
| // sort by k-th ranks | |
| radix_sort(k); | |
| radix_sort(0); | |
| // compute new ranks | |
| tempRA[SA[0]] = 0; | |
| for(int i = 1, r = 0; i < n; ++i) | |
| { | |
| if(getRA(SA[i-1]) != getRA(SA[i])) { | |
| r++; | |
| } | |
| else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) { | |
| r++; | |
| } | |
| tempRA[SA[i]] = r; | |
| } | |
| for(int i = 0; i < n; ++i) | |
| { | |
| RA[i] = tempRA[i]; | |
| } | |
| if(RA[SA[n - 1]] == n - 1) break; | |
| } | |
| } | |
| // Finds the Longest Common Prefix (LCP) between two adjacent suffixes | |
| // excluding the first suffix using the Permuted LCP (PLCP) theorem. | |
| void computeLCP() | |
| { | |
| Phi[SA[0]] = -1; | |
| for(int i = 1; i < n; ++i) | |
| { | |
| Phi[SA[i]] = SA[i - 1]; | |
| } | |
| for(int i = 0, L = 0; i < n; ++i) | |
| { | |
| if(Phi[i] == -1) | |
| { | |
| PLCP[i] = 0; | |
| continue; | |
| } | |
| // longest common prefix length | |
| L = max(L - 1, 0); | |
| while(T[i + L] == T[Phi[i] + L]) | |
| { | |
| L++; | |
| } | |
| PLCP[i] = L; | |
| } | |
| for(int i = 0; i < n; ++i) | |
| { | |
| LCP[i] = PLCP[SA[i]]; | |
| } | |
| } | |
| void TEST() | |
| { | |
| int values[] = { | |
| 10, | |
| 100, | |
| 1000, | |
| 10000, | |
| 50000, | |
| 100000, | |
| 500000, | |
| 1000000, | |
| 2000000, | |
| 3000000, | |
| 4000000, | |
| 5000000, | |
| /*6000000, | |
| 7000000, | |
| 8000000*/ | |
| }; | |
| int siz = sizeof(values) / sizeof(int); | |
| double avg_cpi = 0; | |
| puts(""); | |
| puts("| n | Runtime(s) | TPI(ms) |"); | |
| puts("|----------:|:----------:|:------------:|"); | |
| for(int k = 0; k < siz; ++k) | |
| { | |
| int n = values[k]; | |
| for(int i = 0; i < n; ++i) | |
| { | |
| if(rand() & 1) | |
| { | |
| T[i] = 'A' + (rand() % 26); | |
| } | |
| else if(rand() & 1) | |
| { | |
| T[i] = 'a' + (rand() % 26); | |
| } | |
| else | |
| { | |
| T[i] = '0' + (rand() % 10); | |
| } | |
| } | |
| T[n] = 0; | |
| time_t start = clock(); | |
| buildSA(); // builds the suffix array | |
| computeLCP(); // calculate longest common prefix | |
| time_t stop = clock(); | |
| double time = (double)(stop - start) / CLOCKS_PER_SEC; | |
| double cpi = (double)(stop - start) / (n * log2(n)); | |
| printf("| `%7d` | `%5.3f` | `%0.8f` |\n", n, time, cpi); | |
| if(k) avg_cpi += (values[k] - values[k - 1]) * cpi; | |
| else avg_cpi += values[k] * cpi; | |
| } | |
| avg_cpi /= values[siz - 1]; | |
| printf("\n**Average *Time Per Instructions*** = `%.10f ms`\n", avg_cpi); | |
| } | |
| void RUN() | |
| { | |
| /* | |
| GATAGACA | |
| abcabxabcd | |
| */ | |
| gets(T); | |
| buildSA(); | |
| computeLCP(); | |
| puts("============================================="); | |
| printf("| i | SA | Phi | PLCP | suffix | LCP |\n"); | |
| printf("|----|----|-----|------|--------------|-----|\n"); | |
| for(int i = 0; i < n; ++i) | |
| { | |
| printf("| %2d | %2d | %3d | %2d | %-12s | %3d |\n", i, SA[i], Phi[i], PLCP[i], T + i, LCP[i]); | |
| } | |
| puts("============================================="); | |
| } | |
| int main() | |
| { | |
| //RUN(); | |
| TEST(); | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Performance measures
1. Core i7-4790U @3.6GHz + 8GB RAM
100.0150.451544991000.0250.0376287510000.0300.00301030100000.0360.00027093500000.0400.000051251000000.0450.000027095000000.1300.0000137310000000.4200.0000210720000001.1050.0000264030000002.9460.0000456440000004.2180.0000480850000005.6220.00005053Average Time Per Instructions =
0.0000406254 ms