You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					353 lines
				
				7.8 KiB
			
		
		
			
		
	
	
					353 lines
				
				7.8 KiB
			| 
											3 years ago
										 | /** | ||
|  |  * Mnemonist Suffix Array | ||
|  |  * ======================= | ||
|  |  * | ||
|  |  * Linear time implementation of a suffix array using the recursive | ||
|  |  * method by Karkkainen and Sanders. | ||
|  |  * | ||
|  |  * [References]: | ||
|  |  * https://www.cs.helsinki.fi/u/tpkarkka/publications/jacm05-revised.pdf
 | ||
|  |  * http://people.mpi-inf.mpg.de/~sanders/programs/suffix/
 | ||
|  |  * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.184.442&rep=rep1&type=pdf
 | ||
|  |  * | ||
|  |  * [Article]: | ||
|  |  * "Simple Linear Work Suffix Array Construction", Karkkainen and Sanders. | ||
|  |  * | ||
|  |  * [Note]: | ||
|  |  * A paper by Simon J. Puglisi, William F. Smyth & Andrew Turpin named | ||
|  |  * "The Performance of Linear Time Suffix Sorting Algorithms" seems to | ||
|  |  * prove that supralinear algorithm are in fact better faring for | ||
|  |  * "real" world use cases. It would be nice to check this out in JavaScript | ||
|  |  * because the high level of the language could change a lot to the fact. | ||
|  |  * | ||
|  |  * The current code is largely inspired by the following: | ||
|  |  * https://github.com/tixxit/suffixarray/blob/master/suffixarray.js
 | ||
|  |  */ | ||
|  | 
 | ||
|  | /** | ||
|  |  * Constants. | ||
|  |  */ | ||
|  | var SEPARATOR = '\u0001'; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Function used to sort the triples. | ||
|  |  * | ||
|  |  * @param {string|array} string - Padded sequence. | ||
|  |  * @param {array}        array  - Array to sort (will be mutated). | ||
|  |  * @param {number}       offset - Index offset. | ||
|  |  */ | ||
|  | function sort(string, array, offset) { | ||
|  |   var l = array.length, | ||
|  |       buckets = [], | ||
|  |       i = l, | ||
|  |       j = -1, | ||
|  |       b, | ||
|  |       d = 0, | ||
|  |       bits; | ||
|  | 
 | ||
|  |   while (i--) | ||
|  |     j = Math.max(string[array[i] + offset], j); | ||
|  | 
 | ||
|  |   bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8; | ||
|  | 
 | ||
|  |   for (; d < bits; d += 4) { | ||
|  |     for (i = 16; i--;) | ||
|  |       buckets[i] = []; | ||
|  |     for (i = l; i--;) | ||
|  |       buckets[((string[array[i] + offset]) >> d) & 15].push(array[i]); | ||
|  |     for (b = 0; b < 16; b++) { | ||
|  |       for (j = buckets[b].length; j--;) | ||
|  |         array[++i] = buckets[b][j]; | ||
|  |     } | ||
|  |   } | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Comparison helper. | ||
|  |  */ | ||
|  | function compare(string, lookup, m, n) { | ||
|  |   return ( | ||
|  |     (string[m] - string[n]) || | ||
|  |     (m % 3 === 2 ? | ||
|  |       (string[m + 1] - string[n + 1]) || (lookup[m + 2] - lookup[n + 2]) : | ||
|  |       (lookup[m + 1] - lookup[n + 1])) | ||
|  |   ); | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Recursive function used to build the suffix tree in linear time. | ||
|  |  * | ||
|  |  * @param  {string|array} string - Padded sequence. | ||
|  |  * @param  {number}       l      - True length of sequence (unpadded). | ||
|  |  * @return {array} | ||
|  |  */ | ||
|  | function build(string, l) { | ||
|  |   var a = [], | ||
|  |       b = [], | ||
|  |       al = (2 * l / 3) | 0, | ||
|  |       bl = l - al, | ||
|  |       r = (al + 1) >> 1, | ||
|  |       i = al, | ||
|  |       j = 0, | ||
|  |       k, | ||
|  |       lookup = [], | ||
|  |       result = []; | ||
|  | 
 | ||
|  |   if (l === 1) | ||
|  |     return [0]; | ||
|  | 
 | ||
|  |   while (i--) | ||
|  |     a[i] = ((i * 3) >> 1) + 1; | ||
|  | 
 | ||
|  |   for (i = 3; i--;) | ||
|  |     sort(string, a, i); | ||
|  | 
 | ||
|  |   j = b[((a[0] / 3) | 0) + (a[0] % 3 === 1 ? 0 : r)] = 1; | ||
|  | 
 | ||
|  |   for (i = 1; i < al; i++) { | ||
|  |     if (string[a[i]] !== string[a[i - 1]] || | ||
|  |         string[a[i] + 1] !== string[a[i - 1] + 1] || | ||
|  |         string[a[i] + 2] !== string[a[i - 1] + 2]) | ||
|  |       j++; | ||
|  | 
 | ||
|  |     b[((a[i] / 3) | 0) + (a[i] % 3 === 1 ? 0 : r)] = j; | ||
|  |   } | ||
|  | 
 | ||
|  |   if (j < al) { | ||
|  |     b = build(b, al); | ||
|  | 
 | ||
|  |     for (i = al; i--;) | ||
|  |       a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2); | ||
|  |   } | ||
|  | 
 | ||
|  |   for (i = al; i--;) | ||
|  |     lookup[a[i]] = i; | ||
|  |   lookup[l] = -1; | ||
|  |   lookup[l + 1] = -2; | ||
|  | 
 | ||
|  |   b = l % 3 === 1 ? [l - 1] : []; | ||
|  | 
 | ||
|  |   for (i = 0; i < al; i++) { | ||
|  |     if (a[i] % 3 === 1) | ||
|  |       b.push(a[i] - 1); | ||
|  |   } | ||
|  | 
 | ||
|  |   sort(string, b, 0); | ||
|  | 
 | ||
|  |   for (i = 0, j = 0, k = 0; i < al && j < bl;) | ||
|  |     result[k++] = ( | ||
|  |       compare(string, lookup, a[i], b[j]) < 0 ? | ||
|  |         a[i++] : | ||
|  |         b[j++] | ||
|  |     ); | ||
|  | 
 | ||
|  |   while (i < al) | ||
|  |     result[k++] = a[i++]; | ||
|  | 
 | ||
|  |   while (j < bl) | ||
|  |     result[k++] = b[j++]; | ||
|  | 
 | ||
|  |   return result; | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Function used to create the array we are going to work on. | ||
|  |  * | ||
|  |  * @param  {string|array} target - Target sequence. | ||
|  |  * @return {array} | ||
|  |  */ | ||
|  | function convert(target) { | ||
|  | 
 | ||
|  |   // Creating the alphabet array
 | ||
|  |   var length = target.length, | ||
|  |       paddingOffset = length % 3, | ||
|  |       array = new Array(length + paddingOffset), | ||
|  |       l, | ||
|  |       i; | ||
|  | 
 | ||
|  |   // If we have an arbitrary sequence, we need to transform it
 | ||
|  |   if (typeof target !== 'string') { | ||
|  |     var uniqueTokens = Object.create(null); | ||
|  | 
 | ||
|  |     for (i = 0; i < length; i++) { | ||
|  |       if (!uniqueTokens[target[i]]) | ||
|  |         uniqueTokens[target[i]] = true; | ||
|  |     } | ||
|  | 
 | ||
|  |     var alphabet = Object.create(null), | ||
|  |         sortedUniqueTokens = Object.keys(uniqueTokens).sort(); | ||
|  | 
 | ||
|  |     for (i = 0, l = sortedUniqueTokens.length; i < l; i++) | ||
|  |       alphabet[sortedUniqueTokens[i]] = i + 1; | ||
|  | 
 | ||
|  |     for (i = 0; i < length; i++) { | ||
|  |       array[i] = alphabet[target[i]]; | ||
|  |     } | ||
|  |   } | ||
|  |   else { | ||
|  |     for (i = 0; i < length; i++) | ||
|  |       array[i] = target.charCodeAt(i); | ||
|  |   } | ||
|  | 
 | ||
|  |   // Padding the array
 | ||
|  |   for (i = length; i < length + paddingOffset; i++) | ||
|  |     array[i] = 0; | ||
|  | 
 | ||
|  |   return array; | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Suffix Array. | ||
|  |  * | ||
|  |  * @constructor | ||
|  |  * @param {string|array} string - Sequence for which to build the suffix array. | ||
|  |  */ | ||
|  | function SuffixArray(string) { | ||
|  | 
 | ||
|  |   // Properties
 | ||
|  |   this.hasArbitrarySequence = typeof string !== 'string'; | ||
|  |   this.string = string; | ||
|  |   this.length = string.length; | ||
|  | 
 | ||
|  |   // Building the array
 | ||
|  |   this.array = build(convert(string), this.length); | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Convenience known methods. | ||
|  |  */ | ||
|  | SuffixArray.prototype.toString = function() { | ||
|  |   return this.array.join(','); | ||
|  | }; | ||
|  | 
 | ||
|  | SuffixArray.prototype.toJSON = function() { | ||
|  |   return this.array; | ||
|  | }; | ||
|  | 
 | ||
|  | SuffixArray.prototype.inspect = function() { | ||
|  |   var array = new Array(this.length); | ||
|  | 
 | ||
|  |   for (var i = 0; i < this.length; i++) | ||
|  |     array[i] = this.string.slice(this.array[i]); | ||
|  | 
 | ||
|  |   // Trick so that node displays the name of the constructor
 | ||
|  |   Object.defineProperty(array, 'constructor', { | ||
|  |     value: SuffixArray, | ||
|  |     enumerable: false | ||
|  |   }); | ||
|  | 
 | ||
|  |   return array; | ||
|  | }; | ||
|  | 
 | ||
|  | if (typeof Symbol !== 'undefined') | ||
|  |   SuffixArray.prototype[Symbol.for('nodejs.util.inspect.custom')] = SuffixArray.prototype.inspect; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Generalized Suffix Array. | ||
|  |  * | ||
|  |  * @constructor | ||
|  |  */ | ||
|  | function GeneralizedSuffixArray(strings) { | ||
|  | 
 | ||
|  |   // Properties
 | ||
|  |   this.hasArbitrarySequence = typeof strings[0] !== 'string'; | ||
|  |   this.size = strings.length; | ||
|  | 
 | ||
|  |   if (this.hasArbitrarySequence) { | ||
|  |     this.text = []; | ||
|  | 
 | ||
|  |     for (var i = 0, l = this.size; i < l; i++) { | ||
|  |       this.text.push.apply(this.text, strings[i]); | ||
|  | 
 | ||
|  |       if (i < l - 1) | ||
|  |         this.text.push(SEPARATOR); | ||
|  |     } | ||
|  |   } | ||
|  |   else { | ||
|  |     this.text = strings.join(SEPARATOR); | ||
|  |   } | ||
|  | 
 | ||
|  |   this.firstLength = strings[0].length; | ||
|  |   this.length = this.text.length; | ||
|  | 
 | ||
|  |   // Building the array
 | ||
|  |   this.array = build(convert(this.text), this.length); | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method used to retrieve the longest common subsequence of the generalized | ||
|  |  * suffix array. | ||
|  |  * | ||
|  |  * @return {string|array} | ||
|  |  */ | ||
|  | GeneralizedSuffixArray.prototype.longestCommonSubsequence = function() { | ||
|  |   var lcs = this.hasArbitrarySequence ? [] : '', | ||
|  |       lcp, | ||
|  |       i, | ||
|  |       j, | ||
|  |       s, | ||
|  |       t; | ||
|  | 
 | ||
|  |   for (i = 1; i < this.length; i++) { | ||
|  |     s = this.array[i]; | ||
|  |     t = this.array[i - 1]; | ||
|  | 
 | ||
|  |     if (s < this.firstLength && | ||
|  |         t < this.firstLength) | ||
|  |       continue; | ||
|  | 
 | ||
|  |     if (s > this.firstLength && | ||
|  |         t > this.firstLength) | ||
|  |       continue; | ||
|  | 
 | ||
|  |     lcp = Math.min(this.length - s, this.length - t); | ||
|  | 
 | ||
|  |     for (j = 0; j < lcp; j++) { | ||
|  |       if (this.text[s + j] !== this.text[t + j]) { | ||
|  |         lcp = j; | ||
|  |         break; | ||
|  |       } | ||
|  |     } | ||
|  | 
 | ||
|  |     if (lcp > lcs.length) | ||
|  |       lcs = this.text.slice(s, s + lcp); | ||
|  |   } | ||
|  | 
 | ||
|  |   return lcs; | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Convenience known methods. | ||
|  |  */ | ||
|  | GeneralizedSuffixArray.prototype.toString = function() { | ||
|  |   return this.array.join(','); | ||
|  | }; | ||
|  | 
 | ||
|  | GeneralizedSuffixArray.prototype.toJSON = function() { | ||
|  |   return this.array; | ||
|  | }; | ||
|  | 
 | ||
|  | GeneralizedSuffixArray.prototype.inspect = function() { | ||
|  |   var array = new Array(this.length); | ||
|  | 
 | ||
|  |   for (var i = 0; i < this.length; i++) | ||
|  |     array[i] = this.text.slice(this.array[i]); | ||
|  | 
 | ||
|  |   // Trick so that node displays the name of the constructor
 | ||
|  |   Object.defineProperty(array, 'constructor', { | ||
|  |     value: GeneralizedSuffixArray, | ||
|  |     enumerable: false | ||
|  |   }); | ||
|  | 
 | ||
|  |   return array; | ||
|  | }; | ||
|  | 
 | ||
|  | if (typeof Symbol !== 'undefined') | ||
|  |   GeneralizedSuffixArray.prototype[Symbol.for('nodejs.util.inspect.custom')] = GeneralizedSuffixArray.prototype.inspect; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Exporting. | ||
|  |  */ | ||
|  | SuffixArray.GeneralizedSuffixArray = GeneralizedSuffixArray; | ||
|  | module.exports = SuffixArray; |