You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
353 lines
7.8 KiB
353 lines
7.8 KiB
/**
|
|
* Mnemonist Suffix Array
|
|
* =======================
|
|
*
|
|
* Linear time implementation of a suffix array using the recursive
|
|
* method by Karkkainen and Sanders.
|
|
*
|
|
* [References]:
|
|
* https://www.cs.helsinki.fi/u/tpkarkka/publications/jacm05-revised.pdf
|
|
* http://people.mpi-inf.mpg.de/~sanders/programs/suffix/
|
|
* http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.184.442&rep=rep1&type=pdf
|
|
*
|
|
* [Article]:
|
|
* "Simple Linear Work Suffix Array Construction", Karkkainen and Sanders.
|
|
*
|
|
* [Note]:
|
|
* A paper by Simon J. Puglisi, William F. Smyth & Andrew Turpin named
|
|
* "The Performance of Linear Time Suffix Sorting Algorithms" seems to
|
|
* prove that supralinear algorithm are in fact better faring for
|
|
* "real" world use cases. It would be nice to check this out in JavaScript
|
|
* because the high level of the language could change a lot to the fact.
|
|
*
|
|
* The current code is largely inspired by the following:
|
|
* https://github.com/tixxit/suffixarray/blob/master/suffixarray.js
|
|
*/
|
|
|
|
/**
|
|
* Constants.
|
|
*/
|
|
var SEPARATOR = '\u0001';
|
|
|
|
/**
|
|
* Function used to sort the triples.
|
|
*
|
|
* @param {string|array} string - Padded sequence.
|
|
* @param {array} array - Array to sort (will be mutated).
|
|
* @param {number} offset - Index offset.
|
|
*/
|
|
function sort(string, array, offset) {
|
|
var l = array.length,
|
|
buckets = [],
|
|
i = l,
|
|
j = -1,
|
|
b,
|
|
d = 0,
|
|
bits;
|
|
|
|
while (i--)
|
|
j = Math.max(string[array[i] + offset], j);
|
|
|
|
bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8;
|
|
|
|
for (; d < bits; d += 4) {
|
|
for (i = 16; i--;)
|
|
buckets[i] = [];
|
|
for (i = l; i--;)
|
|
buckets[((string[array[i] + offset]) >> d) & 15].push(array[i]);
|
|
for (b = 0; b < 16; b++) {
|
|
for (j = buckets[b].length; j--;)
|
|
array[++i] = buckets[b][j];
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Comparison helper.
|
|
*/
|
|
function compare(string, lookup, m, n) {
|
|
return (
|
|
(string[m] - string[n]) ||
|
|
(m % 3 === 2 ?
|
|
(string[m + 1] - string[n + 1]) || (lookup[m + 2] - lookup[n + 2]) :
|
|
(lookup[m + 1] - lookup[n + 1]))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Recursive function used to build the suffix tree in linear time.
|
|
*
|
|
* @param {string|array} string - Padded sequence.
|
|
* @param {number} l - True length of sequence (unpadded).
|
|
* @return {array}
|
|
*/
|
|
function build(string, l) {
|
|
var a = [],
|
|
b = [],
|
|
al = (2 * l / 3) | 0,
|
|
bl = l - al,
|
|
r = (al + 1) >> 1,
|
|
i = al,
|
|
j = 0,
|
|
k,
|
|
lookup = [],
|
|
result = [];
|
|
|
|
if (l === 1)
|
|
return [0];
|
|
|
|
while (i--)
|
|
a[i] = ((i * 3) >> 1) + 1;
|
|
|
|
for (i = 3; i--;)
|
|
sort(string, a, i);
|
|
|
|
j = b[((a[0] / 3) | 0) + (a[0] % 3 === 1 ? 0 : r)] = 1;
|
|
|
|
for (i = 1; i < al; i++) {
|
|
if (string[a[i]] !== string[a[i - 1]] ||
|
|
string[a[i] + 1] !== string[a[i - 1] + 1] ||
|
|
string[a[i] + 2] !== string[a[i - 1] + 2])
|
|
j++;
|
|
|
|
b[((a[i] / 3) | 0) + (a[i] % 3 === 1 ? 0 : r)] = j;
|
|
}
|
|
|
|
if (j < al) {
|
|
b = build(b, al);
|
|
|
|
for (i = al; i--;)
|
|
a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2);
|
|
}
|
|
|
|
for (i = al; i--;)
|
|
lookup[a[i]] = i;
|
|
lookup[l] = -1;
|
|
lookup[l + 1] = -2;
|
|
|
|
b = l % 3 === 1 ? [l - 1] : [];
|
|
|
|
for (i = 0; i < al; i++) {
|
|
if (a[i] % 3 === 1)
|
|
b.push(a[i] - 1);
|
|
}
|
|
|
|
sort(string, b, 0);
|
|
|
|
for (i = 0, j = 0, k = 0; i < al && j < bl;)
|
|
result[k++] = (
|
|
compare(string, lookup, a[i], b[j]) < 0 ?
|
|
a[i++] :
|
|
b[j++]
|
|
);
|
|
|
|
while (i < al)
|
|
result[k++] = a[i++];
|
|
|
|
while (j < bl)
|
|
result[k++] = b[j++];
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Function used to create the array we are going to work on.
|
|
*
|
|
* @param {string|array} target - Target sequence.
|
|
* @return {array}
|
|
*/
|
|
function convert(target) {
|
|
|
|
// Creating the alphabet array
|
|
var length = target.length,
|
|
paddingOffset = length % 3,
|
|
array = new Array(length + paddingOffset),
|
|
l,
|
|
i;
|
|
|
|
// If we have an arbitrary sequence, we need to transform it
|
|
if (typeof target !== 'string') {
|
|
var uniqueTokens = Object.create(null);
|
|
|
|
for (i = 0; i < length; i++) {
|
|
if (!uniqueTokens[target[i]])
|
|
uniqueTokens[target[i]] = true;
|
|
}
|
|
|
|
var alphabet = Object.create(null),
|
|
sortedUniqueTokens = Object.keys(uniqueTokens).sort();
|
|
|
|
for (i = 0, l = sortedUniqueTokens.length; i < l; i++)
|
|
alphabet[sortedUniqueTokens[i]] = i + 1;
|
|
|
|
for (i = 0; i < length; i++) {
|
|
array[i] = alphabet[target[i]];
|
|
}
|
|
}
|
|
else {
|
|
for (i = 0; i < length; i++)
|
|
array[i] = target.charCodeAt(i);
|
|
}
|
|
|
|
// Padding the array
|
|
for (i = length; i < length + paddingOffset; i++)
|
|
array[i] = 0;
|
|
|
|
return array;
|
|
}
|
|
|
|
/**
|
|
* Suffix Array.
|
|
*
|
|
* @constructor
|
|
* @param {string|array} string - Sequence for which to build the suffix array.
|
|
*/
|
|
function SuffixArray(string) {
|
|
|
|
// Properties
|
|
this.hasArbitrarySequence = typeof string !== 'string';
|
|
this.string = string;
|
|
this.length = string.length;
|
|
|
|
// Building the array
|
|
this.array = build(convert(string), this.length);
|
|
}
|
|
|
|
/**
|
|
* Convenience known methods.
|
|
*/
|
|
SuffixArray.prototype.toString = function() {
|
|
return this.array.join(',');
|
|
};
|
|
|
|
SuffixArray.prototype.toJSON = function() {
|
|
return this.array;
|
|
};
|
|
|
|
SuffixArray.prototype.inspect = function() {
|
|
var array = new Array(this.length);
|
|
|
|
for (var i = 0; i < this.length; i++)
|
|
array[i] = this.string.slice(this.array[i]);
|
|
|
|
// Trick so that node displays the name of the constructor
|
|
Object.defineProperty(array, 'constructor', {
|
|
value: SuffixArray,
|
|
enumerable: false
|
|
});
|
|
|
|
return array;
|
|
};
|
|
|
|
if (typeof Symbol !== 'undefined')
|
|
SuffixArray.prototype[Symbol.for('nodejs.util.inspect.custom')] = SuffixArray.prototype.inspect;
|
|
|
|
/**
|
|
* Generalized Suffix Array.
|
|
*
|
|
* @constructor
|
|
*/
|
|
function GeneralizedSuffixArray(strings) {
|
|
|
|
// Properties
|
|
this.hasArbitrarySequence = typeof strings[0] !== 'string';
|
|
this.size = strings.length;
|
|
|
|
if (this.hasArbitrarySequence) {
|
|
this.text = [];
|
|
|
|
for (var i = 0, l = this.size; i < l; i++) {
|
|
this.text.push.apply(this.text, strings[i]);
|
|
|
|
if (i < l - 1)
|
|
this.text.push(SEPARATOR);
|
|
}
|
|
}
|
|
else {
|
|
this.text = strings.join(SEPARATOR);
|
|
}
|
|
|
|
this.firstLength = strings[0].length;
|
|
this.length = this.text.length;
|
|
|
|
// Building the array
|
|
this.array = build(convert(this.text), this.length);
|
|
}
|
|
|
|
/**
|
|
* Method used to retrieve the longest common subsequence of the generalized
|
|
* suffix array.
|
|
*
|
|
* @return {string|array}
|
|
*/
|
|
GeneralizedSuffixArray.prototype.longestCommonSubsequence = function() {
|
|
var lcs = this.hasArbitrarySequence ? [] : '',
|
|
lcp,
|
|
i,
|
|
j,
|
|
s,
|
|
t;
|
|
|
|
for (i = 1; i < this.length; i++) {
|
|
s = this.array[i];
|
|
t = this.array[i - 1];
|
|
|
|
if (s < this.firstLength &&
|
|
t < this.firstLength)
|
|
continue;
|
|
|
|
if (s > this.firstLength &&
|
|
t > this.firstLength)
|
|
continue;
|
|
|
|
lcp = Math.min(this.length - s, this.length - t);
|
|
|
|
for (j = 0; j < lcp; j++) {
|
|
if (this.text[s + j] !== this.text[t + j]) {
|
|
lcp = j;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (lcp > lcs.length)
|
|
lcs = this.text.slice(s, s + lcp);
|
|
}
|
|
|
|
return lcs;
|
|
};
|
|
|
|
/**
|
|
* Convenience known methods.
|
|
*/
|
|
GeneralizedSuffixArray.prototype.toString = function() {
|
|
return this.array.join(',');
|
|
};
|
|
|
|
GeneralizedSuffixArray.prototype.toJSON = function() {
|
|
return this.array;
|
|
};
|
|
|
|
GeneralizedSuffixArray.prototype.inspect = function() {
|
|
var array = new Array(this.length);
|
|
|
|
for (var i = 0; i < this.length; i++)
|
|
array[i] = this.text.slice(this.array[i]);
|
|
|
|
// Trick so that node displays the name of the constructor
|
|
Object.defineProperty(array, 'constructor', {
|
|
value: GeneralizedSuffixArray,
|
|
enumerable: false
|
|
});
|
|
|
|
return array;
|
|
};
|
|
|
|
if (typeof Symbol !== 'undefined')
|
|
GeneralizedSuffixArray.prototype[Symbol.for('nodejs.util.inspect.custom')] = GeneralizedSuffixArray.prototype.inspect;
|
|
|
|
/**
|
|
* Exporting.
|
|
*/
|
|
SuffixArray.GeneralizedSuffixArray = GeneralizedSuffixArray;
|
|
module.exports = SuffixArray;
|