You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
250 lines
5.3 KiB
250 lines
5.3 KiB
3 years ago
|
/**
|
||
|
* Mnemonist Inverted Index
|
||
|
* =========================
|
||
|
*
|
||
|
* JavaScript implementation of an inverted index.
|
||
|
*/
|
||
|
var Iterator = require('obliterator/iterator'),
|
||
|
forEach = require('obliterator/foreach'),
|
||
|
helpers = require('./utils/merge.js');
|
||
|
|
||
|
function identity(x) {
|
||
|
return x;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* InvertedIndex.
|
||
|
*
|
||
|
* @constructor
|
||
|
* @param {function} tokenizer - Tokenizer function.
|
||
|
*/
|
||
|
function InvertedIndex(descriptor) {
|
||
|
this.clear();
|
||
|
|
||
|
if (Array.isArray(descriptor)) {
|
||
|
this.documentTokenizer = descriptor[0];
|
||
|
this.queryTokenizer = descriptor[1];
|
||
|
}
|
||
|
else {
|
||
|
this.documentTokenizer = descriptor;
|
||
|
this.queryTokenizer = descriptor;
|
||
|
}
|
||
|
|
||
|
if (!this.documentTokenizer)
|
||
|
this.documentTokenizer = identity;
|
||
|
if (!this.queryTokenizer)
|
||
|
this.queryTokenizer = identity;
|
||
|
|
||
|
if (typeof this.documentTokenizer !== 'function')
|
||
|
throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.');
|
||
|
|
||
|
if (typeof this.queryTokenizer !== 'function')
|
||
|
throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Method used to clear the InvertedIndex.
|
||
|
*
|
||
|
* @return {undefined}
|
||
|
*/
|
||
|
InvertedIndex.prototype.clear = function() {
|
||
|
|
||
|
// Properties
|
||
|
this.items = [];
|
||
|
this.mapping = new Map();
|
||
|
this.size = 0;
|
||
|
this.dimension = 0;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Method used to add a document to the index.
|
||
|
*
|
||
|
* @param {any} doc - Item to add.
|
||
|
* @return {InvertedIndex}
|
||
|
*/
|
||
|
InvertedIndex.prototype.add = function(doc) {
|
||
|
|
||
|
// Increasing size
|
||
|
this.size++;
|
||
|
|
||
|
// Storing document
|
||
|
var key = this.items.length;
|
||
|
this.items.push(doc);
|
||
|
|
||
|
// Tokenizing the document
|
||
|
var tokens = this.documentTokenizer(doc);
|
||
|
|
||
|
if (!Array.isArray(tokens))
|
||
|
throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.');
|
||
|
|
||
|
// Indexing
|
||
|
var done = new Set(),
|
||
|
token,
|
||
|
container;
|
||
|
|
||
|
for (var i = 0, l = tokens.length; i < l; i++) {
|
||
|
token = tokens[i];
|
||
|
|
||
|
if (done.has(token))
|
||
|
continue;
|
||
|
|
||
|
done.add(token);
|
||
|
|
||
|
container = this.mapping.get(token);
|
||
|
|
||
|
if (!container) {
|
||
|
container = [];
|
||
|
this.mapping.set(token, container);
|
||
|
}
|
||
|
|
||
|
container.push(key);
|
||
|
}
|
||
|
|
||
|
this.dimension = this.mapping.size;
|
||
|
|
||
|
return this;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Method used to query the index in a AND fashion.
|
||
|
*
|
||
|
* @param {any} query - Query
|
||
|
* @return {Set} - Intersection of documents matching the query.
|
||
|
*/
|
||
|
InvertedIndex.prototype.get = function(query) {
|
||
|
|
||
|
// Early termination
|
||
|
if (!this.size)
|
||
|
return [];
|
||
|
|
||
|
// First we need to tokenize the query
|
||
|
var tokens = this.queryTokenizer(query);
|
||
|
|
||
|
if (!Array.isArray(tokens))
|
||
|
throw new Error('mnemonist/InvertedIndex.query: tokenizer function should return an array of tokens.');
|
||
|
|
||
|
if (!tokens.length)
|
||
|
return [];
|
||
|
|
||
|
var results = this.mapping.get(tokens[0]),
|
||
|
c,
|
||
|
i,
|
||
|
l;
|
||
|
|
||
|
if (typeof results === 'undefined' || results.length === 0)
|
||
|
return [];
|
||
|
|
||
|
if (tokens.length > 1) {
|
||
|
for (i = 1, l = tokens.length; i < l; i++) {
|
||
|
c = this.mapping.get(tokens[i]);
|
||
|
|
||
|
if (typeof c === 'undefined' || c.length === 0)
|
||
|
return [];
|
||
|
|
||
|
results = helpers.intersectionUniqueArrays(results, c);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var docs = new Array(results.length);
|
||
|
|
||
|
for (i = 0, l = docs.length; i < l; i++)
|
||
|
docs[i] = this.items[results[i]];
|
||
|
|
||
|
return docs;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Method used to iterate over each of the documents.
|
||
|
*
|
||
|
* @param {function} callback - Function to call for each item.
|
||
|
* @param {object} scope - Optional scope.
|
||
|
* @return {undefined}
|
||
|
*/
|
||
|
InvertedIndex.prototype.forEach = function(callback, scope) {
|
||
|
scope = arguments.length > 1 ? scope : this;
|
||
|
|
||
|
for (var i = 0, l = this.documents.length; i < l; i++)
|
||
|
callback.call(scope, this.documents[i], i, this);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Method returning an iterator over the index's documents.
|
||
|
*
|
||
|
* @return {Iterator}
|
||
|
*/
|
||
|
InvertedIndex.prototype.documents = function() {
|
||
|
var documents = this.items,
|
||
|
l = documents.length,
|
||
|
i = 0;
|
||
|
|
||
|
return new Iterator(function() {
|
||
|
if (i >= l)
|
||
|
return {
|
||
|
done: true
|
||
|
};
|
||
|
|
||
|
var value = documents[i++];
|
||
|
|
||
|
return {
|
||
|
value: value,
|
||
|
done: false
|
||
|
};
|
||
|
});
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Method returning an iterator over the index's tokens.
|
||
|
*
|
||
|
* @return {Iterator}
|
||
|
*/
|
||
|
InvertedIndex.prototype.tokens = function() {
|
||
|
return this.mapping.keys();
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Attaching the #.values method to Symbol.iterator if possible.
|
||
|
*/
|
||
|
if (typeof Symbol !== 'undefined')
|
||
|
InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents;
|
||
|
|
||
|
/**
|
||
|
* Convenience known methods.
|
||
|
*/
|
||
|
InvertedIndex.prototype.inspect = function() {
|
||
|
var array = this.items.slice();
|
||
|
|
||
|
// Trick so that node displays the name of the constructor
|
||
|
Object.defineProperty(array, 'constructor', {
|
||
|
value: InvertedIndex,
|
||
|
enumerable: false
|
||
|
});
|
||
|
|
||
|
return array;
|
||
|
};
|
||
|
|
||
|
if (typeof Symbol !== 'undefined')
|
||
|
InvertedIndex.prototype[Symbol.for('nodejs.util.inspect.custom')] = InvertedIndex.prototype.inspect;
|
||
|
|
||
|
/**
|
||
|
* Static @.from function taking an arbitrary iterable & converting it into
|
||
|
* a InvertedIndex.
|
||
|
*
|
||
|
* @param {Iterable} iterable - Target iterable.
|
||
|
* @param {function} tokenizer - Tokenizer function.
|
||
|
* @return {InvertedIndex}
|
||
|
*/
|
||
|
InvertedIndex.from = function(iterable, descriptor) {
|
||
|
var index = new InvertedIndex(descriptor);
|
||
|
|
||
|
forEach(iterable, function(doc) {
|
||
|
index.add(doc);
|
||
|
});
|
||
|
|
||
|
return index;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Exporting.
|
||
|
*/
|
||
|
module.exports = InvertedIndex;
|