You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					250 lines
				
				5.3 KiB
			
		
		
			
		
	
	
					250 lines
				
				5.3 KiB
			| 
								 
											3 years ago
										 
									 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Mnemonist Inverted Index
							 | 
						||
| 
								 | 
							
								 * =========================
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * JavaScript implementation of an inverted index.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								var Iterator = require('obliterator/iterator'),
							 | 
						||
| 
								 | 
							
								    forEach = require('obliterator/foreach'),
							 | 
						||
| 
								 | 
							
								    helpers = require('./utils/merge.js');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								function identity(x) {
							 | 
						||
| 
								 | 
							
								  return x;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * InvertedIndex.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @constructor
							 | 
						||
| 
								 | 
							
								 * @param {function} tokenizer - Tokenizer function.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								function InvertedIndex(descriptor) {
							 | 
						||
| 
								 | 
							
								  this.clear();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (Array.isArray(descriptor)) {
							 | 
						||
| 
								 | 
							
								    this.documentTokenizer = descriptor[0];
							 | 
						||
| 
								 | 
							
								    this.queryTokenizer = descriptor[1];
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								  else {
							 | 
						||
| 
								 | 
							
								    this.documentTokenizer = descriptor;
							 | 
						||
| 
								 | 
							
								    this.queryTokenizer = descriptor;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (!this.documentTokenizer)
							 | 
						||
| 
								 | 
							
								    this.documentTokenizer = identity;
							 | 
						||
| 
								 | 
							
								  if (!this.queryTokenizer)
							 | 
						||
| 
								 | 
							
								    this.queryTokenizer = identity;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (typeof this.documentTokenizer !== 'function')
							 | 
						||
| 
								 | 
							
								    throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (typeof this.queryTokenizer !== 'function')
							 | 
						||
| 
								 | 
							
								    throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.');
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method used to clear the InvertedIndex.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @return {undefined}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.clear = function() {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Properties
							 | 
						||
| 
								 | 
							
								  this.items = [];
							 | 
						||
| 
								 | 
							
								  this.mapping = new Map();
							 | 
						||
| 
								 | 
							
								  this.size = 0;
							 | 
						||
| 
								 | 
							
								  this.dimension = 0;
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method used to add a document to the index.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @param  {any} doc - Item to add.
							 | 
						||
| 
								 | 
							
								 * @return {InvertedIndex}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.add = function(doc) {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Increasing size
							 | 
						||
| 
								 | 
							
								  this.size++;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Storing document
							 | 
						||
| 
								 | 
							
								  var key = this.items.length;
							 | 
						||
| 
								 | 
							
								  this.items.push(doc);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Tokenizing the document
							 | 
						||
| 
								 | 
							
								  var tokens = this.documentTokenizer(doc);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (!Array.isArray(tokens))
							 | 
						||
| 
								 | 
							
								    throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Indexing
							 | 
						||
| 
								 | 
							
								  var done = new Set(),
							 | 
						||
| 
								 | 
							
								      token,
							 | 
						||
| 
								 | 
							
								      container;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  for (var i = 0, l = tokens.length; i < l; i++) {
							 | 
						||
| 
								 | 
							
								    token = tokens[i];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if (done.has(token))
							 | 
						||
| 
								 | 
							
								      continue;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    done.add(token);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    container = this.mapping.get(token);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if (!container) {
							 | 
						||
| 
								 | 
							
								      container = [];
							 | 
						||
| 
								 | 
							
								      this.mapping.set(token, container);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    container.push(key);
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  this.dimension = this.mapping.size;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  return this;
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method used to query the index in a AND fashion.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @param  {any} query - Query
							 | 
						||
| 
								 | 
							
								 * @return {Set}       - Intersection of documents matching the query.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.get = function(query) {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Early termination
							 | 
						||
| 
								 | 
							
								  if (!this.size)
							 | 
						||
| 
								 | 
							
								    return [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // First we need to tokenize the query
							 | 
						||
| 
								 | 
							
								  var tokens = this.queryTokenizer(query);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (!Array.isArray(tokens))
							 | 
						||
| 
								 | 
							
								    throw new Error('mnemonist/InvertedIndex.query: tokenizer function should return an array of tokens.');
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (!tokens.length)
							 | 
						||
| 
								 | 
							
								    return [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  var results = this.mapping.get(tokens[0]),
							 | 
						||
| 
								 | 
							
								      c,
							 | 
						||
| 
								 | 
							
								      i,
							 | 
						||
| 
								 | 
							
								      l;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (typeof results === 'undefined' || results.length === 0)
							 | 
						||
| 
								 | 
							
								    return [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (tokens.length > 1) {
							 | 
						||
| 
								 | 
							
								    for (i = 1, l = tokens.length; i < l; i++) {
							 | 
						||
| 
								 | 
							
								      c = this.mapping.get(tokens[i]);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if (typeof c === 'undefined' || c.length === 0)
							 | 
						||
| 
								 | 
							
								        return [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      results = helpers.intersectionUniqueArrays(results, c);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  var docs = new Array(results.length);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  for (i = 0, l = docs.length; i < l; i++)
							 | 
						||
| 
								 | 
							
								    docs[i] = this.items[results[i]];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  return docs;
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method used to iterate over each of the documents.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @param  {function}  callback - Function to call for each item.
							 | 
						||
| 
								 | 
							
								 * @param  {object}    scope    - Optional scope.
							 | 
						||
| 
								 | 
							
								 * @return {undefined}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.forEach = function(callback, scope) {
							 | 
						||
| 
								 | 
							
								  scope = arguments.length > 1 ? scope : this;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  for (var i = 0, l = this.documents.length; i < l; i++)
							 | 
						||
| 
								 | 
							
								    callback.call(scope, this.documents[i], i, this);
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method returning an iterator over the index's documents.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @return {Iterator}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.documents = function() {
							 | 
						||
| 
								 | 
							
								  var documents = this.items,
							 | 
						||
| 
								 | 
							
								      l = documents.length,
							 | 
						||
| 
								 | 
							
								      i = 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  return new Iterator(function() {
							 | 
						||
| 
								 | 
							
								    if (i >= l)
							 | 
						||
| 
								 | 
							
								      return {
							 | 
						||
| 
								 | 
							
								        done: true
							 | 
						||
| 
								 | 
							
								      };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      var value = documents[i++];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      return {
							 | 
						||
| 
								 | 
							
								        value: value,
							 | 
						||
| 
								 | 
							
								        done: false
							 | 
						||
| 
								 | 
							
								      };
							 | 
						||
| 
								 | 
							
								  });
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Method returning an iterator over the index's tokens.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @return {Iterator}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.tokens = function() {
							 | 
						||
| 
								 | 
							
								  return this.mapping.keys();
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Attaching the #.values method to Symbol.iterator if possible.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								if (typeof Symbol !== 'undefined')
							 | 
						||
| 
								 | 
							
								  InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Convenience known methods.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.prototype.inspect = function() {
							 | 
						||
| 
								 | 
							
								  var array = this.items.slice();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  // Trick so that node displays the name of the constructor
							 | 
						||
| 
								 | 
							
								  Object.defineProperty(array, 'constructor', {
							 | 
						||
| 
								 | 
							
								    value: InvertedIndex,
							 | 
						||
| 
								 | 
							
								    enumerable: false
							 | 
						||
| 
								 | 
							
								  });
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  return array;
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if (typeof Symbol !== 'undefined')
							 | 
						||
| 
								 | 
							
								  InvertedIndex.prototype[Symbol.for('nodejs.util.inspect.custom')] = InvertedIndex.prototype.inspect;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Static @.from function taking an arbitrary iterable & converting it into
							 | 
						||
| 
								 | 
							
								 * a InvertedIndex.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * @param  {Iterable} iterable - Target iterable.
							 | 
						||
| 
								 | 
							
								 * @param  {function} tokenizer - Tokenizer function.
							 | 
						||
| 
								 | 
							
								 * @return {InvertedIndex}
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								InvertedIndex.from = function(iterable, descriptor) {
							 | 
						||
| 
								 | 
							
								  var index = new InvertedIndex(descriptor);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  forEach(iterable, function(doc) {
							 | 
						||
| 
								 | 
							
								    index.add(doc);
							 | 
						||
| 
								 | 
							
								  });
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  return index;
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Exporting.
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								module.exports = InvertedIndex;
							 |