You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					250 lines
				
				5.3 KiB
			
		
		
			
		
	
	
					250 lines
				
				5.3 KiB
			| 
											3 years ago
										 | /** | ||
|  |  * Mnemonist Inverted Index | ||
|  |  * ========================= | ||
|  |  * | ||
|  |  * JavaScript implementation of an inverted index. | ||
|  |  */ | ||
|  | var Iterator = require('obliterator/iterator'), | ||
|  |     forEach = require('obliterator/foreach'), | ||
|  |     helpers = require('./utils/merge.js'); | ||
|  | 
 | ||
|  | function identity(x) { | ||
|  |   return x; | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * InvertedIndex. | ||
|  |  * | ||
|  |  * @constructor | ||
|  |  * @param {function} tokenizer - Tokenizer function. | ||
|  |  */ | ||
|  | function InvertedIndex(descriptor) { | ||
|  |   this.clear(); | ||
|  | 
 | ||
|  |   if (Array.isArray(descriptor)) { | ||
|  |     this.documentTokenizer = descriptor[0]; | ||
|  |     this.queryTokenizer = descriptor[1]; | ||
|  |   } | ||
|  |   else { | ||
|  |     this.documentTokenizer = descriptor; | ||
|  |     this.queryTokenizer = descriptor; | ||
|  |   } | ||
|  | 
 | ||
|  |   if (!this.documentTokenizer) | ||
|  |     this.documentTokenizer = identity; | ||
|  |   if (!this.queryTokenizer) | ||
|  |     this.queryTokenizer = identity; | ||
|  | 
 | ||
|  |   if (typeof this.documentTokenizer !== 'function') | ||
|  |     throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.'); | ||
|  | 
 | ||
|  |   if (typeof this.queryTokenizer !== 'function') | ||
|  |     throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.'); | ||
|  | } | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method used to clear the InvertedIndex. | ||
|  |  * | ||
|  |  * @return {undefined} | ||
|  |  */ | ||
|  | InvertedIndex.prototype.clear = function() { | ||
|  | 
 | ||
|  |   // Properties
 | ||
|  |   this.items = []; | ||
|  |   this.mapping = new Map(); | ||
|  |   this.size = 0; | ||
|  |   this.dimension = 0; | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method used to add a document to the index. | ||
|  |  * | ||
|  |  * @param  {any} doc - Item to add. | ||
|  |  * @return {InvertedIndex} | ||
|  |  */ | ||
|  | InvertedIndex.prototype.add = function(doc) { | ||
|  | 
 | ||
|  |   // Increasing size
 | ||
|  |   this.size++; | ||
|  | 
 | ||
|  |   // Storing document
 | ||
|  |   var key = this.items.length; | ||
|  |   this.items.push(doc); | ||
|  | 
 | ||
|  |   // Tokenizing the document
 | ||
|  |   var tokens = this.documentTokenizer(doc); | ||
|  | 
 | ||
|  |   if (!Array.isArray(tokens)) | ||
|  |     throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.'); | ||
|  | 
 | ||
|  |   // Indexing
 | ||
|  |   var done = new Set(), | ||
|  |       token, | ||
|  |       container; | ||
|  | 
 | ||
|  |   for (var i = 0, l = tokens.length; i < l; i++) { | ||
|  |     token = tokens[i]; | ||
|  | 
 | ||
|  |     if (done.has(token)) | ||
|  |       continue; | ||
|  | 
 | ||
|  |     done.add(token); | ||
|  | 
 | ||
|  |     container = this.mapping.get(token); | ||
|  | 
 | ||
|  |     if (!container) { | ||
|  |       container = []; | ||
|  |       this.mapping.set(token, container); | ||
|  |     } | ||
|  | 
 | ||
|  |     container.push(key); | ||
|  |   } | ||
|  | 
 | ||
|  |   this.dimension = this.mapping.size; | ||
|  | 
 | ||
|  |   return this; | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method used to query the index in a AND fashion. | ||
|  |  * | ||
|  |  * @param  {any} query - Query | ||
|  |  * @return {Set}       - Intersection of documents matching the query. | ||
|  |  */ | ||
|  | InvertedIndex.prototype.get = function(query) { | ||
|  | 
 | ||
|  |   // Early termination
 | ||
|  |   if (!this.size) | ||
|  |     return []; | ||
|  | 
 | ||
|  |   // First we need to tokenize the query
 | ||
|  |   var tokens = this.queryTokenizer(query); | ||
|  | 
 | ||
|  |   if (!Array.isArray(tokens)) | ||
|  |     throw new Error('mnemonist/InvertedIndex.query: tokenizer function should return an array of tokens.'); | ||
|  | 
 | ||
|  |   if (!tokens.length) | ||
|  |     return []; | ||
|  | 
 | ||
|  |   var results = this.mapping.get(tokens[0]), | ||
|  |       c, | ||
|  |       i, | ||
|  |       l; | ||
|  | 
 | ||
|  |   if (typeof results === 'undefined' || results.length === 0) | ||
|  |     return []; | ||
|  | 
 | ||
|  |   if (tokens.length > 1) { | ||
|  |     for (i = 1, l = tokens.length; i < l; i++) { | ||
|  |       c = this.mapping.get(tokens[i]); | ||
|  | 
 | ||
|  |       if (typeof c === 'undefined' || c.length === 0) | ||
|  |         return []; | ||
|  | 
 | ||
|  |       results = helpers.intersectionUniqueArrays(results, c); | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   var docs = new Array(results.length); | ||
|  | 
 | ||
|  |   for (i = 0, l = docs.length; i < l; i++) | ||
|  |     docs[i] = this.items[results[i]]; | ||
|  | 
 | ||
|  |   return docs; | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method used to iterate over each of the documents. | ||
|  |  * | ||
|  |  * @param  {function}  callback - Function to call for each item. | ||
|  |  * @param  {object}    scope    - Optional scope. | ||
|  |  * @return {undefined} | ||
|  |  */ | ||
|  | InvertedIndex.prototype.forEach = function(callback, scope) { | ||
|  |   scope = arguments.length > 1 ? scope : this; | ||
|  | 
 | ||
|  |   for (var i = 0, l = this.documents.length; i < l; i++) | ||
|  |     callback.call(scope, this.documents[i], i, this); | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method returning an iterator over the index's documents. | ||
|  |  * | ||
|  |  * @return {Iterator} | ||
|  |  */ | ||
|  | InvertedIndex.prototype.documents = function() { | ||
|  |   var documents = this.items, | ||
|  |       l = documents.length, | ||
|  |       i = 0; | ||
|  | 
 | ||
|  |   return new Iterator(function() { | ||
|  |     if (i >= l) | ||
|  |       return { | ||
|  |         done: true | ||
|  |       }; | ||
|  | 
 | ||
|  |       var value = documents[i++]; | ||
|  | 
 | ||
|  |       return { | ||
|  |         value: value, | ||
|  |         done: false | ||
|  |       }; | ||
|  |   }); | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Method returning an iterator over the index's tokens. | ||
|  |  * | ||
|  |  * @return {Iterator} | ||
|  |  */ | ||
|  | InvertedIndex.prototype.tokens = function() { | ||
|  |   return this.mapping.keys(); | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Attaching the #.values method to Symbol.iterator if possible. | ||
|  |  */ | ||
|  | if (typeof Symbol !== 'undefined') | ||
|  |   InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Convenience known methods. | ||
|  |  */ | ||
|  | InvertedIndex.prototype.inspect = function() { | ||
|  |   var array = this.items.slice(); | ||
|  | 
 | ||
|  |   // Trick so that node displays the name of the constructor
 | ||
|  |   Object.defineProperty(array, 'constructor', { | ||
|  |     value: InvertedIndex, | ||
|  |     enumerable: false | ||
|  |   }); | ||
|  | 
 | ||
|  |   return array; | ||
|  | }; | ||
|  | 
 | ||
|  | if (typeof Symbol !== 'undefined') | ||
|  |   InvertedIndex.prototype[Symbol.for('nodejs.util.inspect.custom')] = InvertedIndex.prototype.inspect; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Static @.from function taking an arbitrary iterable & converting it into | ||
|  |  * a InvertedIndex. | ||
|  |  * | ||
|  |  * @param  {Iterable} iterable - Target iterable. | ||
|  |  * @param  {function} tokenizer - Tokenizer function. | ||
|  |  * @return {InvertedIndex} | ||
|  |  */ | ||
|  | InvertedIndex.from = function(iterable, descriptor) { | ||
|  |   var index = new InvertedIndex(descriptor); | ||
|  | 
 | ||
|  |   forEach(iterable, function(doc) { | ||
|  |     index.add(doc); | ||
|  |   }); | ||
|  | 
 | ||
|  |   return index; | ||
|  | }; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Exporting. | ||
|  |  */ | ||
|  | module.exports = InvertedIndex; |