You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					227 lines
				
				7.0 KiB
			
		
		
			
		
	
	
					227 lines
				
				7.0 KiB
			| 
											3 years ago
										 | /** | ||
|  |  * Copyright Brian White. All rights reserved. | ||
|  |  * | ||
|  |  * @see https://github.com/mscdex/streamsearch
 | ||
|  |  * | ||
|  |  * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
|  |  * of this software and associated documentation files (the "Software"), to | ||
|  |  * deal in the Software without restriction, including without limitation the | ||
|  |  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
|  |  * sell copies of the Software, and to permit persons to whom the Software is | ||
|  |  * furnished to do so, subject to the following conditions: | ||
|  |  * | ||
|  |  * The above copyright notice and this permission notice shall be included in | ||
|  |  * all copies or substantial portions of the Software. | ||
|  |  * | ||
|  |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
|  |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
|  |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
|  |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
|  |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
|  |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
|  |  * IN THE SOFTWARE. | ||
|  |  * | ||
|  |  * Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation | ||
|  |  * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
 | ||
|  |  */ | ||
|  | const EventEmitter = require('events').EventEmitter | ||
|  | const inherits = require('util').inherits | ||
|  | 
 | ||
|  | function SBMH (needle) { | ||
|  |   if (typeof needle === 'string') { | ||
|  |     needle = Buffer.from(needle) | ||
|  |   } | ||
|  | 
 | ||
|  |   if (!Buffer.isBuffer(needle)) { | ||
|  |     throw new TypeError('The needle has to be a String or a Buffer.') | ||
|  |   } | ||
|  | 
 | ||
|  |   const needleLength = needle.length | ||
|  | 
 | ||
|  |   if (needleLength === 0) { | ||
|  |     throw new Error('The needle cannot be an empty String/Buffer.') | ||
|  |   } | ||
|  | 
 | ||
|  |   if (needleLength > 256) { | ||
|  |     throw new Error('The needle cannot have a length bigger than 256.') | ||
|  |   } | ||
|  | 
 | ||
|  |   this.maxMatches = Infinity | ||
|  |   this.matches = 0 | ||
|  | 
 | ||
|  |   this._occ = new Array(256) | ||
|  |     .fill(needleLength) // Initialize occurrence table.
 | ||
|  |   this._lookbehind_size = 0 | ||
|  |   this._needle = needle | ||
|  |   this._bufpos = 0 | ||
|  | 
 | ||
|  |   this._lookbehind = Buffer.alloc(needleLength) | ||
|  | 
 | ||
|  |   // Populate occurrence table with analysis of the needle,
 | ||
|  |   // ignoring last letter.
 | ||
|  |   for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var
 | ||
|  |     this._occ[needle[i]] = needleLength - 1 - i | ||
|  |   } | ||
|  | } | ||
|  | inherits(SBMH, EventEmitter) | ||
|  | 
 | ||
|  | SBMH.prototype.reset = function () { | ||
|  |   this._lookbehind_size = 0 | ||
|  |   this.matches = 0 | ||
|  |   this._bufpos = 0 | ||
|  | } | ||
|  | 
 | ||
|  | SBMH.prototype.push = function (chunk, pos) { | ||
|  |   if (!Buffer.isBuffer(chunk)) { | ||
|  |     chunk = Buffer.from(chunk, 'binary') | ||
|  |   } | ||
|  |   const chlen = chunk.length | ||
|  |   this._bufpos = pos || 0 | ||
|  |   let r | ||
|  |   while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } | ||
|  |   return r | ||
|  | } | ||
|  | 
 | ||
|  | SBMH.prototype._sbmh_feed = function (data) { | ||
|  |   const len = data.length | ||
|  |   const needle = this._needle | ||
|  |   const needleLength = needle.length | ||
|  |   const lastNeedleChar = needle[needleLength - 1] | ||
|  | 
 | ||
|  |   // Positive: points to a position in `data`
 | ||
|  |   //           pos == 3 points to data[3]
 | ||
|  |   // Negative: points to a position in the lookbehind buffer
 | ||
|  |   //           pos == -2 points to lookbehind[lookbehind_size - 2]
 | ||
|  |   let pos = -this._lookbehind_size | ||
|  |   let ch | ||
|  | 
 | ||
|  |   if (pos < 0) { | ||
|  |     // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
 | ||
|  |     // search with character lookup code that considers both the
 | ||
|  |     // lookbehind buffer and the current round's haystack data.
 | ||
|  |     //
 | ||
|  |     // Loop until
 | ||
|  |     //   there is a match.
 | ||
|  |     // or until
 | ||
|  |     //   we've moved past the position that requires the
 | ||
|  |     //   lookbehind buffer. In this case we switch to the
 | ||
|  |     //   optimized loop.
 | ||
|  |     // or until
 | ||
|  |     //   the character to look at lies outside the haystack.
 | ||
|  |     while (pos < 0 && pos <= len - needleLength) { | ||
|  |       ch = this._sbmh_lookup_char(data, pos + needleLength - 1) | ||
|  | 
 | ||
|  |       if ( | ||
|  |         ch === lastNeedleChar && | ||
|  |         this._sbmh_memcmp(data, pos, needleLength - 1) | ||
|  |       ) { | ||
|  |         this._lookbehind_size = 0 | ||
|  |         ++this.matches | ||
|  |         this.emit('info', true) | ||
|  | 
 | ||
|  |         return (this._bufpos = pos + needleLength) | ||
|  |       } | ||
|  |       pos += this._occ[ch] | ||
|  |     } | ||
|  | 
 | ||
|  |     // No match.
 | ||
|  | 
 | ||
|  |     if (pos < 0) { | ||
|  |       // There's too few data for Boyer-Moore-Horspool to run,
 | ||
|  |       // so let's use a different algorithm to skip as much as
 | ||
|  |       // we can.
 | ||
|  |       // Forward pos until
 | ||
|  |       //   the trailing part of lookbehind + data
 | ||
|  |       //   looks like the beginning of the needle
 | ||
|  |       // or until
 | ||
|  |       //   pos == 0
 | ||
|  |       while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos } | ||
|  |     } | ||
|  | 
 | ||
|  |     if (pos >= 0) { | ||
|  |       // Discard lookbehind buffer.
 | ||
|  |       this.emit('info', false, this._lookbehind, 0, this._lookbehind_size) | ||
|  |       this._lookbehind_size = 0 | ||
|  |     } else { | ||
|  |       // Cut off part of the lookbehind buffer that has
 | ||
|  |       // been processed and append the entire haystack
 | ||
|  |       // into it.
 | ||
|  |       const bytesToCutOff = this._lookbehind_size + pos | ||
|  |       if (bytesToCutOff > 0) { | ||
|  |         // The cut off data is guaranteed not to contain the needle.
 | ||
|  |         this.emit('info', false, this._lookbehind, 0, bytesToCutOff) | ||
|  |       } | ||
|  | 
 | ||
|  |       this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, | ||
|  |         this._lookbehind_size - bytesToCutOff) | ||
|  |       this._lookbehind_size -= bytesToCutOff | ||
|  | 
 | ||
|  |       data.copy(this._lookbehind, this._lookbehind_size) | ||
|  |       this._lookbehind_size += len | ||
|  | 
 | ||
|  |       this._bufpos = len | ||
|  |       return len | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   pos += (pos >= 0) * this._bufpos | ||
|  | 
 | ||
|  |   // Lookbehind buffer is now empty. We only need to check if the
 | ||
|  |   // needle is in the haystack.
 | ||
|  |   if (data.indexOf(needle, pos) !== -1) { | ||
|  |     pos = data.indexOf(needle, pos) | ||
|  |     ++this.matches | ||
|  |     if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } | ||
|  | 
 | ||
|  |     return (this._bufpos = pos + needleLength) | ||
|  |   } else { | ||
|  |     pos = len - needleLength | ||
|  |   } | ||
|  | 
 | ||
|  |   // There was no match. If there's trailing haystack data that we cannot
 | ||
|  |   // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
 | ||
|  |   // data is less than the needle size) then match using a modified
 | ||
|  |   // algorithm that starts matching from the beginning instead of the end.
 | ||
|  |   // Whatever trailing data is left after running this algorithm is added to
 | ||
|  |   // the lookbehind buffer.
 | ||
|  |   while ( | ||
|  |     pos < len && | ||
|  |     ( | ||
|  |       data[pos] !== needle[0] || | ||
|  |       ( | ||
|  |         (Buffer.compare( | ||
|  |           data.subarray(pos, pos + len - pos), | ||
|  |           needle.subarray(0, len - pos) | ||
|  |         ) !== 0) | ||
|  |       ) | ||
|  |     ) | ||
|  |   ) { | ||
|  |     ++pos | ||
|  |   } | ||
|  |   if (pos < len) { | ||
|  |     data.copy(this._lookbehind, 0, pos, pos + (len - pos)) | ||
|  |     this._lookbehind_size = len - pos | ||
|  |   } | ||
|  | 
 | ||
|  |   // Everything until pos is guaranteed not to contain needle data.
 | ||
|  |   if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } | ||
|  | 
 | ||
|  |   this._bufpos = len | ||
|  |   return len | ||
|  | } | ||
|  | 
 | ||
|  | SBMH.prototype._sbmh_lookup_char = function (data, pos) { | ||
|  |   return (pos < 0) | ||
|  |     ? this._lookbehind[this._lookbehind_size + pos] | ||
|  |     : data[pos] | ||
|  | } | ||
|  | 
 | ||
|  | SBMH.prototype._sbmh_memcmp = function (data, pos, len) { | ||
|  |   for (var i = 0; i < len; ++i) { // eslint-disable-line no-var
 | ||
|  |     if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } | ||
|  |   } | ||
|  |   return true | ||
|  | } | ||
|  | 
 | ||
|  | module.exports = SBMH |