You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							214 lines
						
					
					
						
							6.1 KiB
						
					
					
				
			
		
		
	
	
							214 lines
						
					
					
						
							6.1 KiB
						
					
					
				| /*
 | |
|   Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
 | |
|   by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
 | |
| */
 | |
| var EventEmitter = require('events').EventEmitter,
 | |
|     inherits = require('util').inherits;
 | |
| 
 | |
| function jsmemcmp(buf1, pos1, buf2, pos2, num) {
 | |
|   for (var i = 0; i < num; ++i, ++pos1, ++pos2)
 | |
|     if (buf1[pos1] !== buf2[pos2])
 | |
|       return false;
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| function SBMH(needle) {
 | |
|   if (typeof needle === 'string')
 | |
|     needle = new Buffer(needle);
 | |
|   var i, j, needle_len = needle.length;
 | |
| 
 | |
|   this.maxMatches = Infinity;
 | |
|   this.matches = 0;
 | |
| 
 | |
|   this._occ = new Array(256);
 | |
|   this._lookbehind_size = 0;
 | |
|   this._needle = needle;
 | |
|   this._bufpos = 0;
 | |
| 
 | |
|   this._lookbehind = new Buffer(needle_len);
 | |
| 
 | |
|   // Initialize occurrence table.
 | |
|   for (j = 0; j < 256; ++j)
 | |
|     this._occ[j] = needle_len;
 | |
| 
 | |
|   // Populate occurrence table with analysis of the needle,
 | |
|   // ignoring last letter.
 | |
|   if (needle_len >= 1) {
 | |
|     for (i = 0; i < needle_len - 1; ++i)
 | |
|       this._occ[needle[i]] = needle_len - 1 - i;
 | |
|   }
 | |
| }
 | |
| inherits(SBMH, EventEmitter);
 | |
| 
 | |
| SBMH.prototype.reset = function() {
 | |
|   this._lookbehind_size = 0;
 | |
|   this.matches = 0;
 | |
|   this._bufpos = 0;
 | |
| };
 | |
| 
 | |
| SBMH.prototype.push = function(chunk, pos) {
 | |
|   var r, chlen;
 | |
|   if (!Buffer.isBuffer(chunk))
 | |
|     chunk = new Buffer(chunk, 'binary');
 | |
|   chlen = chunk.length;
 | |
|   this._bufpos = pos || 0;
 | |
|   while (r !== chlen && this.matches < this.maxMatches)
 | |
|     r = this._sbmh_feed(chunk);
 | |
|   return r;
 | |
| };
 | |
| 
 | |
| SBMH.prototype._sbmh_feed = function(data) {
 | |
|   var len = data.length, needle = this._needle, needle_len = needle.length;
 | |
| 
 | |
|   // Positive: points to a position in `data`
 | |
|   //           pos == 3 points to data[3]
 | |
|   // Negative: points to a position in the lookbehind buffer
 | |
|   //           pos == -2 points to lookbehind[lookbehind_size - 2]
 | |
|   var pos = -this._lookbehind_size,
 | |
|       last_needle_char = needle[needle_len - 1],
 | |
|       occ = this._occ,
 | |
|       lookbehind = this._lookbehind;
 | |
| 
 | |
|   if (pos < 0) {
 | |
|     // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
 | |
|     // search with character lookup code that considers both the
 | |
|     // lookbehind buffer and the current round's haystack data.
 | |
|     //
 | |
|     // Loop until
 | |
|     //   there is a match.
 | |
|     // or until
 | |
|     //   we've moved past the position that requires the
 | |
|     //   lookbehind buffer. In this case we switch to the
 | |
|     //   optimized loop.
 | |
|     // or until
 | |
|     //   the character to look at lies outside the haystack.
 | |
|     while (pos < 0 && pos <= len - needle_len) {
 | |
|        var ch = this._sbmh_lookup_char(data, pos + needle_len - 1);
 | |
| 
 | |
|       if (ch === last_needle_char
 | |
|           && this._sbmh_memcmp(data, pos, needle_len - 1)) {
 | |
|         this._lookbehind_size = 0;
 | |
|         ++this.matches;
 | |
|         if (pos > -this._lookbehind_size)
 | |
|           this.emit('info', true, lookbehind, 0, this._lookbehind_size + pos);
 | |
|         else
 | |
|           this.emit('info', true);
 | |
| 
 | |
|         this._bufpos = pos + needle_len;
 | |
|         return pos + needle_len;
 | |
|       } else
 | |
|         pos += occ[ch];
 | |
|     }
 | |
| 
 | |
|     // No match.
 | |
| 
 | |
|     if (pos < 0) {
 | |
|       // There's too few data for Boyer-Moore-Horspool to run,
 | |
|       // so let's use a different algorithm to skip as much as
 | |
|       // we can.
 | |
|       // Forward pos until
 | |
|       //   the trailing part of lookbehind + data
 | |
|       //   looks like the beginning of the needle
 | |
|       // or until
 | |
|       //   pos == 0
 | |
|       while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos))
 | |
|         pos++;
 | |
|     }
 | |
| 
 | |
|     if (pos >= 0) {
 | |
|       // Discard lookbehind buffer.
 | |
|       this.emit('info', false, lookbehind, 0, this._lookbehind_size);
 | |
|       this._lookbehind_size = 0;
 | |
|     } else {
 | |
|       // Cut off part of the lookbehind buffer that has
 | |
|       // been processed and append the entire haystack
 | |
|       // into it.
 | |
|       var bytesToCutOff = this._lookbehind_size + pos;
 | |
| 
 | |
|       if (bytesToCutOff > 0) {
 | |
|         // The cut off data is guaranteed not to contain the needle.
 | |
|         this.emit('info', false, lookbehind, 0, bytesToCutOff);
 | |
|       }
 | |
| 
 | |
|       lookbehind.copy(lookbehind, 0, bytesToCutOff,
 | |
|                       this._lookbehind_size - bytesToCutOff);
 | |
|       this._lookbehind_size -= bytesToCutOff;
 | |
| 
 | |
|       data.copy(lookbehind, this._lookbehind_size);
 | |
|       this._lookbehind_size += len;
 | |
| 
 | |
|       this._bufpos = len;
 | |
|       return len;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (pos >= 0)
 | |
|     pos += this._bufpos;
 | |
| 
 | |
|   // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
 | |
|   // search with optimized character lookup code that only considers
 | |
|   // the current round's haystack data.
 | |
|   while (pos <= len - needle_len) {
 | |
|     var ch = data[pos + needle_len - 1];
 | |
| 
 | |
|     if (ch === last_needle_char
 | |
|         && data[pos] === needle[0]
 | |
|         && jsmemcmp(needle, 0, data, pos, needle_len - 1)) {
 | |
|       ++this.matches;
 | |
|       if (pos > 0)
 | |
|         this.emit('info', true, data, this._bufpos, pos);
 | |
|       else
 | |
|         this.emit('info', true);
 | |
| 
 | |
|       this._bufpos = pos + needle_len;
 | |
|       return pos + needle_len;
 | |
|     } else
 | |
|       pos += occ[ch];
 | |
|   }
 | |
| 
 | |
|   // There was no match. If there's trailing haystack data that we cannot
 | |
|   // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
 | |
|   // data is less than the needle size) then match using a modified
 | |
|   // algorithm that starts matching from the beginning instead of the end.
 | |
|   // Whatever trailing data is left after running this algorithm is added to
 | |
|   // the lookbehind buffer.
 | |
|   if (pos < len) {
 | |
|     while (pos < len && (data[pos] !== needle[0]
 | |
|                          || !jsmemcmp(data, pos, needle, 0, len - pos))) {
 | |
|       ++pos;
 | |
|     }
 | |
|     if (pos < len) {
 | |
|       data.copy(lookbehind, 0, pos, pos + (len - pos));
 | |
|       this._lookbehind_size = len - pos;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Everything until pos is guaranteed not to contain needle data.
 | |
|   if (pos > 0)
 | |
|     this.emit('info', false, data, this._bufpos, pos < len ? pos : len);
 | |
| 
 | |
|   this._bufpos = len;
 | |
|   return len;
 | |
| };
 | |
| 
 | |
| SBMH.prototype._sbmh_lookup_char = function(data, pos) {
 | |
|   if (pos < 0)
 | |
|     return this._lookbehind[this._lookbehind_size + pos];
 | |
|   else
 | |
|     return data[pos];
 | |
| }
 | |
| 
 | |
| SBMH.prototype._sbmh_memcmp = function(data, pos, len) {
 | |
|   var i = 0;
 | |
| 
 | |
|   while (i < len) {
 | |
|     if (this._sbmh_lookup_char(data, pos + i) === this._needle[i])
 | |
|       ++i;
 | |
|     else
 | |
|       return false;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| module.exports = SBMH;
 |