You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					394 lines
				
				13 KiB
			
		
		
			
		
	
	
					394 lines
				
				13 KiB
			| 
											3 years ago
										 | /** | ||
|  |  * A port of Google's `PhoneNumberMatcher.java`. | ||
|  |  * https://github.com/googlei18n/libphonenumber/blob/master/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
 | ||
|  |  * Date: 08.03.2018. | ||
|  |  */ | ||
|  | 
 | ||
|  | import PhoneNumber from './PhoneNumber.js' | ||
|  | 
 | ||
|  | import { | ||
|  |   MAX_LENGTH_FOR_NSN, | ||
|  |   MAX_LENGTH_COUNTRY_CODE, | ||
|  |   VALID_PUNCTUATION | ||
|  | } from './constants.js' | ||
|  | 
 | ||
|  | import createExtensionPattern from './helpers/extension/createExtensionPattern.js' | ||
|  | 
 | ||
|  | import RegExpCache from './findNumbers/RegExpCache.js' | ||
|  | 
 | ||
|  | import { | ||
|  | 	limit, | ||
|  | 	trimAfterFirstMatch | ||
|  | } from './findNumbers/util.js' | ||
|  | 
 | ||
|  | import { | ||
|  | 	_pL, | ||
|  | 	_pN, | ||
|  | 	pZ, | ||
|  | 	PZ, | ||
|  | 	pNd | ||
|  | } from './findNumbers/utf-8.js' | ||
|  | 
 | ||
|  | import Leniency from './findNumbers/Leniency.js' | ||
|  | import parsePreCandidate from './findNumbers/parsePreCandidate.js' | ||
|  | import isValidPreCandidate from './findNumbers/isValidPreCandidate.js' | ||
|  | import isValidCandidate, { LEAD_CLASS } from './findNumbers/isValidCandidate.js' | ||
|  | 
 | ||
|  | import { isSupportedCountry } from './metadata.js' | ||
|  | 
 | ||
|  | import parseNumber from './parse_.js' | ||
|  | 
 | ||
|  | const EXTN_PATTERNS_FOR_MATCHING = createExtensionPattern('matching') | ||
|  | 
 | ||
|  | /** | ||
|  |  * Patterns used to extract phone numbers from a larger phone-number-like pattern. These are | ||
|  |  * ordered according to specificity. For example, white-space is last since that is frequently | ||
|  |  * used in numbers, not just to separate two numbers. We have separate patterns since we don't | ||
|  |  * want to break up the phone-number-like text on more than one different kind of symbol at one | ||
|  |  * time, although symbols of the same type (e.g. space) can be safely grouped together. | ||
|  |  * | ||
|  |  * Note that if there is a match, we will always check any text found up to the first match as | ||
|  |  * well. | ||
|  |  */ | ||
|  | const INNER_MATCHES = | ||
|  | [ | ||
|  | 	// Breaks on the slash - e.g. "651-234-2345/332-445-1234"
 | ||
|  | 	'\\/+(.*)/', | ||
|  | 
 | ||
|  | 	// Note that the bracket here is inside the capturing group, since we consider it part of the
 | ||
|  | 	// phone number. Will match a pattern like "(650) 223 3345 (754) 223 3321".
 | ||
|  | 	'(\\([^(]*)', | ||
|  | 
 | ||
|  | 	// Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."
 | ||
|  | 	// We require a space on either side of the hyphen for it to be considered a separator.
 | ||
|  | 	`(?:${pZ}-|-${pZ})${pZ}*(.+)`, | ||
|  | 
 | ||
|  | 	// Various types of wide hyphens. Note we have decided not to enforce a space here, since it's
 | ||
|  | 	// possible that it's supposed to be used to break two numbers without spaces, and we haven't
 | ||
|  | 	// seen many instances of it used within a number.
 | ||
|  | 	`[\u2012-\u2015\uFF0D]${pZ}*(.+)`, | ||
|  | 
 | ||
|  | 	// Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
 | ||
|  | 	`\\.+${pZ}*([^.]+)`, | ||
|  | 
 | ||
|  | 	// Breaks on space - e.g. "3324451234 8002341234"
 | ||
|  | 	`${pZ}+(${PZ}+)` | ||
|  | ] | ||
|  | 
 | ||
|  | // Limit on the number of leading (plus) characters.
 | ||
|  | const leadLimit = limit(0, 2) | ||
|  | 
 | ||
|  | // Limit on the number of consecutive punctuation characters.
 | ||
|  | const punctuationLimit = limit(0, 4) | ||
|  | 
 | ||
|  | /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a | ||
|  |  * single block, set high enough to accommodate the entire national number and the international | ||
|  |  * country code. */ | ||
|  | const digitBlockLimit = MAX_LENGTH_FOR_NSN + MAX_LENGTH_COUNTRY_CODE | ||
|  | 
 | ||
|  | // Limit on the number of blocks separated by punctuation.
 | ||
|  | // Uses digitBlockLimit since some formats use spaces to separate each digit.
 | ||
|  | const blockLimit = limit(0, digitBlockLimit) | ||
|  | 
 | ||
|  | /* A punctuation sequence allowing white space. */ | ||
|  | const punctuation = `[${VALID_PUNCTUATION}]` + punctuationLimit | ||
|  | 
 | ||
|  | // A digits block without punctuation.
 | ||
|  | const digitSequence = pNd + limit(1, digitBlockLimit) | ||
|  | 
 | ||
|  | /** | ||
|  |  * Phone number pattern allowing optional punctuation. | ||
|  |  * The phone number pattern used by `find()`, similar to | ||
|  |  * VALID_PHONE_NUMBER, but with the following differences: | ||
|  |  * <ul> | ||
|  |  *   <li>All captures are limited in order to place an upper bound to the text matched by the | ||
|  |  *       pattern. | ||
|  |  * <ul> | ||
|  |  *   <li>Leading punctuation / plus signs are limited. | ||
|  |  *   <li>Consecutive occurrences of punctuation are limited. | ||
|  |  *   <li>Number of digits is limited. | ||
|  |  * </ul> | ||
|  |  *   <li>No whitespace is allowed at the start or end. | ||
|  |  *   <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported. | ||
|  |  * </ul> | ||
|  |  */ | ||
|  | const PATTERN = '(?:' + LEAD_CLASS + punctuation + ')' + leadLimit | ||
|  | 	+ digitSequence + '(?:' + punctuation + digitSequence + ')' + blockLimit | ||
|  | 	+ '(?:' + EXTN_PATTERNS_FOR_MATCHING + ')?' | ||
|  | 
 | ||
|  | // Regular expression of trailing characters that we want to remove.
 | ||
|  | // We remove all characters that are not alpha or numerical characters.
 | ||
|  | // The hash character is retained here, as it may signify
 | ||
|  | // the previous block was an extension.
 | ||
|  | //
 | ||
|  | // // Don't know what does '&&' mean here.
 | ||
|  | // const UNWANTED_END_CHAR_PATTERN = new RegExp(`[[\\P{N}&&\\P{L}]&&[^#]]+$`)
 | ||
|  | //
 | ||
|  | const UNWANTED_END_CHAR_PATTERN = new RegExp(`[^${_pN}${_pL}#]+$`) | ||
|  | 
 | ||
|  | const NON_DIGITS_PATTERN = /(\D+)/ | ||
|  | 
 | ||
|  | const MAX_SAFE_INTEGER = Number.MAX_SAFE_INTEGER || Math.pow(2, 53) - 1 | ||
|  | 
 | ||
|  | /** | ||
|  |  * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}. | ||
|  |  * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in | ||
|  |  * {@link PhoneNumberUtil}. | ||
|  |  * | ||
|  |  * <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are | ||
|  |  * not found. | ||
|  |  * | ||
|  |  * <p>This class is not thread-safe. | ||
|  |  */ | ||
|  | export default class PhoneNumberMatcher | ||
|  | { | ||
|  |   /** | ||
|  |    * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a | ||
|  |    * new instance. | ||
|  |    * | ||
|  |    * @param util  the phone number util to use | ||
|  |    * @param text  the character sequence that we will search, null for no text | ||
|  |    * @param country  the country to assume for phone numbers not written in international format | ||
|  |    *     (with a leading plus, or with the international dialing prefix of the specified region). | ||
|  |    *     May be null or "ZZ" if only numbers with a leading plus should be | ||
|  |    *     considered. | ||
|  |    * @param leniency  the leniency to use when evaluating candidate phone numbers | ||
|  |    * @param maxTries  the maximum number of invalid numbers to try before giving up on the text. | ||
|  |    *     This is to cover degenerate cases where the text has a lot of false positives in it. Must | ||
|  |    *     be {@code >= 0}. | ||
|  |    */ | ||
|  |   constructor(text = '', options = {}, metadata) | ||
|  |   { | ||
|  |     options = { | ||
|  |       ...options, | ||
|  |       defaultCallingCode: options.defaultCallingCode, | ||
|  |       defaultCountry: options.defaultCountry && isSupportedCountry(options.defaultCountry, metadata) ? options.defaultCountry : undefined, | ||
|  |       leniency: options.leniency || options.extended ? 'POSSIBLE' : 'VALID', | ||
|  |       maxTries: options.maxTries || MAX_SAFE_INTEGER | ||
|  |     } | ||
|  | 
 | ||
|  | 		if (!options.leniency) { | ||
|  | 			throw new TypeError('`Leniency` not supplied') | ||
|  | 		} | ||
|  | 
 | ||
|  | 		if (options.maxTries < 0) { | ||
|  | 			throw new TypeError('`maxTries` not supplied') | ||
|  | 		} | ||
|  | 
 | ||
|  | 		this.text = text | ||
|  | 		this.options = options | ||
|  |     this.metadata = metadata | ||
|  | 
 | ||
|  | 		/** The degree of validation requested. */ | ||
|  | 		this.leniency = Leniency[options.leniency] | ||
|  | 
 | ||
|  | 		if (!this.leniency) { | ||
|  | 			throw new TypeError(`Unknown leniency: ${options.leniency}.`) | ||
|  | 		} | ||
|  | 
 | ||
|  | 		/** The maximum number of retries after matching an invalid number. */ | ||
|  | 		this.maxTries = options.maxTries | ||
|  | 
 | ||
|  | 		this.PATTERN = new RegExp(PATTERN, 'ig') | ||
|  | 
 | ||
|  | 
 | ||
|  |     /** The iteration tristate. */ | ||
|  |     this.state = 'NOT_READY' | ||
|  | 
 | ||
|  |     /** The next index to start searching at. Undefined in {@link State#DONE}. */ | ||
|  |     this.searchIndex = 0 | ||
|  | 
 | ||
|  |     // A cache for frequently used country-specific regular expressions. Set to 32 to cover ~2-3
 | ||
|  |     // countries being used for the same doc with ~10 patterns for each country. Some pages will have
 | ||
|  |     // a lot more countries in use, but typically fewer numbers for each so expanding the cache for
 | ||
|  |     // that use-case won't have a lot of benefit.
 | ||
|  |     this.regExpCache = new RegExpCache(32) | ||
|  |   } | ||
|  | 
 | ||
|  |   /** | ||
|  |    * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex} | ||
|  |    * that represents a phone number. Returns the next match, null if none was found. | ||
|  |    * | ||
|  |    * @param index  the search index to start searching at | ||
|  |    * @return  the phone number match found, null if none can be found | ||
|  |    */ | ||
|  | 	find() { | ||
|  | 		// // Reset the regular expression.
 | ||
|  | 		// this.PATTERN.lastIndex = index
 | ||
|  | 
 | ||
|  | 		let matches | ||
|  | 		while ((this.maxTries > 0) && (matches = this.PATTERN.exec(this.text)) !== null) { | ||
|  | 			let candidate = matches[0] | ||
|  | 			const offset = matches.index | ||
|  | 
 | ||
|  | 			candidate = parsePreCandidate(candidate) | ||
|  | 
 | ||
|  | 			if (isValidPreCandidate(candidate, offset, this.text)) { | ||
|  | 				const match = | ||
|  | 					// Try to come up with a valid match given the entire candidate.
 | ||
|  | 					this.parseAndVerify(candidate, offset, this.text) | ||
|  | 					// If that failed, try to find an "inner match" -
 | ||
|  | 					// there might be a phone number within this candidate.
 | ||
|  | 					|| this.extractInnerMatch(candidate, offset, this.text) | ||
|  | 
 | ||
|  | 				if (match) { | ||
|  | 					if (this.options.v2) { | ||
|  | 						const phoneNumber = new PhoneNumber( | ||
|  |               match.country || match.countryCallingCode, | ||
|  |               match.phone, | ||
|  |               this.metadata | ||
|  |             ) | ||
|  | 						if (match.ext) { | ||
|  | 							phoneNumber.ext = match.ext | ||
|  | 						} | ||
|  | 						return { | ||
|  | 							startsAt: match.startsAt, | ||
|  | 							endsAt: match.endsAt, | ||
|  | 							number: phoneNumber | ||
|  | 						} | ||
|  | 					} | ||
|  | 					return match | ||
|  | 				} | ||
|  | 			} | ||
|  | 
 | ||
|  | 			this.maxTries-- | ||
|  | 		} | ||
|  | 	} | ||
|  | 
 | ||
|  |   /** | ||
|  |    * Attempts to extract a match from `substring` | ||
|  |    * if the substring itself does not qualify as a match. | ||
|  |    */ | ||
|  |   extractInnerMatch(substring, offset, text) { | ||
|  |     for (const innerMatchPattern of INNER_MATCHES) { | ||
|  |       let isFirstMatch = true | ||
|  |       let candidateMatch | ||
|  |       const innerMatchRegExp = new RegExp(innerMatchPattern, 'g') | ||
|  |       while (this.maxTries > 0 && (candidateMatch = innerMatchRegExp.exec(substring)) !== null) { | ||
|  |         if (isFirstMatch) { | ||
|  |           // We should handle any group before this one too.
 | ||
|  |           const candidate = trimAfterFirstMatch( | ||
|  |             UNWANTED_END_CHAR_PATTERN, | ||
|  |             substring.slice(0, candidateMatch.index) | ||
|  |           ) | ||
|  | 
 | ||
|  |           const match = this.parseAndVerify(candidate, offset, text) | ||
|  | 
 | ||
|  |           if (match) { | ||
|  |             return match | ||
|  |           } | ||
|  | 
 | ||
|  |           this.maxTries-- | ||
|  |           isFirstMatch = false | ||
|  |         } | ||
|  | 
 | ||
|  |         const candidate = trimAfterFirstMatch(UNWANTED_END_CHAR_PATTERN, candidateMatch[1]) | ||
|  | 
 | ||
|  |         // Java code does `groupMatcher.start(1)` here,
 | ||
|  |         // but there's no way in javascript to get a `candidate` start index,
 | ||
|  |         // therefore resort to using this kind of an approximation.
 | ||
|  |         // (`groupMatcher` is called `candidateInSubstringMatch` in this javascript port)
 | ||
|  |         // https://stackoverflow.com/questions/15934353/get-index-of-each-capture-in-a-javascript-regex
 | ||
|  |         const candidateIndexGuess = substring.indexOf(candidate, candidateMatch.index) | ||
|  | 
 | ||
|  |         const match = this.parseAndVerify(candidate, offset + candidateIndexGuess, text) | ||
|  |         if (match) { | ||
|  |           return match | ||
|  |         } | ||
|  | 
 | ||
|  |         this.maxTries-- | ||
|  |       } | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   /** | ||
|  |    * Parses a phone number from the `candidate` using `parseNumber` and | ||
|  |    * verifies it matches the requested `leniency`. If parsing and verification succeed, | ||
|  |    * a corresponding `PhoneNumberMatch` is returned, otherwise this method returns `null`. | ||
|  |    * | ||
|  |    * @param candidate  the candidate match | ||
|  |    * @param offset  the offset of {@code candidate} within {@link #text} | ||
|  |    * @return  the parsed and validated phone number match, or null | ||
|  |    */ | ||
|  |   parseAndVerify(candidate, offset, text) { | ||
|  |     if (!isValidCandidate(candidate, offset, text, this.options.leniency)) { | ||
|  |       return | ||
|  |   	} | ||
|  | 
 | ||
|  |     const number = parseNumber( | ||
|  |       candidate, { | ||
|  |         extended: true, | ||
|  |         defaultCountry: this.options.defaultCountry, | ||
|  |         defaultCallingCode: this.options.defaultCallingCode | ||
|  |       }, | ||
|  |       this.metadata | ||
|  |     ) | ||
|  | 
 | ||
|  |     if (!number.possible) { | ||
|  |       return | ||
|  |     } | ||
|  | 
 | ||
|  |     if (this.leniency(number, candidate, this.metadata, this.regExpCache)) { | ||
|  |       // // We used parseAndKeepRawInput to create this number,
 | ||
|  |       // // but for now we don't return the extra values parsed.
 | ||
|  |       // // TODO: stop clearing all values here and switch all users over
 | ||
|  |       // // to using rawInput() rather than the rawString() of PhoneNumberMatch.
 | ||
|  |       // number.clearCountryCodeSource()
 | ||
|  |       // number.clearRawInput()
 | ||
|  |       // number.clearPreferredDomesticCarrierCode()
 | ||
|  | 
 | ||
|  |       const result = { | ||
|  |         startsAt: offset, | ||
|  |         endsAt: offset + candidate.length, | ||
|  |         phone: number.phone | ||
|  |       } | ||
|  | 
 | ||
|  |       if (number.country && number.country !== '001') { | ||
|  |         result.country = number.country | ||
|  |       } else { | ||
|  |         result.countryCallingCode = number.countryCallingCode | ||
|  |       } | ||
|  | 
 | ||
|  |       if (number.ext) { | ||
|  |         result.ext = number.ext | ||
|  |       } | ||
|  | 
 | ||
|  |       return result | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   hasNext() | ||
|  |   { | ||
|  |     if (this.state === 'NOT_READY') | ||
|  |     { | ||
|  |       this.lastMatch = this.find() // (this.searchIndex)
 | ||
|  | 
 | ||
|  |       if (this.lastMatch) | ||
|  |       { | ||
|  |         // this.searchIndex = this.lastMatch.endsAt
 | ||
|  |         this.state = 'READY' | ||
|  |       } | ||
|  |       else | ||
|  |       { | ||
|  |         this.state = 'DONE' | ||
|  |       } | ||
|  |     } | ||
|  | 
 | ||
|  |     return this.state === 'READY' | ||
|  |   } | ||
|  | 
 | ||
|  |   next() | ||
|  |   { | ||
|  |     // Check the state and find the next match as a side-effect if necessary.
 | ||
|  |     if (!this.hasNext()) | ||
|  |     { | ||
|  |       throw new Error('No next element') | ||
|  |     } | ||
|  | 
 | ||
|  |     // Don't retain that memory any longer than necessary.
 | ||
|  |     const result = this.lastMatch | ||
|  |     this.lastMatch = null | ||
|  |     this.state = 'NOT_READY' | ||
|  |     return result | ||
|  |   } | ||
|  | } |