You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							258 lines
						
					
					
						
							5.8 KiB
						
					
					
				
			
		
		
	
	
							258 lines
						
					
					
						
							5.8 KiB
						
					
					
				/**
 | 
						|
 * Simple HTML Parser
 | 
						|
 *
 | 
						|
 * @author Zongmin Lei<leizongmin@gmail.com>
 | 
						|
 */
 | 
						|
 | 
						|
var _ = require("./util");
 | 
						|
 | 
						|
/**
 | 
						|
 * get tag name
 | 
						|
 *
 | 
						|
 * @param {String} html e.g. '<a hef="#">'
 | 
						|
 * @return {String}
 | 
						|
 */
 | 
						|
function getTagName(html) {
 | 
						|
  var i = _.spaceIndex(html);
 | 
						|
  var tagName;
 | 
						|
  if (i === -1) {
 | 
						|
    tagName = html.slice(1, -1);
 | 
						|
  } else {
 | 
						|
    tagName = html.slice(1, i + 1);
 | 
						|
  }
 | 
						|
  tagName = _.trim(tagName).toLowerCase();
 | 
						|
  if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
 | 
						|
  if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
 | 
						|
  return tagName;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * is close tag?
 | 
						|
 *
 | 
						|
 * @param {String} html 如:'<a hef="#">'
 | 
						|
 * @return {Boolean}
 | 
						|
 */
 | 
						|
function isClosing(html) {
 | 
						|
  return html.slice(0, 2) === "</";
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * parse input html and returns processed html
 | 
						|
 *
 | 
						|
 * @param {String} html
 | 
						|
 * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
 | 
						|
 * @param {Function} escapeHtml
 | 
						|
 * @return {String}
 | 
						|
 */
 | 
						|
function parseTag(html, onTag, escapeHtml) {
 | 
						|
  "use strict";
 | 
						|
 | 
						|
  var rethtml = "";
 | 
						|
  var lastPos = 0;
 | 
						|
  var tagStart = false;
 | 
						|
  var quoteStart = false;
 | 
						|
  var currentPos = 0;
 | 
						|
  var len = html.length;
 | 
						|
  var currentTagName = "";
 | 
						|
  var currentHtml = "";
 | 
						|
 | 
						|
  chariterator: for (currentPos = 0; currentPos < len; currentPos++) {
 | 
						|
    var c = html.charAt(currentPos);
 | 
						|
    if (tagStart === false) {
 | 
						|
      if (c === "<") {
 | 
						|
        tagStart = currentPos;
 | 
						|
        continue;
 | 
						|
      }
 | 
						|
    } else {
 | 
						|
      if (quoteStart === false) {
 | 
						|
        if (c === "<") {
 | 
						|
          rethtml += escapeHtml(html.slice(lastPos, currentPos));
 | 
						|
          tagStart = currentPos;
 | 
						|
          lastPos = currentPos;
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
        if (c === ">" || currentPos === len - 1) {
 | 
						|
          rethtml += escapeHtml(html.slice(lastPos, tagStart));
 | 
						|
          currentHtml = html.slice(tagStart, currentPos + 1);
 | 
						|
          currentTagName = getTagName(currentHtml);
 | 
						|
          rethtml += onTag(
 | 
						|
            tagStart,
 | 
						|
            rethtml.length,
 | 
						|
            currentTagName,
 | 
						|
            currentHtml,
 | 
						|
            isClosing(currentHtml)
 | 
						|
          );
 | 
						|
          lastPos = currentPos + 1;
 | 
						|
          tagStart = false;
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
        if (c === '"' || c === "'") {
 | 
						|
          var i = 1;
 | 
						|
          var ic = html.charAt(currentPos - i);
 | 
						|
 | 
						|
          while (ic.trim() === "" || ic === "=") {
 | 
						|
            if (ic === "=") {
 | 
						|
              quoteStart = c;
 | 
						|
              continue chariterator;
 | 
						|
            }
 | 
						|
            ic = html.charAt(currentPos - ++i);
 | 
						|
          }
 | 
						|
        }
 | 
						|
      } else {
 | 
						|
        if (c === quoteStart) {
 | 
						|
          quoteStart = false;
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
  if (lastPos < len) {
 | 
						|
    rethtml += escapeHtml(html.substr(lastPos));
 | 
						|
  }
 | 
						|
 | 
						|
  return rethtml;
 | 
						|
}
 | 
						|
 | 
						|
var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9\\_:.-]/gim;
 | 
						|
 | 
						|
/**
 | 
						|
 * parse input attributes and returns processed attributes
 | 
						|
 *
 | 
						|
 * @param {String} html e.g. `href="#" target="_blank"`
 | 
						|
 * @param {Function} onAttr e.g. `function (name, value)`
 | 
						|
 * @return {String}
 | 
						|
 */
 | 
						|
function parseAttr(html, onAttr) {
 | 
						|
  "use strict";
 | 
						|
 | 
						|
  var lastPos = 0;
 | 
						|
  var lastMarkPos = 0;
 | 
						|
  var retAttrs = [];
 | 
						|
  var tmpName = false;
 | 
						|
  var len = html.length;
 | 
						|
 | 
						|
  function addAttr(name, value) {
 | 
						|
    name = _.trim(name);
 | 
						|
    name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
 | 
						|
    if (name.length < 1) return;
 | 
						|
    var ret = onAttr(name, value || "");
 | 
						|
    if (ret) retAttrs.push(ret);
 | 
						|
  }
 | 
						|
 | 
						|
  // 逐个分析字符
 | 
						|
  for (var i = 0; i < len; i++) {
 | 
						|
    var c = html.charAt(i);
 | 
						|
    var v, j;
 | 
						|
    if (tmpName === false && c === "=") {
 | 
						|
      tmpName = html.slice(lastPos, i);
 | 
						|
      lastPos = i + 1;
 | 
						|
      lastMarkPos = html.charAt(lastPos) === '"' || html.charAt(lastPos) === "'" ? lastPos : findNextQuotationMark(html, i + 1);
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    if (tmpName !== false) {
 | 
						|
      if (
 | 
						|
        i === lastMarkPos
 | 
						|
      ) {
 | 
						|
        j = html.indexOf(c, i + 1);
 | 
						|
        if (j === -1) {
 | 
						|
          break;
 | 
						|
        } else {
 | 
						|
          v = _.trim(html.slice(lastMarkPos + 1, j));
 | 
						|
          addAttr(tmpName, v);
 | 
						|
          tmpName = false;
 | 
						|
          i = j;
 | 
						|
          lastPos = i + 1;
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
    if (/\s|\n|\t/.test(c)) {
 | 
						|
      html = html.replace(/\s|\n|\t/g, " ");
 | 
						|
      if (tmpName === false) {
 | 
						|
        j = findNextEqual(html, i);
 | 
						|
        if (j === -1) {
 | 
						|
          v = _.trim(html.slice(lastPos, i));
 | 
						|
          addAttr(v);
 | 
						|
          tmpName = false;
 | 
						|
          lastPos = i + 1;
 | 
						|
          continue;
 | 
						|
        } else {
 | 
						|
          i = j - 1;
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
      } else {
 | 
						|
        j = findBeforeEqual(html, i - 1);
 | 
						|
        if (j === -1) {
 | 
						|
          v = _.trim(html.slice(lastPos, i));
 | 
						|
          v = stripQuoteWrap(v);
 | 
						|
          addAttr(tmpName, v);
 | 
						|
          tmpName = false;
 | 
						|
          lastPos = i + 1;
 | 
						|
          continue;
 | 
						|
        } else {
 | 
						|
          continue;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  if (lastPos < html.length) {
 | 
						|
    if (tmpName === false) {
 | 
						|
      addAttr(html.slice(lastPos));
 | 
						|
    } else {
 | 
						|
      addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  return _.trim(retAttrs.join(" "));
 | 
						|
}
 | 
						|
 | 
						|
function findNextEqual(str, i) {
 | 
						|
  for (; i < str.length; i++) {
 | 
						|
    var c = str[i];
 | 
						|
    if (c === " ") continue;
 | 
						|
    if (c === "=") return i;
 | 
						|
    return -1;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
function findNextQuotationMark(str, i) {
 | 
						|
  for (; i < str.length; i++) {
 | 
						|
    var c = str[i];
 | 
						|
    if (c === " ") continue;
 | 
						|
    if (c === "'" || c === '"') return i;
 | 
						|
    return -1;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
function findBeforeEqual(str, i) {
 | 
						|
  for (; i > 0; i--) {
 | 
						|
    var c = str[i];
 | 
						|
    if (c === " ") continue;
 | 
						|
    if (c === "=") return i;
 | 
						|
    return -1;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
function isQuoteWrapString(text) {
 | 
						|
  if (
 | 
						|
    (text[0] === '"' && text[text.length - 1] === '"') ||
 | 
						|
    (text[0] === "'" && text[text.length - 1] === "'")
 | 
						|
  ) {
 | 
						|
    return true;
 | 
						|
  } else {
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
function stripQuoteWrap(text) {
 | 
						|
  if (isQuoteWrapString(text)) {
 | 
						|
    return text.substr(1, text.length - 2);
 | 
						|
  } else {
 | 
						|
    return text;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
exports.parseTag = parseTag;
 | 
						|
exports.parseAttr = parseAttr;
 |