Skip to content

Commit

Permalink
Merge pull request #220 from milahu/fix-parse-comments
Browse files Browse the repository at this point in the history
Fix parse comments
  • Loading branch information
taoqf committed Sep 24, 2022
2 parents 8f4cedf + b75810d commit b0db7b1
Show file tree
Hide file tree
Showing 12 changed files with 841 additions and 831 deletions.
1 change: 1 addition & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"consistent-return": "off",
"camelcase": "off",
"@typescript-eslint/camelcase": "off",
"@typescript-eslint/no-this-alias": "off",
"curly": [
"error",
"multi-line",
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:

strategy:
matrix:
node-version: [12.x, 14.x, 16.x, 17.x]
node-version: [14.x, 16.x, 17.x]

steps:
- name: Checkout
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Node.js 12.x to publish to npmjs.org
- name: Setup Node.js 14.x to publish to npmjs.org
uses: actions/setup-node@v1
with:
node-version: '12.x'
node-version: '14.x'
registry-url: 'https://registry.npmjs.org'

- name: Install Packages
Expand Down
19 changes: 10 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"compile": "tsc",
"build": "npm run lint && npm run clean && npm run compile:cjs && npm run compile:amd",
"compile:cjs": "tsc -m commonjs",
"watch": "npx tsc -m commonjs --watch --preserveWatchOutput",
"compile:amd": "tsc -t es5 -m amd -d false --outFile ./dist/main.js",
"lint": "eslint ./src/*.ts ./src/**/*.ts",
"---------------": "",
Expand Down Expand Up @@ -47,7 +48,7 @@
"registry": "https://registry.npmjs.org"
},
"dependencies": {
"css-select": "^4.2.1",
"css-select": "^5.1.0",
"he": "1.2.0"
},
"devDependencies": {
Expand All @@ -58,31 +59,31 @@
"@typescript-eslint/eslint-plugin-tslint": "latest",
"@typescript-eslint/parser": "latest",
"blanket": "latest",
"cheerio": "^1.0.0-rc.5",
"cheerio": "^1.0.0-rc.12",
"cross-env": "^7.0.3",
"eslint": "^7.32.0",
"eslint": "^8.23.1",
"eslint-config-prettier": "latest",
"eslint-plugin-import": "latest",
"high5": "^1.0.0",
"html-dom-parser": "^1.0.4",
"html-dom-parser": "^3.1.2",
"html-parser": "^0.11.0",
"html5parser": "^2.0.2",
"htmljs-parser": "^2.11.1",
"htmljs-parser": "^5.1.4",
"htmlparser": "^1.7.7",
"htmlparser-benchmark": "^1.1.3",
"htmlparser2": "^6.0.0",
"htmlparser2": "^8.0.1",
"mocha": "latest",
"mocha-each": "^2.0.1",
"neutron-html5parser": "^0.2.0",
"np": "latest",
"parse5": "^6.0.1",
"parse5": "^7.1.1",
"rimraf": "^3.0.2",
"saxes": "^6.0.0",
"should": "latest",
"spec": "latest",
"standard-version": "^9.3.1",
"standard-version": "^9.5.0",
"travis-cov": "latest",
"ts-node": "^10.2.1",
"ts-node": "^10.9.1",
"typescript": "latest"
},
"config": {
Expand Down
5 changes: 1 addition & 4 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ export {
NodeType
};

export default function parse(data: string, options = {
lowerCaseTagName: false,
comment: false
} as Partial<Options>) {
export default function parse(data: string, options = {} as Partial<Options>) {
return baseParse(data, options);
}

Expand Down
2 changes: 1 addition & 1 deletion src/matcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ export default {
hasAttrib,
findOne,
findAll
} as Adapter<Node, HTMLElement>;
} as unknown as Adapter<Node, HTMLElement>;
30 changes: 16 additions & 14 deletions src/nodes/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ class DOMTokenList {
export default class HTMLElement extends Node {
private _attrs: Attributes;
private _rawAttrs: RawAttributes;
private _parseOptions: Partial<Options>;
public rawTagName: string; // there is not friend funciton in es
public id: string;
public classList: DOMTokenList;
Expand Down Expand Up @@ -173,13 +174,15 @@ export default class HTMLElement extends Node {
public rawAttrs = '',
parentNode: HTMLElement | null,
range: [number, number],
private voidTag = new VoidTag()
private voidTag = new VoidTag(),
_parseOptions = {} as Partial<Options>
) {
super(parentNode, range);
this.rawTagName = tagName;
this.rawAttrs = rawAttrs || '';
this.id = keyAttrs.id || '';
this.childNodes = [];
this._parseOptions = _parseOptions;
this.classList = new DOMTokenList(
keyAttrs.class ? keyAttrs.class.split(/\s+/) : [],
(classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
Expand Down Expand Up @@ -327,8 +330,7 @@ export default class HTMLElement extends Node {
}

public set innerHTML(content: string) {
//const r = parse(content, global.options); // TODO global.options ?
const r = parse(content);
const r = parse(content, this._parseOptions);
const nodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
resetParent(nodes, this);
resetParent(this.childNodes, null);
Expand All @@ -339,8 +341,9 @@ export default class HTMLElement extends Node {
if (content instanceof Node) {
content = [content];
} else if (typeof content == 'string') {
options = { ...this._parseOptions, ...options };
const r = parse(content, options);
content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
content = r.childNodes.length ? r.childNodes : [new TextNode(r.innerHTML, this)];
}
resetParent(this.childNodes, null);
resetParent(content, this);
Expand All @@ -355,8 +358,7 @@ export default class HTMLElement extends Node {
if (node instanceof Node) {
return [node];
} else if (typeof node == 'string') {
// const r = parse(content, global.options); // TODO global.options ?
const r = parse(node);
const r = parse(node, this._parseOptions);
return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
}
return [];
Expand Down Expand Up @@ -802,7 +804,7 @@ export default class HTMLElement extends Node {
if (arguments.length < 2) {
throw new Error('2 arguments required');
}
const p = parse(html);
const p = parse(html, this._parseOptions);
if (where === 'afterend') {
const idx = this.parentNode.childNodes.findIndex((child) => {
return child === this;
Expand Down Expand Up @@ -903,7 +905,7 @@ export default class HTMLElement extends Node {
* Clone this Node
*/
public clone() {
return parse(this.toString()).firstChild;
return parse(this.toString(), this._parseOptions).firstChild;
}
}

Expand Down Expand Up @@ -982,8 +984,8 @@ const kElementsClosedByClosing = {
};

export interface Options {
lowerCaseTagName: boolean;
comment: boolean;
lowerCaseTagName?: boolean;
comment?: boolean;
/**
* @see PR #215 for explanation
*/
Expand Down Expand Up @@ -1012,7 +1014,7 @@ const frameflag = 'documentfragmentcontainer';
* @param {string} data html
* @return {HTMLElement} root element
*/
export function base_parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export function base_parse(data: string, options = {} as Partial<Options>) {
const voidTag = new VoidTag(options?.voidTag?.closingSlash, options?.voidTag?.tags);
const elements = options.blockTextElements || {
script: true,
Expand All @@ -1033,7 +1035,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
}

const createRange = (startPos: number, endPos: number): [number, number] => [startPos - frameFlagOffset, endPos - frameFlagOffset];
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag, options);

let currentParent = root;
const stack = [root];
Expand Down Expand Up @@ -1116,7 +1118,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co

currentParent = currentParent.appendChild(
// Initialize range (end position updated later for closed tags)
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag)
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag, options)
);
stack.push(currentParent);

Expand Down Expand Up @@ -1178,7 +1180,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
*/
export function parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export function parse(data: string, options = {} as Partial<Options>) {
const stack = base_parse(data, options);

const [root] = stack;
Expand Down
2 changes: 1 addition & 1 deletion src/valid.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { base_parse, Options } from './nodes/html';
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
*/
export default function valid(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export default function valid(data: string, options = {} as Partial<Options>) {
const stack = base_parse(data, options);
return Boolean(stack.length === 1);
}
2 changes: 1 addition & 1 deletion test/benchmark/compare-htmlparser2.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import benchmark from 'htmlparser-benchmark';
import htmlparser2 from "htmlparser2";
import * as htmlparser2 from "htmlparser2";

const { Parser } = htmlparser2;

Expand Down
2 changes: 1 addition & 1 deletion test/benchmark/compare-parse5.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import benchmark from 'htmlparser-benchmark';
import parse5 from "parse5";
import * as parse5 from "parse5";

export default function parse() {
return new Promise((res) => {
Expand Down
100 changes: 95 additions & 5 deletions test/tests/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,41 @@ describe('HTML Parser', function () {
lowerCaseTagName: true
});

const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { lowerCaseTagName: true });

const a = div.appendChild(new HTMLElement('a', {}, '', u, u, u, { lowerCaseTagName: true }));
const img = a.appendChild(new HTMLElement('img', {}, '', u, u, u, { lowerCaseTagName: true }));
const p = div.appendChild(new HTMLElement('p', {}, '', u, u, u, { lowerCaseTagName: true }));

root.firstChild.should.eql(div);
});

it.skip('TODO implement: should use HTMLElement._defaultParseOptions', function () {

// set _defaultParseOptions,
// so we dont have to pass parseOptions to every `new HTMLElement(...)`
// FIXME _defaultParseOptions is not used
should(HTMLElement._defaultParseOptions.lowerCaseTagName).be.undefined();
HTMLElement._defaultParseOptions.lowerCaseTagName = true;

const root = parseHTML('<DIV><a><img/></A><p></P></div>');

const div = new HTMLElement('div', {}, '', root);

const a = div.appendChild(new HTMLElement('a', {}, ''));
const img = a.appendChild(new HTMLElement('img', {}, ''));
const p = div.appendChild(new HTMLElement('p', {}, ''));

root.firstChild.should.eql(div);

// cleanup? remove key from _defaultParseOptions object?
//HTMLElement._defaultParseOptions.lowerCaseTagName = undefined;
});

it('should deal uppercase', function () {
//should(HTMLElement._defaultParseOptions.comment).be.undefined();

const html = '<HTML xmlns="http://www.w3.org/1999/xhtml" lang="pt" xml:lang="pt-br"><HEAD><TITLE>SISREG III</TITLE><META http-equiv="Content-Type" content="text/html; charset=UTF-8" /><META http-equiv="Content-Language" content="pt-BR" /><LINK rel="stylesheet" href="/css/estilo.css" type="text/css"><SCRIPT type="text/javascript" src="/javascript/jquery.js" charset="utf-8"></SCRIPT><SCRIPT LANGUAGE=\'JavaScript\'></SCRIPT></HEAD><BODY link=\'#0000AA\' vlink=\'#0000AA\'><CENTER><h1>CONSULTA AO CADASTRO DE PACIENTES SUS</h1></CENTER><DIV id=\'progress_div\'><BR><BR><CENTER><IMG src=\'/imagens/loading.gif\' /></CENTER><CENTER><SPAN style=\'font-size: 80%\'>Processando...</SPAN></CENTER><BR><BR></DIV></BODY></HTML>';

const root = parseHTML(html, {
Expand Down Expand Up @@ -73,22 +98,86 @@ describe('HTML Parser', function () {
it('should parse "<div><a><!-- my comment --></a></div>" and return root element with comments', function () {
const root = parseHTML('<div><a><!-- my comment --></a></div>', { comment: true });

const div = new HTMLElement('div', {}, '', root);
const a = div.appendChild(new HTMLElement('a', {}, ''));
const comment = a.appendChild(new CommentNode(' my comment '));
const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { comment: true });
const a = div.appendChild(new HTMLElement('a', {}, '', u, u, u, { comment: true }));
const comment = a.appendChild(new CommentNode(' my comment ', u, u, u, u, u, { comment: true }));

root.firstChild.should.eql(div);
});

it('should not parse HTML inside comments', function () {
const root = parseHTML('<div><!--<a></a>--></div>', { comment: true });

const div = new HTMLElement('div', {}, '', root);
const comment = div.appendChild(new CommentNode('<a></a>'));
const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { comment: true });
const comment = div.appendChild(new CommentNode('<a></a>', u, u, u, u, u, { comment: true }));

root.firstChild.should.eql(div);
});

// yarn test:target -g 'should parse HTML comments in *'

it('should parse HTML comments in insertAdjacentHTML', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.insertAdjacentHTML('afterend', '<!-- my comment -->');
root.toString().should.eql('<div></div><!-- my comment -->');
div.nextSibling.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set innerHTML', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.innerHTML = '<!-- my comment -->';
root.toString().should.eql('<div><!-- my comment --></div>');
div.firstChild.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set_content', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<!-- my comment -->');
root.toString().should.eql('<div><!-- my comment --></div>');
div.firstChild.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set_content - comment in div', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<div><!-- my comment --></div>');
root.toString().should.eql('<div><div><!-- my comment --></div></div>');
div.firstChild.toString().should.eql('<div><!-- my comment --></div>');
});

it('should parse HTML comments NOT in set_content with option comment=false', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<!-- my comment -->', { comment: false });
root.toString().should.eql('<div></div>');
});

it('should parse HTML comments NOT in set_content with option comment=false - comment in div', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<div><!-- my comment --></div>', { comment: false });
root.toString().should.eql('<div><div></div></div>');
});

it('should parse HTML comments in replaceWith', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.replaceWith('<!-- my comment -->');
root.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in clone', function () {
const root = parseHTML('<div><!-- my comment --></div>', { comment: true });
const div = root.querySelector('div');
const clone = div.clone();
clone.toString().should.eql('<div><!-- my comment --></div>');
});

it('should parse picture element', function () {

const root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
Expand Down Expand Up @@ -164,6 +253,7 @@ describe('HTML Parser', function () {
});

it('should parse table currect', function () {
this.timeout(4000); // pass test on slow CPUs
const root = parseHTML(fs.readFileSync(__dirname + '/../assets/html/tables.html').toString(), {
script: true
});
Expand Down
Loading

0 comments on commit b0db7b1

Please sign in to comment.