Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parse comments #220

Merged
merged 18 commits into from
Sep 24, 2022
1 change: 1 addition & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"consistent-return": "off",
"camelcase": "off",
"@typescript-eslint/camelcase": "off",
"@typescript-eslint/no-this-alias": "off",
"curly": [
"error",
"multi-line",
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:

strategy:
matrix:
node-version: [12.x, 14.x, 16.x, 17.x]
node-version: [14.x, 16.x, 17.x]

steps:
- name: Checkout
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Node.js 12.x to publish to npmjs.org
- name: Setup Node.js 14.x to publish to npmjs.org
uses: actions/setup-node@v1
with:
node-version: '12.x'
node-version: '14.x'
registry-url: 'https://registry.npmjs.org'

- name: Install Packages
Expand Down
19 changes: 10 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"compile": "tsc",
"build": "npm run lint && npm run clean && npm run compile:cjs && npm run compile:amd",
"compile:cjs": "tsc -m commonjs",
"watch": "npx tsc -m commonjs --watch --preserveWatchOutput",
"compile:amd": "tsc -t es5 -m amd -d false --outFile ./dist/main.js",
"lint": "eslint ./src/*.ts ./src/**/*.ts",
"---------------": "",
Expand Down Expand Up @@ -47,7 +48,7 @@
"registry": "https://registry.npmjs.org"
},
"dependencies": {
"css-select": "^4.2.1",
"css-select": "^5.1.0",
"he": "1.2.0"
},
"devDependencies": {
Expand All @@ -58,31 +59,31 @@
"@typescript-eslint/eslint-plugin-tslint": "latest",
"@typescript-eslint/parser": "latest",
"blanket": "latest",
"cheerio": "^1.0.0-rc.5",
"cheerio": "^1.0.0-rc.12",
"cross-env": "^7.0.3",
"eslint": "^7.32.0",
"eslint": "^8.23.1",
"eslint-config-prettier": "latest",
"eslint-plugin-import": "latest",
"high5": "^1.0.0",
"html-dom-parser": "^1.0.4",
"html-dom-parser": "^3.1.2",
"html-parser": "^0.11.0",
"html5parser": "^2.0.2",
"htmljs-parser": "^2.11.1",
"htmljs-parser": "^5.1.4",
"htmlparser": "^1.7.7",
"htmlparser-benchmark": "^1.1.3",
"htmlparser2": "^6.0.0",
"htmlparser2": "^8.0.1",
"mocha": "latest",
"mocha-each": "^2.0.1",
"neutron-html5parser": "^0.2.0",
"np": "latest",
"parse5": "^6.0.1",
"parse5": "^7.1.1",
"rimraf": "^3.0.2",
"saxes": "^6.0.0",
"should": "latest",
"spec": "latest",
"standard-version": "^9.3.1",
"standard-version": "^9.5.0",
"travis-cov": "latest",
"ts-node": "^10.2.1",
"ts-node": "^10.9.1",
"typescript": "latest"
},
"config": {
Expand Down
5 changes: 1 addition & 4 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ export {
NodeType
};

export default function parse(data: string, options = {
lowerCaseTagName: false,
comment: false
} as Partial<Options>) {
export default function parse(data: string, options = {} as Partial<Options>) {
return baseParse(data, options);
}

Expand Down
2 changes: 1 addition & 1 deletion src/matcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ export default {
hasAttrib,
findOne,
findAll
} as Adapter<Node, HTMLElement>;
} as unknown as Adapter<Node, HTMLElement>;
30 changes: 16 additions & 14 deletions src/nodes/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ class DOMTokenList {
export default class HTMLElement extends Node {
private _attrs: Attributes;
private _rawAttrs: RawAttributes;
private _parseOptions: Partial<Options>;
public rawTagName: string; // there is not friend funciton in es
public id: string;
public classList: DOMTokenList;
Expand Down Expand Up @@ -173,13 +174,15 @@ export default class HTMLElement extends Node {
public rawAttrs = '',
parentNode: HTMLElement | null,
range: [number, number],
private voidTag = new VoidTag()
private voidTag = new VoidTag(),
_parseOptions = {} as Partial<Options>
) {
super(parentNode, range);
this.rawTagName = tagName;
this.rawAttrs = rawAttrs || '';
this.id = keyAttrs.id || '';
this.childNodes = [];
this._parseOptions = _parseOptions;
this.classList = new DOMTokenList(
keyAttrs.class ? keyAttrs.class.split(/\s+/) : [],
(classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
Expand Down Expand Up @@ -327,8 +330,7 @@ export default class HTMLElement extends Node {
}

public set innerHTML(content: string) {
//const r = parse(content, global.options); // TODO global.options ?
const r = parse(content);
const r = parse(content, this._parseOptions);
const nodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
resetParent(nodes, this);
resetParent(this.childNodes, null);
Expand All @@ -339,8 +341,9 @@ export default class HTMLElement extends Node {
if (content instanceof Node) {
content = [content];
} else if (typeof content == 'string') {
options = { ...this._parseOptions, ...options };
const r = parse(content, options);
content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
content = r.childNodes.length ? r.childNodes : [new TextNode(r.innerHTML, this)];
}
resetParent(this.childNodes, null);
resetParent(content, this);
Expand All @@ -355,8 +358,7 @@ export default class HTMLElement extends Node {
if (node instanceof Node) {
return [node];
} else if (typeof node == 'string') {
// const r = parse(content, global.options); // TODO global.options ?
const r = parse(node);
const r = parse(node, this._parseOptions);
return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
}
return [];
Expand Down Expand Up @@ -802,7 +804,7 @@ export default class HTMLElement extends Node {
if (arguments.length < 2) {
throw new Error('2 arguments required');
}
const p = parse(html);
const p = parse(html, this._parseOptions);
if (where === 'afterend') {
const idx = this.parentNode.childNodes.findIndex((child) => {
return child === this;
Expand Down Expand Up @@ -903,7 +905,7 @@ export default class HTMLElement extends Node {
* Clone this Node
*/
public clone() {
return parse(this.toString()).firstChild;
return parse(this.toString(), this._parseOptions).firstChild;
}
}

Expand Down Expand Up @@ -982,8 +984,8 @@ const kElementsClosedByClosing = {
};

export interface Options {
lowerCaseTagName: boolean;
comment: boolean;
lowerCaseTagName?: boolean;
comment?: boolean;
/**
* @see PR #215 for explanation
*/
Expand Down Expand Up @@ -1012,7 +1014,7 @@ const frameflag = 'documentfragmentcontainer';
* @param {string} data html
* @return {HTMLElement} root element
*/
export function base_parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export function base_parse(data: string, options = {} as Partial<Options>) {
const voidTag = new VoidTag(options?.voidTag?.closingSlash, options?.voidTag?.tags);
const elements = options.blockTextElements || {
script: true,
Expand All @@ -1033,7 +1035,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
}

const createRange = (startPos: number, endPos: number): [number, number] => [startPos - frameFlagOffset, endPos - frameFlagOffset];
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag, options);

let currentParent = root;
const stack = [root];
Expand Down Expand Up @@ -1116,7 +1118,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co

currentParent = currentParent.appendChild(
// Initialize range (end position updated later for closed tags)
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag)
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag, options)
);
stack.push(currentParent);

Expand Down Expand Up @@ -1178,7 +1180,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
*/
export function parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export function parse(data: string, options = {} as Partial<Options>) {
const stack = base_parse(data, options);

const [root] = stack;
Expand Down
2 changes: 1 addition & 1 deletion src/valid.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { base_parse, Options } from './nodes/html';
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
*/
export default function valid(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
export default function valid(data: string, options = {} as Partial<Options>) {
const stack = base_parse(data, options);
return Boolean(stack.length === 1);
}
2 changes: 1 addition & 1 deletion test/benchmark/compare-htmlparser2.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import benchmark from 'htmlparser-benchmark';
import htmlparser2 from "htmlparser2";
import * as htmlparser2 from "htmlparser2";

const { Parser } = htmlparser2;

Expand Down
2 changes: 1 addition & 1 deletion test/benchmark/compare-parse5.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import benchmark from 'htmlparser-benchmark';
import parse5 from "parse5";
import * as parse5 from "parse5";

export default function parse() {
return new Promise((res) => {
Expand Down
100 changes: 95 additions & 5 deletions test/tests/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,41 @@ describe('HTML Parser', function () {
lowerCaseTagName: true
});

const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { lowerCaseTagName: true });

const a = div.appendChild(new HTMLElement('a', {}, '', u, u, u, { lowerCaseTagName: true }));
const img = a.appendChild(new HTMLElement('img', {}, '', u, u, u, { lowerCaseTagName: true }));
const p = div.appendChild(new HTMLElement('p', {}, '', u, u, u, { lowerCaseTagName: true }));

root.firstChild.should.eql(div);
});

it.skip('TODO implement: should use HTMLElement._defaultParseOptions', function () {

// set _defaultParseOptions,
// so we dont have to pass parseOptions to every `new HTMLElement(...)`
// FIXME _defaultParseOptions is not used
should(HTMLElement._defaultParseOptions.lowerCaseTagName).be.undefined();
HTMLElement._defaultParseOptions.lowerCaseTagName = true;

const root = parseHTML('<DIV><a><img/></A><p></P></div>');

const div = new HTMLElement('div', {}, '', root);

const a = div.appendChild(new HTMLElement('a', {}, ''));
const img = a.appendChild(new HTMLElement('img', {}, ''));
const p = div.appendChild(new HTMLElement('p', {}, ''));

root.firstChild.should.eql(div);

// cleanup? remove key from _defaultParseOptions object?
//HTMLElement._defaultParseOptions.lowerCaseTagName = undefined;
});

it('should deal uppercase', function () {
//should(HTMLElement._defaultParseOptions.comment).be.undefined();

const html = '<HTML xmlns="http://www.w3.org/1999/xhtml" lang="pt" xml:lang="pt-br"><HEAD><TITLE>SISREG III</TITLE><META http-equiv="Content-Type" content="text/html; charset=UTF-8" /><META http-equiv="Content-Language" content="pt-BR" /><LINK rel="stylesheet" href="/css/estilo.css" type="text/css"><SCRIPT type="text/javascript" src="/javascript/jquery.js" charset="utf-8"></SCRIPT><SCRIPT LANGUAGE=\'JavaScript\'></SCRIPT></HEAD><BODY link=\'#0000AA\' vlink=\'#0000AA\'><CENTER><h1>CONSULTA AO CADASTRO DE PACIENTES SUS</h1></CENTER><DIV id=\'progress_div\'><BR><BR><CENTER><IMG src=\'/imagens/loading.gif\' /></CENTER><CENTER><SPAN style=\'font-size: 80%\'>Processando...</SPAN></CENTER><BR><BR></DIV></BODY></HTML>';

const root = parseHTML(html, {
Expand Down Expand Up @@ -73,22 +98,86 @@ describe('HTML Parser', function () {
it('should parse "<div><a><!-- my comment --></a></div>" and return root element with comments', function () {
const root = parseHTML('<div><a><!-- my comment --></a></div>', { comment: true });

const div = new HTMLElement('div', {}, '', root);
const a = div.appendChild(new HTMLElement('a', {}, ''));
const comment = a.appendChild(new CommentNode(' my comment '));
const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { comment: true });
const a = div.appendChild(new HTMLElement('a', {}, '', u, u, u, { comment: true }));
const comment = a.appendChild(new CommentNode(' my comment ', u, u, u, u, u, { comment: true }));

root.firstChild.should.eql(div);
});

it('should not parse HTML inside comments', function () {
const root = parseHTML('<div><!--<a></a>--></div>', { comment: true });

const div = new HTMLElement('div', {}, '', root);
const comment = div.appendChild(new CommentNode('<a></a>'));
const u = undefined;
const div = new HTMLElement('div', {}, '', root, u, u, { comment: true });
const comment = div.appendChild(new CommentNode('<a></a>', u, u, u, u, u, { comment: true }));

root.firstChild.should.eql(div);
});

// yarn test:target -g 'should parse HTML comments in *'

it('should parse HTML comments in insertAdjacentHTML', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.insertAdjacentHTML('afterend', '<!-- my comment -->');
root.toString().should.eql('<div></div><!-- my comment -->');
div.nextSibling.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set innerHTML', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.innerHTML = '<!-- my comment -->';
root.toString().should.eql('<div><!-- my comment --></div>');
div.firstChild.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set_content', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<!-- my comment -->');
root.toString().should.eql('<div><!-- my comment --></div>');
div.firstChild.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in set_content - comment in div', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<div><!-- my comment --></div>');
root.toString().should.eql('<div><div><!-- my comment --></div></div>');
div.firstChild.toString().should.eql('<div><!-- my comment --></div>');
});

it('should parse HTML comments NOT in set_content with option comment=false', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<!-- my comment -->', { comment: false });
root.toString().should.eql('<div></div>');
});

it('should parse HTML comments NOT in set_content with option comment=false - comment in div', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.set_content('<div><!-- my comment --></div>', { comment: false });
root.toString().should.eql('<div><div></div></div>');
});

it('should parse HTML comments in replaceWith', function () {
const root = parseHTML('<div></div>', { comment: true });
const div = root.querySelector('div');
div.replaceWith('<!-- my comment -->');
root.toString().should.eql('<!-- my comment -->');
});

it('should parse HTML comments in clone', function () {
const root = parseHTML('<div><!-- my comment --></div>', { comment: true });
const div = root.querySelector('div');
const clone = div.clone();
clone.toString().should.eql('<div><!-- my comment --></div>');
});

it('should parse picture element', function () {

const root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
Expand Down Expand Up @@ -164,6 +253,7 @@ describe('HTML Parser', function () {
});

it('should parse table currect', function () {
this.timeout(4000); // pass test on slow CPUs
const root = parseHTML(fs.readFileSync(__dirname + '/../assets/html/tables.html').toString(), {
script: true
});
Expand Down
Loading