Skip to content

Commit

Permalink
Merge branch '5.4' into 6.4
Browse files Browse the repository at this point in the history
* 5.4:
  fix syntax for PHP 7.2
  [Security] Fix Danish translations
  [DomCrawler] Encode html entities only if nessecary
  [Serializer] Ignore when using #[Ignore] on a non-accessor
  [Filesystem] Strengthen the check of file permissions in `dumpFile`
  [Serializer] Fix XML scalar to object denormalization
  [HttpClient][EventSourceHttpClient] Fix consuming SSEs with \r\n separator
  • Loading branch information
xabbuh committed Apr 5, 2024
2 parents f0e7ec3 + 000634e commit fd18b0f
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 2 deletions.
22 changes: 20 additions & 2 deletions Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -1090,12 +1090,30 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'):

private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset));
if (!$this->supportsEncoding($charset)) {
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
$charset = 'UTF-8';
}

return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]);
}

private function supportsEncoding(string $encoding): bool
{
try {
return '' === @mb_convert_encoding('', $encoding, 'UTF-8');
} catch (\Throwable $e) {
return false;
}
}

private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
} else {
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
}

$internalErrors = libxml_use_internal_errors(true);

Expand Down
4 changes: 4 additions & 0 deletions Tests/AbstractCrawlerTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ public function testAddContent()
$crawler = $this->createCrawler();
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=unicode" /><div class="foo"></html></html>');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset');

$crawler = $this->createCrawler();
$crawler->addContent($this->getDoctype().'<html><script>var foo = "bär";</script></html>', 'text/html; charset=UTF-8');
$this->assertEquals('var foo = "bär";', $crawler->filterXPath('//script')->text(), '->addContent() does not interfere with script content');
}

/**
Expand Down

0 comments on commit fd18b0f

Please sign in to comment.