Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Support rendering previews with data: URLs in them #11767

Merged
merged 16 commits into from
Jan 24, 2022
Merged
1 change: 1 addition & 0 deletions changelog.d/11767.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug when previewing Reddit URLs which do not contain an image.
31 changes: 25 additions & 6 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,14 +321,33 @@ def _iterate_over_text(


def rebase_url(url: str, base: str) -> str:
clokep marked this conversation as resolved.
Show resolved Hide resolved
base_parts = list(urlparse.urlparse(base))
"""
Resolves a potentially relative `url` against an absolute `base` URL.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how this differs from urljoin in the standard lib?
Not necessarily opposed to rolling our own, but it'd be nice to know if we had specific motivation for doing so.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't know of it, but I'll look into it!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will work OK, but I'd rather punt this to a separate PR since it is pretty unrelated to this work. Would that be OK?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah OK


For example:

>>> rebase_url("subpage", "https://example.com/foo/")
'https://example.com/foo/subpage'
>>> rebase_url("sibling", "https://example.com/foo")
'https://example.com/sibling'
>>> rebase_url("/bar", "https://example.com/foo/")
'https://example.com/bar'
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
'https://alice.com/a'
"""
base_parts = urlparse.urlparse(base)
# Convert the parsed URL to a list for (potential) modification.
url_parts = list(urlparse.urlparse(url))
if not url_parts[0]: # fix up schema
url_parts[0] = base_parts[0] or "http"
if not url_parts[1]: # fix up hostname
url_parts[1] = base_parts[1]
# Add a scheme, if one does not exist.
if not url_parts[0]:
url_parts[0] = base_parts.scheme or "http"
# Fix up the hostname, if this is not a data URL.
if url_parts[0] != "data" and not url_parts[1]:
url_parts[1] = base_parts.netloc
# If the path does not start with a /, nest it under the base path's last
# directory.
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
return urlparse.urlunparse(url_parts)


Expand Down
224 changes: 166 additions & 58 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
import shutil
import sys
import traceback
from typing import TYPE_CHECKING, Iterable, Optional, Tuple
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
from urllib import parse as urlparse
from urllib.request import urlopen

import attr

Expand Down Expand Up @@ -70,6 +71,17 @@
IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY


@attr.s(slots=True, frozen=True, auto_attribs=True)
class DownloadResult:
length: int
uri: str
response_code: int
media_type: str
download_name: Optional[str]
expires: int
etag: Optional[str]


@attr.s(slots=True, frozen=True, auto_attribs=True)
class MediaInfo:
"""
Expand Down Expand Up @@ -256,7 +268,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
if oembed_url:
url_to_download = oembed_url

media_info = await self._download_url(url_to_download, user)
media_info = await self._handle_url(url_to_download, user)

logger.debug("got media_info of '%s'", media_info)

Expand Down Expand Up @@ -297,7 +309,9 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
oembed_url = self._oembed.autodiscover_from_html(tree)
og_from_oembed: JsonDict = {}
if oembed_url:
oembed_info = await self._download_url(oembed_url, user)
oembed_info = await self._handle_url(
oembed_url, user, allow_data_urls=True
)
(
og_from_oembed,
author_name,
Expand Down Expand Up @@ -367,7 +381,135 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:

return jsonog.encode("utf8")

async def _download_url(self, url: str, user: UserID) -> MediaInfo:
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
"""
Fetches a remote URL and parses the headers.

Args:
url: The URL to fetch.
output_stream: The stream to write the content to.

Returns:
A tuple of:
Media length, URL downloaded, the HTTP response code,
the media type, the downloaded file name, the number of
milliseconds the result is valid for, the etag header.
"""

try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"

download_name = get_filename_from_headers(headers)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None

return DownloadResult(
length, uri, code, media_type, download_name, expires, etag
)

async def _parse_data_url(
self, url: str, output_stream: BinaryIO
) -> DownloadResult:
"""
Parses a data: URL.

Args:
url: The URL to parse.
output_stream: The stream to write the content to.

Returns:
A tuple of:
Media length, URL downloaded, the HTTP response code,
the media type, the downloaded file name, the number of
milliseconds the result is valid for, the etag header.
"""

try:
logger.debug("Trying to parse data url '%s'", url)
with urlopen(url) as url_info:
# TODO Can this be more efficient.
output_stream.write(url_info.read())
except Exception as e:
logger.warning("Error parsing data: URL %s: %r", url, e)

raise SynapseError(
500,
"Failed to parse data URL: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)

return DownloadResult(
# Read back the length that has been written.
length=output_stream.tell(),
uri=url,
# If it was parsed, consider this a 200 OK.
response_code=200,
# urlopen shoves the media-type from the data URL into the content type
# header object.
media_type=url_info.headers.get_content_type(),
# Some features are not supported by data: URLs.
download_name=None,
expires=ONE_HOUR,
etag=None,
)

async def _handle_url(
self, url: str, user: UserID, allow_data_urls: bool = False
) -> MediaInfo:
"""
Fetches content from a URL and parses the result to generate a MediaInfo.

It uses the media storage provider to persist the fetched content and
stores the mapping into the database.

Args:
url: The URL to fetch.
user: The user who ahs requested this URL.
allow_data_urls: True if data URLs should be allowed.

Returns:
A MediaInfo object describing the fetched content.
"""

# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
# bot, so are we really a robot?
Expand All @@ -377,61 +519,27 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo:
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if url.startswith("data:"):
if not allow_data_urls:
raise SynapseError(
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
)

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
download_result = await self._parse_data_url(url, f)
else:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to assert HTTP(S) schemes here as additional error checking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I feel like I'd be happier having a few select allowed protocols rather than singling out data:; not sure it really makes sense to ask us to preview an ftp: URI either for example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, note that those will fail right now (we won't try them), but it would be better to be explicit, I think!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might actually be tougher to get correct than I initially though since we seem to support previewing scheme-less URLs. I'm not sure it makes sense to refactor that as part of this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, fair enough.

media_type = "application/octet-stream"
download_result = await self._download_url(url, f)

download_name = get_filename_from_headers(headers)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
await finish()

try:
time_now_ms = self.clock.time_msec()

await self.store.store_local_media(
media_id=file_id,
media_type=media_type,
media_type=download_result.media_type,
time_now_ms=time_now_ms,
upload_name=download_name,
media_length=length,
upload_name=download_result.download_name,
media_length=download_result.length,
user_id=user,
url_cache=url,
)
Expand All @@ -444,16 +552,16 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo:
raise

return MediaInfo(
media_type=media_type,
media_length=length,
download_name=download_name,
media_type=download_result.media_type,
media_length=download_result.length,
download_name=download_result.download_name,
created_ts_ms=time_now_ms,
filesystem_id=file_id,
filename=fname,
uri=uri,
response_code=code,
expires=expires,
etag=etag,
uri=download_result.uri,
response_code=download_result.response_code,
expires=download_result.expires,
etag=download_result.etag,
)

async def _precache_image_url(
Expand All @@ -474,8 +582,8 @@ async def _precache_image_url(
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._download_url(
rebase_url(og["og:image"], media_info.uri), user
image_info = await self._handle_url(
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
)

if _is_media(image_info.media_type):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
rebase_url,
summarize_paragraphs,
)

from . import unittest
from tests import unittest

try:
import lxml
Expand Down Expand Up @@ -447,3 +448,34 @@ def test_unknown_invalid(self):
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])


class RebaseUrlTestCase(unittest.TestCase):
def test_relative(self):
"""Relative URLs should be resolved based on the context of the base URL."""
self.assertEqual(
rebase_url("subpage", "https://example.com/foo/"),
"https://example.com/foo/subpage",
)
self.assertEqual(
rebase_url("sibling", "https://example.com/foo"),
"https://example.com/sibling",
)
self.assertEqual(
rebase_url("/bar", "https://example.com/foo/"),
"https://example.com/bar",
)

def test_absolute(self):
"""Absolute URLs should not be modified."""
self.assertEqual(
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
"https://alice.com/a/",
)

def test_data(self):
"""Data URLs should not be modified."""
self.assertEqual(
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
"data:,Hello%2C%20World%21",
)
Loading