Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest/bigquery): escape special characters for table descriptions #9932

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryColumn,
Expand Down Expand Up @@ -1073,7 +1076,9 @@ def gen_dataset_workunits(

dataset_properties = DatasetProperties(
name=datahub_dataset_name.get_table_display_name(),
description=table.comment,
description=unquote_and_decode_unicode_escape_seq(table.comment)
if table.comment
else "",
qualifiedName=str(datahub_dataset_name),
created=(
TimeStamp(time=int(table.created.timestamp() * 1000))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Optional


def unquote_and_decode_unicode_escape_seq(
string: str,
leading_quote: str = '"',
trailing_quote: Optional[str] = None,
) -> str:
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -31,7 +32,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -46,7 +48,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -63,7 +66,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -78,7 +82,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -100,7 +105,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -115,7 +121,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -130,7 +137,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -147,7 +155,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -162,7 +171,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -182,7 +192,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -197,7 +208,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand Down Expand Up @@ -229,7 +241,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -243,12 +256,14 @@
"externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1",
"name": "table-1",
"qualifiedName": "project-id-1.bigquery-dataset-1.table-1",
"description": "",
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -263,7 +278,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -279,7 +295,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -296,7 +313,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -320,7 +338,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
}
]
36 changes: 36 additions & 0 deletions metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
from datahub.sql_parsing.schema_resolver import SchemaResolver
Expand Down Expand Up @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation():
assert table_identifier.dataset == "dataset-4567"
assert table_identifier.table == "foo_2016*"
assert table_identifier.get_table_display_name() == "foo"


def test_unquote_and_decode_unicode_escape_seq():

# Test with a string that starts and ends with quotes and has Unicode escape sequences
input_string = '"Hello \\u003cWorld\\u003e"'
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not start and end with quotes
input_string = "Hello \\u003cWorld\\u003e"
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with an empty string
input_string = ""
expected_output = ""
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not have Unicode escape sequences
input_string = "No escape sequences here"
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that starts and ends with quotes but does not have escape sequences
input_string = '"No escape sequences here"'
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output
Loading