-
Notifications
You must be signed in to change notification settings - Fork 13.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(bigquery): add support for query cost estimate #18694
Changes from all commits
6ae4638
6e676f6
4e63843
6c06b05
dba135c
1fd1488
f5908e5
9e36f0f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -185,6 +185,47 @@ class BigQueryEngineSpec(BaseEngineSpec): | |
), | ||
} | ||
|
||
@classmethod | ||
def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: | ||
return True | ||
|
||
@classmethod | ||
def estimate_statement_cost( | ||
cls, statement: str, cursor: Any, engine: Engine | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only way to estimate the cost in advance in BigQuery is to run the query with dry_run, and since this is not possible with only cursor, I add engine as an argument. Another way to handle bigquery.Client directly is to configure sqlalchemy to pass the dryrun parameter when creating the connection, but this seems to be more complicated... https://github.com/googleapis/python-bigquery-sqlalchemy#connection-string-parameters |
||
) -> Dict[str, Any]: | ||
# pylint: disable=import-outside-toplevel | ||
from google.cloud import bigquery | ||
from google.oauth2 import service_account | ||
|
||
creds = engine.dialect.credentials_info | ||
credentials = service_account.Credentials.from_service_account_info(creds) | ||
client = bigquery.Client(credentials=credentials) | ||
dry_run_result = client.query( | ||
statement, bigquery.job.QueryJobConfig(dry_run=True) | ||
) | ||
|
||
return { | ||
"Total bytes processed": dry_run_result.total_bytes_processed, | ||
} | ||
|
||
@classmethod | ||
def query_cost_formatter( | ||
cls, raw_cost: List[Dict[str, Any]] | ||
) -> List[Dict[str, str]]: | ||
cost = [] | ||
columns = [ | ||
("Total bytes processed", "", "bytes"), | ||
] | ||
|
||
for row in raw_cost: | ||
statement_cost = {} | ||
for key, suffix, category in columns: | ||
if key in row: | ||
statement_cost[key] = cls._humanize(row[key], suffix, category) | ||
cost.append(statement_cost) | ||
|
||
return cost | ||
|
||
@classmethod | ||
def convert_dttm( | ||
cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None | ||
|
@@ -316,16 +357,9 @@ def df_to_sql( | |
:param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method | ||
""" | ||
|
||
try: | ||
# pylint: disable=import-outside-toplevel | ||
import pandas_gbq | ||
from google.oauth2 import service_account | ||
except ImportError as ex: | ||
raise Exception( | ||
"Could not import libraries `pandas_gbq` or `google.oauth2`, which are " | ||
"required to be installed in your environment in order " | ||
"to upload data to BigQuery" | ||
) from ex | ||
# pylint: disable=import-outside-toplevel | ||
import pandas_gbq | ||
from google.oauth2 import service_account | ||
|
||
if not table.schema: | ||
raise Exception("The table schema must be defined") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -524,7 +524,7 @@ def test_query_cost_formatter(self): | |
expected = [ | ||
{ | ||
"Output count": "904 M rows", | ||
"Output size": "354 GB", | ||
"Output size": "329 GiB", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two values are identical except for the units. I took care not to change any of the other outputs. |
||
"CPU cost": "354 G", | ||
"Max memory": "0 B", | ||
"Network cost": "354 G", | ||
|
@@ -795,17 +795,19 @@ def test_estimate_statement_cost(self): | |
mock_cursor.fetchone.return_value = [ | ||
'{"a": "b"}', | ||
] | ||
mock_engine = mock.Mock() | ||
result = PrestoEngineSpec.estimate_statement_cost( | ||
"SELECT * FROM brth_names", mock_cursor | ||
"SELECT * FROM brth_names", mock_cursor, mock_engine | ||
) | ||
assert result == estimate_json | ||
|
||
def test_estimate_statement_cost_invalid_syntax(self): | ||
mock_cursor = mock.MagicMock() | ||
mock_cursor.execute.side_effect = Exception() | ||
mock_engine = mock.Mock() | ||
with self.assertRaises(Exception): | ||
PrestoEngineSpec.estimate_statement_cost( | ||
"DROP TABLE brth_names", mock_cursor | ||
"DROP TABLE brth_names", mock_cursor, mock_engine | ||
) | ||
|
||
def test_get_all_datasource_names(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there could be an input like (1000, "", "dollars") and an output like "$1,000", there would be more categories