-
Notifications
You must be signed in to change notification settings - Fork 13.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(presto/trino): Ensure get_table_names only returns real tables #21794
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,13 +19,13 @@ | |
|
||
import logging | ||
import re | ||
import textwrap | ||
import time | ||
from abc import ABCMeta | ||
from collections import defaultdict, deque | ||
from contextlib import closing | ||
from datetime import datetime | ||
from distutils.version import StrictVersion | ||
from textwrap import dedent | ||
from typing import Any, cast, Dict, List, Optional, Pattern, Tuple, TYPE_CHECKING, Union | ||
from urllib import parse | ||
|
||
|
@@ -392,46 +392,84 @@ def update_impersonation_config( | |
|
||
@classmethod | ||
def get_table_names( | ||
cls, database: Database, inspector: Inspector, schema: Optional[str] | ||
cls, | ||
database: Database, | ||
inspector: Inspector, | ||
schema: Optional[str], | ||
) -> List[str]: | ||
tables = super().get_table_names(database, inspector, schema) | ||
if not is_feature_enabled("PRESTO_SPLIT_VIEWS_FROM_TABLES"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not quite sure why this undocumented feature flag existed. Why wouldn't one want to split views from tables as is the case with the other engine specifications. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I added the feature at Lyft it would take 90 seconds to read all the views, so I thought it would be better to put it behind a feature flag. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yikes. @betodealmeida I'm somewhat surprised that is the case as per Trino's definition of their I wonder if the poor performance was related to when the schema was undefined? This functionally, i.e., fetching all tables/views across all schemas, was deprecated in #21284 given that the SQL Lab frontend no longer supports searching across all schemas, i.e., a schema must be defined. |
||
return tables | ||
""" | ||
Get all the real table names within the specified schema. | ||
|
||
Per the SQLAlchemy definition if the schema is omitted the database’s default | ||
schema is used, however some dialects infer the request as schema agnostic. | ||
|
||
views = set(cls.get_view_names(database, inspector, schema)) | ||
actual_tables = set(tables) - views | ||
return list(actual_tables) | ||
Note that PyHive's Hive and Presto SQLAlchemy dialects do not adhere to the | ||
specification where the `get_table_names` method returns both real tables and | ||
views. Futhermore the dialects wrongfully infer the request as schema agnostic | ||
when the schema is omitted. | ||
|
||
:param database: The database to inspect | ||
:param inspector: The SQLAlchemy inspector | ||
:param schema: The schema to inspect | ||
:returns: The physical table names | ||
""" | ||
|
||
return sorted( | ||
list( | ||
set(super().get_table_names(database, inspector, schema)) | ||
- set(cls.get_view_names(database, inspector, schema)) | ||
) | ||
) | ||
|
||
@classmethod | ||
def get_view_names( | ||
cls, database: Database, inspector: Inspector, schema: Optional[str] | ||
cls, | ||
database: Database, | ||
inspector: Inspector, | ||
schema: Optional[str], | ||
) -> List[str]: | ||
"""Returns an empty list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Incorrect comment as it depended on state of the feature flag. |
||
""" | ||
Get all the view names within the specified schema. | ||
|
||
get_table_names() function returns all table names and view names, | ||
and get_view_names() is not implemented in sqlalchemy_presto.py | ||
https://github.com/dropbox/PyHive/blob/e25fc8440a0686bbb7a5db5de7cb1a77bdb4167a/pyhive/sqlalchemy_presto.py | ||
Per the SQLAlchemy definition if the schema is omitted the database’s default | ||
schema is used, however some dialects infer the request as schema agnostic. | ||
|
||
Note that PyHive's Hive and Presto SQLAlchemy dialects do not implement the | ||
`get_view_names` method. To ensure consistency with the `get_table_names` method | ||
the request is deemed schema agnostic when the schema is omitted. | ||
|
||
:param database: The database to inspect | ||
:param inspector: The SQLAlchemy inspector | ||
:param schema: The schema to inspect | ||
:returns: The view names | ||
""" | ||
if not is_feature_enabled("PRESTO_SPLIT_VIEWS_FROM_TABLES"): | ||
return [] | ||
|
||
if schema: | ||
sql = ( | ||
"SELECT table_name FROM information_schema.views " | ||
"WHERE table_schema=%(schema)s" | ||
) | ||
sql = dedent( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See trinodb/trino-python-client#267 to why these queries were reworked. |
||
""" | ||
SELECT table_name FROM information_schema.tables | ||
WHERE table_schema = %(schema)s | ||
AND table_type = 'VIEW' | ||
""" | ||
).strip() | ||
params = {"schema": schema} | ||
else: | ||
sql = "SELECT table_name FROM information_schema.views" | ||
sql = dedent( | ||
""" | ||
SELECT table_name FROM information_schema.tables | ||
WHERE table_type = 'VIEW' | ||
""" | ||
).strip() | ||
params = {} | ||
|
||
engine = cls.get_engine(database, schema=schema) | ||
|
||
with closing(engine.raw_connection()) as conn: | ||
cursor = conn.cursor() | ||
cursor.execute(sql, params) | ||
results = cursor.fetchall() | ||
|
||
return [row[0] for row in results] | ||
return sorted([row[0] for row in results]) | ||
|
||
@classmethod | ||
def _create_column_info( | ||
|
@@ -1087,7 +1125,7 @@ def _partition_query( # pylint: disable=too-many-arguments,too-many-locals | |
else f"SHOW PARTITIONS FROM {table_name}" | ||
) | ||
|
||
sql = textwrap.dedent( | ||
sql = dedent( | ||
f"""\ | ||
{partition_select_clause} | ||
{where_clause} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
# specific language governing permissions and limitations | ||
# under the License. | ||
from collections import namedtuple | ||
from textwrap import dedent | ||
from unittest import mock, skipUnless | ||
|
||
import pandas as pd | ||
|
@@ -33,49 +34,47 @@ class TestPrestoDbEngineSpec(TestDbEngineSpec): | |
def test_get_datatype_presto(self): | ||
self.assertEqual("STRING", PrestoEngineSpec.get_datatype("string")) | ||
|
||
def test_presto_get_view_names_return_empty_list( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what the purpose of this function was, i.e., the logic can be inferred. |
||
self, | ||
): # pylint: disable=invalid-name | ||
self.assertEqual( | ||
[], PrestoEngineSpec.get_view_names(mock.ANY, mock.ANY, mock.ANY) | ||
) | ||
|
||
@mock.patch("superset.db_engine_specs.presto.is_feature_enabled") | ||
def test_get_view_names(self, mock_is_feature_enabled): | ||
mock_is_feature_enabled.return_value = True | ||
mock_execute = mock.MagicMock() | ||
mock_fetchall = mock.MagicMock(return_value=[["a", "b,", "c"], ["d", "e"]]) | ||
def test_get_view_names_with_schema(self): | ||
database = mock.MagicMock() | ||
mock_execute = mock.MagicMock() | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.execute = ( | ||
mock_execute | ||
) | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.fetchall = ( | ||
mock_fetchall | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.fetchall = mock.MagicMock( | ||
return_value=[["a", "b,", "c"], ["d", "e"]] | ||
) | ||
result = PrestoEngineSpec.get_view_names(database, mock.Mock(), None) | ||
schema = "schema" | ||
result = PrestoEngineSpec.get_view_names(database, mock.Mock(), schema) | ||
mock_execute.assert_called_once_with( | ||
"SELECT table_name FROM information_schema.views", {} | ||
dedent( | ||
""" | ||
SELECT table_name FROM information_schema.tables | ||
WHERE table_schema = %(schema)s | ||
AND table_type = 'VIEW' | ||
""" | ||
).strip(), | ||
{"schema": schema}, | ||
) | ||
assert result == ["a", "d"] | ||
|
||
@mock.patch("superset.db_engine_specs.presto.is_feature_enabled") | ||
def test_get_view_names_with_schema(self, mock_is_feature_enabled): | ||
mock_is_feature_enabled.return_value = True | ||
mock_execute = mock.MagicMock() | ||
mock_fetchall = mock.MagicMock(return_value=[["a", "b,", "c"], ["d", "e"]]) | ||
def test_get_view_names_without_schema(self): | ||
database = mock.MagicMock() | ||
mock_execute = mock.MagicMock() | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.execute = ( | ||
mock_execute | ||
) | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.fetchall = ( | ||
mock_fetchall | ||
database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.fetchall = mock.MagicMock( | ||
return_value=[["a", "b,", "c"], ["d", "e"]] | ||
) | ||
schema = "schema" | ||
result = PrestoEngineSpec.get_view_names(database, mock.Mock(), schema) | ||
result = PrestoEngineSpec.get_view_names(database, mock.Mock(), None) | ||
mock_execute.assert_called_once_with( | ||
"SELECT table_name FROM information_schema.views " | ||
"WHERE table_schema=%(schema)s", | ||
{"schema": schema}, | ||
dedent( | ||
""" | ||
SELECT table_name FROM information_schema.tables | ||
WHERE table_type = 'VIEW' | ||
""" | ||
).strip(), | ||
{}, | ||
) | ||
assert result == ["a", "d"] | ||
|
||
|
@@ -663,50 +662,17 @@ def test_get_sqla_column_type(self): | |
sqla_type = PrestoEngineSpec.get_sqla_column_type(None) | ||
assert sqla_type is None | ||
|
||
@mock.patch( | ||
"superset.utils.feature_flag_manager.FeatureFlagManager.is_feature_enabled" | ||
) | ||
@mock.patch("superset.db_engine_specs.base.BaseEngineSpec.get_table_names") | ||
@mock.patch("superset.db_engine_specs.presto.PrestoEngineSpec.get_view_names") | ||
def test_get_table_names_no_split_views_from_tables( | ||
self, mock_get_view_names, mock_get_table_names, mock_is_feature_enabled | ||
): | ||
mock_get_view_names.return_value = ["view1", "view2"] | ||
table_names = ["table1", "table2", "view1", "view2"] | ||
mock_get_table_names.return_value = table_names | ||
mock_is_feature_enabled.return_value = False | ||
tables = PrestoEngineSpec.get_table_names(mock.Mock(), mock.Mock(), None) | ||
assert tables == table_names | ||
|
||
@mock.patch( | ||
"superset.utils.feature_flag_manager.FeatureFlagManager.is_feature_enabled" | ||
) | ||
@mock.patch("superset.db_engine_specs.base.BaseEngineSpec.get_table_names") | ||
@mock.patch("superset.db_engine_specs.presto.PrestoEngineSpec.get_view_names") | ||
def test_get_table_names_split_views_from_tables( | ||
self, mock_get_view_names, mock_get_table_names, mock_is_feature_enabled | ||
def test_get_table_names( | ||
self, | ||
mock_get_view_names, | ||
mock_get_table_names, | ||
): | ||
mock_get_view_names.return_value = ["view1", "view2"] | ||
table_names = ["table1", "table2", "view1", "view2"] | ||
mock_get_table_names.return_value = table_names | ||
mock_is_feature_enabled.return_value = True | ||
tables = PrestoEngineSpec.get_table_names(mock.Mock(), mock.Mock(), None) | ||
assert sorted(tables) == sorted(table_names) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where I'm puzzled. The result— |
||
|
||
@mock.patch( | ||
"superset.utils.feature_flag_manager.FeatureFlagManager.is_feature_enabled" | ||
) | ||
@mock.patch("superset.db_engine_specs.base.BaseEngineSpec.get_table_names") | ||
@mock.patch("superset.db_engine_specs.presto.PrestoEngineSpec.get_view_names") | ||
def test_get_table_names_split_views_from_tables_no_tables( | ||
self, mock_get_view_names, mock_get_table_names, mock_is_feature_enabled | ||
): | ||
mock_get_view_names.return_value = [] | ||
table_names = [] | ||
mock_get_table_names.return_value = table_names | ||
mock_is_feature_enabled.return_value = True | ||
mock_get_table_names.return_value = ["table1", "table2", "view1", "view2"] | ||
tables = PrestoEngineSpec.get_table_names(mock.Mock(), mock.Mock(), None) | ||
assert tables == [] | ||
assert tables == ["table1", "table2"] | ||
|
||
def test_get_full_name(self): | ||
names = [ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The result of running
pip-compile-multi --no-upgrade
.