From e87b4def21f9a0bc00ccd67eec57ed33556f09bf Mon Sep 17 00:00:00 2001 From: David Katz <41651296+DavidKatz-il@users.noreply.github.com> Date: Mon, 6 May 2024 14:14:56 +0000 Subject: [PATCH 1/2] Support json type in athena2pandas --- awswrangler/_data_types.py | 2 +- tests/unit/test_athena.py | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 54949d109..b8cf9ec18 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -376,7 +376,7 @@ def athena2pandas(dtype: str, dtype_backend: str | None = None) -> str: # noqa: return "decimal" if dtype_backend != "pyarrow" else "double[pyarrow]" if dtype in ("binary", "varbinary"): return "bytes" if dtype_backend != "pyarrow" else "binary[pyarrow]" - if any(dtype.startswith(t) for t in ["array", "row", "map", "struct"]): + if any(dtype.startswith(t) for t in ["array", "row", "map", "struct", "json"]): return "object" if dtype == "geometry": return "string" diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py index eda53c0ab..d8e9503d0 100644 --- a/tests/unit/test_athena.py +++ b/tests/unit/test_athena.py @@ -560,6 +560,47 @@ def test_athena_read_list(glue_database): assert df["col0"].iloc[0] == "[1, 2, 3]" +def test_athena_read_json(glue_database): + sql = """ + WITH dataset AS ( + SELECT + CAST('HELLO ATHENA' AS JSON) AS some_str, + CAST(12345 AS JSON) AS some_int, + CAST(MAP(ARRAY['a', 'b'], ARRAY[1,2]) AS JSON) AS some_map + ) + SELECT * FROM dataset + """ + df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False) + assert len(df) == 1 + assert len(df.index) == 1 + assert len(df.columns) == 3 + assert df["some_str"].iloc[0] == '"HELLO ATHENA"' + assert df["some_int"].iloc[0] == '12345' + assert df["some_map"].iloc[0] == '{"a":1,"b":2}' + + +def test_athena_read_json_extract(glue_database): + sql = """ + WITH dataset AS ( + SELECT '{"name": "Susan Smith", + "org": "engineering", + "projects": [{"name":"project1", "completed":false}, + {"name":"project2", "completed":true}]}' + AS myblob + ) + SELECT + json_extract(myblob, '$.name') AS name, + json_extract(myblob, '$.projects') AS projects + FROM dataset + """ + df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False) + assert len(df) == 1 + assert len(df.index) == 1 + assert len(df.columns) == 2 + assert df["name"].iloc[0] == '"Susan Smith"' + assert df["projects"].iloc[0] == '[{"name":"project1","completed":false},{"name":"project2","completed":true}]' + + def test_sanitize_dataframe_column_names(): with pytest.warns(UserWarning, match=r"Duplicate*"): test_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) From 0803d85ab9f8750756cf889a0252b9fa4d8c1a6c Mon Sep 17 00:00:00 2001 From: David Katz <41651296+DavidKatz-il@users.noreply.github.com> Date: Mon, 6 May 2024 14:30:21 +0000 Subject: [PATCH 2/2] fix formatting --- tests/unit/test_athena.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py index d8e9503d0..ba8d458be 100644 --- a/tests/unit/test_athena.py +++ b/tests/unit/test_athena.py @@ -575,7 +575,7 @@ def test_athena_read_json(glue_database): assert len(df.index) == 1 assert len(df.columns) == 3 assert df["some_str"].iloc[0] == '"HELLO ATHENA"' - assert df["some_int"].iloc[0] == '12345' + assert df["some_int"].iloc[0] == "12345" assert df["some_map"].iloc[0] == '{"a":1,"b":2}'