From d891924e1ee241784d2234c30fcc82e1e71534d0 Mon Sep 17 00:00:00 2001 From: John Bodley <4567245+john-bodley@users.noreply.github.com> Date: Fri, 28 Apr 2023 07:38:44 +1200 Subject: [PATCH 1/6] fix(tests): Ensure deterministic zip file ordering for CSV uploads --- tests/integration_tests/csv_upload_tests.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index 97b83bb8fc9e5..72a8d9e57ef6c 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -20,6 +20,7 @@ import logging import os import shutil +import zipfile from typing import Dict, Optional, Union from unittest import mock @@ -119,7 +120,10 @@ def create_columnar_files(): os.mkdir(ZIP_DIRNAME) pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1) pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2) - shutil.make_archive(ZIP_DIRNAME, "zip", ZIP_DIRNAME) + + with zipfile.ZipFile(ZIP_FILENAME, "w") as archive: + for filename in [PARQUET_FILENAME1, PARQUET_FILENAME2]: + archive.write(filename, filename) yield os.remove(ZIP_FILENAME) shutil.rmtree(ZIP_DIRNAME) @@ -556,4 +560,4 @@ def test_import_parquet(mock_event_logger): with test_db.get_sqla_engine_with_context() as engine: data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}").fetchall() - assert data == [("max", 3), ("bob", 4), ("john", 1), ("paul", 2)] + assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)] From cf70389240660ff2e0c95724ba2f9019d563b76d Mon Sep 17 00:00:00 2001 From: John Bodley <4567245+john-bodley@users.noreply.github.com> Date: Fri, 28 Apr 2023 07:50:12 +1200 Subject: [PATCH 2/6] Update csv_upload_tests.py --- tests/integration_tests/csv_upload_tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index 72a8d9e57ef6c..51035c1db5f0d 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -122,8 +122,9 @@ def create_columnar_files(): pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2) with zipfile.ZipFile(ZIP_FILENAME, "w") as archive: - for filename in [PARQUET_FILENAME1, PARQUET_FILENAME2]: - archive.write(filename, filename) + archive.write(PARQUET_FILENAME1) + archive.write(PARQUET_FILENAME2) + yield os.remove(ZIP_FILENAME) shutil.rmtree(ZIP_DIRNAME) From 1b4125f3b446e932d526f2afd8bf165d462c29c0 Mon Sep 17 00:00:00 2001 From: John Bodley <4567245+john-bodley@users.noreply.github.com> Date: Fri, 28 Apr 2023 08:02:58 +1200 Subject: [PATCH 3/6] Update csv_upload_tests.py --- tests/integration_tests/csv_upload_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index 51035c1db5f0d..be1660780b932 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -120,7 +120,7 @@ def create_columnar_files(): os.mkdir(ZIP_DIRNAME) pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1) pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2) - + with zipfile.ZipFile(ZIP_FILENAME, "w") as archive: archive.write(PARQUET_FILENAME1) archive.write(PARQUET_FILENAME2) From 9e9b1c6b2ec83aa92cd65fac9af94701a02f0d61 Mon Sep 17 00:00:00 2001 From: John Bodley <4567245+john-bodley@users.noreply.github.com> Date: Fri, 28 Apr 2023 08:17:52 +1200 Subject: [PATCH 4/6] Update csv_upload_tests.py --- tests/integration_tests/csv_upload_tests.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index be1660780b932..0809f4007a82b 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -20,7 +20,6 @@ import logging import os import shutil -import zipfile from typing import Dict, Optional, Union from unittest import mock @@ -120,11 +119,7 @@ def create_columnar_files(): os.mkdir(ZIP_DIRNAME) pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1) pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2) - - with zipfile.ZipFile(ZIP_FILENAME, "w") as archive: - archive.write(PARQUET_FILENAME1) - archive.write(PARQUET_FILENAME2) - + shutil.make_archive(ZIP_DIRNAME, "zip", ZIP_DIRNAME) yield os.remove(ZIP_FILENAME) shutil.rmtree(ZIP_DIRNAME) @@ -279,7 +274,7 @@ def test_import_csv_enforced_schema(mock_event_logger): with get_upload_db().get_sqla_engine_with_context() as engine: data = engine.execute( - f"SELECT * from {ADMIN_SCHEMA_NAME}.{CSV_UPLOAD_TABLE_W_SCHEMA}" + f"SELECT * from {ADMIN_SCHEMA_NAME}.{CSV_UPLOAD_TABLE_W_SCHEMA} ORDER BY b" ).fetchall() assert data == [("john", 1), ("paul", 2)] @@ -390,12 +385,12 @@ def test_import_csv(mock_event_logger): ) # make sure that john and empty string are replaced with None with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [(None, 1, "x"), ("paul", 2, None)] # default null values upload_csv(CSV_FILENAME2, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"}) # make sure that john and empty string are replaced with None - data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [("john", 1, "x"), ("paul", 2, None)] # cleanup @@ -413,7 +408,7 @@ def test_import_csv(mock_event_logger): # or an int to a float # file upload should work as normal with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [("john", 1), ("paul", 2)] # cleanup @@ -485,7 +480,7 @@ def test_import_excel(mock_event_logger): ) with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {EXCEL_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {EXCEL_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [(0, "john", 1), (1, "paul", 2)] @@ -549,7 +544,7 @@ def test_import_parquet(mock_event_logger): assert success_msg_f1 in resp with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [("john", 1), ("paul", 2)] # replace table with zip file @@ -560,5 +555,5 @@ def test_import_parquet(mock_event_logger): assert success_msg_f2 in resp with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}").fetchall() + data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b").fetchall() assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)] From 2bb6407b812472e91f1e1b4fbffec17a309070cc Mon Sep 17 00:00:00 2001 From: John Bodley Date: Fri, 28 Apr 2023 08:33:44 +1200 Subject: [PATCH 5/6] pre-commit --- tests/integration_tests/csv_upload_tests.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index 0809f4007a82b..4d77f6113b179 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -480,7 +480,9 @@ def test_import_excel(mock_event_logger): ) with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {EXCEL_UPLOAD_TABLE} ORDER BY b").fetchall() + data = engine.execute( + f"SELECT * from {EXCEL_UPLOAD_TABLE} ORDER BY b" + ).fetchall() assert data == [(0, "john", 1), (1, "paul", 2)] @@ -544,7 +546,9 @@ def test_import_parquet(mock_event_logger): assert success_msg_f1 in resp with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b").fetchall() + data = engine.execute( + f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b" + ).fetchall() assert data == [("john", 1), ("paul", 2)] # replace table with zip file @@ -555,5 +559,7 @@ def test_import_parquet(mock_event_logger): assert success_msg_f2 in resp with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b").fetchall() + data = engine.execute( + f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b" + ).fetchall() assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)] From 50db0de87ffd7f13690129fe42b2dd0ea210d09b Mon Sep 17 00:00:00 2001 From: John Bodley Date: Fri, 28 Apr 2023 08:45:28 +1200 Subject: [PATCH 6/6] Fix ORDER BY --- tests/integration_tests/csv_upload_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/csv_upload_tests.py b/tests/integration_tests/csv_upload_tests.py index 4d77f6113b179..70f984775db59 100644 --- a/tests/integration_tests/csv_upload_tests.py +++ b/tests/integration_tests/csv_upload_tests.py @@ -385,12 +385,12 @@ def test_import_csv(mock_event_logger): ) # make sure that john and empty string are replaced with None with test_db.get_sqla_engine_with_context() as engine: - data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall() + data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY c").fetchall() assert data == [(None, 1, "x"), ("paul", 2, None)] # default null values upload_csv(CSV_FILENAME2, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"}) # make sure that john and empty string are replaced with None - data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall() + data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY c").fetchall() assert data == [("john", 1, "x"), ("paul", 2, None)] # cleanup