From 2de26447965303ab0db31879fa517180ae766d5d Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 21 Mar 2023 20:35:40 -0500 Subject: [PATCH 1/3] Fix `PropertyGraph.renumber_*_by_type` with only default types Fixes #3058 --- .../dask/structure/mg_property_graph.py | 16 ++++++---- .../cugraph/structure/property_graph.py | 2 ++ .../data_store/test_property_graph_mg.py | 31 +++++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index dbf0bc90ec2..ce881727490 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -1431,7 +1431,10 @@ def renumber_vertices_by_type(self, prev_id_column=None): # Include self.vertex_col_name when sorting by values to ensure we can # evenly distribute the data across workers. df = df.reset_index().persist() - df = df.sort_values(by=[TCN, self.vertex_col_name], ignore_index=True).persist() + if len(self.vertex_types) > 1: + df = df.sort_values( + by=[TCN, self.vertex_col_name], ignore_index=True + ).persist() if self.__edge_prop_dataframe is not None: new_name = f"new_{self.vertex_col_name}" df[new_name] = 1 @@ -1522,9 +1525,10 @@ def renumber_edges_by_type(self, prev_id_column=None): # Include self.edge_id_col_name when sorting by values to ensure we can # evenly distribute the data across workers. df = df.reset_index().persist() - df = df.sort_values( - by=[self.type_col_name, self.edge_id_col_name], ignore_index=True - ).persist() + if len(self.edge_types) > 1: + df = df.sort_values( + by=[self.type_col_name, self.edge_id_col_name], ignore_index=True + ).persist() if prev_id_column is not None: df[prev_id_column] = df[self.edge_id_col_name] @@ -1540,8 +1544,8 @@ def renumber_edges_by_type(self, prev_id_column=None): # FIXME DASK_CUDF: https://github.com/rapidsai/cudf/issues/11795 df = self._edge_type_value_counts - assert df.index.dtype == cat_dtype - df.index = df.index.astype(str) + if df.index.dtype == cat_dtype: + df.index = df.index.astype(str) # self._edge_type_value_counts rv = df.sort_index().cumsum().to_frame("stop") diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 7baadac57bf..b04025d1c57 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -1982,6 +1982,7 @@ def renumber_vertices_by_type(self, prev_id_column=None): ].astype(cat_dtype) index_dtype = self.__vertex_prop_dataframe.index.dtype + # Should we avoid `sort_values` if we know there is only one type? df = self.__vertex_prop_dataframe.reset_index().sort_values(by=TCN) df.index = df.index.astype(index_dtype) if self.__edge_prop_dataframe is not None: @@ -2071,6 +2072,7 @@ def renumber_edges_by_type(self, prev_id_column=None): df = self.__edge_prop_dataframe index_dtype = df.index.dtype + # Should we avoid `sort_values` if we know there is only one type? if prev_id_column is None: df = df.sort_values(by=TCN, ignore_index=True) else: diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py index d2d7b82d9d8..e2f9d03f445 100644 --- a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py +++ b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py @@ -1476,6 +1476,37 @@ def test_types_from_numerals(dask_client): ] +@pytest.mark.mg +def test_renumber_by_type_only_default_type(dask_client): + from cugraph.experimental import MGPropertyGraph + + pG = MGPropertyGraph() + df = cudf.DataFrame( + { + "src": cp.array([0, 0, 1, 2, 2, 3], dtype="int32"), + "dst": cp.array([1, 2, 4, 3, 4, 1], dtype="int32"), + } + ) + ddf = dask_cudf.from_cudf(df, npartitions=2) + pG.add_edge_data(ddf, vertex_col_names=["src", "dst"]) + + df2 = cudf.DataFrame( + { + "prop1": [100, 200, 300, 400, 500], + "prop2": [5, 4, 3, 2, 1], + "id": cp.array([0, 1, 2, 3, 4], dtype="int32"), + } + ) + ddf2 = dask_cudf.from_cudf(df2, npartitions=2) + pG.add_vertex_data(ddf2, vertex_col_name="id") + pG.renumber_vertices_by_type() + got = pG.get_vertex_data().compute() + assert got[pG.vertex_col_name].to_arrow().to_pylist() == list(range(len(got))) + pG.renumber_edges_by_type() + got = pG.get_edge_data().compute() + assert got[pG.edge_id_col_name].to_arrow().to_pylist() == list(range(len(got))) + + # ============================================================================= # Benchmarks # ============================================================================= From bd89f98e1561643183c339e10a901e7aa3856ffe Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 4 Apr 2023 18:02:19 -0500 Subject: [PATCH 2/3] Only do `sort_values` when we know there is more than one type --- python/cugraph/cugraph/dask/common/mg_utils.py | 2 +- .../dask/structure/mg_property_graph.py | 5 +++-- .../cugraph/structure/property_graph.py | 18 +++++++++++++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py index 5ab884a5b34..a8979ba15f8 100644 --- a/python/cugraph/cugraph/dask/common/mg_utils.py +++ b/python/cugraph/cugraph/dask/common/mg_utils.py @@ -72,7 +72,7 @@ def get_visible_devices(): _visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") if _visible_devices is None: # FIXME: We assume that if the variable is unset there is only one GPU - visible_devices = ["0"] + visible_devices = ["1"] else: visible_devices = _visible_devices.strip().split(",") return visible_devices diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index e0879819008..ce15948b6db 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -1530,7 +1530,8 @@ def renumber_vertices_by_type(self, prev_id_column=None): # Include self.vertex_col_name when sorting by values to ensure we can # evenly distribute the data across workers. df = df.reset_index().persist() - if len(self.vertex_types) > 1: + if len(cat_dtype.categories) > 1 and len(self.vertex_types) > 1: + # `self.vertex_types` is currently not cheap, b/c it looks at edge df df = df.sort_values( by=[TCN, self.vertex_col_name], ignore_index=True ).persist() @@ -1624,7 +1625,7 @@ def renumber_edges_by_type(self, prev_id_column=None): # Include self.edge_id_col_name when sorting by values to ensure we can # evenly distribute the data across workers. df = df.reset_index().persist() - if len(self.edge_types) > 1: + if len(cat_dtype.categories) > 1 and len(self.edge_types) > 1: df = df.sort_values( by=[self.type_col_name, self.edge_id_col_name], ignore_index=True ).persist() diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 68050dbdc7e..3bf7faea6cc 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -2074,8 +2074,11 @@ def renumber_vertices_by_type(self, prev_id_column=None): ].astype(cat_dtype) index_dtype = self.__vertex_prop_dataframe.index.dtype - # Should we avoid `sort_values` if we know there is only one type? - df = self.__vertex_prop_dataframe.reset_index().sort_values(by=TCN) + df = self.__vertex_prop_dataframe.reset_index() + if len(df.dtypes[TCN].categories) > 1 and len(self.vertex_types) > 1: + # Avoid `sort_values` if we know there is only one type + # `self.vertex_types` is currently not cheap, b/c it looks at edge df + df = df.sort_values(by=TCN, ignore_index=True) df.index = df.index.astype(index_dtype) if self.__edge_prop_dataframe is not None: mapper = self.__series_type(df.index, index=df[self.vertex_col_name]) @@ -2164,11 +2167,16 @@ def renumber_edges_by_type(self, prev_id_column=None): df = self.__edge_prop_dataframe index_dtype = df.index.dtype - # Should we avoid `sort_values` if we know there is only one type? if prev_id_column is None: - df = df.sort_values(by=TCN, ignore_index=True) + if len(df.dtypes[TCN].categories) > 1 and len(self.edge_types) > 1: + # Avoid `sort_values` if we know there is only one type + df = df.sort_values(by=TCN, ignore_index=True) + else: + df.reset_index(drop=True, inplace=True) else: - df = df.sort_values(by=TCN) + if len(df.dtypes[TCN].categories) > 1 and len(self.edge_types) > 1: + # Avoid `sort_values` if we know there is only one type + df = df.sort_values(by=TCN) df.index.name = prev_id_column df.reset_index(inplace=True) df.index = df.index.astype(index_dtype) From ab3d1db73c8eec6809f08df0a5e8819a35f10418 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 4 Apr 2023 18:03:22 -0500 Subject: [PATCH 3/3] oops. I should use an environment variable --- python/cugraph/cugraph/dask/common/mg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py index a8979ba15f8..5ab884a5b34 100644 --- a/python/cugraph/cugraph/dask/common/mg_utils.py +++ b/python/cugraph/cugraph/dask/common/mg_utils.py @@ -72,7 +72,7 @@ def get_visible_devices(): _visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") if _visible_devices is None: # FIXME: We assume that if the variable is unset there is only one GPU - visible_devices = ["1"] + visible_devices = ["0"] else: visible_devices = _visible_devices.strip().split(",") return visible_devices