Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MG wrapper for HITS #2088

Merged
merged 39 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
43d2f72
mg hits pylicugraph implementation with tests
jnke2016 Feb 20, 2022
26d21fc
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
jnke2016 Feb 20, 2022
4ea2a88
uncomment part of the test
jnke2016 Feb 20, 2022
ebece58
add experimental hit definition to __init__.py
jnke2016 Feb 20, 2022
eb4adf7
fetch latest updates
jnke2016 Mar 16, 2022
77312e9
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
jnke2016 Mar 16, 2022
081ef4a
add cugraph implementation of MG HITS calling pylibcugraph
jnke2016 Mar 18, 2022
57b18ec
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
jnke2016 Mar 18, 2022
6387b93
update pylibcugraph implementation of MG HITS
jnke2016 Mar 20, 2022
ff6a195
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
jnke2016 Mar 20, 2022
8f395d6
add function to move content from device array view to a cudf series
Mar 20, 2022
73445c7
add MGGraph implementation to pylibcugraph
Mar 20, 2022
4b86078
fix typo, handle None value of initial hubs guess vertices and values
Mar 21, 2022
5976a2b
set is_multigraph=False
Mar 21, 2022
9c75b48
fix style
Mar 21, 2022
9f4be5d
fix copyright check
Mar 21, 2022
204d2ef
fix typo
Mar 21, 2022
f16a549
pass a handle as argument when creating the resource handle for MG Graph
Mar 22, 2022
812f92c
Added FIXME to test for unweighted, added (temporarily) RAFT code to …
rlratzel Mar 23, 2022
ac81610
Fixed flake8 error, changed FIXME in ResourceHandle and removed unuse…
rlratzel Mar 23, 2022
259d140
Added dunder methods to support pickling and set graph props in HITS.
rlratzel Mar 23, 2022
34c23c2
remove cudf dependency in pylibcugraph
Mar 25, 2022
5399e4b
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
Mar 25, 2022
acaeea6
fix style check
Mar 25, 2022
09481bf
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
Mar 28, 2022
38b52ff
ensure each partition is processed by the worker who holds it
Mar 28, 2022
346ca3b
drop Gunrock reference
Mar 28, 2022
1bd9bb1
fix flake8 errors
Mar 28, 2022
4546a14
fix bug in MG HITS test where the tol and max_iter were swapped
Mar 28, 2022
ab042f6
drop duplicated edges
Mar 29, 2022
a531c06
remove comment since MG Graph is now supported
Mar 29, 2022
138b49b
remove outdated comments, update docstring
Mar 29, 2022
2774bdb
remove unused import
Mar 29, 2022
14e1caa
add new line at the end of file
Mar 29, 2022
8494100
Merge remote-tracking branch 'upstream/branch-22.04' into fea-mg_hits
Mar 29, 2022
4a26602
add fixture to create the graph, update pylibcugraph import, update d…
Mar 29, 2022
1ad30ca
update docstring for the HITS version which is L1 norm
Mar 29, 2022
3e47bf1
fix typo
Mar 29, 2022
b6ec8c3
remove outdated comments
Mar 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/cugraph/cugraph/dask/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -12,6 +12,7 @@
# limitations under the License.

from .link_analysis.pagerank import pagerank
from .link_analysis.hits import hits
from .traversal.bfs import bfs
from .traversal.sssp import sssp
from .common.read_utils import get_chunksize
Expand Down
186 changes: 186 additions & 0 deletions python/cugraph/cugraph/dask/link_analysis/hits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from dask.distributed import wait, default_client
from cugraph.dask.common.input_utils import get_distributed_data

import cugraph.comms.comms as Comms
import dask_cudf

import pylibcugraph


def call_hits(sID,
data,
src_col_name,
dst_col_name,
graph_properties,
store_transposed,
num_edges,
do_expensive_check,
tolerance,
max_iter,
initial_hubs_guess_vertices,
initial_hubs_guess_value,
normalized):

handle = Comms.get_handle(sID)
h = pylibcugraph.experimental.ResourceHandle(handle.getHandle())
srcs = data[0][src_col_name]
dsts = data[0][dst_col_name]
weights = None
if "value" in data[0].columns:
weights = data[0]['value']

mg = pylibcugraph.experimental.MGGraph(h,
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
graph_properties,
srcs,
dsts,
weights,
store_transposed,
num_edges,
do_expensive_check)

result = pylibcugraph.experimental.hits(h,
mg,
tolerance,
max_iter,
initial_hubs_guess_vertices,
initial_hubs_guess_value,
normalized,
do_expensive_check)

return result


def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True):
"""
Compute HITS hubs and authorities values for each vertex

The HITS algorithm computes two numbers for a node. Authorities
estimates the node value based on the incoming links. Hubs estimates
the node value based on outgoing links.

The cuGraph implementation of HITS is a wrapper around the gunrock
implementation of HITS.

Note that the gunrock implementation uses a 2-norm, while networkx
uses a 1-norm. The raw scores will be different, but the rank ordering
should be comparable with networkx.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reference to "Gunrock" should be dropped since we are no longer using that as the backend. Does our version use 1- or 2-norm?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dropped the "Gunrock" reference. regarding the implementation , I will ask Chuck and update the docstring accordingly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Gunrock implementation used the L2 norm. Our primitive based implementation uses the L1 norm.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

Parameters
----------

input_graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm).
The adjacency list will be computed if not already present.

tol : float, optional (default=1.0e-5)
Set the tolerance the approximation, this parameter should be a small
magnitude value.

max_iter : int, optional (default=100)
The maximum number of iterations before an answer is returned.

nstart : cudf.Dataframe, optional (default=None)
The initial hubs guess vertices along with their initial hubs guess
value

nstart['vertex'] : cudf.Series
Intial hubs guess vertices
jnke2016 marked this conversation as resolved.
Show resolved Hide resolved
nstart['values'] : cudf.Series
Intial hubs guess values
jnke2016 marked this conversation as resolved.
Show resolved Hide resolved

normalized : bool, optional (default=True)
A flag to normalize the results

Returns
-------
HubsAndAuthorities : dask_cudf.DataFrame
GPU data frame containing three cudf.Series of size V: the vertex
identifiers and the corresponding hubs values and the corresponding
authorities values.

df['vertex'] : dask_cudf.Series
Contains the vertex identifiers
df['hubs'] : dask_cudf.Series
Contains the hubs score
df['authorities'] : dask_cudf.Series
Contains the authorities score

Examples
--------
>>> # import cugraph.dask as dcg
>>> # ... Init a DASK Cluster
>>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
>>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
>>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
>>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
>>> # dg = cugraph.Graph(directed=True)
>>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
>>> # edge_attr='value')
>>> # hits = dcg.hits(dg, max_iter = 50)

"""

client = default_client()

# FIXME Still compute renumbering at this layer in case str
# vertex ID are passed
input_graph.compute_renumber_edge_list(transposed=False)
ddf = input_graph.edgelist.edgelist_df

graph_properties = pylibcugraph.experimental.GraphProperties(
is_multigraph=False)

store_transposed = False
do_expensive_check = False
initial_hubs_guess_vertices = None
initial_hubs_guess_values = None
num_edges = len(ddf)

data = get_distributed_data(ddf)
src_col_name = input_graph.renumber_map.renumbered_src_col_name
dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

if nstart is not None:
initial_hubs_guess_vertices = nstart['vertex']
initial_hubs_guess_values = nstart['values']

result = [client.submit(call_hits,
Comms.get_session_id(),
wf[1],
src_col_name,
dst_col_name,
graph_properties,
store_transposed,
num_edges,
do_expensive_check,
tol,
max_iter,
initial_hubs_guess_vertices,
initial_hubs_guess_values,
normalized,
workers=[wf[0]])
for idx, wf in enumerate(data.worker_to_parts.items())]

wait(result)

ddf = dask_cudf.from_delayed(result)
if input_graph.renumbered:
return input_graph.unrenumber(ddf, 'vertex')

return ddf
149 changes: 149 additions & 0 deletions python/cugraph/cugraph/tests/dask/test_mg_hits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cugraph.dask as dcg
import gc
import pytest
import cugraph
import dask_cudf
from cugraph.dask.common.mg_utils import is_single_gpu
from cugraph.tests import utils

# =============================================================================
# Pytest Setup / Teardown - called for each test function
# =============================================================================


def setup_function():
gc.collect()


# =============================================================================
# Pytest fixtures
# =============================================================================
datasets = utils.DATASETS_UNDIRECTED + \
[utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"]
fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"),
([50], "max_iter"),
([1.0e-6], "tol"),
)


@pytest.fixture(scope="module", params=fixture_params)
def input_combo(request):
"""
Simply return the current combination of params as a dictionary for use in
tests or other parameterized fixtures.
"""
print("parameters are \n", request.param, flush=True)
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
parameters = dict(zip(("graph_file", "max_iter", "tol"), request.param))

return parameters


@pytest.fixture(scope="module")
def input_expected_output(input_combo):
"""
This fixture returns the expected results from the HITS algo.(based on
cuGraph HITS) which can be used for validation.
"""

G = utils.generate_cugraph_graph_from_file(
input_combo["graph_file"])
sg_cugraph_hits = cugraph.hits(
G,
input_combo["max_iter"],
input_combo["tol"])
# Save the results back to the input_combo dictionary to prevent redundant
# cuGraph runs. Other tests using the input_combo fixture will look for
# them, and if not present they will have to re-run the same cuGraph call.
sg_cugraph_hits = sg_cugraph_hits.sort_values(
"vertex").reset_index(drop=True)

input_combo["sg_cugraph_results"] = sg_cugraph_hits
return input_combo


# =============================================================================
# Tests
# =============================================================================


@pytest.mark.skipif(
is_single_gpu(), reason="skipping MG testing on Single GPU system"
)
def test_cugraph_hits(benchmark, input_combo):
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
"""
Simply run cuGraph HITS on the same set of input combinations used
for the
cuGraph HITS tests.
This is only in place for generating comparison performance numbers.
"""
G = utils.generate_cugraph_graph_from_file(
input_combo["graph_file"])

sg_cugraph_hits = benchmark(cugraph.hits,
G,
input_combo["max_iter"],
input_combo["tol"])
input_combo["sg_cugraph_results"] = sg_cugraph_hits


@pytest.mark.skipif(
is_single_gpu(), reason="skipping MG testing on Single GPU system"
)
def test_dask_hits(dask_client, benchmark, input_expected_output):

input_data_path = input_expected_output["graph_file"]
print(f"dataset={input_data_path}")
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
chunksize = dcg.get_chunksize(input_expected_output["graph_file"])

ddf = dask_cudf.read_csv(
input_data_path,
chunksize=chunksize,
delimiter=" ",
names=["src", "dst", "value"],
dtype=["int32", "int32", "float32"],
)

dg = cugraph.Graph(directed=True)
# FIXME: also test with no weights
dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")

result_hits = benchmark(dcg.hits,
dg,
input_expected_output["max_iter"],
input_expected_output["tol"])
result_hits = result_hits.compute().sort_values(
"vertex").reset_index(drop=True)

expected_output = input_expected_output["sg_cugraph_results"].sort_values(
"vertex").reset_index(drop=True)

# Update the dask cugraph HITS results with sg cugraph results for easy
# comparison using cuDF DataFrame methods.
result_hits["cugraph_hubs"] = expected_output['hubs']
result_hits["cugraph_authorities"] = expected_output["authorities"]

# FIXME: Check this is working
hubs_diffs1 = result_hits.query('hubs - cugraph_hubs > 0.00001')
hubs_diffs2 = result_hits.query('hubs - cugraph_hubs < -0.00001')
authorities_diffs1 = result_hits.query(
'authorities - cugraph_authorities > 0.0001')
authorities_diffs2 = result_hits.query(
'authorities - cugraph_authorities < -0.0001')

assert len(hubs_diffs1) == 0
assert len(hubs_diffs2) == 0
assert len(authorities_diffs1) == 0
assert len(authorities_diffs2) == 0
38 changes: 38 additions & 0 deletions python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,41 @@ cdef extern from "cugraph_c/algorithms.h":
cugraph_random_walk_result_t** result,
cugraph_error_t** error
)
###########################################################################
# hits
ctypedef struct cugraph_hits_result_t:
pass

cdef cugraph_type_erased_device_array_view_t* \
cugraph_hits_result_get_vertices(
cugraph_hits_result_t* result
)

cdef cugraph_type_erased_device_array_view_t* \
cugraph_hits_result_get_hubs(
cugraph_hits_result_t* result
)

cdef cugraph_type_erased_device_array_view_t* \
cugraph_hits_result_get_authorities(
cugraph_hits_result_t* result
)

cdef void \
cugraph_hits_result_free(
cugraph_hits_result_t* result
)

cdef cugraph_error_code_t \
cugraph_hits(
const cugraph_resource_handle_t* handle,
cugraph_graph_t* graph,
double tol,
size_t max_iter,
const cugraph_type_erased_device_array_view_t* initial_hubs_guess_vertices,
const cugraph_type_erased_device_array_view_t* initial_hubs_guess_values,
bool_t normalized,
bool_t do_expensive_check,
cugraph_hits_result_t** result,
cugraph_error_t** error
)
Loading