rapidsai · rapids-bot · Mar 30, 2022 · Feb 20, 2022 · Feb 20, 2022 · Feb 20, 2022
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 from .link_analysis.pagerank import pagerank
+from .link_analysis.hits import hits
 from .traversal.bfs import bfs
 from .traversal.sssp import sssp
 from .common.read_utils import get_chunksize

@@ -0,0 +1,186 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import wait, default_client
+from cugraph.dask.common.input_utils import get_distributed_data
+
+import cugraph.comms.comms as Comms
+import dask_cudf
+
+import pylibcugraph
+
+
+def call_hits(sID,
+              data,
+              src_col_name,
+              dst_col_name,
+              graph_properties,
+              store_transposed,
+              num_edges,
+              do_expensive_check,
+              tolerance,
+              max_iter,
+              initial_hubs_guess_vertices,
+              initial_hubs_guess_value,
+              normalized):
+
+    handle = Comms.get_handle(sID)
+    h = pylibcugraph.experimental.ResourceHandle(handle.getHandle())
+    srcs = data[0][src_col_name]
+    dsts = data[0][dst_col_name]
+    weights = None
+    if "value" in data[0].columns:
+        weights = data[0]['value']
+
+    mg = pylibcugraph.experimental.MGGraph(h,
+                                           graph_properties,
+                                           srcs,
+                                           dsts,
+                                           weights,
+                                           store_transposed,
+                                           num_edges,
+                                           do_expensive_check)
+
+    result = pylibcugraph.experimental.hits(h,
+                                            mg,
+                                            tolerance,
+                                            max_iter,
+                                            initial_hubs_guess_vertices,
+                                            initial_hubs_guess_value,
+                                            normalized,
+                                            do_expensive_check)
+
+    return result
+
+
+def hits(input_graph, tol=1.0e-5, max_iter=100,  nstart=None, normalized=True):
+    """
+    Compute HITS hubs and authorities values for each vertex
+
+    The HITS algorithm computes two numbers for a node.  Authorities
+    estimates the node value based on the incoming links.  Hubs estimates
+    the node value based on outgoing links.
+
+    The cuGraph implementation of HITS is a wrapper around the gunrock
+    implementation of HITS.
+
+    Note that the gunrock implementation uses a 2-norm, while networkx
+    uses a 1-norm.  The raw scores will be different, but the rank ordering
+    should be comparable with networkx.
+
+    Parameters
+    ----------
+
+    input_graph : cugraph.Graph
+        cuGraph graph descriptor, should contain the connectivity information
+        as an edge list (edge weights are not used for this algorithm).
+        The adjacency list will be computed if not already present.
+
+    tol : float, optional (default=1.0e-5)
+        Set the tolerance the approximation, this parameter should be a small
+        magnitude value.
+
+    max_iter : int, optional (default=100)
+        The maximum number of iterations before an answer is returned.
+
+    nstart : cudf.Dataframe, optional (default=None)
+        The initial hubs guess vertices along with their initial hubs guess
+        value
+
+        nstart['vertex'] : cudf.Series
+            Intial hubs guess vertices
+        nstart['values'] : cudf.Series
+            Intial hubs guess values
+
+    normalized : bool, optional (default=True)
+        A flag to normalize the results
+
+    Returns
+    -------
+    HubsAndAuthorities : dask_cudf.DataFrame
+        GPU data frame containing three cudf.Series of size V: the vertex
+        identifiers and the corresponding hubs values and the corresponding
+        authorities values.
+
+        df['vertex'] : dask_cudf.Series
+            Contains the vertex identifiers
+        df['hubs'] : dask_cudf.Series
+            Contains the hubs score
+        df['authorities'] : dask_cudf.Series
+            Contains the authorities score
+
+    Examples
+    --------
+    >>> # import cugraph.dask as dcg
+    >>> # ... Init a DASK Cluster
+    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
+    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
+    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
+    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
+    >>> # dg = cugraph.Graph(directed=True)
+    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
+    >>> #                            edge_attr='value')
+    >>> # hits = dcg.hits(dg, max_iter = 50)
+
+    """
+
+    client = default_client()
+
+    # FIXME Still compute renumbering at this layer in case str
+    # vertex ID are passed
+    input_graph.compute_renumber_edge_list(transposed=False)
+    ddf = input_graph.edgelist.edgelist_df
+
+    graph_properties = pylibcugraph.experimental.GraphProperties(
+        is_multigraph=False)
+
+    store_transposed = False
+    do_expensive_check = False
+    initial_hubs_guess_vertices = None
+    initial_hubs_guess_values = None
+    num_edges = len(ddf)
+
+    data = get_distributed_data(ddf)
+    src_col_name = input_graph.renumber_map.renumbered_src_col_name
+    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
+
+    if nstart is not None:
+        initial_hubs_guess_vertices = nstart['vertex']
+        initial_hubs_guess_values = nstart['values']
+
+    result = [client.submit(call_hits,
+                            Comms.get_session_id(),
+                            wf[1],
+                            src_col_name,
+                            dst_col_name,
+                            graph_properties,
+                            store_transposed,
+                            num_edges,
+                            do_expensive_check,
+                            tol,
+                            max_iter,
+                            initial_hubs_guess_vertices,
+                            initial_hubs_guess_values,
+                            normalized,
+                            workers=[wf[0]])
+              for idx, wf in enumerate(data.worker_to_parts.items())]
+
+    wait(result)
+
+    ddf = dask_cudf.from_delayed(result)
+    if input_graph.renumbered:
+        return input_graph.unrenumber(ddf, 'vertex')
+
+    return ddf
@@ -0,0 +1,149 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cugraph.dask as dcg
+import gc
+import pytest
+import cugraph
+import dask_cudf
+from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.tests import utils
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+
+
+def setup_function():
+    gc.collect()
+
+
+# =============================================================================
+# Pytest fixtures
+# =============================================================================
+datasets = utils.DATASETS_UNDIRECTED + \
+           [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"]
+fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"),
+                                               ([50], "max_iter"),
+                                               ([1.0e-6], "tol"),
+                                               )
+
+
+@pytest.fixture(scope="module", params=fixture_params)
+def input_combo(request):
+    """
+    Simply return the current combination of params as a dictionary for use in
+    tests or other parameterized fixtures.
+    """
+    print("parameters are \n", request.param, flush=True)
+    parameters = dict(zip(("graph_file", "max_iter", "tol"), request.param))
+
+    return parameters
+
+
+@pytest.fixture(scope="module")
+def input_expected_output(input_combo):
+    """
+    This fixture returns the expected results from the HITS algo.(based on
+    cuGraph HITS) which can be used for validation.
+    """
+
+    G = utils.generate_cugraph_graph_from_file(
+        input_combo["graph_file"])
+    sg_cugraph_hits = cugraph.hits(
+                            G,
+                            input_combo["max_iter"],
+                            input_combo["tol"])
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+    sg_cugraph_hits = sg_cugraph_hits.sort_values(
+        "vertex").reset_index(drop=True)
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_hits
+    return input_combo
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+def test_cugraph_hits(benchmark, input_combo):
+    """
+    Simply run cuGraph HITS on the same set of input combinations used
+    for the
+    cuGraph HITS tests.
+    This is only in place for generating comparison performance numbers.
+    """
+    G = utils.generate_cugraph_graph_from_file(
+        input_combo["graph_file"])
+
+    sg_cugraph_hits = benchmark(cugraph.hits,
+                                G,
+                                input_combo["max_iter"],
+                                input_combo["tol"])
+    input_combo["sg_cugraph_results"] = sg_cugraph_hits
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+def test_dask_hits(dask_client, benchmark, input_expected_output):
+
+    input_data_path = input_expected_output["graph_file"]
+    print(f"dataset={input_data_path}")
+    chunksize = dcg.get_chunksize(input_expected_output["graph_file"])
+
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=True)
+    # FIXME: also test with no weights
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
+
+    result_hits = benchmark(dcg.hits,
+                            dg,
+                            input_expected_output["max_iter"],
+                            input_expected_output["tol"])
+    result_hits = result_hits.compute().sort_values(
+        "vertex").reset_index(drop=True)
+
+    expected_output = input_expected_output["sg_cugraph_results"].sort_values(
+        "vertex").reset_index(drop=True)
+
+    # Update the dask cugraph HITS results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_hits["cugraph_hubs"] = expected_output['hubs']
+    result_hits["cugraph_authorities"] = expected_output["authorities"]
+
+    # FIXME: Check this is working
+    hubs_diffs1 = result_hits.query('hubs - cugraph_hubs > 0.00001')
+    hubs_diffs2 = result_hits.query('hubs - cugraph_hubs < -0.00001')
+    authorities_diffs1 = result_hits.query(
+        'authorities - cugraph_authorities > 0.0001')
+    authorities_diffs2 = result_hits.query(
+        'authorities - cugraph_authorities < -0.0001')
+
+    assert len(hubs_diffs1) == 0
+    assert len(hubs_diffs2) == 0
+    assert len(authorities_diffs1) == 0
+    assert len(authorities_diffs2) == 0
@@ -203,3 +203,41 @@ cdef extern from "cugraph_c/algorithms.h":
             cugraph_random_walk_result_t** result,
             cugraph_error_t** error
         )
+    ###########################################################################
+    # hits
+    ctypedef struct cugraph_hits_result_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_hits_result_get_vertices(
+            cugraph_hits_result_t* result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_hits_result_get_hubs(
+            cugraph_hits_result_t* result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_hits_result_get_authorities(
+            cugraph_hits_result_t* result
+        )
+
+    cdef void \
+        cugraph_hits_result_free(
+            cugraph_hits_result_t* result
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_hits(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            double tol,
+            size_t max_iter,
+            const cugraph_type_erased_device_array_view_t* initial_hubs_guess_vertices,
+            const cugraph_type_erased_device_array_view_t* initial_hubs_guess_values,
+            bool_t normalized,
+            bool_t do_expensive_check,
+            cugraph_hits_result_t** result,
+            cugraph_error_t** error
+        )