THUDM · cenyk1230 · Apr 7, 2021 · Mar 23, 2021 · Mar 24, 2021 · Mar 24, 2021
diff --git a/cogdl/data/data.py b/cogdl/data/data.py
@@ -12,12 +12,12 @@
     remove_self_loops,
     symmetric_normalization,
     row_normalization,
-    fast_spmm,
     get_degrees,
 )
-from cogdl.operators.operators import sample_adj_c, subgraph_c
+from cogdl.operators.sample import sample_adj_c
 
-indicator = fast_spmm is None
+indicator = False
+subgraph_c = None
 
 
 class BaseGraph(object):

diff --git a/cogdl/data/sampler.py b/cogdl/data/sampler.py
@@ -168,10 +168,7 @@ def one_batch(self, phase, require_norm=True):
         return data
 
     def exists_train_nodes(self, node_idx):
-        for idx in node_idx:
-            if self.train_mask[idx]:
-                return True
-        return False
+        return self.train_mask[node_idx].any().item()
 
     def node_induction(self, node_idx):
         node_idx = np.unique(node_idx)
@@ -201,6 +198,53 @@ def sample(self):
         pass
 
 
+class SAINTDataset(torch.utils.data.Dataset):
+    partition_tool = None
+
+    def __init__(self, dataset, args_sampler, require_norm=True, log=False):
+        super(SAINTDataset).__init__()
+
+        self.data = dataset.data
+        self.dataset_name = dataset.__class__.__name__
+        self.args_sampler = args_sampler
+        self.require_norm = require_norm
+        self.log = log
+
+        if self.args_sampler["sampler"] == "node":
+            self.sampler = NodeSampler(self.data, self.args_sampler)
+        elif self.args_sampler["sampler"] == "edge":
+            self.sampler = EdgeSampler(self.data, self.args_sampler)
+        elif self.args_sampler["sampler"] == "rw":
+            self.sampler = RWSampler(self.data, self.args_sampler)
+        elif self.args_sampler["sampler"] == "mrw":
+            self.sampler = MRWSampler(self.data, self.args_sampler)
+        else:
+            raise NotImplementedError
+
+        self.batch_idx = np.array(range(len(self.sampler.subgraph_data)))
+
+    def shuffle(self):
+        random.shuffle(self.batch_idx)
+
+    def __len__(self):
+        return len(self.sampler.subgraph_data)
+
+    def __getitem__(self, idx):
+        new_idx = self.batch_idx[idx]
+        data = self.sampler.subgraph_data[new_idx]
+        node_idx = self.sampler.subgraph_node_idx[new_idx]
+        edge_idx = self.sampler.subgraph_edge_idx[new_idx]
+
+        if self.require_norm:
+            data.norm_aggr = torch.FloatTensor(self.sampler.norm_aggr_train[edge_idx][:])
+            data.norm_loss = self.sampler.norm_loss_train[node_idx]
+
+        edge_weight = row_normalization(data.x.shape[0], data.edge_index)
+        data.edge_weight = edge_weight
+
+        return data
+
+
 class NodeSampler(SAINTSampler):
     r"""
     randomly select nodes, then adding edges connecting these nodes
@@ -369,6 +413,66 @@ def sample(self, batch):
             return batch, node_id, adj_list[::-1]
 
 
+class ClusteredDataset(torch.utils.data.Dataset):
+    partition_tool = None
+
+    def __init__(self, dataset, n_cluster: int, batch_size: int, log=False):
+        super(ClusteredDataset).__init__()
+        try:
+            import metis
+
+            ClusteredDataset.partition_tool = metis
+        except Exception as e:
+            print(e)
+            exit(1)
+
+        self.data = dataset.data
+        self.dataset_name = dataset.__class__.__name__
+        self.batch_size = batch_size
+        self.log = log
+        self.clusters = self.preprocess(n_cluster)
+        self.batch_idx = np.array(range(n_cluster))
+
+    def shuffle(self):
+        random.shuffle(self.batch_idx)
+
+    def __len__(self):
+        return (len(self.clusters) - 1) // self.batch_size + 1
+
+    def __getitem__(self, idx):
+        batch = self.batch_idx[idx * self.batch_size : (idx + 1) * self.batch_size]
+        nodes = np.concatenate([self.clusters[i] for i in batch])
+        subgraph = self.data.subgraph(nodes)
+
+        return subgraph
+
+    def preprocess(self, n_cluster):
+        save_name = f"{self.dataset_name}-{n_cluster}.cluster"
+        if os.path.exists(save_name):
+            return torch.load(save_name)
+        if self.log:
+            print("Preprocessing...")
+        edges = self.data.edge_index
+        edges, _ = remove_self_loops(edges)
+        if str(edges.device) != "cpu":
+            edges = edges.cpu()
+        edges = edges.numpy()
+        num_nodes = np.max(edges) + 1
+        adj = sp.csr_matrix((np.ones(edges.shape[1]), (edges[0], edges[1])), shape=(num_nodes, num_nodes))
+        indptr = adj.indptr
+        indptr = np.split(adj.indices, indptr[1:])[:-1]
+        _, parts = ClusteredDataset.partition_tool.part_graph(indptr, n_cluster, seed=1)
+        division = [[] for _ in range(n_cluster)]
+        for i, v in enumerate(parts):
+            division[v].append(i)
+        for k in range(len(division)):
+            division[k] = np.array(division[k], dtype=np.int)
+        torch.save(division, save_name)
+        if self.log:
+            print("Graph clustering done")
+        return division
+
+
 class ClusteredLoader(torch.utils.data.DataLoader):
     partition_tool = None
 

diff --git a/cogdl/operators/sample.py b/cogdl/operators/sample.py
@@ -0,0 +1,18 @@
+import os
+from torch.utils.cpp_extension import load
+
+path = os.path.join(os.path.dirname(__file__))
+
+# subgraph and sample_adj
+try:
+    sample = load(name="sampler", sources=[os.path.join(path, "sample/sample.cpp")], verbose=False)
+    subgraph_c = sample.subgraph
+    sample_adj_c = sample.sample_adj
+    coo2csr_cpu = sample.coo2csr_cpu
+    coo2csr_cpu_index = sample.coo2csr_cpu_index
+except Exception as e:
+    print(e)
+    subgraph_c = None
+    sample_adj_c = None
+    coo2csr_cpu_index = None
+    coo2csr_cpu = None
diff --git a/cogdl/operators/operators.py → cogdl/operators/spmm.py b/cogdl/operators/operators.py → cogdl/operators/spmm.py
@@ -2,24 +2,8 @@
 import torch
 from torch.utils.cpp_extension import load
 
-path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "operators")
+path = os.path.join(os.path.dirname(__file__))
 
-# subgraph and sample_adj
-try:
-    sample = load(name="sampler", sources=[os.path.join(path, "sample/sample.cpp")], verbose=False)
-    subgraph_c = sample.subgraph
-    sample_adj_c = sample.sample_adj
-    coo2csr_cpu = sample.coo2csr_cpu
-    coo2csr_cpu_index = sample.coo2csr_cpu_index
-except Exception as e:
-    print(e)
-    subgraph_c = None
-    sample_adj_c = None
-    coo2csr_cpu_index = None
-    coo2csr_cpu = None
-
-
-# SPMM
 if not torch.cuda.is_available():
     spmm = None
 else:

diff --git a/cogdl/trainers/__init__.py b/cogdl/trainers/__init__.py
@@ -55,4 +55,5 @@ def build_trainer(args):
     "clustergcn": "cogdl.trainers.sampled_trainer",
     "random_partition": "cogdl.trainers.sampled_trainer",
     "self_auxiliary_task": "cogdl.trainers.self_auxiliary_task_trainer",
+    "distributed_trainer": "cogdl.trainers.distributed_trainer",
 }