[Dataset] Update rd2cd datasets (#323)

THUDM · Dec 24, 2021 · 29170ae · 29170ae
1 parent dc61b6e
commit 29170ae
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 28 deletions.
diff --git a/cogdl/datasets/geom_data.py b/cogdl/datasets/geom_data.py
@@ -99,6 +99,8 @@ def __init__(self, root, name, split=0):
         self.data.val_mask = self.data.all_masks[split]["val"]
         self.data.test_mask = self.data.all_masks[split]["test"]
 
+        self.data.set_asymmetric()
+
     @property
     def raw_file_names(self):
         names = ["out1_graph_edges.txt", "out1_node_feature_label.txt"] + [

diff --git a/cogdl/datasets/rd2cd_data.py b/cogdl/datasets/rd2cd_data.py
@@ -1,14 +1,12 @@
-import os
+import os.path as osp
 import random
-import tarfile
 
 import numpy as np
-import requests
 import torch
 from torch import Tensor
 
-from cogdl.data import Graph
-from cogdl.datasets import NodeDataset
+from cogdl.data import Dataset, Graph
+from cogdl.utils import download_url, untar
 
 base_url = "https://cloud.tsinghua.edu.cn/d/65d7c53dd8474d7091a9/files/?p=%2F"
 
@@ -67,44 +65,51 @@ def check_train_containing(train_mask, y):
     return True
 
 
-class RD2CD(NodeDataset):
+class RD2CD(Dataset):
     def __init__(self, root, name):
-        self.root = root
         self.name = name
-        self.source_path = root + "/" + name + "/raw"
-        if not os.path.exists(self.source_path):
-            os.makedirs(self.source_path)
-        dst_path = root + "/" + name + "/processed"
-        if not os.path.exists(dst_path):
-            os.makedirs(dst_path)
-        self.data_path = dst_path + "/data.pt"
-        if not os.path.exists(self.data_path):
-            self.download()
-        super(RD2CD, self).__init__(path=self.data_path, scale_feat=False)
+        path = osp.join(root, name)
+
+        super(RD2CD, self).__init__(path)
+        self.data = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        names = ["x.npy", "y.npy", "edge_index.npy"]
+        return names
+
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+
+    @property
+    def num_nodes(self):
+        assert hasattr(self.data, "y")
+        return self.data.y.shape[0]
 
     def download(self):
-        r = requests.get(base_url + self.name + ".tgz&dl=1")
-        tarfile_path = self.source_path + "/" + self.name + ".tgz"
-        with open(tarfile_path, "wb") as f:
-            f.write(r.content)
-        with tarfile.open(tarfile_path, "r") as f:
-            f.extractall(self.source_path)
+        fname = "{}.tgz".format(self.name.lower())
+        download_url("{}{}.tgz&dl=1".format(base_url, self.name), self.raw_dir, fname)
+        untar(self.raw_dir, fname)
 
     def process(self):
-        numpy_x = np.load(self.source_path + "/x.npy")
+        numpy_x = np.load(self.raw_dir + "/x.npy")
         x = torch.from_numpy(numpy_x).to(torch.float)
-        numpy_y = np.load(self.source_path + "/y.npy")
+        numpy_y = np.load(self.raw_dir + "/y.npy")
         y = torch.from_numpy(numpy_y).to(torch.long)
-        numpy_edge_index = np.load(self.source_path + "/edge_index.npy")
+        numpy_edge_index = np.load(self.raw_dir + "/edge_index.npy")
         edge_index = torch.from_numpy(numpy_edge_index).to(torch.long)
 
         # set train/val/test mask in node_classification task
         random_seed = 14530529  # a fixed seed
         (train_mask, val_mask, test_mask) = get_whole_mask(y, "6-2-2", random_seed)
         data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
-        torch.save(data, self.data_path)
+        torch.save(data, self.processed_paths[0])
         return data
 
+    def get(self, idx):
+        return self.data
+
 
 class Github(RD2CD):
     def __init__(self, root="data"):

diff --git a/docs/source/tutorial/node_classification.rst b/docs/source/tutorial/node_classification.rst
@@ -48,7 +48,7 @@ CogDL supports saving the trained model with ``checkpoint_path`` in command line
     experiment(model="gcn", dataset="cora", checkpoint_path="gcn_cora.pt")
 
 
-When the training stops, the model will be saved in `gcn_cora.py`. If you want to continue the training from previous checkpoint
+When the training stops, the model will be saved in `gcn_cora.pt`. If you want to continue the training from previous checkpoint
 with different parameters(such as learning rate, weight decay and etc.), keep the same model parameters (such as hidden size, model layers)
 and do it as follows: