Source code for torch_geometric.datasets.word_net

from itertools import chain
from typing import Callable, List, Optional

import torch

from torch_geometric.data import Data, InMemoryDataset, download_url
from torch_geometric.utils import index_sort


[docs]class WordNet18(InMemoryDataset): r"""The WordNet18 dataset from the `"Translating Embeddings for Modeling Multi-Relational Data" <https://papers.nips.cc/paper/5071-translating-embeddings-for-modeling -multi-relational-data>`_ paper, containing 40,943 entities, 18 relations and 151,442 fact triplets, *e.g.*, furniture includes bed. .. note:: The original :obj:`WordNet18` dataset suffers from test leakage, *i.e.* more than 80% of test triplets can be found in the training set with another relation type. Therefore, it should not be used for research evaluation anymore. We recommend to use its cleaned version :class:`~torch_geometric.datasets.WordNet18RR` instead. Args: root (str): Root directory where the dataset should be saved. transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before every access. (default: :obj:`None`) pre_transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before being saved to disk. (default: :obj:`None`) force_reload (bool, optional): Whether to re-process the dataset. (default: :obj:`False`) """ url = ('https://raw.githubusercontent.com/villmow/' 'datasets_knowledge_embedding/master/WN18/original') def __init__( self, root: str, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, force_reload: bool = False, ) -> None: super().__init__(root, transform, pre_transform, force_reload=force_reload) self.load(self.processed_paths[0]) @property def raw_file_names(self) -> List[str]: return ['train.txt', 'valid.txt', 'test.txt'] @property def processed_file_names(self) -> str: return 'data.pt' def download(self) -> None: for filename in self.raw_file_names: download_url(f'{self.url}/{filename}', self.raw_dir) def process(self) -> None: srcs, dsts, edge_types = [], [], [] for path in self.raw_paths: with open(path) as f: edges = [int(x) for x in f.read().split()[1:]] edge = torch.tensor(edges, dtype=torch.long) srcs.append(edge[::3]) dsts.append(edge[1::3]) edge_types.append(edge[2::3]) src = torch.cat(srcs, dim=0) dst = torch.cat(dsts, dim=0) edge_type = torch.cat(edge_types, dim=0) train_mask = torch.zeros(src.size(0), dtype=torch.bool) train_mask[:srcs[0].size(0)] = True val_mask = torch.zeros(src.size(0), dtype=torch.bool) val_mask[srcs[0].size(0):srcs[0].size(0) + srcs[1].size(0)] = True test_mask = torch.zeros(src.size(0), dtype=torch.bool) test_mask[srcs[0].size(0) + srcs[1].size(0):] = True num_nodes = max(int(src.max()), int(dst.max())) + 1 _, perm = index_sort(num_nodes * src + dst) edge_index = torch.stack([src[perm], dst[perm]], dim=0) edge_type = edge_type[perm] train_mask = train_mask[perm] val_mask = val_mask[perm] test_mask = test_mask[perm] data = Data( edge_index=edge_index, edge_type=edge_type, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, num_nodes=num_nodes, ) if self.pre_transform is not None: data = self.pre_transform(data) self.save([data], self.processed_paths[0])
[docs]class WordNet18RR(InMemoryDataset): r"""The WordNet18RR dataset from the `"Convolutional 2D Knowledge Graph Embeddings" <https://arxiv.org/abs/1707.01476>`_ paper, containing 40,943 entities, 11 relations and 93,003 fact triplets. Args: root (str): Root directory where the dataset should be saved. transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before every access. (default: :obj:`None`) pre_transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before being saved to disk. (default: :obj:`None`) force_reload (bool, optional): Whether to re-process the dataset. (default: :obj:`False`) """ url = ('https://raw.githubusercontent.com/villmow/' 'datasets_knowledge_embedding/master/WN18RR/original') edge2id = { '_also_see': 0, '_derivationally_related_form': 1, '_has_part': 2, '_hypernym': 3, '_instance_hypernym': 4, '_member_meronym': 5, '_member_of_domain_region': 6, '_member_of_domain_usage': 7, '_similar_to': 8, '_synset_domain_topic_of': 9, '_verb_group': 10, } def __init__( self, root: str, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, force_reload: bool = False, ) -> None: super().__init__(root, transform, pre_transform, force_reload=force_reload) self.load(self.processed_paths[0]) @property def raw_file_names(self) -> List[str]: return ['train.txt', 'valid.txt', 'test.txt'] @property def processed_file_names(self) -> str: return 'data.pt' def download(self) -> None: for filename in self.raw_file_names: download_url(f'{self.url}/{filename}', self.raw_dir) def process(self) -> None: node2id, idx = {}, 0 srcs, dsts, edge_types = [], [], [] for path in self.raw_paths: with open(path) as f: edges = f.read().split() _src = edges[::3] _dst = edges[2::3] _edge_type = edges[1::3] for i in chain(_src, _dst): if i not in node2id: node2id[i] = idx idx += 1 srcs.append(torch.tensor([node2id[i] for i in _src])) dsts.append(torch.tensor([node2id[i] for i in _dst])) edge_types.append( torch.tensor([self.edge2id[i] for i in _edge_type])) src = torch.cat(srcs, dim=0) dst = torch.cat(dsts, dim=0) edge_type = torch.cat(edge_types, dim=0) train_mask = torch.zeros(src.size(0), dtype=torch.bool) train_mask[:srcs[0].size(0)] = True val_mask = torch.zeros(src.size(0), dtype=torch.bool) val_mask[srcs[0].size(0):srcs[0].size(0) + srcs[1].size(0)] = True test_mask = torch.zeros(src.size(0), dtype=torch.bool) test_mask[srcs[0].size(0) + srcs[1].size(0):] = True num_nodes = max(int(src.max()), int(dst.max())) + 1 _, perm = index_sort(num_nodes * src + dst) edge_index = torch.stack([src[perm], dst[perm]], dim=0) edge_type = edge_type[perm] train_mask = train_mask[perm] val_mask = val_mask[perm] test_mask = test_mask[perm] data = Data(edge_index=edge_index, edge_type=edge_type, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, num_nodes=num_nodes) if self.pre_transform is not None: data = self.pre_transform(data) self.save([data], self.processed_paths[0])