import os
import os.path as osp
from functools import partial
from typing import Callable, Optional
import numpy as np
import torch
from torch_geometric.data import (
Data,
InMemoryDataset,
download_url,
extract_zip,
)
from torch_geometric.transforms import ToUndirected
from torch_geometric.utils import subgraph
def _load_yaml(path: str) -> dict:
import yaml
with open(path) as f:
return yaml.safe_load(f)
[docs]class GraphLandDataset(InMemoryDataset):
r"""The graph datasets from the `"GraphLand: Evaluating
Graph Machine Learning Models on Diverse Industrial Data"
<https://arxiv.org/abs/2409.14500>`_ paper.
Args:
root (str): Root directory where the dataset should be saved.
name (str): The name of the dataset (:obj:`"hm-categories"`,
:obj:`"pokec-regions"`, :obj:`"web-topics"`, :obj:`"tolokers-2"`,
:obj:`"city-reviews"`, :obj:`"artnet-exp"`, :obj:`"web-fraud"`,
:obj:`"hm-prices"`, :obj:`"avazu-ctr"`, :obj:`"city-roads-M"`,
:obj:`"city-roads-L"`, :obj:`"twitch-views"`,
:obj:`"artnet-views"`, :obj:`"web-traffic"`).
split (str): The type of dataset split/setting (:obj:`"RL"`,
:obj:`"RH"`, :obj:`"TH"`, :obj:`"THI"`).
:obj:`"RL"` is for "random low" split — a 10%/10%/80% random
stratified train/val/test split.
:obj:`"RH"` is for "random high" split — a 50%/25%/25% random
stratified train/val/test split.
:obj:`"TH"` is for "temporal high" split — a 50%/25%/25% temporal
train/val/test split.
:obj:`"THI"` is for "temporal high" split with the inductive
setting, which means that the graph is evolving over time, thus
val and test nodes are not seen at train time, and test nodes
are not seen at val time.
The :obj:`"RL"`, :obj:`"RH"`, and :obj:`"TH"` splits correspond
to the transductive setting and thus will return a dataset with
a single graph and three masks (for train, val, and test nodes).
In contrast, the :obj:`"THI"` split corresponds to the inductive
setting, and thus will return a dataset with three graphs
(a train graph, a val graph, and a test graph), which are three
snapshots of an evolving network captured at different timestamps.
Each of the three graphs has a mask specifying which of the nodes
should be used for training (in the train graph) and evaluation
(in the val and test graphs).
:obj:`"TH"` and :obj:`"THI"` splits are not available for the
following datasets: :obj:`"city-reviews"`, :obj:`"city-roads-M"`,
:obj:`"city-roads-L"`, :obj:`"web-traffic"`.
numerical_features_transform (str, optional): A transform applied to
numerical features (:obj:`None`, :obj:`"standard_scaler"`,
:obj:`"min_max_scaler"`, :obj:`"quantile_transform_normal"`,
:obj:`"quantile_transform_uniform"`, :obj:`"default"`).
Since numerical features can have widely different scales and
distributions, it is typically useful to apply some transform
to them before passing them to a neural model. This transform
is applied to all numerical features except for those that are
also categorized as fraction features. The :obj:`"default"` value
selects a dataset-specific transform from the other options that
was determined to be a safe and likely optimal choice for this
dataset based on experiments with various GNNs.
(default: :obj:`"default"`)
fraction_features_transform (str, optional): A transform applied to
fraction features (:obj:`None`, :obj:`"standard_scaler"`,
:obj:`"min_max_scaler"`, :obj:`"quantile_transform_normal"`,
:obj:`"quantile_transform_uniform"`, :obj:`"default"`). Fraction
features are a subset of numerical features that have the meaning
of fractions and are thus always in :obj:`[0, 1]` range. Since
their range is bounded, it is not neccessary but may still be
useful to apply some transform to them before passing them to a
neural model. The :obj:`"default"` value selects a dataset-specific
transform from the other options that was determined to be a safe
and likely optimal choice for this dataset based on experiments
with various GNNs. (default: :obj:`"default"`)
categorical_features_transform (str, optional): A transform applied to
categorical features (:obj:`None`, :obj:`"one_hot_encoding"`).
It is most often useful to apply one-hot encoding to categorical
features before passing them to a neural model.
(default: :obj:`"one_hot_encoding"`)
regression_targets_transform (str, optional): A transform applied to
regression targets (:obj:`None`, :obj:`"standard_scaler"`,
:obj:`"min_max_scaler"`, :obj:`"default"`). Depending on their
range, it may or may not be useful to apply a transform to
regression targets before fitting a neural model to them.
The :obj:`"default"` value selects a dataset-specific transform
from the other options that was determined to be a safe and likely
optimal choice for this dataset based on experiments with various
GNNs. This argument does not affect classification datasets.
(default: :obj:`"default"`)
numerical_features_nan_imputation_strategy (str, optional): Defines
which value to fill NaNs in numerical features with
(:obj:`None`, :obj:`"mean"`, :obj:`"median"`,
:obj:`"most_frequent"`). This imputation strategy is applied to
all numerical features except for those that are also categorized
as fraction features. (default: :obj:`"most_frequent"`)
fraction_features_nan_imputation_strategy (str, optional): Defines
which value to fill NaNs in fraction features with (:obj:`None`,
:obj:`"mean"`, :obj:`"median"`, :obj:`"most_frequent"`).
(default: :obj:`"most_frequent"`)
to_undirected (bool, optional): Whether to convert a directed graph
to an undirected one. Does not affect undirected graphs.
(default: :obj:`True`)
transform (callable, optional): A function/transform that takes in an
:obj:`torch_geometric.data.Data` object and returns a transformed
version. The data object will be transformed before every access.
(default: :obj:`None`)
pre_transform (callable, optional): A function/transform that takes in
an :obj:`torch_geometric.data.Data` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
force_reload (bool, optional): Whether to re-process the dataset.
(default: :obj:`False`)
**STATS:**
.. list-table::
:widths: 14 10 10 10 14
:header-rows: 1
* - Name
- #nodes
- #edges
- is directed
- task
* - :obj:`hm-categories`
- 46,563
- 21,461,990
- False
- multiclass
* - :obj:`pokec-regions`
- 1,632,803
- 30,622,564
- True
- multiclass
* - :obj:`web-topics`
- 2,890,331
- 12,895,369
- True
- multiclass
* - :obj:`tolokers-2`
- 11,758
- 1,038,000
- False
- binclass
* - :obj:`city-reviews`
- 148,801
- 2,330,830
- False
- binclass
* - :obj:`artnet-exp`
- 50,405
- 560,696
- False
- binclass
* - :obj:`web-fraud`
- 2,890,331
- 12,895,369
- True
- binclass
* - :obj:`hm-prices`
- 46,563
- 21,461,990
- False
- regression
* - :obj:`avazu-ctr`
- 76,269
- 21,968,154
- False
- regression
* - :obj:`city-roads-M`
- 57,073
- 132,571
- True
- regression
* - :obj:`city-roads-L`
- 142,257
- 279,062
- True
- regression
* - :obj:`twitch-views`
- 168,114
- 13,595,114
- False
- regression
* - :obj:`artnet-views`
- 50,405
- 560,696
- False
- regression
* - :obj:`web-traffic`
- 2,890,331
- 12,895,369
- True
- regression
"""
_url = 'https://zenodo.org/records/16895532'
GRAPHLAND_DATASETS = {
'hm-categories': 'multiclass_classification',
'pokec-regions': 'multiclass_classification',
'web-topics': 'multiclass_classification',
'tolokers-2': 'binary_classification',
'city-reviews': 'binary_classification',
'artnet-exp': 'binary_classification',
'web-fraud': 'binary_classification',
'hm-prices': 'regression',
'avazu-ctr': 'regression',
'city-roads-M': 'regression',
'city-roads-L': 'regression',
'twitch-views': 'regression',
'artnet-views': 'regression',
'web-traffic': 'regression',
}
def __init__(
self,
root: str,
name: str,
split: str,
numerical_features_transform: Optional[str] = 'default',
fraction_features_transform: Optional[str] = 'default',
categorical_features_transform: Optional[str] = 'one_hot_encoding',
regression_targets_transform: Optional[str] = 'default',
numerical_features_nan_imputation_strategy: Optional[
str] = 'most_frequent',
fraction_features_nan_imputation_strategy: Optional[
str] = 'most_frequent',
to_undirected: bool = True,
transform: Optional[Callable] = None,
pre_transform: Optional[Callable] = None,
force_reload: bool = False,
) -> None:
assert name in self.GRAPHLAND_DATASETS, (
f'Unsupported dataset name: {name}')
assert split in ['RL', 'RH', 'TH', 'THI'], \
f'Unsupported split name: {split}'
if split in ['TH', 'THI']:
assert name not in [
'city-reviews',
'city-roads-M',
'city-roads-L',
'web-trafic',
], ('Temporal split is not available for city-reviews, '
'city-roads-M, city-roads-L, web-trafic.')
if numerical_features_transform == 'default':
numerical_features_transform = 'quantile_transform_normal'
if fraction_features_transform == 'default':
fraction_features_transform = ( #
'quantile_transform_normal'
if name in ['artnet-views', 'avazu-ctr'] else None)
if regression_targets_transform == 'default':
regression_targets_transform = 'standard_scaler'
if numerical_features_transform is not None:
assert numerical_features_transform in [
'standard_scaler',
'min_max_scaler',
'quantile_transform_normal',
'quantile_transform_uniform',
], ('Unsupported numerical features transform: '
f'{numerical_features_transform}')
if fraction_features_transform is not None:
assert fraction_features_transform in [
'standard_scaler',
'min_max_scaler',
'quantile_transform_normal',
'quantile_transform_uniform',
], ('Unsupported fraction features transform: '
f'{fraction_features_transform}')
if categorical_features_transform is not None:
assert categorical_features_transform == 'one_hot_encoding', (
'Unsupported categorical features transform: '
f'{categorical_features_transform}')
if regression_targets_transform is not None:
assert regression_targets_transform in [
'standard_scaler', 'min_max_scaler'
], ('Unsupported regression targets transform:'
f'{regression_targets_transform}')
self.name = name
self.split = split
self.task = self.GRAPHLAND_DATASETS[name]
self._num_transform = numerical_features_transform
self._frac_transform = fraction_features_transform
self._cat_transform = categorical_features_transform
self._reg_transform = regression_targets_transform
self._num_imputation = numerical_features_nan_imputation_strategy
self._frac_imputation = fraction_features_nan_imputation_strategy
self._to_undirected = to_undirected
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
MinMaxScaler,
OneHotEncoder,
QuantileTransformer,
StandardScaler,
)
self._transforms = {
'standard_scaler':
partial(StandardScaler, copy=False),
'min_max_scaler':
partial(MinMaxScaler, clip=False, copy=False),
'quantile_transform_normal':
partial(
QuantileTransformer,
output_distribution='normal',
subsample=None,
random_state=0,
copy=False,
),
'quantile_transform_uniform':
partial(
QuantileTransformer,
output_distribution='uniform',
subsample=None,
random_state=0,
copy=False,
),
'one_hot_encoding':
partial(
OneHotEncoder,
drop='if_binary',
sparse_output=False,
handle_unknown='ignore',
dtype=np.float32,
),
}
self._imputer = SimpleImputer
super().__init__(root, transform, pre_transform,
force_reload=force_reload)
self.load(self.processed_paths[0])
@property
def raw_dir(self) -> str:
return osp.join(self.root, self.name, 'raw')
@property
def processed_dir(self) -> str:
specs = ''.join(f'__{str(arg).lower()}' for arg in [
self.split,
self._num_transform,
self._frac_transform,
self._cat_transform,
self._reg_transform,
self._num_imputation,
self._frac_imputation,
self._to_undirected,
])
return osp.join(self.root, self.name, 'processed', specs)
@property
def raw_file_names(self) -> str:
return self.name
@property
def processed_file_names(self) -> str:
return 'data.pt'
def download(self) -> None:
zip_url = osp.join(self._url, 'files', f'{self.name}.zip')
path = download_url(zip_url, self.raw_dir)
extract_zip(path, self.raw_dir)
os.unlink(path)
def _get_raw_data(self) -> dict:
import pandas as pd
raw_data_dir = osp.join(self.raw_dir, self.name)
info = _load_yaml(osp.join(raw_data_dir, 'info.yaml'))
features_df = pd.read_csv(
osp.join(raw_data_dir, 'features.csv'),
index_col=0,
)
num_features_names = [
name for name in info['numerical_features_names']
if name not in info['fraction_features_names']
]
num_features = features_df[num_features_names].values
num_features = num_features.astype(np.float32)
cat_features_names = info['categorical_features_names']
cat_features = features_df[cat_features_names].values
cat_features = cat_features.astype(np.int32)
frac_features_names = info['fraction_features_names']
frac_features = features_df[frac_features_names].values
frac_features = frac_features.astype(np.float32)
targets_df = pd.read_csv(
osp.join(raw_data_dir, 'targets.csv'),
index_col=0,
)
targets = targets_df[info['target_name']].values
targets = targets.astype(np.float32)
masks_df = pd.read_csv(
osp.join(raw_data_dir, f'split_masks_{self.split[:2]}.csv'),
index_col=0,
)
masks = {
k: np.array(v, dtype=bool)
for k, v in masks_df.to_dict('list').items()
}
edges_df = pd.read_csv(osp.join(raw_data_dir, 'edgelist.csv'))
edges = edges_df.values
return {
'info': info,
'num_features': num_features,
'cat_features': cat_features,
'frac_features': frac_features,
'targets': targets,
'masks': masks,
'edges': edges,
}
def _get_transductive_data(self) -> list[Data]:
raw_data = self._get_raw_data()
# >>> process targets
targets = raw_data['targets']
labeled_mask = ~np.isnan(targets)
if (raw_data['info']['task'] == 'regression'
and self._reg_transform is not None):
targets = targets.reshape(-1, 1)
transform = self._transforms[self._reg_transform]()
transform.fit(targets[raw_data['masks']['train']])
targets = transform.transform(targets).reshape(-1)
targets = torch.from_numpy(targets).float()
# >>> process numerical features
num_features = raw_data['num_features']
if num_features.size > 0:
if self._num_transform is not None:
transform = self._transforms[self._num_transform]()
transform.fit(num_features)
num_features = self._imputer(
missing_values=np.nan, strategy=self._num_imputation,
copy=False).fit_transform(num_features)
if self._num_transform is not None:
num_features = transform.transform(num_features)
# >>> process fraction features
frac_features = raw_data['frac_features']
if frac_features.size > 0:
if self._frac_transform is not None:
transform = self._transforms[self._frac_transform]()
transform.fit(frac_features)
frac_features = self._imputer(
missing_values=np.nan, strategy=self._frac_imputation,
copy=False).fit_transform(frac_features)
if self._frac_transform is not None:
frac_features = transform.transform(frac_features)
# >>> process categorical features
cat_features = raw_data['cat_features']
if cat_features.size > 0 and self._cat_transform is not None:
cat_features = (self._transforms[self._cat_transform]
().fit_transform(cat_features))
# >>> concatenate features and make features mask
features = np.concatenate(
[num_features, frac_features, cat_features],
axis=1,
)
features = torch.from_numpy(features).float()
num_mask = torch.zeros(features.shape[1], dtype=torch.bool)
num_mask[:num_features.shape[1]] = True
frac_mask = torch.zeros(features.shape[1], dtype=torch.bool)
if cat_features.shape[1] > 0:
frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True
else:
frac_mask[num_features.shape[1]:] = True
cat_mask = torch.zeros(features.shape[1], dtype=torch.bool)
if cat_features.shape[1] > 0:
cat_mask[-cat_features.shape[1]:] = True
# >>> update split masks
train_mask = raw_data['masks']['train'] & labeled_mask
train_mask = torch.from_numpy(train_mask).bool()
val_mask = raw_data['masks']['val'] & labeled_mask
val_mask = torch.from_numpy(val_mask).bool()
test_mask = raw_data['masks']['test'] & labeled_mask
test_mask = torch.from_numpy(test_mask).bool()
# >>> make edge index
edge_index = raw_data['edges'].T
edge_index = torch.from_numpy(edge_index).long()
# >>> construct Data object
data = Data(
edge_index=edge_index,
x=features,
y=targets,
train_mask=train_mask,
val_mask=val_mask,
test_mask=test_mask,
x_numerical_mask=num_mask,
x_fraction_mask=frac_mask,
x_categorical_mask=cat_mask,
)
return [data]
def _get_inductive_data(self) -> list[Data]:
raw_data = self._get_raw_data()
transform_mask = raw_data['masks']['train']
# >>> process targets
targets = raw_data['targets']
labeled_mask = ~np.isnan(targets)
if (raw_data['info']['task'] == 'regression'
and self._reg_transform is not None):
targets = targets.reshape(-1, 1)
transform = self._transforms[self._reg_transform]()
transform.fit(targets[transform_mask])
targets = transform.transform(targets).reshape(-1)
targets = torch.from_numpy(targets).float()
# >>> process numerical features
num_features = raw_data['num_features']
if num_features.size > 0:
if self._num_transform is not None:
transform = self._transforms[self._num_transform]()
transform.fit(num_features[transform_mask])
imputer = self._imputer(missing_values=np.nan,
strategy=self._num_imputation, copy=False)
imputer.fit(num_features[transform_mask])
num_features = imputer.transform(num_features)
if self._num_transform is not None:
num_features = transform.transform(num_features)
# >>> process fraction features
frac_features = raw_data['frac_features']
if frac_features.size > 0:
if self._frac_transform is not None:
transform = self._transforms[self._frac_transform]()
transform.fit(frac_features[transform_mask])
imputer = self._imputer(missing_values=np.nan,
strategy=self._frac_imputation, copy=False)
imputer.fit(frac_features[transform_mask])
frac_features = imputer.transform(frac_features)
if self._frac_transform is not None:
frac_features = transform.transform(frac_features)
# >>> process categorical features
cat_features = raw_data['cat_features']
if cat_features.size > 0 and self._cat_transform is not None:
transform = self._transforms[self._cat_transform]()
transform.fit(cat_features[transform_mask])
cat_features = transform.transform(cat_features)
# >>> concatenate features and make features mask
features = np.concatenate(
[num_features, frac_features, cat_features],
axis=1,
)
features = torch.from_numpy(features).float()
num_mask = torch.zeros(features.shape[1], dtype=torch.bool)
num_mask[:num_features.shape[1]] = True
frac_mask = torch.zeros(features.shape[1], dtype=torch.bool)
if cat_features.shape[1] > 0:
frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True
else:
frac_mask[num_features.shape[1]:] = True
cat_mask = torch.zeros(features.shape[1], dtype=torch.bool)
if cat_features.shape[1] > 0:
cat_mask[-cat_features.shape[1]:] = True
# >>> construct Data objects
edge_index = raw_data['edges'].T
edge_index = torch.from_numpy(edge_index).long()
# --- train
train_graph_mask = raw_data['masks']['train']
train_graph_mask = torch.from_numpy(train_graph_mask).bool()
train_label_mask = raw_data['masks']['train'] & labeled_mask
train_label_mask = torch.from_numpy(train_label_mask).bool()
train_node_id = np.where(train_graph_mask)[0]
train_node_id = torch.from_numpy(train_node_id).long() # type: ignore
train_edge_index, _ = subgraph(
train_graph_mask,
edge_index,
relabel_nodes=True,
)
train_data = Data(
snapshot='train',
edge_index=train_edge_index,
x=features[train_graph_mask],
y=targets[train_graph_mask],
mask=train_label_mask[train_graph_mask],
x_numerical_mask=num_mask,
x_fraction_mask=frac_mask,
x_categorical_mask=cat_mask,
cross_snapshot_node_id=train_node_id,
)
# --- val
val_graph_mask = (raw_data['masks']['train']
| raw_data['masks']['val'])
val_graph_mask = torch.from_numpy(val_graph_mask).bool()
val_label_mask = raw_data['masks']['val'] & labeled_mask
val_label_mask = torch.from_numpy(val_label_mask).bool()
val_node_id = np.where(val_graph_mask)[0]
val_node_id = torch.from_numpy(val_node_id).long() # type: ignore
val_edge_index, _ = subgraph(
val_graph_mask,
edge_index,
relabel_nodes=True,
)
val_data = Data(
snapshot='val',
edge_index=val_edge_index,
x=features[val_graph_mask],
y=targets[val_graph_mask],
mask=val_label_mask[val_graph_mask],
x_numerical_mask=num_mask,
x_fraction_mask=frac_mask,
x_categorical_mask=cat_mask,
cross_snapshot_node_id=val_node_id,
)
# --- test
test_graph_mask = (raw_data['masks']['train']
| raw_data['masks']['val']
| raw_data['masks']['test'])
test_graph_mask = torch.from_numpy(test_graph_mask).bool()
test_label_mask = raw_data['masks']['test'] & labeled_mask
test_label_mask = torch.from_numpy(test_label_mask).bool()
test_node_id = np.where(test_graph_mask)[0]
test_node_id = torch.from_numpy(test_node_id).long() # type: ignore
test_edge_index, _ = subgraph(
test_graph_mask,
edge_index,
relabel_nodes=True,
)
test_data = Data(
snapshot='test',
edge_index=test_edge_index,
x=features[test_graph_mask],
y=targets[test_graph_mask],
mask=test_label_mask[test_graph_mask],
x_numerical_mask=num_mask,
x_fraction_mask=frac_mask,
x_categorical_mask=cat_mask,
cross_snapshot_node_id=test_node_id,
)
return [train_data, val_data, test_data]
def process(self) -> None:
data = (self._get_transductive_data() if self.split
in ['RL', 'RH', 'TH'] else self._get_inductive_data())
if self._to_undirected:
transform = ToUndirected()
for idx, d in enumerate(data):
data[idx] = transform(d)
self.save(data, self.processed_paths[0])
def __repr__(self) -> str:
return f'{self.__class__.__name__}(name={self.name})'