import os
from typing import Any, Callable, List, Optional, Tuple
import torch
from torch_geometric.data import (
Data,
InMemoryDataset,
download_url,
extract_zip,
)
[docs]class EllipticBitcoinDataset(InMemoryDataset):
r"""The Elliptic Bitcoin dataset of Bitcoin transactions from the
`"Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional
Networks for Financial Forensics" <https://arxiv.org/abs/1908.02591>`_
paper.
:class:`EllipticBitcoinDataset` maps Bitcoin transactions to real entities
belonging to licit categories (exchanges, wallet providers, miners,
licit services, etc.) versus illicit ones (scams, malware, terrorist
organizations, ransomware, Ponzi schemes, etc.)
There exists 203,769 node transactions and 234,355 directed edge payments
flows, with two percent of nodes (4,545) labelled as illicit, and
twenty-one percent of nodes (42,019) labelled as licit.
The remaining transactions are unknown.
Args:
root (str): Root directory where the dataset should be saved.
transform (callable, optional): A function/transform that takes in an
:obj:`torch_geometric.data.Data` object and returns a transformed
version. The data object will be transformed before every access.
(default: :obj:`None`)
pre_transform (callable, optional): A function/transform that takes in
an :obj:`torch_geometric.data.Data` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
**STATS:**
.. list-table::
:widths: 10 10 10 10
:header-rows: 1
* - #nodes
- #edges
- #features
- #classes
* - 203,769
- 234,355
- 165
- 2
"""
url = 'https://data.pyg.org/datasets/elliptic'
def __init__(self, root: str, transform: Optional[Callable] = None,
pre_transform: Optional[Callable] = None):
super().__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self) -> List[str]:
return [
'elliptic_txs_features.csv',
'elliptic_txs_edgelist.csv',
'elliptic_txs_classes.csv',
]
@property
def processed_file_names(self) -> str:
return 'data.pt'
def download(self):
for file_name in self.raw_file_names:
path = download_url(f'{self.url}/{file_name}.zip', self.raw_dir)
extract_zip(path, self.raw_dir)
os.remove(path)
def _process_df(self, feat_df: Any, edge_df: Any,
class_df: Any) -> Tuple[Any, Any, Any]:
return feat_df, edge_df, class_df
def process(self):
import pandas as pd
feat_df = pd.read_csv(self.raw_paths[0], header=None)
edge_df = pd.read_csv(self.raw_paths[1])
class_df = pd.read_csv(self.raw_paths[2])
columns = {0: 'txId', 1: 'time_step'}
feat_df = feat_df.rename(columns=columns)
feat_df, edge_df, class_df = self._process_df(
feat_df,
edge_df,
class_df,
)
x = torch.from_numpy(feat_df.loc[:, 2:].values).to(torch.float)
# There exists 3 different classes in the dataset:
# 0=licit, 1=illicit, 2=unknown
mapping = {'unknown': 2, '1': 1, '2': 0}
class_df['class'] = class_df['class'].map(mapping)
y = torch.from_numpy(class_df['class'].values)
mapping = {idx: i for i, idx in enumerate(feat_df['txId'].values)}
edge_df['txId1'] = edge_df['txId1'].map(mapping)
edge_df['txId2'] = edge_df['txId2'].map(mapping)
edge_index = torch.from_numpy(edge_df.values).t().contiguous()
# Timestamp based split:
# train_mask: 1 - 34 time_step, test_mask: 35-49 time_step
time_step = torch.from_numpy(feat_df['time_step'].values)
train_mask = (time_step < 35) & (y != 2)
test_mask = (time_step >= 35) & (y != 2)
data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask,
test_mask=test_mask)
if self.pre_transform is not None:
data = self.pre_transform(data)
torch.save(self.collate([data]), self.processed_paths[0])
@property
def num_classes(self) -> int:
return 2