-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataset_utils.py
49 lines (38 loc) · 1.81 KB
/
dataset_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.random_projection import GaussianRandomProjection
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.preprocessing import add_dummy_feature
def load_dataset(dataset_name, test_size=.25, seed=0, add_intercept=True, scale=False, reduce_dim: int = None):
X_train, y_train_ = load_svmlight_file(dataset_name+'_train.svm', multilabel=True)
X_test, y_test_ = load_svmlight_file(dataset_name+'_test.svm', multilabel=True)
if reduce_dim is None and dataset_name in ('tmc2007', 'rcv1_topics',):
reduce_dim = 300
if reduce_dim:
print("reducing dimension for %s dataset" % dataset_name)
fh = GaussianRandomProjection(n_components=reduce_dim)
X_train = fh.fit_transform(X_train)
X_test = fh.transform(X_test)
try:
X_train = np.array(X_train.todense())
X_test = np.array(X_test.todense())
except AttributeError:
pass
onehot_labeller = MultiLabelBinarizer()
y_train = onehot_labeller.fit_transform(y_train_).astype(int)
y_test = onehot_labeller.transform(y_test_).astype(int)
X_all = np.vstack([X_train, X_test])
if add_intercept:
X_all = add_dummy_feature(X_all)
y_all = np.vstack([y_train, y_test])
X_train, X_test, y_train, y_test = train_test_split(
X_all, y_all, test_size=test_size, random_state=seed
)
if scale:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
labels = onehot_labeller.classes_.astype(int)
print("X_train:", X_train.shape, "y_train:", y_train.shape)
return X_train, y_train, X_test, y_test, labels