forked from erdembocugoz/Recommender-System
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblend.py
130 lines (111 loc) · 5.18 KB
/
blend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
This file contains functions for blending models.
**REMARK** It is not recommended to try pySpark ALS algorith with more than 24 maxIter value because it may give error after 24 iteration.
"""
import numpy as np
import pandas as pd
import os
import random
from sklearn.model_selection import train_test_split
from surprise import *
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import GridSearchCV
from models.model_pyfm import *
from models.model_surprise import *
from models.model_pyspark import *
from models.model_means import *
from models.model_matrixfactorization import *
from matrix_fact_helpers import *
from sklearn.linear_model import Ridge
def make_dataframe(filepath):
"""
Reads csv file and transform it into 3 column(User,Movie,Rating) data frame
Args:
filepath (string): path to csv file
Returns:
pandas.Dataframe: df
"""
df = pd.read_csv(filepath)
df['User'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
df['Movie'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))
df['Rating'] = df['Prediction']
df = df.drop(['Id', 'Prediction'], axis=1)
return df
def make_datasets(train_path="data/data_train.csv",predict_path="data/data_test.csv"):
"""
Reads training csv file and splits it into training and testing set as pandas dataframe, for blending(voting) models
Reads to be predicted csv file as pandas dataframe
Converts all dataframes
Args:
train_path (string): path to training csv file
predict_path (string): path to be predicted csv file
Returns:
pandas.Dataframe: data_train : train dataframe from train-test split method
pandas.Dataframe: data_test : test dataframe from train-test split method
pandas.Dataframe: data_actual_train : actual training data for final predictions
pandas.Dataframe: data_actual_predict : final data to be predicted
"""
df_train = make_dataframe(train_path)
df_predict = make_dataframe(predict_path)
X_train, X_test, y_train, y_test = train_test_split(df_train[['User','Movie']], df_train['Rating'], test_size=0.5, random_state=56)
data_train = X_train.join(y_train)
data_test = X_test.join(y_test)
data_actual_train = df_train
data_actual_predict = df_predict
train_file = 'tmp_train.csv'
test_file = 'tmp_test.csv'
data_train.to_csv(train_file, index=False, header=False)
data_test.to_csv(train_file, index=False, header=False)
return data_train,data_test,data_actual_train,data_actual_predict
def get_predicts(data_train,data_test,spark_context):
"""
Train models with training data
Predict test data with trained models
Args:
data_train (pandas.DataFrame): train set
data_test (pandas.DataFrame): test set for predictions
Returns:
numpy.array: all_predictions : predictions obtained from all models
"""
train_file = 'tmp_train.csv'
test_file = 'tmp_test.csv'
data_train.to_csv(train_file, index=False, header=False)
data_test.to_csv(test_file, index=False, header=False)
#ratings_train_df = submission_table(data_train, 'User', 'Movie', 'Rating')
#ratings_test_df = submission_table(data_test, 'User', 'Movie', 'Rating')
#ratings_train_file = 'ratings_train.csv'
#ratings_test_file = 'ratings_test.csv'
#ratings_train_df.to_csv(ratings_train_file, index=False, header=True)
#ratings_test_df.to_csv(ratings_test_file, index=False, header=True)
als_pred = pyspark_ALS(data_train,data_test,spark_context)
svdpp_pred = surprise_SVDpp(train_file,test_file)
pyfm_pred = pyfm_predict(data_train,data_test)
baseline_pred = surprise_baseline(train_file,test_file)
slopeone_pred = surprise_slopeOne(train_file,test_file)
knn_ub_pred = surprise_knn_ub(train_file,test_file)
knn_ib_pred = surprise_knn_ib(train_file,test_file)
svd_pred = surprise_SVD(train_file,test_file)
all_predictions = np.array([baseline_pred,slopeone_pred,knn_ub_pred,knn_ib_pred,svd_pred,svdpp_pred,als_pred,pyfm_pred])
#,pred_globalmean,pred_usermean,pred_itemmean,pred_sgd,pred_als])
return all_predictions
def calculate_rmse(real_labels, predictions):
"""Calculate RMSE."""
return np.linalg.norm(real_labels - predictions) / np.sqrt(len(real_labels))
def get_weights(all_preds,target):
"""
Ensemble models by using voting/Blending
Establish weights for each model by using Ridge Regression
Args:
all_preds (numpy.array): array of prediction arrays obtained from each model
target (numpy.array): rankings obtained from test score, to calculate rmse
Returns:
weights: dict : predictions obtained from all models
linreg: scikitlearn.Ridge : Ridge Regression model
rmse: final rmse after blending models
"""
linreg = Ridge(alpha=0.1, fit_intercept=False)
linreg.fit(all_preds.T, target)
weights = dict(zip(["baseline_pred","slopeone_pred","knn_ub_pred","knn_ib_pred","svd_pred","svdpp_pred","als_pred","pyfm_pred","pred_globalmean","pred_usermean","pred_itemmean","pred_sgd","pred_als"], linreg.coef_))
rmse = calculate_rmse(target, linreg.predict(all_preds.T))
print(rmse)
return weights,linreg,rmse