-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions_exploratory_data_analysis.py
131 lines (124 loc) · 5.43 KB
/
functions_exploratory_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, f_oneway, lognorm, levy, skew, chisquare
# Functions for exploratory data analysis
def visualize_continuous(df,label,method={'type':'histogram','bins':20},outlier='on'):
"""
function to quickly visualize continous variables
df: pandas.dataFrame
label: str, name of the variable to be plotted. It should be present in df.columns
method: dict, contains info of the type of plot to generate. It can be histogram or boxplot [-Not yet developped]
outlier: {'on','off'}, Set it to off if you need to cut off outliers. Outliers are all those points
located at 3 standard deviations further from the mean
"""
# create vector of the variable of interest
v = df[label]
# define mean and standard deviation
m = v.mean()
s = v.std()
# prep the figure
fig,ax = plt.subplots(1,2,figsize=(14,4))
ax[0].set_title('Distribution of '+label)
ax[1].set_title('Tip % by '+label)
if outlier=='off': # remove outliers accordingly and update titles
v = v[(v-m)<=3*s]
ax[0].set_title('Distribution of '+label+'(no outliers)')
ax[1].set_title('Tip % by '+label+'(no outliers)')
if method['type'] == 'histogram': # plot the histogram
v.hist(bins = method['bins'],ax=ax[0])
if method['type'] == 'boxplot': # plot the box plot
df.loc[v.index].boxplot(label,ax=ax[0])
ax[1].plot(v,df.loc[v.index].Tip_percentage,'.',alpha=0.4)
ax[0].set_xlabel(label)
ax[1].set_xlabel(label)
ax[0].set_ylabel('Count')
ax[1].set_ylabel('Tip (%)')
def visualize_categories(df,catName,chart_type='histogram',ylimit=[None,None]):
"""
This functions helps to quickly visualize categorical variables.
This functions calls other functions generate_boxplot and generate_histogram
df: pandas.Dataframe
catName: str, variable name, it must be present in df
chart_type: {histogram,boxplot}, choose which type of chart to plot
ylim: tuple, list. Valid if chart_type is histogram
"""
print catName
cats = sorted(pd.unique(df[catName]))
if chart_type == 'boxplot': #generate boxplot
generate_boxplot(df,catName,ylimit)
elif chart_type == 'histogram': # generate histogram
generate_histogram(df,catName)
else:
pass
#=> calculate test statistics
groups = df[[catName,'Tip_percentage']].groupby(catName).groups #create groups
tips = df.Tip_percentage
if len(cats)<=2: # if there are only two groups use t-test
print ttest_ind(tips[groups[cats[0]]],tips[groups[cats[1]]])
else: # otherwise, use one_way anova test
# prepare the command to be evaluated
cmd = "f_oneway("
for cat in cats:
cmd+="tips[groups["+str(cat)+"]],"
cmd=cmd[:-1]+")"
print "one way anova test:", eval(cmd) #evaluate the command and print
print "Frequency of categories (%):\n",df[catName].value_counts(normalize=True)*100
def test_classification(df,label,yl=[0,50]):
"""
This function test if the means of the two groups with_tip and without_tip are different at 95% of confidence level.
It will also generate a box plot of the variable by tipping groups
label: str, label to test
yl: tuple or list (default = [0,50]), y limits on the ylabel of the boxplot
df: pandas.DataFrame (default = data)
Example: run <visualize_continuous(data,'Fare_amount',outlier='on')>
"""
if len(pd.unique(df[label]))==2: #check if the variable is categorical with only two categores and run chisquare test
vals=pd.unique(df[label])
gp1 = df[df.With_tip==0][label].value_counts().sort_index()
gp2 = df[df.With_tip==1][label].value_counts().sort_index()
print "t-test if", label, "can be used to distinguish transaction with tip and without tip"
print chisquare(gp1,gp2)
elif len(pd.unique(df[label]))>=10: #other wise run the t-test
df.boxplot(label,by='With_tip')
plt.ylim(yl)
plt.show()
print "t-test if", label, "can be used to distinguish transaction with tip and without tip"
print "results:",ttest_ind(df[df.With_tip==0][label].values,df[df.With_tip==1][label].values,equal_var=False)
else:
pass
def generate_boxplot(df,catName,ylimit):
"""
generate boxplot of tip percentage by variable "catName" with ylim set to ylimit
df: pandas.Dataframe
catName: str
ylimit: tuple, list
"""
df.boxplot('Tip_percentage',by=catName)
#plt.title('Tip % by '+catName)
plt.title('')
plt.ylabel('Tip (%)')
if ylimit != [None,None]:
plt.ylim(ylimit)
plt.show()
def generate_histogram(df,catName):
"""
generate histogram of tip percentage by variable "catName" with ylim set to ylimit
df: pandas.Dataframe
catName: str
ylimit: tuple, list
"""
cats = sorted(pd.unique(df[catName]))
colors = plt.cm.jet(np.linspace(0,1,len(cats)))
hx = np.array(map(lambda x:round(x,1),np.histogram(df.Tip_percentage,bins=20)[1]))
fig,ax = plt.subplots(1,1,figsize = (15,4))
for i,cat in enumerate(cats):
vals = df[df[catName] == cat].Tip_percentage
h = np.histogram(vals,bins=hx)
w = 0.9*(hx[1]-hx[0])/float(len(cats))
plt.bar(hx[:-1]+w*i,h[0],color=colors[i],width=w)
plt.legend(cats)
plt.yscale('log')
plt.title('Distribution of Tip by '+catName)
plt.xlabel('Tip (%)')