-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbayes_utils.py
141 lines (108 loc) · 4.29 KB
/
bayes_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# A set of tools
import math
import csv
import string
import scipy.stats as stats
from collections import Counter
import multiprocessing
pool = multiprocessing.Pool()
# Some code was reused to avoid boilerplate.
# Credit to:
# http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
def loadCsv(filename):
# Loads a CSV
lines = csv.reader(open(filename, "rb"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [x for x in dataset[i]]
return dataset
def uniform_data(dataset):
# Some labels have . and extra spaces. Unoform data will strip spaces and
# remove .
return [[t.strip().replace(".", "") for t in i] for i in dataset]
def mean(numbers):
# Computes the mean of list of numbers
return sum(numbers)/float(len(numbers))
def stdev(numbers):
# Computes the standar deviation of a list of numbers
avg = mean(numbers)
variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def separateByClass(dataset):
# Dataset is a list of vectors, with label being the last index
# [feature1, feature2, feature3, label]
# Separates a dataset into a dict with each label
# { label1: [feature1, feature2, feature3] }
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def summarize(dataset):
# Dataset is a list of lists. Zip will aggregate all the same features
# eg. [a,b,c] [g,t,e] -> [(a,g), (b,t), (c,e)]
# Returns (mean_of_dataset, stdev_of_dataset) for each feature
summaries = [
(mean(attribute), stdev(attribute))
for attribute in zip(*dataset)]
# The las summary is the summary of the labels.
del summaries[-1]
return summaries
def discretize_variable(dataset, index):
# Inneficiently puts a constant variable into one of the 4 percentiles
print "Discretizing... variable %s" % index
variable_values = [sample[index] for sample in dataset]
new_values = stats.rankdata(
variable_values, "average")/len(variable_values)
for i in range(0, len(dataset)):
dataset[i][index] = str(int(round((new_values[i]*100)/25)))
return dataset
def discrete_summarize(dataset):
# Dataset is a list of lists. Zip will aggregate all the same features
# eg. [a,b,c] [g,t,e] -> [(a,g), (b,t), (c,e)]
# Counts occurency of each discrete occurrence of feature. eg {'<10': 10}
# Returns (summaries, )
summaries = [
dict(Counter(attribute))
for attribute in zip(*dataset)]
# The las summary is the summary of the labels.
del summaries[-1]
return (summaries, len(dataset))
def summarizeByClass(dataset):
# Creates a summary for each class value
# eg {positive: ((mean, stdv)(mean, stdv)}
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = summarize(instances)
return summaries
def discrete_summarize_by_class(dataset):
# Creates a summary for each class value
# eg {positive: ((mean, stdv)(mean, stdv)}
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = discrete_summarize(instances)
return summaries
def discrete_summarize_total(dataset):
# summrize the total data
summaries = discrete_summarize(dataset)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev, 2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateDiscreteProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev, 2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.iteritems():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(
x, mean, stdev)
return probabilities