-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinternal_clustering_evaluation.go
158 lines (124 loc) · 4.09 KB
/
internal_clustering_evaluation.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package main
import (
"fmt"
"github.com/kniren/gota/dataframe"
"gonum.org/v1/gonum/floats"
"log"
"os"
)
type centroid []float64
func main() {
// Pull in the CSV file.
irisFile, err := os.Open("iris.csv")
if err != nil {
log.Fatal(err)
}
defer irisFile.Close()
// Create a dataframe from the CSV file.
irisDF := dataframe.ReadCSV(irisFile)
// Define the names of the three separate species contained in the CSV file.
speciesNames := []string{
"Iris-setosa",
"Iris-versicolor",
"Iris-virginica",
}
// Create a map to hold our centroid information.
centroids := make(map[string]centroid)
// Create a map to hold the filtered dataframe for each cluster.
clusters := make(map[string]dataframe.DataFrame)
// Filter the dataset into three separate dataframes,
// each corresponding to one of the Iris species.
for _, species := range speciesNames {
// Filter the original dataset.
filter := dataframe.F{
Colname: "species",
Comparator: "==",
Comparando: species,
}
filtered := irisDF.Filter(filter)
// Calculate the mean of features.
summaryDF := filtered.Describe()
// Put each dimension's mean into the corresponding centroid.
var c centroid
for _, feature := range summaryDF.Names() {
// Skip the irrelevant columns.
if feature == "column" || feature == "species" {
continue
}
c = append(c, summaryDF.Col(feature).Float()[0])
}
// Add this centroid to our map.
centroids[species] = c
// Add the filtered dataframe to the map of clusters.
clusters[species] = filtered
}
// As a sanity check, output our centroids.
for _, species := range speciesNames {
fmt.Printf("%s centroid: %v\n", species, centroids[species])
}
// Convert our labels into a slice of strings and create a slice
// of float column names for convenience.
labels := irisDF.Col("species").Records()
floatColumns := []string{
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
}
// Loop over the records accumulating the average silhouette coefficient.
var silhouette float64
for idx, label := range labels {
// a will store our accumulated value for a.
var a float64
// Loop over the data points in the same cluster.
for i := 0; i < clusters[label].Nrow(); i++ {
// Get the data point for comparison.
current := dfFloatRow(irisDF, floatColumns, idx)
other := dfFloatRow(clusters[label], floatColumns, i)
// Add to a.
a += floats.Distance(current, other, 2) / float64(clusters[label].Nrow())
}
// Determine the nearest other cluster.
var otherCluster string
var distanceToCluster float64
for _, species := range speciesNames {
// Skip the cluster containing the data point.
if species == label {
continue
}
// Calculate the distance to the cluster from the current cluster.
distanceForThisCluster := floats.Distance(centroids[label], centroids[species], 2)
// Replace the current cluster if relevant.
if distanceToCluster == 0.0 || distanceForThisCluster < distanceToCluster {
otherCluster = species
distanceToCluster = distanceForThisCluster
}
}
// b will store our accumulated value for b.
var b float64
// Loop over the data points in the nearest other cluster.
for i := 0; i < clusters[otherCluster].Nrow(); i++ {
// Get the data point for comparison.
current := dfFloatRow(irisDF, floatColumns, idx)
other := dfFloatRow(clusters[otherCluster], floatColumns, i)
// Add to b.
b += floats.Distance(current, other, 2) / float64(clusters[otherCluster].Nrow())
}
// Add to the average silhouette coefficient.
if a > b {
silhouette += ((b - a) / a) / float64(len(labels))
}
silhouette += ((b - a) / b) / float64(len(labels))
}
// Output the final average silhouette coeffcient to stdout.
fmt.Printf("\nAverage Silhouette Coefficient: %0.2f\n\n", silhouette)
}
// dfFloatRow retrieves a slice of float values from a DataFrame
// at the given index and for the given column names.
func dfFloatRow(df dataframe.DataFrame, names []string, idx int) []float64 {
var row []float64
for _, name := range names {
row = append(row, df.Col(name).Float()[idx])
}
return row
}