forked from nmslib/nmslib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparse_bench.py
executable file
·121 lines (91 loc) · 3.42 KB
/
sparse_bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
import sys
import os
import time
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial import distance
import nmslib
import pysparnn as snn
from common import *
def bench_sparse_vector(batch=True):
dim = 20000
dataset = np.random.binomial(1, 0.01, size=(40000, dim))
queryset = np.random.binomial(1, 0.009, size=(1000, dim))
print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]
k = 3
q0 = queryset[0]
res = []
for i in range(dataset.shape[0]):
res.append([i, distance.cosine(q0, dataset[i,:])])
res.sort(key=lambda x: x[1])
print 'q0 res', res[:k]
data_matrix = csr_matrix(dataset, dtype=np.float32)
query_matrix = csr_matrix(queryset, dtype=np.float32)
data_to_return = range(dataset.shape[0])
with TimeIt('building MultiClusterIndex'):
cp = snn.MultiClusterIndex(data_matrix, data_to_return)
with TimeIt('knn search'):
res = cp.search(query_matrix, k=k, return_distance=False)
print res[:5]
for i in res[0]:
print int(i), distance.cosine(q0, dataset[int(i),:])
#space_type = 'cosinesimil_sparse'
space_type = 'cosinesimil_sparse_fast'
space_param = []
method_name = 'small_world_rand'
index_name = method_name + '_sparse.index'
if os.path.isfile(index_name):
os.remove(index_name)
index = nmslib.init(space_type,
space_param,
method_name,
nmslib.DataType.SPARSE_VECTOR,
nmslib.DistType.FLOAT)
if batch:
with TimeIt('batch add'):
positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix)
print 'positions', positions
else:
d = []
q = []
with TimeIt('preparing'):
for data in dataset:
d.append([[i, v] for i, v in enumerate(data) if v > 0])
for data in queryset:
q.append([[i, v] for i, v in enumerate(data) if v > 0])
with TimeIt('adding points'):
for id, data in enumerate(d):
nmslib.addDataPoint(index, id, data)
print 'Let\'s invoke the index-build process'
index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
with TimeIt('building index'):
nmslib.createIndex(index, index_param)
print 'The index is created'
nmslib.setQueryTimeParams(index,query_time_param)
print 'Query time parameters are set'
print "Results for the freshly created index:"
with TimeIt('knn query'):
if batch:
num_threads = 10
res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix)
for idx, v in enumerate(res):
if idx < 5:
print idx, v
if idx == 0:
for i in v:
print 'q0', i, distance.cosine(q0, dataset[i,:])
else:
for idx, data in enumerate(q):
res = nmslib.knnQuery(index, k, data)
if idx < 5:
print idx, res
nmslib.saveIndex(index, index_name)
print "The index %s is saved" % index_name
nmslib.freeIndex(index)
if __name__ == '__main__':
# for debugging purpose!
np.random.seed(17)
bench_sparse_vector()