-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalytics.py
87 lines (69 loc) · 2.75 KB
/
analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from gensim import corpora, models, similarities
from pykafka import KafkaClient
import pandas as pd
import heapq
import jieba
import sys
TOPIC_P = 'politics3'
TOPIC_E = 'entertainment4'
jieba.set_dictionary('dict.txt.big')
stop_flag = ['.','.','〔','〕','〝','〞','『 ','』','〈','〉','\\\\','(', ')','~','u3000','\"','>','<',';',',', '。', '、', ';', ':', '?', '「', '」', '%', '.', ',', '?', '-', '~','!','!', ' ', '<BR>', '“', '”', '【', '】', '《', '》',':','(',')',';']
stop_flag += ['和','的','是','了','也']
stop_flag += ['▲','★','●','2018','04','05','06','07','08','09']
def analysis(subject):
data = getNews(subject) #拿出在kafka中的新聞
corpus=[]
for each in data:
corpus.append(tokenization(each))
dictionary = corpora.Dictionary(corpus) #建立詞袋模型
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
#建立TF-IDF模型
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
print(dictionary)
def articleSim(article):
query = tokenization(article) #建立query文本
query_bow = dictionary.doc2bow(query)
#用TF-IDF計算相似度
index = similarities.MatrixSimilarity(tfidf_vectors)
sims = index[query_bow]
return sims
result =[]
for idx,val in enumerate(data):
row = []
row.append('Id_'+str(idx+1))
for i in range(1,6):
row.append('sid_'+str(idx+1)+str(i))
data_row = []
for i in articleSim(val):
data_row.append(str(i))
data_row = heapq.nlargest(7,data_row)#找出相似度最高的前五
result.append(row)
result.append(data_row)
return result
def tokenization(news): #用jieba分詞詞並去除停用詞
result = []
for word in jieba.cut(news, cut_all=False):
if word not in stop_flag:
result.append(word)
return result
def getNews(topicsname):
client = KafkaClient(hosts="127.0.0.1:9092")
topic = client.topics[topicsname]
consumer = topic.get_simple_consumer(reset_offset_on_start=True,consumer_timeout_ms=5000)
data = []
for message in consumer:
if message is not None:
print(message.offset, (message.value).decode('utf-8'))
data.append((message.value).decode('utf-8'))
return data
if __name__ == '__main__':
topic = ""
if(sys.argv[1]=='p'):
topic = TOPIC_P
elif(sys.argv[1]=='e'):
topic = TOPIC_E
else:
print("input the topic")
result = analysis(bytes(topic,'utf-8'))
pd.DataFrame(result).to_csv(topic+'_top5.csv',index=False,header=False)