Skip to content

Instantly share code, notes, and snippets.

@2efPer
Created November 21, 2017 09:40
Show Gist options
  • Select an option

  • Save 2efPer/a26a9273507eefa5a7bc88105c25ae0e to your computer and use it in GitHub Desktop.

Select an option

Save 2efPer/a26a9273507eefa5a7bc88105c25ae0e to your computer and use it in GitHub Desktop.
NLP测试
if __name__=="__main__":
from gensim import corpora, models, similarities
import codecs
from collections import defaultdict
# 原始语料
documents = []
f1 = codecs.open("../input", "r")
for (num, value) in enumerate(f1):
content=value.strip().split("__SOB__")
if(len(content)!=2):
print(num)
print(value)
break
documents.append(content[1])
f1.close()
# 预料格式化成分词形式
texts = [[word for word in document.split(r'|')] for document in documents]
# 词频计算
frequency = defaultdict(int)
counter=0
for text in texts:
for token in text:
counter=counter+1
frequency[token] += 1
# 创建字典(单词与编号之间的映射)
dictionary = corpora.Dictionary(texts)
# 建立向量[(tokenid,count),...] 此后可以做LDA或者TFIDF
corpus = [dictionary.doc2bow(text) for text in texts]
# tfidf = models.TfidfModel(corpus)
# corpus_tfidf = tfidf[corpus]
#相似度表
index = similarities.MatrixSimilarity(corpus_tfidf)
#index.save("../sim_index")
#相似度计算
query="北京市|天气预报|今日|晴|温度|13|风向|偏东|风力|1-2|级|明日|多云|转|晴|温度|15|最低|10|风向|西北|风力|1-2|级"
vec_bow = dictionary.doc2bow(query.split("|"))
vec_tfidf = tfidf[vec_bow]
sim = index[vec_tfidf]
print(len(sim))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment