Created
November 21, 2017 09:40
-
-
Save 2efPer/a26a9273507eefa5a7bc88105c25ae0e to your computer and use it in GitHub Desktop.
NLP测试
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| if __name__=="__main__": | |
| from gensim import corpora, models, similarities | |
| import codecs | |
| from collections import defaultdict | |
| # 原始语料 | |
| documents = [] | |
| f1 = codecs.open("../input", "r") | |
| for (num, value) in enumerate(f1): | |
| content=value.strip().split("__SOB__") | |
| if(len(content)!=2): | |
| print(num) | |
| print(value) | |
| break | |
| documents.append(content[1]) | |
| f1.close() | |
| # 预料格式化成分词形式 | |
| texts = [[word for word in document.split(r'|')] for document in documents] | |
| # 词频计算 | |
| frequency = defaultdict(int) | |
| counter=0 | |
| for text in texts: | |
| for token in text: | |
| counter=counter+1 | |
| frequency[token] += 1 | |
| # 创建字典(单词与编号之间的映射) | |
| dictionary = corpora.Dictionary(texts) | |
| # 建立向量[(tokenid,count),...] 此后可以做LDA或者TFIDF | |
| corpus = [dictionary.doc2bow(text) for text in texts] | |
| # tfidf = models.TfidfModel(corpus) | |
| # corpus_tfidf = tfidf[corpus] | |
| #相似度表 | |
| index = similarities.MatrixSimilarity(corpus_tfidf) | |
| #index.save("../sim_index") | |
| #相似度计算 | |
| query="北京市|天气预报|今日|晴|温度|13|风向|偏东|风力|1-2|级|明日|多云|转|晴|温度|15|最低|10|风向|西北|风力|1-2|级" | |
| vec_bow = dictionary.doc2bow(query.split("|")) | |
| vec_tfidf = tfidf[vec_bow] | |
| sim = index[vec_tfidf] | |
| print(len(sim)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment