#数据清洗-去除中文以外的字符 import re defworddrop(a): drop=re.compile('[^\u4e00-\u9fa5]') a=drop.sub('',a) return a train_dataset.keywords=train_dataset.keywords.apply(worddrop) train_dataset.head()
import jieba #导入停用词表 defstopwordslist(filepath): stopwords = [line.strip() for line inopen(filepath, 'r', encoding='utf-8').readlines()] return stopwords stopwords = stopwordslist('stopword.txt') stopwords[:10]
#删除停用词 import time start=time.clock() defdel_stopword(a): old = jieba.cut(a) new = '' for word in old : if word notin stopwords: new += " " new += word return new train_dataset.keywords=train_dataset.keywords.apply(del_stopword) end=time.clock() train_dataset.head()