def tokenize_cntext text=resub^wtext cutwords =jiebacuttext #去除停顿词判断分词在不在停用词列表内 stops = setstopwordswordschinese cutwords2 = word for word in cutwords if word not in stops and lenword1
def tokenize_cn(text): # 去除标点符号 text = re.sub('[^\w]', '', text) # 分词 cutwords = jieba.posseg.cut(text) # 去除停顿词,并提取名词、形容词和动词 stops = set(stopwords.words("chinese")) tags = ["nz", "n", "nt", "ns", "nr", "a", "an", "vn"] tokens = [word for word, flag in cutwords if word not in stops and flag in tags and len(word) > 1] return tokens
原文地址: https://www.cveoy.top/t/topic/fHIv 著作权归作者所有。请勿转载和采集!