import os import codecs

POS = os.path.join(os.getcwd(), 'pos') NEG = os.path.join(os.getcwd(), 'neg') FIX_POS = os.path.join(os.getcwd(), 'fix_pos') FIX_NEG = os.path.join(os.getcwd(), 'fix_neg')

def fix_corpus(dir_s, dir_t): for item in os.listdir(dir_s): with open(os.path.join(dir_s, item), 'r') as f: try: s = f.read() fix_s = s except UnicodeDecodeError: try: fix_s = s.decode('gbk') except UnicodeDecodeError: fix_s = s.decode('gb2312', errors='ignore') with codecs.open(os.path.join(dir_t, item), 'w', encoding='utf8') as ff: ff.write(fix_s)

if name == "main": if not os.path.isdir(FIX_POS): os.mkdir(FIX_POS) if not os.path.isdir(FIX_NEG): os.mkdir(FIX_NEG) fix_corpus(POS, FIX_POS) fix_corpus(NEG, FIX_NEG)

请调格式import os import codecs POS = ospathjoinosgetcwd pos NEG = ospathjoinosgetcwd neg FIX_POS = ospathjoinosgetcwd fix_pos FIX_NEG = ospathjoinosgetcwd fix_neg def fix_corpusdir_s dir_t for item in

原文地址: http://www.cveoy.top/t/topic/bBEU 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录