import os import codecs

POS = os.path.join(os.getcwd(), 'pos') NEG = os.path.join(os.getcwd(), 'neg') FIX_POS = os.path.join(os.getcwd(), 'fix_pos') FIX_NEG = os.path.join(os.getcwd(), 'fix_neg')

def fix_corpus(dir_s, dir_t): for item in os.listdir(dir_s): with open(os.path.join(dir_s, item), 'r') as f: try: s = f.read() fix_s = s except UnicodeDecodeError: try: fix_s = s.decode('gbk') except UnicodeDecodeError: fix_s = s.decode('gb2312', errors='ignore') with codecs.open(os.path.join(dir_t, item), 'w', encoding='utf8') as ff: ff.write(fix_s)

if name == "main": if not os.path.isdir(FIX_POS): os.mkdir(FIX_POS) if not os.path.isdir(FIX_NEG): os.mkdir(FIX_NEG) fix_corpus(POS, FIX_POS) fix_corpus(NEG, FIX_NEG)

修改说明:

1. 删除了 s.decode('gb2312') 这行代码,因为 s 已经是字符串类型,不需要再进行解码操作

2. 修改了 fix_s = s.decode('gbk') 这行代码,因为出错的情况下一般是 gb2312 编码,因此先尝试使用 gbk 解码

3. 在写入文件时,使用 codecs.open() 来指定编码为 utf8

Python 编码错误解决:'str' object has no attribute 'decode' 解決方法

原文地址: https://www.cveoy.top/t/topic/m55b 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录