Python中文日期时间提取:jieba和dateutil实战
import jieba.posseg as psg
import re
from datetime import datetime, timedelta
from dateutil.parser import parse
import os
# 定义中文数字和单位的映射字典
UTIL_CN_NUM = {
'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9,
}
UTIL_CN_UTIL = {'十': 10, '百': 100, '千': 1000, '万': 10000}
# 检查时间字符串是否合法
def check_time_valid(word):
m = re.match(r'\d+$', word)
if m:
if len(word) <= 6:
return None
word_1 = re.sub(r'[号|日]\d+$', '日', word)
if word_1 != word:
return check_time_valid(word_1)
else:
return word_1
# 将中文年份转换为数字
def year2dig(year):
res = ''
for item in year:
if item in UTIL_CN_NUM.keys():
res = res + str(UTIL_CN_NUM[item])
else:
res = res + item
m = re.match(r'\d+', res)
if m:
if len(m.group(0)) == 2:
return int(datetime.today().year / 100) * 100 + int(m.group(0))
else:
return int(m.group(0))
else:
return None
# 将中文数字转换为阿拉伯数字
def cn2dig(src):
if src == '':
return None
m = re.match(r'\d+', src)
if m:
return int(m.group(0))
rsl = 0
unit = 1
for item in src[::-1]:
if item in UTIL_CN_UTIL.keys():
unit = UTIL_CN_UTIL[item]
elif item in UTIL_CN_NUM.keys():
num = UTIL_CN_NUM[item]
rsl += num * unit
else:
return None
if rsl < unit:
rsl += unit
return rsl
# 将时间字符串解析为标准格式
def parse_datetime(msg):
if msg is None or len(msg) == 0:
return None
try:
dt = parse(msg)
return dt.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
m = re.match(r'([0-9零一二两三四五六七八九十]+ 年)? ([0-9一二两三四五六七八九十]+ 月)? '
r'([0-9一二两三四五六七八九十]+ [号日])? ([上中下午晚早]+)?'
r'([0-9零一二两三四五六七八九十百]+[点:.时])?([0-9零一二三四五六七八九十百]+ 分?)?'
r'([0-9零一二三四五六七八九十百]+ 秒)?', msg)
if m and m.group(0) is not None:
res = {
'year': m.group(1),
'month': m.group(2),
'day': m.group(3),
'hour': m.group(5) if m.group(5) is not None else '00',
'minute': m.group(6) if m.group(6) is not None else '00',
'second': m.group(7) if m.group(7) is not None else '00',
}
params = {}
for name in res:
if res[name] is not None and len(res[name]) != 0:
if name == 'year':
tmp = year2dig(res[name][:-1])
else:
tmp = cn2dig(res[name][:-1])
if tmp is not None:
params[name] = int(tmp)
target_date = datetime.today().replace(**params)
is_pm = m.group(4)
if is_pm is not None:
if is_pm == u'下午' or is_pm == u'晚上' or is_pm == u'中午':
hour = target_date.time().hour
if hour < 12:
target_date = target_date.replace(hour=hour + 12)
return target_date.strftime('%Y-%m-%d %H:%M:%S')
else:
return None
# 从文本中提取时间信息
def time_extract(text):
time_res = []
word = ''
key_date = {'今天': 0, '明天': 1, '后天': 2}
for k, v in psg.cut(text):
if k in key_date:
if word != '':
time_res.append(word)
word = (datetime.today() + timedelta(days=key_date.get(k, 0))) \
.strftime('%Y {0} %m {1} %d {2} ').format('年', '月', '日')
elif word != '':
if v in ['m', 't']:
word = word + k
else:
time_res.append(word)
word = ''
elif v in ['m', 't']:
word = k
if word != '':
time_res.append(word)
result = list(filter(lambda x: x is not None, [check_time_valid(w) for w in time_res]))
final_res = [parse_datetime(w) for w in result]
return [x for x in final_res if x is not None]
# 读取语料文本
with open('d:/Users/Administrator/Desktop/data/corpus.txt', encoding='utf-8') as f:
text = f.read()
f.close()
# 提取时间信息并打印
result = time_extract(text)
for r in result:
print(r)
原文地址: https://www.cveoy.top/t/topic/ex4O 著作权归作者所有。请勿转载和采集!