地址解析:基于CRF模型的地址要素提取
import jieba import os from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
读取训练数据
train_data = [] with open('train.conll', 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: train_data.append(line.split())
训练模型
segmentor = Segmentor() segmentor.load(os.path.join(os.getcwd(), 'ltp_data_v3.4.0', 'cws.model')) postagger = Postagger() postagger.load(os.path.join(os.getcwd(), 'ltp_data_v3.4.0', 'pos.model')) parser = Parser() parser.load(os.path.join(os.getcwd(), 'ltp_data_v3.4.0', 'parser.model')) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(os.getcwd(), 'ltp_data_v3.4.0', 'ner.model'))
定义标签集
tag_set = set() for data in train_data: tag_set.add(data[1]) tag_list = list(tag_set) tag_list.sort()
定义特征模板
def get_features(words, i):
feature_list = []
# 当前词
feature_list.append('w=' + words[i])
# 当前词的前一个词
if i > 0:
feature_list.append('w-1=' + words[i-1])
else:
feature_list.append('w-1=
定义特征模板集
feature_set = set() for data in train_data: words = [] tags = [] for item in data: if item: word, tag = item.split() words.append(word) tags.append(tag) for i in range(len(words)): features = get_features(words, i) for feature in features: feature_set.add(feature) feature_list = list(feature_set) feature_list.sort()
定义特征函数
def feature_func(words, postags, i, tag): features = get_features(words, i) feature_dict = {} for feature in features: if feature in feature_list: feature_dict[feature] = 1 return feature_dict, tag
训练模型
trainer = pycrfsuite.Trainer(verbose=False) for data in train_data: words = [] tags = [] for item in data: if item: word, tag = item.split() words.append(word) tags.append(tag) for i in range(len(words)): feature_dict, tag = feature_func(words, postags, i, tags[i]) trainer.append(feature_dict, tag) trainer.set_params({'c1': 1.0, 'c2': 1e-3, 'max_iterations': 50, 'feature.possible_transitions': True}) trainer.train('addr_parser.crfsuite')
加载模型
tagger = pycrfsuite.Tagger() tagger.open('addr_parser.crfsuite')
解析地址
def parse_address(address): # 分词 words = list(segmentor.segment(address)) # 词性标注 postags = list(postagger.postag(words)) # 命名实体识别 netags = list(recognizer.recognize(words, postags)) # 依存句法分析 arcs = list(parser.parse(words, postags)) # 初始化地址要素 province = '' city = '' district = '' street = '' community = '' devzone = '' houseno = '' cellno = '' floorno = '' # 初始化标签 tag_list = [] # 遍历分词结果 for i in range(len(words)): # 获取特征 feature_dict, _ = feature_func(words, postags, i, '') # 预测标签 tag = tagger.tag(feature_dict) # 获取实体类型 netag = netags[i] # 获取依存关系 arc = arcs[i] # 判断是否为省份 if 'B-prov' in tag: province = words[i] # 判断是否为直辖市 elif words[i] in ['北京', '上海', '天津', '重庆']: province = words[i] + '市' city = words[i] + '市' # 判断是否为城市 elif 'B-city' in tag: city = words[i] # 判断是否为区县 elif 'B-district' in tag or netag == 'S-Ns': if arc.relation == 'ATT' and arc.head - 1 >= 0: district = words[arc.head - 1] + words[i] else: district = words[i] # 判断是否为街道 elif 'B-town' in tag: street = words[i] # 判断是否为社区 elif 'B-community' in tag: community = words[i] # 判断是否为开发区 elif 'B-devzone' in tag: devzone = words[i] # 判断是否为门牌号 elif 'B-roadno' in tag: houseno = words[i] # 判断是否为楼号 elif 'B-houseno' in tag: houseno = words[i] # 判断是否为单元 elif 'B-cellno' in tag: cellno = words[i] # 判断是否为楼层 elif 'B-floorno' in tag: floorno = words[i] # 判断是否为POI elif 'B-poi' in tag: if arc.relation == 'ATT' and arc.head - 1 >= 0: street = words[arc.head - 1] else: street = words[i] # 返回地址要素和标签 return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tag )
读取文件中的地址信息
with open('1.txt', 'r', encoding='utf-8') as f: with open('对对对队_addr_parsing_runid.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素和标签 province, city, district, street, community ,devzone, houseno, cellno, floorno, tags = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + info + '' + tags + ' ')
原文地址: https://www.cveoy.top/t/topic/joLD 著作权归作者所有。请勿转载和采集!