pypinyin.contrib.mmseg 源代码

# -*- coding: utf-8 -*-
"""最大正向匹配分词"""
from pypinyin.constants import PHRASES_DICT


[文档]class Seg(object): """最大正向匹配分词 :type prefix_set: PrefixSet """ def __init__(self, prefix_set): self._prefix_set = prefix_set
[文档] def cut(self, text): """分词 :param text: 待分词的文本 :yield: 单个词语 """ remain = text while remain: matched = '' # 一次加一个字的匹配 for index in range(len(remain)): word = remain[:index + 1] if word in self._prefix_set: matched = word else: # 前面的字符串是个词语 if matched: yield matched matched = '' remain = remain[index:] else: # 前面为空 yield word remain = remain[index + 1:] # 有结果了,剩余的重新开始匹配 break else: # 整个文本就是一个词语 yield remain break
[文档] def train(self, words): """训练分词器 :param words: 词语列表 """ self._prefix_set.train(words)
class PrefixSet(object): def __init__(self): self._set = set() def train(self, word_s): """更新 prefix set :param word_s: 词语库列表 :type word_s: iterable :return: None """ for word in word_s: # 把词语的每个前缀更新到 prefix_set 中 for index in range(len(word)): self._set.add(word[:index + 1]) def __contains__(self, key): return key in self._set p_set = PrefixSet() p_set.train(PHRASES_DICT.keys()) #: 基于内置词库的最大正向匹配分词器。使用:: #: #: >>> from pypinyin.contrib.mmseg import seg #: >>> text = '你好,我是中国人,我爱我的祖国' #: >>> seg.cut(text) #: <generator object Seg.cut at 0x10b2df2b0> #: >>> list(seg.cut(text)) #: ['你好', ',', '我', '是', '中国人', ',', '我', '爱', #: '我的', '祖', '国'] #: >>> seg.train(['祖国', '我是']) #: >>> list(seg.cut(text)) #: ['你好', ',', '我是', '中国人', ',', '我', '爱', #: '我的', '祖国'] #: >>> seg = Seg(p_set)
[文档]def retrain(seg_instance): """重新使用内置词典训练 seg_instance。 比如在增加自定义词语信息后需要调用这个模块重新训练分词器 :type seg_instance: Seg """ seg_instance.train(PHRASES_DICT.keys())