From e8855b92edfdab687be939fd24076617e031b146 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sat, 13 Apr 2024 21:33:27 +0900 Subject: [PATCH] added khaiii to disambiguate benchmark --- benchmark/disambiguate/disambiguate.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmark/disambiguate/disambiguate.py b/benchmark/disambiguate/disambiguate.py index 00f4b32..103b508 100644 --- a/benchmark/disambiguate/disambiguate.py +++ b/benchmark/disambiguate/disambiguate.py @@ -12,6 +12,7 @@ def from_name(name, kiwi_model_path=None, kiwi_model_type='knlm'): if name == 'hannanum': return HannanumModel() if name == 'mecab': return MecabModel() if name == 'okt': return OktModel() + if name == 'khaiii': return KhaiiiModel() def _convert(self, morph): raise NotImplementedError() @@ -108,6 +109,18 @@ def _convert(self, morph): def _tokenize(self, text): return self._mdl.pos(text, stem=True) +class KhaiiiModel(Model): + def __init__(self): + from khaiii import KhaiiiApi + self._mdl = KhaiiiApi() + print("Initialize khaiii ({})".format(self._mdl.version()), file=sys.stderr) + + def _convert(self, morph): + return morph.form, (morph.tag[:2] if morph.tag.startswith('V') else morph.tag[:1]) + + def _tokenize(self, text): + return [(morph.lex, morph.tag) for word in self._mdl.analyze(text) for morph in word.morphs] + def load_dataset(path): ret = [] for line in open(path, encoding='utf-8'): @@ -165,7 +178,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument('datasets', nargs='+') - parser.add_argument('--target', default='kiwi', help='kiwi,komoran,mecab,kkma,hannanum,okt') + parser.add_argument('--target', default='kiwi', help='kiwi,komoran,mecab,kkma,hannanum,okt,khaiii') parser.add_argument('--error_output_dir') parser.add_argument('--print_all_results', default=False, action='store_true') parser.add_argument('--kiwi_model_path')