From 2beebb53d05ead75526829884063098040e571b7 Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 00:23:32 +0900 Subject: [PATCH 1/7] Add the argument `saisiot` to `Kiwi.tokenize()` --- kiwipiepy/_wrap.py | 30 +++++++++++++++++++++++++++++- kiwipiepy/const.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/kiwipiepy/_wrap.py b/kiwipiepy/_wrap.py index 9db576c..03bec4c 100644 --- a/kiwipiepy/_wrap.py +++ b/kiwipiepy/_wrap.py @@ -929,6 +929,7 @@ def analyze(self, z_coda:bool = True, split_complex:bool = False, compatible_jamo:bool = False, + saisiot:Optional[bool] = None, blocklist:Optional[Union[MorphemeSet, Iterable[str]]] = None, pretokenized:Optional[Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList]] = None, ) -> List[Tuple[List[Token], float]]: @@ -955,6 +956,8 @@ def analyze(self, 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. compatible_jamo: bool 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. +saistiot: bool + 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. blocklist: Union[Iterable[str], MorphemeSet] 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. pretokenized: Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList] @@ -1004,6 +1007,11 @@ def analyze(self, match_options |= Match.SPLIT_COMPLEX if compatible_jamo: match_options |= Match.COMPATIBLE_JAMO + if saisiot is True: + match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT + elif saisiot is False: + match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT + if isinstance(blocklist, MorphemeSet): if blocklist.kiwi != self: warnings.warn("This `MorphemeSet` isn't based on current Kiwi object.") @@ -1155,6 +1163,7 @@ def _tokenize(self, z_coda:bool = True, split_complex:bool = False, compatible_jamo:bool = False, + saisiot:Optional[bool] = None, split_sents:bool = False, stopwords:Optional[Stopwords] = None, echo:bool = False, @@ -1181,6 +1190,11 @@ def _refine_result_with_echo(arg): match_options |= Match.SPLIT_COMPLEX if compatible_jamo: match_options |= Match.COMPATIBLE_JAMO + + if saisiot is True: + match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT + elif saisiot is False: + match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT if isinstance(blocklist, MorphemeSet): if blocklist.kiwi != self: @@ -1209,6 +1223,7 @@ def tokenize(self, z_coda:bool = True, split_complex:bool = False, compatible_jamo:bool = False, + saisiot:Optional[bool] = None, split_sents:bool = False, stopwords:Optional[Stopwords] = None, echo:bool = False, @@ -1249,6 +1264,13 @@ def tokenize(self, True인 경우 분석 결과의 첫가끝 자모를 호환용 자모로 변환하여 출력합니다. 예를 들어 "ᆫ다/EF"는 "ㄴ다/EF"로, "ᆯ/ETM"은 "ㄹ/ETM"으로 변환됩니다. +saisiot: bool + + .. versionadded:: 0.20.0 + + True인 경우 합성명사의 사이시옷을 분리하여 출력하고, False인 경우 사이시옷이 포함된 것으로 추정되는 합성명사를 결합하여 출력합니다. + None인 경우 별도의 사이시옷 처리 없이 Kiwi 기본 사전에 등재된 명사 사전에 기반해 분석합니다. + split_sents: bool .. versionadded:: 0.10.3 @@ -1387,7 +1409,8 @@ def tokenize(self, Token(form='.', tag='SF', start=25, len=1)] ``` ''' - return self._tokenize(text, match_options, normalize_coda, z_coda, split_complex, compatible_jamo, + return self._tokenize(text, match_options, normalize_coda, + z_coda, split_complex, compatible_jamo, saisiot, split_sents, stopwords, echo, blocklist=blocklist, pretokenized=pretokenized @@ -1400,6 +1423,7 @@ def split_into_sents(self, z_coda:bool = True, split_complex:bool = False, compatible_jamo:bool = False, + saisiot:Optional[bool] = None, stopwords:Optional[Stopwords] = None, blocklist:Optional[Union[Iterable[str], MorphemeSet]] = None, return_tokens:bool = False, @@ -1426,6 +1450,8 @@ def split_into_sents(self, 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. compatible_jamo: bool 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. +saisiot: Optional[bool] + 이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다. stopwords: Stopwords .. versionadded:: 0.16.0 @@ -1532,6 +1558,7 @@ def _make_result(arg): z_coda=z_coda, split_complex=split_complex, compatible_jamo=compatible_jamo, + saisiot=saisiot, blocklist=blocklist, split_sents=True), text)) @@ -1541,6 +1568,7 @@ def _make_result(arg): z_coda=z_coda, split_complex=split_complex, compatible_jamo=compatible_jamo, + saisiot=saisiot, blocklist=blocklist, split_sents=True, echo=True)) diff --git a/kiwipiepy/const.py b/kiwipiepy/const.py index 1943b27..60fbd9b 100644 --- a/kiwipiepy/const.py +++ b/kiwipiepy/const.py @@ -11,88 +11,120 @@ class Match(IntEnum): 분석 시 특수한 문자열 패턴 중 어떤 것들을 추출할 지 선택할 수 있습니다. bitwise OR 연산으로 여러 개 선택하여 사용가능합니다. """ + URL = 1 << 0 """ 인터넷 주소 형태의 텍스트를 W_URL이라는 태그로 추출합니다. """ + EMAIL = 1 << 1 """ 이메일 주소 형태의 텍스트를 W_EMAIL이라는 태그로 추출합니다. """ + HASHTAG = 1 << 2 """ 해시태그(#해시태그) 형태의 텍스트를 W_HASHTAG라는 태그로 추출합니다. """ + MENTION = 1 << 3 """ 멘션(@멘션) 형태의 텍스트를 W_MENTION이라는 태그로 추출합니다. .. versionadded:: 0.8.2 """ + SERIAL = 1 << 4 """ 일련번호 형태의 텍스트를 W_SERIAL이라는 태그로 추출합니다. .. versionadded:: 0.14.0 """ + EMOJI = 1 << 5 """ 이모지 형태의 텍스트를 W_EMOJI라는 태그로 추출합니다. .. versionadded:: 0.18.0 """ + ALL = URL | EMAIL | HASHTAG | MENTION | SERIAL | EMOJI """ URL, EMAIL, HASHTAG, MENTION, SERIAL, EMOJI을 모두 사용합니다. """ + NORMALIZING_CODA = 1 << 16 """ '먹었엌ㅋㅋ'처럼 받침이 덧붙어서 분석에 실패하는 경우, 받침을 분리하여 정규화합니다. """ + JOIN_NOUN_PREFIX = 1 << 17 """ 명사의 접두사를 분리하지 않고 결합합니다. 풋/XPN 사과/NNG -> 풋사과/NNG .. versionadded:: 0.11.0 """ + JOIN_NOUN_SUFFIX = 1 << 18 """ 명사의 접미사를 분리하지 않고 결합합니다. 사과/NNG 들/XSN -> 사과들/NNG .. versionadded:: 0.11.0 """ + JOIN_VERB_SUFFIX = 1 << 19 """ 동사 파생접미사를 분리하지 않고 결합합니다. 사랑/NNG 하/XSV 다/EF -> 사랑하/VV 다/EF .. versionadded:: 0.11.0 """ + JOIN_ADJ_SUFFIX = 1 << 20 """ 형용사 파생접미사를 분리하지 않고 결합합니다. 매콤/XR 하/XSA 다/EF -> 매콤하/VA 다/EF .. versionadded:: 0.11.0 """ + JOIN_ADV_SUFFIX = 1 << 21 """ 부사 파생접미사를 분리하지 않고 결합합니다. 요란/XR 히/XSM -> 요란히/MAG .. versionadded:: 0.15.0 """ + SPLIT_COMPLEX = 1 << 22 """ 더 잘게 분할 가능한 형태소를 모두 분할합니다. 고마움/NNG -> 고맙/VA-I 음/ETN .. versionadded:: 0.15.0 """ + Z_CODA = 1 << 23 """ 조사/어미에 덧붙은 받침을 Z_CODA 태그로 분리합니다. 했어욗 -> 하/VV 었/EP 어요/EF ㄳ/Z_CODA .. versionadded:: 0.15.0 """ + COMPATIBLE_JAMO = 1 << 24 """ 형태소 분석 결과 출력 시 첫가끝 자모를 호환용 자모로 변환합니다. .. versionadded:: 0.18.1 """ + + SPLIT_SAISIOT = 1 << 25 + """ + 사이시옷이 포함된 합성명사를 분리합니다. 만둣국 -> 만두/NNG ᆺ/Z_SIOT 국/NNG + + .. versionadded:: 0.20.0 + """ + + MERGE_SAISIOT = 1 << 26 + """ + 사이시옷이 포함된 것으로 추정되는 명사를 결합합니다. 만둣국 -> 만둣국/NNG + + .. versionadded:: 0.20.0 + """ + JOIN_V_SUFFIX = JOIN_VERB_SUFFIX | JOIN_ADJ_SUFFIX """ 동사/형용사형 파생접미사를 분리하지 않고 결합합니다. .. versionadded:: 0.11.0 """ + JOIN_AFFIX = JOIN_NOUN_PREFIX | JOIN_NOUN_SUFFIX | JOIN_V_SUFFIX | JOIN_ADV_SUFFIX """ 모든 접두사/접미사를 분리하지 않고 결합합니다. From c4a8b2d3ff9dce9de12ff2984108502d6514679e Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 00:23:56 +0900 Subject: [PATCH 2/7] Update Kiwi submodule to v0.20.0 --- Kiwi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Kiwi b/Kiwi index e371e0d..a9ee8c3 160000 --- a/Kiwi +++ b/Kiwi @@ -1 +1 @@ -Subproject commit e371e0d1a3c2c6fdfc4d30ef9f735c139c16d433 +Subproject commit a9ee8c3cc47211776db29b0f55995ad7d28313d2 From 8e93fefd2ececb49486662b289d53ae0f458de82 Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 00:38:43 +0900 Subject: [PATCH 3/7] Add test cases for `saisiot` arg --- test/test_kiwipiepy.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_kiwipiepy.py b/test/test_kiwipiepy.py index 470010f..861bf25 100644 --- a/test/test_kiwipiepy.py +++ b/test/test_kiwipiepy.py @@ -895,3 +895,16 @@ def test_issue_176(): text = "접사를 결합해 출력합니다." tokens = kiwi.tokenize(text, match_options=Match.JOIN_AFFIX) assert kiwi.join(tokens) == text + +def test_saisiot(): + kiwi = Kiwi() + for s in ["하굣길", "만둣국", "나뭇잎", "세숫물", "고춧가루", "시곗바늘", "사글셋방"]: + tokens = kiwi.tokenize(s, saisiot=True) + assert len(tokens) == 3 + assert tokens[0].tag == "NNG" + assert tokens[1].tag == "Z_SIOT" + assert tokens[2].tag == "NNG" + + tokens = kiwi.tokenize(s, saisiot=False) + assert len(tokens) == 1 + assert tokens[0].tag == "NNG" From 57d341edbb2da6b10ca1222a1377cea93f894bb4 Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 00:45:29 +0900 Subject: [PATCH 4/7] Add `saisiot` & `no-saisiot` arg to __main__.py --- kiwipiepy/__main__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kiwipiepy/__main__.py b/kiwipiepy/__main__.py index 2a00bf2..22eb6e1 100644 --- a/kiwipiepy/__main__.py +++ b/kiwipiepy/__main__.py @@ -11,7 +11,7 @@ def tokenize(args, kiwi:Kiwi): try: while True: txt = input('>>> ') - for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda): + for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda, saisiot=args.saisiot): pprint(res) except (EOFError, KeyboardInterrupt): print() @@ -45,7 +45,7 @@ def split(args, kiwi:Kiwi): try: while True: txt = input('>>> ') - for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda): + for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda, saisiot=args.saisiot): pprint(res) except (EOFError, KeyboardInterrupt): print() @@ -66,13 +66,15 @@ def main(args): if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() - parser.add_argument('--model_path') - parser.add_argument('--model_type', default='knlm', choices=['knlm', 'sbg']) - parser.add_argument('--top_n', default=1, type=int) - parser.add_argument('--normalize_coda', default=False, action='store_true') - parser.add_argument('--reset_whitespace', default=False, action='store_true') + parser.add_argument('--model-path') + parser.add_argument('--model-type', default='knlm', choices=['knlm', 'sbg']) + parser.add_argument('--top-n', default=1, type=int) + parser.add_argument('--normalize-coda', default=False, action='store_true') + parser.add_argument('--reset-whitespace', default=False, action='store_true') parser.add_argument('--task', default='tokenize', choices=['tokenize', 'space', 'join', 'split']) parser.add_argument('--typos') - parser.add_argument('--typo_cost_threshold', default=2.5, type=float) + parser.add_argument('--typo-cost-threshold', default=2.5, type=float) + parser.add_argument('--saisiot', default=None, action='store_true') + parser.add_argument('--no-saisiot', action='store_false', dest='saisiot') main(parser.parse_args()) From 551bb055ff7e806418ef1bd4ebf18775a6f93d3f Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 00:48:17 +0900 Subject: [PATCH 5/7] bump to v0.20.0 --- kiwipiepy/_version.py | 2 +- model/kiwipiepy_model/_version.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kiwipiepy/_version.py b/kiwipiepy/_version.py index db7a416..2f15b8c 100644 --- a/kiwipiepy/_version.py +++ b/kiwipiepy/_version.py @@ -1 +1 @@ -__version__ = '0.19.1' +__version__ = '0.20.0' diff --git a/model/kiwipiepy_model/_version.py b/model/kiwipiepy_model/_version.py index 24e55fc..2df6e67 100644 --- a/model/kiwipiepy_model/_version.py +++ b/model/kiwipiepy_model/_version.py @@ -1 +1 @@ -__version__ = '0.19.0' \ No newline at end of file +__version__ = '0.20.0' \ No newline at end of file diff --git a/setup.py b/setup.py index 37184fd..9fc31d5 100644 --- a/setup.py +++ b/setup.py @@ -209,7 +209,7 @@ def build_extension(self, ext): keywords='Korean morphological analysis', install_requires=[ 'dataclasses; python_version < "3.7"', - 'kiwipiepy_model>=0.19,<0.20', + 'kiwipiepy_model>=0.20,<0.21', 'numpy<2; python_version < "3.9"', 'numpy; python_version >= "3.9"', 'tqdm', From aaffffd34c73f6a21589d44e4667ea7a878c292c Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 01:51:51 +0900 Subject: [PATCH 6/7] Fix workflows --- .github/workflows/pull_request_test.yml | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 67a779b..1986f81 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -42,7 +42,7 @@ jobs: - name: Test kiwipiepy run: | /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest - /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py + /opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py - name: Test transformers_addon run: | /opt/python/${{ matrix.cp }}/bin/python -m pip install -U pip @@ -92,13 +92,12 @@ jobs: cd model /opt/python/${{ matrix.cp }}/bin/python setup.py build install cd .. - /opt/python/${{ matrix.cp }}/bin/python setup.py build + /opt/python/${{ matrix.cp }}/bin/python setup.py build install /opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true - /opt/python/${{ matrix.cp }}/bin/python setup.py install - name: Test kiwipiepy run: | /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest - /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py + /opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py - name: Test transformers_addon run: | for v in {12..46} @@ -147,9 +146,8 @@ jobs: cd .. MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=arm64 USE_MIMALLOC=1 python setup.py build - MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build + MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build install python -m pip install numpy==`python .github/workflows/numpy_version.py v1` || true - MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py install - name: Archive binary uses: actions/upload-artifact@v3 with: @@ -159,7 +157,7 @@ jobs: - name: Test kiwipiepy run: | python -m pip install pytest - python -m pytest -s --verbose test/test_kiwipiepy.py + python -m pytest -svv test/test_kiwipiepy.py build_windows: name: Build for Windows @@ -194,9 +192,8 @@ jobs: cd model python setup.py build install cd .. - $env:USE_MIMALLOC = 1; python setup.py build + $env:USE_MIMALLOC = 1; python setup.py build install Try { python -m pip install numpy==$(python .github/workflows/numpy_version.py v1) } Catch {} - $env:USE_MIMALLOC = 1; python setup.py install - name: Archive binary uses: actions/upload-artifact@v3 with: @@ -206,7 +203,7 @@ jobs: - name: Test kiwipiepy run: | python -m pip install pytest - python -m pytest --verbose test/test_kiwipiepy.py + python -m pytest -svv test/test_kiwipiepy.py build_other_arch: name: Build for manylinux (other arch) @@ -243,15 +240,14 @@ jobs: cd model /opt/python/${{ matrix.cp }}/bin/python setup.py build install cd .. - /opt/python/${{ matrix.cp }}/bin/python setup.py build + /opt/python/${{ matrix.cp }}/bin/python setup.py build install bdist_wheel /opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true - /opt/python/${{ matrix.cp }}/bin/python setup.py install bdist_wheel tar -zcvf /artifacts/build.tgz build/* cp -r dist /artifacts/ /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest - /opt/python/${{ matrix.cp }}/bin/python -m pytest -s --verbose test/test_kiwipiepy.py + /opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py - name: Archive binary uses: actions/upload-artifact@v3 From c87880b8c711aa3bb6b92cd398cc9f46440c1268 Mon Sep 17 00:00:00 2001 From: bab2min Date: Tue, 29 Oct 2024 09:06:54 +0900 Subject: [PATCH 7/7] Fix workflows --- .github/workflows/pull_request_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 1986f81..259cf3b 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -203,7 +203,7 @@ jobs: - name: Test kiwipiepy run: | python -m pip install pytest - python -m pytest -svv test/test_kiwipiepy.py + python -m pytest -vv test/test_kiwipiepy.py build_other_arch: name: Build for manylinux (other arch)