Skip to content

Commit

Permalink
Merge pull request #186 from bab2min/dev/0200
Browse files Browse the repository at this point in the history
Prepare 0.20.0
  • Loading branch information
bab2min authored Oct 29, 2024
2 parents 38a64d4 + c87880b commit 2a46cdc
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 26 deletions.
22 changes: 9 additions & 13 deletions .github/workflows/pull_request_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- name: Test kiwipiepy
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py
- name: Test transformers_addon
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install -U pip
Expand Down Expand Up @@ -92,13 +92,12 @@ jobs:
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install
- name: Test kiwipiepy
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py
- name: Test transformers_addon
run: |
for v in {12..46}
Expand Down Expand Up @@ -147,9 +146,8 @@ jobs:
cd ..
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=arm64 USE_MIMALLOC=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build install
python -m pip install numpy==`python .github/workflows/numpy_version.py v1` || true
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
with:
Expand All @@ -159,7 +157,7 @@ jobs:
- name: Test kiwipiepy
run: |
python -m pip install pytest
python -m pytest -s --verbose test/test_kiwipiepy.py
python -m pytest -svv test/test_kiwipiepy.py
build_windows:
name: Build for Windows
Expand Down Expand Up @@ -194,9 +192,8 @@ jobs:
cd model
python setup.py build install
cd ..
$env:USE_MIMALLOC = 1; python setup.py build
$env:USE_MIMALLOC = 1; python setup.py build install
Try { python -m pip install numpy==$(python .github/workflows/numpy_version.py v1) } Catch {}
$env:USE_MIMALLOC = 1; python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
with:
Expand All @@ -206,7 +203,7 @@ jobs:
- name: Test kiwipiepy
run: |
python -m pip install pytest
python -m pytest --verbose test/test_kiwipiepy.py
python -m pytest -vv test/test_kiwipiepy.py
build_other_arch:
name: Build for manylinux (other arch)
Expand Down Expand Up @@ -243,15 +240,14 @@ jobs:
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python setup.py build install bdist_wheel
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install bdist_wheel
tar -zcvf /artifacts/build.tgz build/*
cp -r dist /artifacts/
/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest -s --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py
- name: Archive binary
uses: actions/upload-artifact@v3
Expand Down
18 changes: 10 additions & 8 deletions kiwipiepy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def tokenize(args, kiwi:Kiwi):
try:
while True:
txt = input('>>> ')
for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda):
for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda, saisiot=args.saisiot):
pprint(res)
except (EOFError, KeyboardInterrupt):
print()
Expand Down Expand Up @@ -45,7 +45,7 @@ def split(args, kiwi:Kiwi):
try:
while True:
txt = input('>>> ')
for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda):
for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda, saisiot=args.saisiot):
pprint(res)
except (EOFError, KeyboardInterrupt):
print()
Expand All @@ -66,13 +66,15 @@ def main(args):
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model_path')
parser.add_argument('--model_type', default='knlm', choices=['knlm', 'sbg'])
parser.add_argument('--top_n', default=1, type=int)
parser.add_argument('--normalize_coda', default=False, action='store_true')
parser.add_argument('--reset_whitespace', default=False, action='store_true')
parser.add_argument('--model-path')
parser.add_argument('--model-type', default='knlm', choices=['knlm', 'sbg'])
parser.add_argument('--top-n', default=1, type=int)
parser.add_argument('--normalize-coda', default=False, action='store_true')
parser.add_argument('--reset-whitespace', default=False, action='store_true')
parser.add_argument('--task', default='tokenize', choices=['tokenize', 'space', 'join', 'split'])
parser.add_argument('--typos')
parser.add_argument('--typo_cost_threshold', default=2.5, type=float)
parser.add_argument('--typo-cost-threshold', default=2.5, type=float)
parser.add_argument('--saisiot', default=None, action='store_true')
parser.add_argument('--no-saisiot', action='store_false', dest='saisiot')

main(parser.parse_args())
2 changes: 1 addition & 1 deletion kiwipiepy/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.19.1'
__version__ = '0.20.0'
30 changes: 29 additions & 1 deletion kiwipiepy/_wrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,7 @@ def analyze(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
blocklist:Optional[Union[MorphemeSet, Iterable[str]]] = None,
pretokenized:Optional[Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList]] = None,
) -> List[Tuple[List[Token], float]]:
Expand All @@ -955,6 +956,8 @@ def analyze(self,
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
compatible_jamo: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
saistiot: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
blocklist: Union[Iterable[str], MorphemeSet]
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
pretokenized: Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList]
Expand Down Expand Up @@ -1004,6 +1007,11 @@ def analyze(self,
match_options |= Match.SPLIT_COMPLEX
if compatible_jamo:
match_options |= Match.COMPATIBLE_JAMO
if saisiot is True:
match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT
elif saisiot is False:
match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT

if isinstance(blocklist, MorphemeSet):
if blocklist.kiwi != self:
warnings.warn("This `MorphemeSet` isn't based on current Kiwi object.")
Expand Down Expand Up @@ -1155,6 +1163,7 @@ def _tokenize(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
split_sents:bool = False,
stopwords:Optional[Stopwords] = None,
echo:bool = False,
Expand All @@ -1181,6 +1190,11 @@ def _refine_result_with_echo(arg):
match_options |= Match.SPLIT_COMPLEX
if compatible_jamo:
match_options |= Match.COMPATIBLE_JAMO

if saisiot is True:
match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT
elif saisiot is False:
match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT

if isinstance(blocklist, MorphemeSet):
if blocklist.kiwi != self:
Expand Down Expand Up @@ -1209,6 +1223,7 @@ def tokenize(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
split_sents:bool = False,
stopwords:Optional[Stopwords] = None,
echo:bool = False,
Expand Down Expand Up @@ -1249,6 +1264,13 @@ def tokenize(self,
True인 경우 분석 결과의 첫가끝 자모를 호환용 자모로 변환하여 출력합니다.
예를 들어 "ᆫ다/EF"는 "ㄴ다/EF"로, "ᆯ/ETM"은 "ㄹ/ETM"으로 변환됩니다.
saisiot: bool
.. versionadded:: 0.20.0
True인 경우 합성명사의 사이시옷을 분리하여 출력하고, False인 경우 사이시옷이 포함된 것으로 추정되는 합성명사를 결합하여 출력합니다.
None인 경우 별도의 사이시옷 처리 없이 Kiwi 기본 사전에 등재된 명사 사전에 기반해 분석합니다.
split_sents: bool
.. versionadded:: 0.10.3
Expand Down Expand Up @@ -1387,7 +1409,8 @@ def tokenize(self,
Token(form='.', tag='SF', start=25, len=1)]
```
'''
return self._tokenize(text, match_options, normalize_coda, z_coda, split_complex, compatible_jamo,
return self._tokenize(text, match_options, normalize_coda,
z_coda, split_complex, compatible_jamo, saisiot,
split_sents, stopwords, echo,
blocklist=blocklist,
pretokenized=pretokenized
Expand All @@ -1400,6 +1423,7 @@ def split_into_sents(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
stopwords:Optional[Stopwords] = None,
blocklist:Optional[Union[Iterable[str], MorphemeSet]] = None,
return_tokens:bool = False,
Expand All @@ -1426,6 +1450,8 @@ def split_into_sents(self,
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
compatible_jamo: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
saisiot: Optional[bool]
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
stopwords: Stopwords
.. versionadded:: 0.16.0
Expand Down Expand Up @@ -1532,6 +1558,7 @@ def _make_result(arg):
z_coda=z_coda,
split_complex=split_complex,
compatible_jamo=compatible_jamo,
saisiot=saisiot,
blocklist=blocklist,
split_sents=True), text))

Expand All @@ -1541,6 +1568,7 @@ def _make_result(arg):
z_coda=z_coda,
split_complex=split_complex,
compatible_jamo=compatible_jamo,
saisiot=saisiot,
blocklist=blocklist,
split_sents=True,
echo=True))
Expand Down
32 changes: 32 additions & 0 deletions kiwipiepy/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,88 +11,120 @@ class Match(IntEnum):
분석 시 특수한 문자열 패턴 중 어떤 것들을 추출할 지 선택할 수 있습니다.
bitwise OR 연산으로 여러 개 선택하여 사용가능합니다.
"""

URL = 1 << 0
""" 인터넷 주소 형태의 텍스트를 W_URL이라는 태그로 추출합니다. """

EMAIL = 1 << 1
""" 이메일 주소 형태의 텍스트를 W_EMAIL이라는 태그로 추출합니다. """

HASHTAG = 1 << 2
""" 해시태그(#해시태그) 형태의 텍스트를 W_HASHTAG라는 태그로 추출합니다. """

MENTION = 1 << 3
"""
멘션(@멘션) 형태의 텍스트를 W_MENTION이라는 태그로 추출합니다.
.. versionadded:: 0.8.2
"""

SERIAL = 1 << 4
"""
일련번호 형태의 텍스트를 W_SERIAL이라는 태그로 추출합니다.
.. versionadded:: 0.14.0
"""

EMOJI = 1 << 5
"""
이모지 형태의 텍스트를 W_EMOJI라는 태그로 추출합니다.
.. versionadded:: 0.18.0
"""

ALL = URL | EMAIL | HASHTAG | MENTION | SERIAL | EMOJI
""" URL, EMAIL, HASHTAG, MENTION, SERIAL, EMOJI을 모두 사용합니다. """

NORMALIZING_CODA = 1 << 16
""" '먹었엌ㅋㅋ'처럼 받침이 덧붙어서 분석에 실패하는 경우, 받침을 분리하여 정규화합니다. """

JOIN_NOUN_PREFIX = 1 << 17
"""
명사의 접두사를 분리하지 않고 결합합니다. 풋/XPN 사과/NNG -> 풋사과/NNG
.. versionadded:: 0.11.0
"""

JOIN_NOUN_SUFFIX = 1 << 18
"""
명사의 접미사를 분리하지 않고 결합합니다. 사과/NNG 들/XSN -> 사과들/NNG
.. versionadded:: 0.11.0
"""

JOIN_VERB_SUFFIX = 1 << 19
"""
동사 파생접미사를 분리하지 않고 결합합니다. 사랑/NNG 하/XSV 다/EF -> 사랑하/VV 다/EF
.. versionadded:: 0.11.0
"""

JOIN_ADJ_SUFFIX = 1 << 20
"""
형용사 파생접미사를 분리하지 않고 결합합니다. 매콤/XR 하/XSA 다/EF -> 매콤하/VA 다/EF
.. versionadded:: 0.11.0
"""

JOIN_ADV_SUFFIX = 1 << 21
"""
부사 파생접미사를 분리하지 않고 결합합니다. 요란/XR 히/XSM -> 요란히/MAG
.. versionadded:: 0.15.0
"""

SPLIT_COMPLEX = 1 << 22
"""
더 잘게 분할 가능한 형태소를 모두 분할합니다. 고마움/NNG -> 고맙/VA-I 음/ETN
.. versionadded:: 0.15.0
"""

Z_CODA = 1 << 23
"""
조사/어미에 덧붙은 받침을 Z_CODA 태그로 분리합니다. 했어욗 -> 하/VV 었/EP 어요/EF ㄳ/Z_CODA
.. versionadded:: 0.15.0
"""

COMPATIBLE_JAMO = 1 << 24
"""
형태소 분석 결과 출력 시 첫가끝 자모를 호환용 자모로 변환합니다.
.. versionadded:: 0.18.1
"""

SPLIT_SAISIOT = 1 << 25
"""
사이시옷이 포함된 합성명사를 분리합니다. 만둣국 -> 만두/NNG ᆺ/Z_SIOT 국/NNG
.. versionadded:: 0.20.0
"""

MERGE_SAISIOT = 1 << 26
"""
사이시옷이 포함된 것으로 추정되는 명사를 결합합니다. 만둣국 -> 만둣국/NNG
.. versionadded:: 0.20.0
"""

JOIN_V_SUFFIX = JOIN_VERB_SUFFIX | JOIN_ADJ_SUFFIX
"""
동사/형용사형 파생접미사를 분리하지 않고 결합합니다.
.. versionadded:: 0.11.0
"""

JOIN_AFFIX = JOIN_NOUN_PREFIX | JOIN_NOUN_SUFFIX | JOIN_V_SUFFIX | JOIN_ADV_SUFFIX
"""
모든 접두사/접미사를 분리하지 않고 결합합니다.
Expand Down
2 changes: 1 addition & 1 deletion model/kiwipiepy_model/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.19.0'
__version__ = '0.20.0'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def build_extension(self, ext):
keywords='Korean morphological analysis',
install_requires=[
'dataclasses; python_version < "3.7"',
'kiwipiepy_model>=0.19,<0.20',
'kiwipiepy_model>=0.20,<0.21',
'numpy<2; python_version < "3.9"',
'numpy; python_version >= "3.9"',
'tqdm',
Expand Down
13 changes: 13 additions & 0 deletions test/test_kiwipiepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,3 +895,16 @@ def test_issue_176():
text = "접사를 결합해 출력합니다."
tokens = kiwi.tokenize(text, match_options=Match.JOIN_AFFIX)
assert kiwi.join(tokens) == text

def test_saisiot():
kiwi = Kiwi()
for s in ["하굣길", "만둣국", "나뭇잎", "세숫물", "고춧가루", "시곗바늘", "사글셋방"]:
tokens = kiwi.tokenize(s, saisiot=True)
assert len(tokens) == 3
assert tokens[0].tag == "NNG"
assert tokens[1].tag == "Z_SIOT"
assert tokens[2].tag == "NNG"

tokens = kiwi.tokenize(s, saisiot=False)
assert len(tokens) == 1
assert tokens[0].tag == "NNG"

0 comments on commit 2a46cdc

Please sign in to comment.