Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare 0.20.0 #186

Merged
merged 7 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions .github/workflows/pull_request_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- name: Test kiwipiepy
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py
- name: Test transformers_addon
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install -U pip
Expand Down Expand Up @@ -92,13 +92,12 @@ jobs:
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install
- name: Test kiwipiepy
run: |
/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py
- name: Test transformers_addon
run: |
for v in {12..46}
Expand Down Expand Up @@ -147,9 +146,8 @@ jobs:
cd ..

MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=arm64 USE_MIMALLOC=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build install
python -m pip install numpy==`python .github/workflows/numpy_version.py v1` || true
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
with:
Expand All @@ -159,7 +157,7 @@ jobs:
- name: Test kiwipiepy
run: |
python -m pip install pytest
python -m pytest -s --verbose test/test_kiwipiepy.py
python -m pytest -svv test/test_kiwipiepy.py

build_windows:
name: Build for Windows
Expand Down Expand Up @@ -194,9 +192,8 @@ jobs:
cd model
python setup.py build install
cd ..
$env:USE_MIMALLOC = 1; python setup.py build
$env:USE_MIMALLOC = 1; python setup.py build install
Try { python -m pip install numpy==$(python .github/workflows/numpy_version.py v1) } Catch {}
$env:USE_MIMALLOC = 1; python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
with:
Expand All @@ -206,7 +203,7 @@ jobs:
- name: Test kiwipiepy
run: |
python -m pip install pytest
python -m pytest --verbose test/test_kiwipiepy.py
python -m pytest -vv test/test_kiwipiepy.py

build_other_arch:
name: Build for manylinux (other arch)
Expand Down Expand Up @@ -243,15 +240,14 @@ jobs:
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python setup.py build install bdist_wheel
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install bdist_wheel

tar -zcvf /artifacts/build.tgz build/*
cp -r dist /artifacts/

/opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
/opt/python/${{ matrix.cp }}/bin/python -m pytest -s --verbose test/test_kiwipiepy.py
/opt/python/${{ matrix.cp }}/bin/python -m pytest -svv test/test_kiwipiepy.py

- name: Archive binary
uses: actions/upload-artifact@v3
Expand Down
18 changes: 10 additions & 8 deletions kiwipiepy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def tokenize(args, kiwi:Kiwi):
try:
while True:
txt = input('>>> ')
for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda):
for res in kiwi.analyze(txt, args.top_n, normalize_coda=args.normalize_coda, saisiot=args.saisiot):
pprint(res)
except (EOFError, KeyboardInterrupt):
print()
Expand Down Expand Up @@ -45,7 +45,7 @@ def split(args, kiwi:Kiwi):
try:
while True:
txt = input('>>> ')
for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda):
for res in kiwi.split_into_sents(txt, normalize_coda=args.normalize_coda, saisiot=args.saisiot):
pprint(res)
except (EOFError, KeyboardInterrupt):
print()
Expand All @@ -66,13 +66,15 @@ def main(args):
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model_path')
parser.add_argument('--model_type', default='knlm', choices=['knlm', 'sbg'])
parser.add_argument('--top_n', default=1, type=int)
parser.add_argument('--normalize_coda', default=False, action='store_true')
parser.add_argument('--reset_whitespace', default=False, action='store_true')
parser.add_argument('--model-path')
parser.add_argument('--model-type', default='knlm', choices=['knlm', 'sbg'])
parser.add_argument('--top-n', default=1, type=int)
parser.add_argument('--normalize-coda', default=False, action='store_true')
parser.add_argument('--reset-whitespace', default=False, action='store_true')
parser.add_argument('--task', default='tokenize', choices=['tokenize', 'space', 'join', 'split'])
parser.add_argument('--typos')
parser.add_argument('--typo_cost_threshold', default=2.5, type=float)
parser.add_argument('--typo-cost-threshold', default=2.5, type=float)
parser.add_argument('--saisiot', default=None, action='store_true')
parser.add_argument('--no-saisiot', action='store_false', dest='saisiot')

main(parser.parse_args())
2 changes: 1 addition & 1 deletion kiwipiepy/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.19.1'
__version__ = '0.20.0'
30 changes: 29 additions & 1 deletion kiwipiepy/_wrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,7 @@ def analyze(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
blocklist:Optional[Union[MorphemeSet, Iterable[str]]] = None,
pretokenized:Optional[Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList]] = None,
) -> List[Tuple[List[Token], float]]:
Expand All @@ -955,6 +956,8 @@ def analyze(self,
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
compatible_jamo: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
saistiot: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
blocklist: Union[Iterable[str], MorphemeSet]
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
pretokenized: Union[Callable[[str], PretokenizedTokenList], PretokenizedTokenList]
Expand Down Expand Up @@ -1004,6 +1007,11 @@ def analyze(self,
match_options |= Match.SPLIT_COMPLEX
if compatible_jamo:
match_options |= Match.COMPATIBLE_JAMO
if saisiot is True:
match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT
elif saisiot is False:
match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT

if isinstance(blocklist, MorphemeSet):
if blocklist.kiwi != self:
warnings.warn("This `MorphemeSet` isn't based on current Kiwi object.")
Expand Down Expand Up @@ -1155,6 +1163,7 @@ def _tokenize(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
split_sents:bool = False,
stopwords:Optional[Stopwords] = None,
echo:bool = False,
Expand All @@ -1181,6 +1190,11 @@ def _refine_result_with_echo(arg):
match_options |= Match.SPLIT_COMPLEX
if compatible_jamo:
match_options |= Match.COMPATIBLE_JAMO

if saisiot is True:
match_options = (match_options & ~Match.MERGE_SAISIOT) | Match.SPLIT_SAISIOT
elif saisiot is False:
match_options = (match_options & ~Match.SPLIT_SAISIOT) | Match.MERGE_SAISIOT

if isinstance(blocklist, MorphemeSet):
if blocklist.kiwi != self:
Expand Down Expand Up @@ -1209,6 +1223,7 @@ def tokenize(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
split_sents:bool = False,
stopwords:Optional[Stopwords] = None,
echo:bool = False,
Expand Down Expand Up @@ -1249,6 +1264,13 @@ def tokenize(self,

True인 경우 분석 결과의 첫가끝 자모를 호환용 자모로 변환하여 출력합니다.
예를 들어 "ᆫ다/EF"는 "ㄴ다/EF"로, "ᆯ/ETM"은 "ㄹ/ETM"으로 변환됩니다.
saisiot: bool

.. versionadded:: 0.20.0

True인 경우 합성명사의 사이시옷을 분리하여 출력하고, False인 경우 사이시옷이 포함된 것으로 추정되는 합성명사를 결합하여 출력합니다.
None인 경우 별도의 사이시옷 처리 없이 Kiwi 기본 사전에 등재된 명사 사전에 기반해 분석합니다.

split_sents: bool
.. versionadded:: 0.10.3

Expand Down Expand Up @@ -1387,7 +1409,8 @@ def tokenize(self,
Token(form='.', tag='SF', start=25, len=1)]
```
'''
return self._tokenize(text, match_options, normalize_coda, z_coda, split_complex, compatible_jamo,
return self._tokenize(text, match_options, normalize_coda,
z_coda, split_complex, compatible_jamo, saisiot,
split_sents, stopwords, echo,
blocklist=blocklist,
pretokenized=pretokenized
Expand All @@ -1400,6 +1423,7 @@ def split_into_sents(self,
z_coda:bool = True,
split_complex:bool = False,
compatible_jamo:bool = False,
saisiot:Optional[bool] = None,
stopwords:Optional[Stopwords] = None,
blocklist:Optional[Union[Iterable[str], MorphemeSet]] = None,
return_tokens:bool = False,
Expand All @@ -1426,6 +1450,8 @@ def split_into_sents(self,
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
compatible_jamo: bool
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
saisiot: Optional[bool]
이 인자는 `Kiwi.tokenize`에서와 동일한 역할을 수행합니다.
stopwords: Stopwords

.. versionadded:: 0.16.0
Expand Down Expand Up @@ -1532,6 +1558,7 @@ def _make_result(arg):
z_coda=z_coda,
split_complex=split_complex,
compatible_jamo=compatible_jamo,
saisiot=saisiot,
blocklist=blocklist,
split_sents=True), text))

Expand All @@ -1541,6 +1568,7 @@ def _make_result(arg):
z_coda=z_coda,
split_complex=split_complex,
compatible_jamo=compatible_jamo,
saisiot=saisiot,
blocklist=blocklist,
split_sents=True,
echo=True))
Expand Down
32 changes: 32 additions & 0 deletions kiwipiepy/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,88 +11,120 @@ class Match(IntEnum):
분석 시 특수한 문자열 패턴 중 어떤 것들을 추출할 지 선택할 수 있습니다.
bitwise OR 연산으로 여러 개 선택하여 사용가능합니다.
"""

URL = 1 << 0
""" 인터넷 주소 형태의 텍스트를 W_URL이라는 태그로 추출합니다. """

EMAIL = 1 << 1
""" 이메일 주소 형태의 텍스트를 W_EMAIL이라는 태그로 추출합니다. """

HASHTAG = 1 << 2
""" 해시태그(#해시태그) 형태의 텍스트를 W_HASHTAG라는 태그로 추출합니다. """

MENTION = 1 << 3
"""
멘션(@멘션) 형태의 텍스트를 W_MENTION이라는 태그로 추출합니다.

.. versionadded:: 0.8.2
"""

SERIAL = 1 << 4
"""
일련번호 형태의 텍스트를 W_SERIAL이라는 태그로 추출합니다.

.. versionadded:: 0.14.0
"""

EMOJI = 1 << 5
"""
이모지 형태의 텍스트를 W_EMOJI라는 태그로 추출합니다.

.. versionadded:: 0.18.0
"""

ALL = URL | EMAIL | HASHTAG | MENTION | SERIAL | EMOJI
""" URL, EMAIL, HASHTAG, MENTION, SERIAL, EMOJI을 모두 사용합니다. """

NORMALIZING_CODA = 1 << 16
""" '먹었엌ㅋㅋ'처럼 받침이 덧붙어서 분석에 실패하는 경우, 받침을 분리하여 정규화합니다. """

JOIN_NOUN_PREFIX = 1 << 17
"""
명사의 접두사를 분리하지 않고 결합합니다. 풋/XPN 사과/NNG -> 풋사과/NNG

.. versionadded:: 0.11.0
"""

JOIN_NOUN_SUFFIX = 1 << 18
"""
명사의 접미사를 분리하지 않고 결합합니다. 사과/NNG 들/XSN -> 사과들/NNG

.. versionadded:: 0.11.0
"""

JOIN_VERB_SUFFIX = 1 << 19
"""
동사 파생접미사를 분리하지 않고 결합합니다. 사랑/NNG 하/XSV 다/EF -> 사랑하/VV 다/EF

.. versionadded:: 0.11.0
"""

JOIN_ADJ_SUFFIX = 1 << 20
"""
형용사 파생접미사를 분리하지 않고 결합합니다. 매콤/XR 하/XSA 다/EF -> 매콤하/VA 다/EF

.. versionadded:: 0.11.0
"""

JOIN_ADV_SUFFIX = 1 << 21
"""
부사 파생접미사를 분리하지 않고 결합합니다. 요란/XR 히/XSM -> 요란히/MAG

.. versionadded:: 0.15.0
"""

SPLIT_COMPLEX = 1 << 22
"""
더 잘게 분할 가능한 형태소를 모두 분할합니다. 고마움/NNG -> 고맙/VA-I 음/ETN

.. versionadded:: 0.15.0
"""

Z_CODA = 1 << 23
"""
조사/어미에 덧붙은 받침을 Z_CODA 태그로 분리합니다. 했어욗 -> 하/VV 었/EP 어요/EF ㄳ/Z_CODA

.. versionadded:: 0.15.0
"""

COMPATIBLE_JAMO = 1 << 24
"""
형태소 분석 결과 출력 시 첫가끝 자모를 호환용 자모로 변환합니다.

.. versionadded:: 0.18.1
"""

SPLIT_SAISIOT = 1 << 25
"""
사이시옷이 포함된 합성명사를 분리합니다. 만둣국 -> 만두/NNG ᆺ/Z_SIOT 국/NNG

.. versionadded:: 0.20.0
"""

MERGE_SAISIOT = 1 << 26
"""
사이시옷이 포함된 것으로 추정되는 명사를 결합합니다. 만둣국 -> 만둣국/NNG

.. versionadded:: 0.20.0
"""

JOIN_V_SUFFIX = JOIN_VERB_SUFFIX | JOIN_ADJ_SUFFIX
"""
동사/형용사형 파생접미사를 분리하지 않고 결합합니다.

.. versionadded:: 0.11.0
"""

JOIN_AFFIX = JOIN_NOUN_PREFIX | JOIN_NOUN_SUFFIX | JOIN_V_SUFFIX | JOIN_ADV_SUFFIX
"""
모든 접두사/접미사를 분리하지 않고 결합합니다.
Expand Down
2 changes: 1 addition & 1 deletion model/kiwipiepy_model/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.19.0'
__version__ = '0.20.0'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def build_extension(self, ext):
keywords='Korean morphological analysis',
install_requires=[
'dataclasses; python_version < "3.7"',
'kiwipiepy_model>=0.19,<0.20',
'kiwipiepy_model>=0.20,<0.21',
'numpy<2; python_version < "3.9"',
'numpy; python_version >= "3.9"',
'tqdm',
Expand Down
13 changes: 13 additions & 0 deletions test/test_kiwipiepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,3 +895,16 @@ def test_issue_176():
text = "접사를 결합해 출력합니다."
tokens = kiwi.tokenize(text, match_options=Match.JOIN_AFFIX)
assert kiwi.join(tokens) == text

def test_saisiot():
kiwi = Kiwi()
for s in ["하굣길", "만둣국", "나뭇잎", "세숫물", "고춧가루", "시곗바늘", "사글셋방"]:
tokens = kiwi.tokenize(s, saisiot=True)
assert len(tokens) == 3
assert tokens[0].tag == "NNG"
assert tokens[1].tag == "Z_SIOT"
assert tokens[2].tag == "NNG"

tokens = kiwi.tokenize(s, saisiot=False)
assert len(tokens) == 1
assert tokens[0].tag == "NNG"
Loading