Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Korean Language Support #98

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 76 additions & 4 deletions contextualized_topic_models/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from nltk.corpus import stopwords as stop_words
from gensim.utils import deaccent
import warnings
from konlpy.tag import Okt # for Korean natural language processing.
okt = Okt()


class WhiteSpacePreprocessing():
"""
Expand All @@ -11,7 +14,6 @@ class WhiteSpacePreprocessing():

def __init__(self, documents, stopwords_language="english", vocabulary_size=2000):
"""

:param documents: list of strings
:param stopwords_language: string of the language of the stopwords (see nltk stopwords)
:param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
Expand All @@ -28,7 +30,6 @@ def preprocess(self):
"""
Note that if after filtering some documents do not contain words we remove them. That is why we return also the
list of unpreprocessed documents.

:return: preprocessed documents, unpreprocessed documents and the vocabulary list
"""
preprocessed_docs_tmp = self.documents
Expand Down Expand Up @@ -64,7 +65,6 @@ class WhiteSpacePreprocessingStopwords():
def __init__(self, documents, stopwords_list=None, vocabulary_size=2000, max_df=1.0, min_words=1,
remove_numbers=True):
"""

:param documents: list of strings
:param stopwords_list: list of the stopwords to remove
:param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
Expand Down Expand Up @@ -94,13 +94,13 @@ def preprocess(self):
"""
Note that if after filtering some documents do not contain words we remove them. That is why we return also the
list of unpreprocessed documents.

:return: preprocessed documents, unpreprocessed documents and the vocabulary list
"""
preprocessed_docs_tmp = self.documents
preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
preprocessed_docs_tmp = [doc.translate(
str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]

if self.remove_numbers:
preprocessed_docs_tmp = [doc.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
for doc in preprocessed_docs_tmp]
Expand All @@ -124,4 +124,76 @@ def preprocess(self):

return preprocessed_docs, unpreprocessed_docs, vocabulary

class WhiteSpacePreprocessingStopwordsKorean():
"""
Provides a very simple preprocessing script that filters infrequent tokens from text
"""

def __init__(self, documents, stopwords_list=None, vocabulary_size=2000, max_df=1.0, min_words=1,
remove_numbers=True):
"""
:param documents: list of strings
:param stopwords_list: list of the stopwords to remove
:param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
:param max_df : float or int, default=1.0
When building the vocabulary ignore terms that have a document
frequency strictly higher than the given threshold (corpus-specific
stop words).
If float in range [0.0, 1.0], the parameter represents a proportion of
documents, integer absolute counts.
This parameter is ignored if vocabulary is not None.
:param min_words: int, default=1. Documents with less words than the parameter
will be removed
:param remove_numbers: bool, default=True. If true, numbers are removed from docs
"""
self.documents = documents
if stopwords_list is not None:
self.stopwords = set(stopwords_list)
else:
self.stopwords = []

self.vocabulary_size = vocabulary_size
self.max_df = max_df
self.min_words = min_words
self.remove_numbers = remove_numbers

def preprocess(self):
"""
Note that if after filtering some documents do not contain words we remove them. That is why we return also the
list of unpreprocessed documents.
:return: preprocessed documents, unpreprocessed documents and the vocabulary list

Note that for Korean language support, it uses konlpy to use its tokenizer.
"""
preprocessed_docs_tmp = self.documents
preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]

korean_tmp = []
for doc in preprocessed_docs_tmp :
tmp = okt.nouns(doc)
sent = ''
for t in tmp :
sent = sent + str(t) + ' '
korean_tmp.append(sent)

preprocessed_docs_tmp = korean_tmp

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
for doc in preprocessed_docs_tmp]

vectorizer = CountVectorizer(max_features=self.vocabulary_size, max_df=self.max_df)
vectorizer.fit_transform(preprocessed_docs_tmp)
temp_vocabulary = set(vectorizer.get_feature_names())

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
for doc in preprocessed_docs_tmp]

preprocessed_docs, unpreprocessed_docs = [], []
for i, doc in enumerate(preprocessed_docs_tmp):
if len(doc) > 0 and len(doc) >= self.min_words:
preprocessed_docs.append(doc)
unpreprocessed_docs.append(self.documents[i])

vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

return preprocessed_docs, unpreprocessed_docs, vocabulary