-
Notifications
You must be signed in to change notification settings - Fork 6
Extract bag of words from Wiki dump and save to .txt.gz files
Jencir Lee edited this page May 14, 2017
·
1 revision
We use gensim
for extracting the texts in the Wiki dump. The first time we run the following lines it goes through the entire bz2
file and compile the dictionary, which could take hours.
import gensim.corpora.dictionary
import gensim.corpora.wikicorpus
wiki = gensim.corpora.WikiCorpus('enwiki-20170420-pages-articles.xml.bz2', processes=8, dictionary=wiki_dictionary)
wiki.dictionary.save_as_text('wiki_dictionary.txt', sort_by_word=False)
Once we save the dictionary, the 2nd time to construct the WikiCorpus
object would be instantaneous.
import gensim.corpora.dictionary
import gensim.corpora.wikicorpus
wiki_dictionary = gensim.corpora.Dictionary.load_from_text('wiki_dictionary.txt')
wiki = gensim.corpora.WikiCorpus('enwiki-20170420-pages-articles.xml.bz2', processes=2, dictionary=wiki_dictionary)
We could use the following dump_bow.py
to dump the Bag-of-Words of each documents in a series of .txt.gz
files. Each line of the uncompressed text files represent one document, with only lower-case words separated by space. The punctuations are removed.
usage: dump_bow.py [-h] [-j JOBS] [-p PARTITION_SIZE] [-l LIMIT]
[-o OUTPUT_PREFIX]
wikidump dictionary
Dump bag-of-words in .txt.gz files
positional arguments:
wikidump xxx-pages-articles.xml.bz2 wiki dump file
dictionary gensim dictionary .txt file
optional arguments:
-h, --help show this help message and exit
-j JOBS, --jobs JOBS Number of parallel jobs, default: 2
-p PARTITION_SIZE, --partition-size PARTITION_SIZE
Number of documents in each .txt.gz file
-l LIMIT, --limit LIMIT
Total number of documents to dump, or all documents
when not specified
-o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX
Prefix of dump .txt.gz files, default: dump
''' Dump Bag-of-Word from gensim WikiCorpus '''
import gzip
from argparse import ArgumentParser
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.wikicorpus import WikiCorpus
def dump_bow(corpus, partition_size=50, limit=200, output_prefix='dump'):
''' Dump Bag-of-Word from gensim WikiCorpus
Iterate through the documents in the wiki dump and dump the
Bag-of-Words of the documents in a series of .txt.gz files.
Each line in the uncompressed file represent one document, with
only lower-case words separated by space.
PARAMETERS
-----------
corpus: gensim.corpora.WikiCorpus
The Wikidump corpus.
partition_size: int
Number of documents in each .txt.gz dump file.
limit: int or None
The total number of documents to dump, or None for all
the documents in the corpus.
output_prefix: str
Prefix of the dump files.
'''
def write_buffer(buf, output_prefix, partition_id):
''' Dump current buffer of Bag-of-Words '''
fname = '{}-{:06d}.txt.gz'.format(output_prefix, partition_id)
with gzip.open(fname, 'wt') as partition_file:
partition_file.write(buf)
if limit is not None:
print('Processing {} documents in the corpus...'.format(limit))
else:
print('Processing all the documents in the corpus...')
assert partition_size >= 1
assert limit is None or limit >= 1
# gensim 2.0 requires this otherwise the multi-processing locks up
assert limit is None or partition_size <= limit
count_documents = 0
partition_id = 0
buf = ''
for bow in corpus.get_texts():
text = ' '.join([byte_array.decode('utf-8') for byte_array in bow])
buf += text + '\n'
count_documents += 1
if count_documents % 200 == 0:
print('Processed {} documents.'.format(count_documents))
if count_documents % partition_size == 0:
write_buffer(buf, output_prefix, partition_id)
buf = ''
partition_id += 1
if limit is not None and count_documents >= limit:
break
if buf:
write_buffer(buf, output_prefix, partition_id)
print('Dumped {} documents.'.format(count_documents))
def main():
''' Parse arguments and run '''
parser = ArgumentParser(description='Dump bag-of-words in .txt.gz files')
parser.add_argument('wikidump', type=str,
help='xxx-pages-articles.xml.bz2 wiki dump file')
parser.add_argument('dictionary', type=str,
help='gensim dictionary .txt file')
parser.add_argument('-j', '--jobs', type=int, default=2,
help='Number of parallel jobs, default: 2')
parser.add_argument('-p', '--partition-size', type=int,
help='Number of documents in each .txt.gz file')
parser.add_argument('-l', '--limit', type=int,
help=('Total number of documents to dump, '
'or all documents when not specified'))
parser.add_argument('-o', '--output-prefix', type=str, default='dump',
help='Prefix of dump .txt.gz files, default: dump')
args = parser.parse_args()
wiki_dictionary = Dictionary.load_from_text(args.dictionary)
wiki = WikiCorpus(args.wikidump, processes=args.jobs,
dictionary=wiki_dictionary)
dump_bow(wiki, args.partition_size, args.limit,
output_prefix=args.output_prefix)
if __name__ == '__main__':
main()