Extract bag of words from Wiki dump and save to .txt.gz files

We use gensim for extracting the texts in the Wiki dump. The first time we run the following lines it goes through the entire bz2 file and compile the dictionary, which could take hours.

import gensim.corpora.dictionary
import gensim.corpora.wikicorpus
wiki = gensim.corpora.WikiCorpus('enwiki-20170420-pages-articles.xml.bz2', processes=8, dictionary=wiki_dictionary)
wiki.dictionary.save_as_text('wiki_dictionary.txt', sort_by_word=False)

Once we save the dictionary, the 2nd time to construct the WikiCorpus object would be instantaneous.

import gensim.corpora.dictionary
import gensim.corpora.wikicorpus
wiki_dictionary = gensim.corpora.Dictionary.load_from_text('wiki_dictionary.txt')
wiki = gensim.corpora.WikiCorpus('enwiki-20170420-pages-articles.xml.bz2', processes=2, dictionary=wiki_dictionary)

We could use the following dump_bow.py to dump the Bag-of-Words of each documents in a series of .txt.gz files. Each line of the uncompressed text files represent one document, with only lower-case words separated by space. The punctuations are removed.

usage: dump_bow.py [-h] [-j JOBS] [-p PARTITION_SIZE] [-l LIMIT]
                   [-o OUTPUT_PREFIX]
                   wikidump dictionary

Dump bag-of-words in .txt.gz files

positional arguments:
  wikidump              xxx-pages-articles.xml.bz2 wiki dump file
  dictionary            gensim dictionary .txt file

optional arguments:
  -h, --help            show this help message and exit
  -j JOBS, --jobs JOBS  Number of parallel jobs, default: 2
  -p PARTITION_SIZE, --partition-size PARTITION_SIZE
                        Number of documents in each .txt.gz file
  -l LIMIT, --limit LIMIT
                        Total number of documents to dump, or all documents
                        when not specified
  -o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX
                        Prefix of dump .txt.gz files, default: dump

''' Dump Bag-of-Word from gensim WikiCorpus '''
import gzip
from argparse import ArgumentParser
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.wikicorpus import WikiCorpus

def dump_bow(corpus, partition_size=50, limit=200, output_prefix='dump'):
    ''' Dump Bag-of-Word from gensim WikiCorpus

    Iterate through the documents in the wiki dump and dump the
    Bag-of-Words of the documents in a series of .txt.gz files.
    Each line in the uncompressed file represent one document, with
    only lower-case words separated by space.

    PARAMETERS
    -----------
    corpus: gensim.corpora.WikiCorpus
        The Wikidump corpus.
    partition_size: int
        Number of documents in each .txt.gz dump file.
    limit: int or None
        The total number of documents to dump, or None for all
        the documents in the corpus.
    output_prefix: str
        Prefix of the dump files.
    '''
    def write_buffer(buf, output_prefix, partition_id):
        ''' Dump current buffer of Bag-of-Words '''
        fname = '{}-{:06d}.txt.gz'.format(output_prefix, partition_id)
        with gzip.open(fname, 'wt') as partition_file:
            partition_file.write(buf)

    if limit is not None:
        print('Processing {} documents in the corpus...'.format(limit))
    else:
        print('Processing all the documents in the corpus...')

    assert partition_size >= 1
    assert limit is None or limit >= 1
    # gensim 2.0 requires this otherwise the multi-processing locks up
    assert limit is None or partition_size <= limit

    count_documents = 0
    partition_id = 0
    buf = ''

    for bow in corpus.get_texts():
        text = ' '.join([byte_array.decode('utf-8') for byte_array in bow])
        buf += text + '\n'
        count_documents += 1

        if count_documents % 200 == 0:
            print('Processed {} documents.'.format(count_documents))

        if count_documents % partition_size == 0:
            write_buffer(buf, output_prefix, partition_id)
            buf = ''
            partition_id += 1

        if limit is not None and count_documents >= limit:
            break

    if buf:
        write_buffer(buf, output_prefix, partition_id)
    print('Dumped {} documents.'.format(count_documents))

def main():
    ''' Parse arguments and run '''
    parser = ArgumentParser(description='Dump bag-of-words in .txt.gz files')

    parser.add_argument('wikidump', type=str,
                        help='xxx-pages-articles.xml.bz2 wiki dump file')
    parser.add_argument('dictionary', type=str,
                        help='gensim dictionary .txt file')

    parser.add_argument('-j', '--jobs', type=int, default=2,
                        help='Number of parallel jobs, default: 2')
    parser.add_argument('-p', '--partition-size', type=int,
                        help='Number of documents in each .txt.gz file')
    parser.add_argument('-l', '--limit', type=int,
                        help=('Total number of documents to dump, '
                              'or all documents when not specified'))
    parser.add_argument('-o', '--output-prefix', type=str, default='dump',
                        help='Prefix of dump .txt.gz files, default: dump')

    args = parser.parse_args()

    wiki_dictionary = Dictionary.load_from_text(args.dictionary)
    wiki = WikiCorpus(args.wikidump, processes=args.jobs,
                      dictionary=wiki_dictionary)

    dump_bow(wiki, args.partition_size, args.limit,
             output_prefix=args.output_prefix)

if __name__ == '__main__':
    main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Extract bag of words from Wiki dump and save to .txt.gz files

Clone this wiki locally