Skip to content

Commit

Permalink
Release v1
Browse files Browse the repository at this point in the history
  • Loading branch information
FedericoCinus committed May 7, 2020
1 parent f3b7564 commit 102c013
Show file tree
Hide file tree
Showing 15 changed files with 459 additions and 448 deletions.
95 changes: 50 additions & 45 deletions src/womg/womg/__main__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
'''
Main of the extended version of womg
'''
import click
from womg_core import womgc
from womg.topic.lda_extended import LDA_extended
from womg_core.__main__ import save
from womg_core.utils.distributions import set_seed
#from womg.utils.saver import TxtSaver
from womg.topic.lda_extended import LDAExtended



def womg_main(graph=None,
def womg_main(graph=None, #pylint: disable=too-many-arguments, too-many-locals
docs_path=None,
items_descr=None,
numb_topics=15,
Expand Down Expand Up @@ -98,11 +101,11 @@ def womg_main(graph=None,
'''

set_seed(seed)
topic_model = LDA_extended(numb_topics=numb_topics,
numb_docs=numb_docs,
docs_path=docs_path,
items_descr=items_descr,
progress_bar=progress_bar)
topic_model = LDAExtended(numb_topics=numb_topics,
numb_docs=numb_docs,
docs_path=docs_path,
items_descr=items_descr,
progress_bar=progress_bar)
topic_model.fit()
prop = womgc(graph=graph,
numb_topics=numb_topics,
Expand All @@ -113,6 +116,7 @@ def womg_main(graph=None,
interests=interests,
gn_strength=gn_strength,
infl_strength=infl_strength,
int_mode=int_mode,
seed=seed,
items_descr=topic_model.items_descript,
virality_exp=virality_exp,
Expand All @@ -122,75 +126,76 @@ def womg_main(graph=None,
progress_bar=progress_bar,
path_out=path_out)
prop.topic_model.items_keyw = topic_model.items_keyw

return prop

@click.command()
@click.option('--topics', metavar='K', default=15,
help='Number of topics in the topic model. Default 15. K<d ',
type=int)
help='Number of topics in the topic model. Default 15. K<d ',
type=int)
@click.option('--docs', metavar='D', default=None,
help='Number of docs to be generated. Default 100',
type=int)
help='Number of docs to be generated. Default 100',
type=int)
@click.option('--steps', metavar='T', default=6,
help='Number of time steps for diffusion',
type=int)
help='Number of time steps for diffusion',
type=int)
@click.option('--homophily', metavar='H', default=0.5,
help='0<=H<=1 :degree of homophily decoded from the given network. Default 0.5',
type=click.FloatRange(0, 1, clamp=True))
help='0<=H<=1 :degree of homophily decoded from the given network. Default 0.5',
type=click.FloatRange(0, 1, clamp=True))
@click.option('--gn_strength', default=13.,
help='Influence strength of the god node for initial configuration. Default 13.',
type=float)
help='Influence strength of the god node for initial configuration. Default 13.',
type=float)
@click.option('--infl_strength', type=float, default=None,
help='Influence strength of nodes with respect to interests vecs. Default 12.')
help='Influence strength of nodes with respect to interests vecs. Default 12.')
@click.option('--virality_exp', metavar='V', default=8.,
help='Exponent of the pareto distribution for documents viralities. Default 8.',
type=float)
help='Exponent of the pareto distribution for documents viralities. Default 8.',
type=float)
@click.option('--virality_resistance', metavar='V', default=13.,
help='Virality resistance factor r. Default 13.',
type=float)
help='Virality resistance factor r. Default 13.',
type=float)

@click.option('--graph', default=None,
help='Input path of the graph edgelist or nx object', type=str)
help='Input path of the graph edgelist or nx object', type=str)
@click.option('--interests', default=None,
help='Input path of the ginterests table', type=str)
help='Input path of the ginterests table', type=str)

@click.option('--int_mode', type=str,
help="defines the method for generating nodes' interests. 2 choices: 'rand', 'nmf'. Default 'nmf' ",
default='nmf')
help="defines the method for generating nodes' interests. 2 choices: 'rand', 'nmf'. Default 'nmf' ",
default='nmf')

@click.option('--weighted', is_flag=True,
help='boolean specifying (un)weighted. Default unweighted', default=False)
help='boolean specifying (un)weighted. Default unweighted', default=False)

@click.option('--directed', is_flag=True,
help='graph is (un)directed. Default undirected',
default=False)
help='graph is (un)directed. Default undirected',
default=False)


@click.option('--docs_folder', metavar='DOCS', default=None,
help='Input path of the documents folder', type=str)
help='Input path of the documents folder', type=str)
@click.option('--items_descr', default=None,
help='Input path items description file representing each item in the topics space. Format: topic_index [topic-dim vec]', type=str)
help='Input path items description file representing each item in the topics space. Format: topic_index [topic-dim vec]', type=str)
@click.option('--output', default=None, help='Outputs path')
@click.option('--seed', help='Seed (int) for random distribution extractions',
type=int, required=False)
type=int, required=False)


@click.option('--progress_bar', is_flag=True,
help='boolean for specifying the progress bar related to the environment if True progress_bar=tqdm_notebook -> Jupyter progress_bar; if False progress_bar=tqdm. Default False ',
default=True)
help='boolean for specifying the progress bar related to the environment if True progress_bar=tqdm_notebook -> Jupyter progress_bar; if False progress_bar=tqdm. Default False ',
default=True)

@click.option('--save_int', is_flag=True,
help='if True WoMG saves the interests vector for each node',
default=False)
@click.option('--save_infl', is_flag=True,
help='if True WoMG saves the influence vector for each node',
default=False)
help='if True WoMG saves the interests vector for each node',
default=False)
@click.option('--save_infl', is_flag=True, #pylint: disable=too-many-arguments, too-many-locals
help='if True WoMG saves the influence vector for each node',
default=False)
@click.option('--save_keyw', is_flag=True,
help='if True WoMG saves the keywords in a bow format for each document',
default=False)
help='if True WoMG saves the keywords in a bow format for each document',
default=False)
@click.option('--single_activator', is_flag=True,
help='if True we have at most one activator per item, else god node will activate all nodes beyond threshold',
default=False)
help='if True we have at most one activator per item, else god node will activate all nodes beyond threshold',
default=False)

def main_cli(graph,
items_descr,
Expand Down Expand Up @@ -245,4 +250,4 @@ def main_cli(graph,
save_keyw=save_keyw)

if __name__ == '__main__':
main_cli()
main_cli() # pylint: disable=no-value-for-parameter
34 changes: 12 additions & 22 deletions src/womg/womg/topic/lda_extended.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# /Topic/lda.py
# Implementation of LDA topic-model
'''
/Topic/lda.py
Implementation of LDA topic-model
'''
import re
import os
import pathlib
import gensim
import pickle
from tqdm import tqdm, tqdm_notebook
import numpy as np
import womg_core
from womg_core.topic.lda import LDA
from womg_core.utils.utility_functions import count_files, read_docs, TopicsError, DocsError
from womg_core.utils.utility_functions import count_files, read_docs
from womg_core.utils.distributions import random_powerlaw_vec

class LDA_extended(LDA):
class LDAExtended(LDA):
'''
Class implementing Latent Dirichlet Allocation as topic model
for topic distribution involved in tlt class model
Expand Down Expand Up @@ -53,7 +52,6 @@ def fit(self):
3. get the items descriptions (topic distribution for each item)
4. get the items keywords (bow list for each item)
'''
#print('\n In fit method there are ', self.numb_docs, 'docs, in', self._docs_path, ' with description ', self._items_descr)
mode = self.set_lda_mode()
if mode == 'load':
super().fit()
Expand Down Expand Up @@ -87,10 +85,7 @@ def set_lda_mode(self):
reading : bool
if True: it will read docs inside the given folder path or input folder
if False: it will use lda for generating docs
'''


if self.numb_docs is None and self._docs_path is not None and self._items_descr is None:
mode = 'get'
path = pathlib.Path(self._docs_path)
Expand Down Expand Up @@ -131,7 +126,6 @@ def get_items_descript(self, path, model):
gammas[item] = np.array([i[1] for i in item_descript])
item += 1
if self.numb_docs == len(gammas.keys()):
#print("Items' distribution over topics is stored")
pass
self.items_descript = gammas

Expand All @@ -142,11 +136,8 @@ def get_items_keyw(self, path):
Get the items keyword in a bow format
'''
docs = read_docs(path)
#print(docs)
preproc_docs = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) !=0]
#print(preproc_docs)
preproc_docs = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) != 0]
data_words = self.sent_to_words(preproc_docs, verbose=False)
#print(data_words)
for item in range(self.numb_docs):
self.items_keyw[item] = self.to_bow(data_words[item])

Expand All @@ -168,7 +159,8 @@ def gen_items_keyw(self, model):
item_keyw.append(self.dictionary[word_index[0]])
self.items_keyw[item] = self.to_bow(item_keyw)

def get_topics_descript(self, model, mtrx_form=False):
@staticmethod
def get_topics_descript(model, mtrx_form=False):
'''
Getting the word distribution for each topic
Expand All @@ -183,8 +175,7 @@ def get_topics_descript(self, model, mtrx_form=False):
'''
if mtrx_form:
return model.get_topics()
else:
return model.print_topics()
return model.print_topics()


def to_bow(self, text):
Expand All @@ -201,7 +192,7 @@ def preprocess_texts(self, docs):
'''
Preprocessing input texts: divides docs into words, bow format
'''
data = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) !=0]
data = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) != 0]

# Remove new line characters
data = [re.sub(r'\s+', ' ', str(sent)) for sent in data]
Expand Down Expand Up @@ -259,7 +250,7 @@ def sent_to_words(self, docs, verbose=True):
return list(data_words)


def train_lda(self, path=None):
def train_lda(self):
'''
Pre-train lda model on a saved corpus for infering the prior weights of
the distributions given a number of topics, which correpsonds to the
Expand All @@ -278,7 +269,6 @@ def train_lda(self, path=None):
'''
print('Training LDA model..')
docs = read_docs(self._training_path)
#print('len docs ', len(docs))
data_words = self.sent_to_words(docs, verbose=False)

self.dictionary = gensim.corpora.Dictionary(data_words)
Expand Down
Loading

0 comments on commit 102c013

Please sign in to comment.