Release v1

FedericoCinus · May 7, 2020 · 102c013 · 102c013
1 parent f3b7564
commit 102c013
Show file tree

Hide file tree

Showing 15 changed files with 459 additions and 448 deletions.
diff --git a/src/womg/womg/__main__.py b/src/womg/womg/__main__.py
@@ -1,12 +1,15 @@
+'''
+Main of the extended version of womg
+'''
 import click
 from womg_core import womgc
-from womg.topic.lda_extended import LDA_extended
+from womg_core.__main__ import save
 from womg_core.utils.distributions import set_seed
-#from womg.utils.saver import TxtSaver
+from womg.topic.lda_extended import LDAExtended
 
 
 
-def womg_main(graph=None,
+def womg_main(graph=None, #pylint: disable=too-many-arguments, too-many-locals
               docs_path=None,
               items_descr=None,
               numb_topics=15,
@@ -98,11 +101,11 @@ def womg_main(graph=None,
     '''
 
     set_seed(seed)
-    topic_model = LDA_extended(numb_topics=numb_topics,
-                      numb_docs=numb_docs,
-                      docs_path=docs_path,
-                      items_descr=items_descr,
-                      progress_bar=progress_bar)
+    topic_model = LDAExtended(numb_topics=numb_topics,
+                              numb_docs=numb_docs,
+                              docs_path=docs_path,
+                              items_descr=items_descr,
+                              progress_bar=progress_bar)
     topic_model.fit()
     prop = womgc(graph=graph,
                  numb_topics=numb_topics,
@@ -113,6 +116,7 @@ def womg_main(graph=None,
                  interests=interests,
                  gn_strength=gn_strength,
                  infl_strength=infl_strength,
+                 int_mode=int_mode,
                  seed=seed,
                  items_descr=topic_model.items_descript,
                  virality_exp=virality_exp,
@@ -122,75 +126,76 @@ def womg_main(graph=None,
                  progress_bar=progress_bar,
                  path_out=path_out)
     prop.topic_model.items_keyw = topic_model.items_keyw
+
     return prop
 
 @click.command()
 @click.option('--topics', metavar='K', default=15,
-                    help='Number of topics in the topic model. Default 15. K<d ',
-                    type=int)
+              help='Number of topics in the topic model. Default 15. K<d ',
+              type=int)
 @click.option('--docs', metavar='D', default=None,
-                    help='Number of docs to be generated. Default 100',
-                    type=int)
+              help='Number of docs to be generated. Default 100',
+              type=int)
 @click.option('--steps', metavar='T', default=6,
-                    help='Number of time steps for diffusion',
-                    type=int)
+              help='Number of time steps for diffusion',
+              type=int)
 @click.option('--homophily', metavar='H', default=0.5,
-                    help='0<=H<=1 :degree of homophily decoded from the given network. Default 0.5',
-                    type=click.FloatRange(0, 1, clamp=True))
+              help='0<=H<=1 :degree of homophily decoded from the given network. Default 0.5',
+              type=click.FloatRange(0, 1, clamp=True))
 @click.option('--gn_strength', default=13.,
-                    help='Influence strength of the god node for initial configuration. Default 13.',
-                    type=float)
+              help='Influence strength of the god node for initial configuration. Default 13.',
+              type=float)
 @click.option('--infl_strength', type=float, default=None,
-                    help='Influence strength of nodes with respect to interests vecs. Default 12.')
+              help='Influence strength of nodes with respect to interests vecs. Default 12.')
 @click.option('--virality_exp', metavar='V', default=8.,
-                    help='Exponent of the pareto distribution for documents viralities. Default 8.',
-                    type=float)
+              help='Exponent of the pareto distribution for documents viralities. Default 8.',
+              type=float)
 @click.option('--virality_resistance', metavar='V', default=13.,
-                    help='Virality resistance factor r. Default 13.',
-                    type=float)
+              help='Virality resistance factor r. Default 13.',
+              type=float)
 
 @click.option('--graph', default=None,
-                    help='Input path of the graph edgelist or nx object', type=str)
+              help='Input path of the graph edgelist or nx object', type=str)
 @click.option('--interests', default=None,
-                    help='Input path of the ginterests table', type=str)
+              help='Input path of the ginterests table', type=str)
 
 @click.option('--int_mode', type=str,
-                    help="defines the method for generating nodes' interests. 2 choices: 'rand', 'nmf'. Default 'nmf' ",
-                    default='nmf')
+              help="defines the method for generating nodes' interests. 2 choices: 'rand', 'nmf'. Default 'nmf' ",
+              default='nmf')
 
 @click.option('--weighted', is_flag=True,
-                    help='boolean specifying (un)weighted. Default  unweighted', default=False)
+              help='boolean specifying (un)weighted. Default  unweighted', default=False)
 
 @click.option('--directed', is_flag=True,
-                    help='graph is (un)directed. Default  undirected',
-                    default=False)
+              help='graph is (un)directed. Default  undirected',
+              default=False)
 
 
 @click.option('--docs_folder', metavar='DOCS', default=None,
-                    help='Input  path of the documents folder', type=str)
+              help='Input  path of the documents folder', type=str)
 @click.option('--items_descr', default=None,
-                    help='Input  path items description file representing each item in the topics space. Format: topic_index [topic-dim vec]', type=str)
+              help='Input  path items description file representing each item in the topics space. Format: topic_index [topic-dim vec]', type=str)
 @click.option('--output', default=None, help='Outputs path')
 @click.option('--seed', help='Seed (int) for random distribution extractions',
-                    type=int, required=False)
+              type=int, required=False)
 
 
 @click.option('--progress_bar', is_flag=True,
-                    help='boolean for specifying the progress bar related to the environment if True progress_bar=tqdm_notebook -> Jupyter progress_bar; if False progress_bar=tqdm. Default False ',
-                    default=True)
+              help='boolean for specifying the progress bar related to the environment if True progress_bar=tqdm_notebook -> Jupyter progress_bar; if False progress_bar=tqdm. Default False ',
+              default=True)
 
 @click.option('--save_int', is_flag=True,
-                    help='if True WoMG saves the interests vector for each node',
-                    default=False)
-@click.option('--save_infl', is_flag=True,
-                    help='if True WoMG saves the influence vector for each node',
-                    default=False)
+              help='if True WoMG saves the interests vector for each node',
+              default=False)
+@click.option('--save_infl', is_flag=True, #pylint: disable=too-many-arguments, too-many-locals
+              help='if True WoMG saves the influence vector for each node',
+              default=False)
 @click.option('--save_keyw', is_flag=True,
-                    help='if True WoMG saves the keywords in a bow format for each document',
-                    default=False)
+              help='if True WoMG saves the keywords in a bow format for each document',
+              default=False)
 @click.option('--single_activator', is_flag=True,
-                    help='if True we have at most one activator per item, else god node will activate all nodes beyond threshold',
-                    default=False)
+              help='if True we have at most one activator per item, else god node will activate all nodes beyond threshold',
+              default=False)
 
 def main_cli(graph,
              items_descr,
@@ -245,4 +250,4 @@ def main_cli(graph,
          save_keyw=save_keyw)
 
 if __name__ == '__main__':
-    main_cli()
+    main_cli() # pylint: disable=no-value-for-parameter
diff --git a/src/womg/womg/topic/lda_extended.py b/src/womg/womg/topic/lda_extended.py
@@ -1,18 +1,17 @@
-# /Topic/lda.py
-# Implementation of LDA topic-model
+'''
+/Topic/lda.py
+Implementation of LDA topic-model
+'''
 import re
 import os
 import pathlib
 import gensim
-import pickle
-from tqdm import tqdm, tqdm_notebook
 import numpy as np
-import womg_core
 from womg_core.topic.lda import LDA
-from womg_core.utils.utility_functions import count_files, read_docs, TopicsError, DocsError
+from womg_core.utils.utility_functions import count_files, read_docs
 from womg_core.utils.distributions import random_powerlaw_vec
 
-class LDA_extended(LDA):
+class LDAExtended(LDA):
     '''
     Class implementing Latent Dirichlet Allocation as topic model
     for topic distribution involved in tlt class model
@@ -53,7 +52,6 @@ def fit(self):
         3. get the items descriptions (topic distribution for each item)
         4. get the items keywords (bow list for each item)
         '''
-        #print('\n In fit method there are ', self.numb_docs, 'docs, in', self._docs_path, ' with description ', self._items_descr)
         mode = self.set_lda_mode()
         if mode == 'load':
             super().fit()
@@ -87,10 +85,7 @@ def set_lda_mode(self):
         reading : bool
             if True: it will read docs inside the given folder path or input folder
             if False: it will use lda for generating docs
-
         '''
-
-
         if self.numb_docs is None and self._docs_path is not None and self._items_descr is None:
             mode = 'get'
             path = pathlib.Path(self._docs_path)
@@ -131,7 +126,6 @@ def get_items_descript(self, path, model):
             gammas[item] = np.array([i[1] for i in item_descript])
             item += 1
         if self.numb_docs == len(gammas.keys()):
-            #print("Items' distribution over topics is stored")
             pass
         self.items_descript = gammas
 
@@ -142,11 +136,8 @@ def get_items_keyw(self, path):
         Get the items keyword in a bow format
         '''
         docs = read_docs(path)
-        #print(docs)
-        preproc_docs = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) !=0]
-        #print(preproc_docs)
+        preproc_docs = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) != 0]
         data_words = self.sent_to_words(preproc_docs, verbose=False)
-        #print(data_words)
         for item in range(self.numb_docs):
             self.items_keyw[item] = self.to_bow(data_words[item])
 
@@ -168,7 +159,8 @@ def gen_items_keyw(self, model):
                 item_keyw.append(self.dictionary[word_index[0]])
             self.items_keyw[item] = self.to_bow(item_keyw)
 
-    def get_topics_descript(self, model, mtrx_form=False):
+    @staticmethod
+    def get_topics_descript(model, mtrx_form=False):
         '''
         Getting the word distribution for each topic
 
@@ -183,8 +175,7 @@ def get_topics_descript(self, model, mtrx_form=False):
         '''
         if mtrx_form:
             return model.get_topics()
-        else:
-            return model.print_topics()
+        return model.print_topics()
 
 
     def to_bow(self, text):
@@ -201,7 +192,7 @@ def preprocess_texts(self, docs):
         '''
         Preprocessing input texts: divides docs into words, bow format
         '''
-        data = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) !=0]
+        data = [gensim.parsing.preprocessing.remove_stopwords((sent[0].lower())) for sent in docs if len(sent) != 0]
 
         # Remove new line characters
         data = [re.sub(r'\s+', ' ', str(sent)) for sent in data]
@@ -259,7 +250,7 @@ def sent_to_words(self, docs, verbose=True):
         return list(data_words)
 
 
-    def train_lda(self, path=None):
+    def train_lda(self):
         '''
         Pre-train lda model on a saved corpus for infering the prior weights of
         the distributions given a number of topics, which correpsonds to the
@@ -278,7 +269,6 @@ def train_lda(self, path=None):
         '''
         print('Training LDA model..')
         docs = read_docs(self._training_path)
-        #print('len docs ', len(docs))
         data_words = self.sent_to_words(docs, verbose=False)
 
         self.dictionary = gensim.corpora.Dictionary(data_words)