diff --git a/src/womg/womg/__main__.py b/src/womg/womg/__main__.py index 657db3f..473d4df 100644 --- a/src/womg/womg/__main__.py +++ b/src/womg/womg/__main__.py @@ -1,12 +1,15 @@ +''' +Main of the extended version of womg +''' import click from womg_core import womgc -from womg.topic.lda_extended import LDA_extended +from womg_core.__main__ import save from womg_core.utils.distributions import set_seed -#from womg.utils.saver import TxtSaver +from womg.topic.lda_extended import LDAExtended -def womg_main(graph=None, +def womg_main(graph=None, #pylint: disable=too-many-arguments, too-many-locals docs_path=None, items_descr=None, numb_topics=15, @@ -98,11 +101,11 @@ def womg_main(graph=None, ''' set_seed(seed) - topic_model = LDA_extended(numb_topics=numb_topics, - numb_docs=numb_docs, - docs_path=docs_path, - items_descr=items_descr, - progress_bar=progress_bar) + topic_model = LDAExtended(numb_topics=numb_topics, + numb_docs=numb_docs, + docs_path=docs_path, + items_descr=items_descr, + progress_bar=progress_bar) topic_model.fit() prop = womgc(graph=graph, numb_topics=numb_topics, @@ -113,6 +116,7 @@ def womg_main(graph=None, interests=interests, gn_strength=gn_strength, infl_strength=infl_strength, + int_mode=int_mode, seed=seed, items_descr=topic_model.items_descript, virality_exp=virality_exp, @@ -122,75 +126,76 @@ def womg_main(graph=None, progress_bar=progress_bar, path_out=path_out) prop.topic_model.items_keyw = topic_model.items_keyw + return prop @click.command() @click.option('--topics', metavar='K', default=15, - help='Number of topics in the topic model. Default 15. K self._virality_resistance * self.topic_model.viralities[item] - else: - return False + return False @@ -206,7 +206,6 @@ def update_sets(self, item, new_active_nodes): self.new_active_nodes[item] = new_active_nodes if self.new_active_nodes[item] == set(): - #print('item ', item) self._stall_count[item] += 1 removing_list = new_active_nodes.union(self.active_nodes[item]) ### needs improvement @@ -226,11 +225,11 @@ def set_sets(self): current new activated nodes set is equal to the active one ''' - for item in range(self._numb_docs): - self.active_nodes[item] = self.godNode_influence_config(item) + for item in range(self.numb_docs): + self.active_nodes[item] = self.godnode_influence_config(item) if self.active_nodes[item] == set(): self._stall_count[item] += 1 - self.inactive_nodes[item] = set(self.network_model._nx_obj.nodes()) + self.inactive_nodes[item] = set(self.network_model.nx_obj.nodes()) self.new_active_nodes[item] = self.active_nodes[item] @@ -240,12 +239,12 @@ def stop_criterior(self): Stops the run if there are not new active nodes for given time step seq ''' stall_factor = True - for item in self._stall_count.keys(): + for item, _value in self._stall_count.items(): stall_factor *= (self._stall_count[item] >= 1) return stall_factor - def godNode_influence_config(self, item): + def godnode_influence_config(self, item): ''' Returns the activated nodes for the initial configuration for a given item; the god node (connected with all nodes) influences all the others for the @@ -261,11 +260,14 @@ def godNode_influence_config(self, item): actives_config = [] max_interested = -np.inf max_v = None - for u, v in self.network_model.godNode_links: - curr_weight = self.network_model.graph[(u, v)] + if self.network_model.nx_obj.is_directed(): + god_node_edges = list(self.network_model.nx_obj.out_edges(-1)) + else: + god_node_edges = list(self.network_model.nx_obj.edges(-1)) + for u, v in god_node_edges: + curr_weight = self.network_model.nx_obj.get_edge_data(u, v)['weight'] z_sum = np.dot(self.topic_model.items_descript[item], curr_weight) self._thresholds_values.append((z_sum, self.topic_model.viralities[item])) - #print('z_sum: ', z_sum, ' virality: ', self._virality_resistance *self.topic_model.viralities[item]) if z_sum > self._virality_resistance * self.topic_model.viralities[item]: if self._single_activator: if max_interested < z_sum: @@ -273,7 +275,8 @@ def godNode_influence_config(self, item): max_v = v else: if all(v == 0 for v in curr_weight): - print('ATTENTION nodes: ', u, v, ' curr_weight:', curr_weight, ' z_sum:', z_sum) + print('ATTENTION nodes: ', u, v, ' curr_weight:', + curr_weight, ' z_sum:', z_sum) actives_config.append(v) if self._single_activator: @@ -309,8 +312,8 @@ def propagations(self): def propagations(self, value): self._propagations = value - - def read_class_pickle(self, model): + @staticmethod + def read_class_pickle(model): ''' Read the pickle file containing a class model instance Parameters @@ -321,7 +324,7 @@ def read_class_pickle(self, model): ------- Loaded object of the correspondent class: TLTNetworkModel or TLTTopicModel ''' - file = pathlib.Path.cwd() / str('__'+model+'_model') - with open(file, 'rb') as f: - rfile = pickle.load(f) + filename = pathlib.Path.cwd() / str('__'+model+'_model') + with open(filename, 'rb') as file: + rfile = pickle.load(file) return rfile diff --git a/src/womg_core/womg_core/network/network_model.py b/src/womg_core/womg_core/network/network_model.py index fc9ba68..a4cd486 100644 --- a/src/womg_core/womg_core/network/network_model.py +++ b/src/womg_core/womg_core/network/network_model.py @@ -1,13 +1,11 @@ -# /network/network_model.py -# Abstract class defining the Network models -import pathlib -import json -import pickle -from abc import ABC -import networkx as nx +''' +/network/network_model.py +Abstract class defining the Network models +''' +import abc -class NetworkModel(ABC): +class NetworkModel(abc.ABC): ''' Abstract class for network models @@ -51,16 +49,21 @@ def gformat(nx_obj, directed=False): key <- tuple which describes the link (node_1, node_2) value <- int weight of the link ''' - G = {} + graph = {} print('Formatting graph..') if directed: - for edge in (nx_obj.edges()): - G[(edge[0],edge[1])] = 1 + for edge in nx_obj.edges(): + graph[(edge[0], edge[1])] = 1 else: - #print(nx_obj.edges()) - for edge in (nx_obj.edges()): - #print(edge) - G[(edge[0],edge[1])] = 1 - G[(edge[1],edge[0])] = 1 + for edge in nx_obj.edges(): + graph[(edge[0], edge[1])] = 1 + graph[(edge[1], edge[0])] = 1 - return G + return graph + + + @abc.abstractmethod + def network_setup(self, int_mode): + '''Setting up the network environment for diffusion + ''' + pass diff --git a/src/womg_core/womg_core/network/tlt_network_model.py b/src/womg_core/womg_core/network/tlt_network_model.py index b6b8712..92bfb0a 100644 --- a/src/womg_core/womg_core/network/tlt_network_model.py +++ b/src/womg_core/womg_core/network/tlt_network_model.py @@ -1,8 +1,11 @@ -# /network/tlt_network_model.py -# Abstract class defining the Network models +'''/network/tlt_network_model.py +Abstract class defining the Network models +''' + import abc from womg_core.network.network_model import NetworkModel + class TLTNetworkModel(NetworkModel): ''' Abstract class for network models involved in the tlt diffusion model class @@ -19,8 +22,13 @@ class TLTNetworkModel(NetworkModel): @abc.abstractmethod def graph_weights_vecs_generation(self): - ''' - Generates numb_topics dim vectors for each link and put them as + '''Generates numb_topics dim vectors for each link and put them as value of the graph dict attribute of the class ''' pass + + @abc.abstractmethod + def set_godnode_links(self, weight, nodes): + '''Creates the god node links for initial configurations + ''' + pass diff --git a/src/womg_core/womg_core/network/tn.py b/src/womg_core/womg_core/network/tn.py index 8f48204..83069a4 100644 --- a/src/womg_core/womg_core/network/tn.py +++ b/src/womg_core/womg_core/network/tn.py @@ -1,21 +1,24 @@ -# /network/tn.py -# Implementation of the Topic-aware Network model +'''/network/tn.py +Implementation of the Topic-aware Network model +''' + import os -import womg_core import random import pathlib import collections import networkx as nx import numpy as np from scipy import sparse -from tqdm import tqdm, tqdm_notebook from scipy.spatial import distance +from tqdm import tqdm, tqdm_notebook from sklearn.decomposition import NMF +import womg_core from womg_core.network.tlt_network_model import TLTNetworkModel from womg_core.utils.utility_functions import read_edgelist -from womg_core.utils.distributions import random_powerlaw_vec +DEFAULT_GRAPH = pathlib.Path(os.path.abspath(womg_core.__file__).replace('/womg/__init__.py', + ''))/ "womgdata" / "graph" / "lesmiserables" / "lesmiserables_edgelist.txt" class TN(TLTNetworkModel): ''' @@ -33,15 +36,8 @@ class TN(TLTNetworkModel): (of _numb_topics dimension) in the format: key <- [node id] value <- _numb_topics dimension array in numpy format - - _nx_obj : NetworkX object + - nx_obj : NetworkX object networkx instance of the input network - - godNode_links : dict - dictionary containing all the links of the god node; - god node is out connected - to all the nodes but does not have in connections; - god node index (id) is -1; format will be: - key <- (-1, node id) [all int] - value <- link weight [int] - _numb_topics : int dimension of the interests and influence vectors - _fast : bool @@ -50,8 +46,6 @@ class TN(TLTNetworkModel): Methods ------- - - set_graph() - - set_godNode_links() - set_interests() - set_influence() - node2interests(): generates realistic latent interests of the nodes starting @@ -68,43 +62,43 @@ class TN(TLTNetworkModel): ''' - def __init__(self, numb_topics, homophily, + def __init__(self, numb_topics, #pylint: disable=too-many-arguments, + homophily, weighted, directed, graph, interests, gn_strength, infl_strength, - progress_bar, - seed, + progress_bar, + seed, ): super().__init__() self._graph = graph self._weighted = weighted - self._directed = directed + self.directed = directed self.users_interests = {} self.users_influence = {} self._numb_topics = numb_topics self._homophily = homophily self._interests = interests - self.godNode_links = {} assert gn_strength is None or np.isfinite(gn_strength) - self._godNode_strength = gn_strength + self._godnode_strength = gn_strength self._infl_strength = infl_strength self._rand = 16 - 15.875 * self._homophily if progress_bar: self._progress_bar = tqdm_notebook else: self._progress_bar = tqdm + self._verbose = False self._seed = seed + self.nx_obj = None + self.mapping = None def network_setup(self, int_mode): ''' - - Sets the graph atribute using set_graph() method - - Sets the info attribute using set_info() method - - Sets the godNode_links attribute using set_godNode_links() method - Sets the interests vectors using set_interests() method - Sets the influence vectors using set_influence() mehtod - Sets the new graph weights using update_weights() method @@ -113,22 +107,26 @@ def network_setup(self, int_mode): ----- See each method docstring for details ''' - #print('int', int_mode) if isinstance(self._graph, nx.classes.graph.Graph): - self._nx_obj = self._graph - self._directed = self._nx_obj.is_directed() - self._weighted = nx.is_weighted(self._nx_obj) - self.mapping = None - elif self._graph == None: - print('No graph path provided \n DEMO Mode: generating cascades in les miserables network') - self._graph = pathlib.Path(os.path.abspath(womg_core.__file__).replace('/womg/__init__.py', '')) / "womgdata" / "graph" / "lesmiserables" / "lesmiserables_edgelist.txt" - self._nx_obj, self.mapping = read_edgelist(self,path=self._graph, weighted=False, directed=False) + self.nx_obj = self._graph.copy() + self.directed = self.nx_obj.is_directed() + if not self.directed: + self.nx_obj = self.nx_obj.to_directed() + self._weighted = nx.is_weighted(self.nx_obj) + elif self._graph is None: + print('No graph path provided \n', + 'DEMO Mode: generating cascades in les miserables network') + self._graph = DEFAULT_GRAPH + self.nx_obj, self.mapping = read_edgelist(path=self._graph, + weighted=False, + directed=False) else: self._graph = pathlib.Path(self._graph) - self._nx_obj, self.mapping = read_edgelist(self, path=self._graph, weighted=self._weighted, directed=self._directed) - self.set_graph() - if self._godNode_strength is not None: - self.set_godNode_links() + self.nx_obj, self.mapping = read_edgelist(path=self._graph, + weighted=self._weighted, + directed=self.directed) + if self._godnode_strength is not None: + self.set_godnode_links() self.set_interests(int_mode) ##### check inf ########################################## for node, interests in self.users_interests.items(): @@ -136,44 +134,15 @@ def network_setup(self, int_mode): print('ATTENTION node: ', node, ' interests: ', interests) ########################################################### self.set_influence() - #print('updating weights') self.graph_weights_vecs_generation() - #print('Macroscopic homophily level: ', self.homophily(), ' with H=', self._homophily) - - - def set_graph(self): - ''' - Sets the graph attribute formatting the networkx instance with gformat() - method of the superclass - ''' - if isinstance(self._nx_obj, nx.classes.graph.Graph): - self.graph = self.gformat(self._nx_obj, directed=self._directed) - self.set_info() - else: - print('Not a networkx readable object') - - - def set_info(self): - ''' - Sets graph info dictionary attribute using networkx graph-instance description - ''' - infos = nx.info(self._nx_obj) + '\nDirected: '+str(nx.is_directed(self._nx_obj)) - infos = infos.split() - self.info['type'] = infos[2] - self.info['numb_nodes'] = infos[6] - self._numb_nodes = int(infos[6]) - self.info['numb_edges'] = infos[10] - if infos[2] == 'MultiDiGraph': - self.info['aver_in_degree'] = infos[14] - self.info['aver_out_degree'] = infos[18] - self.info['directed'] = infos[20] - else: - self.info['aver_degree'] = infos[13] - self.info['directed'] = infos[15] + if self._verbose: + print('Macroscopic homophily level: ', + self.homophily(), + ' with H=', self._homophily) - def set_godNode_links(self, weight=1, nodes=None): + def set_godnode_links(self, weight=1, nodes=None): ''' Sets the godNode's links weights in the graph @@ -189,21 +158,22 @@ def set_godNode_links(self, weight=1, nodes=None): (Defalut None) Example: - tn_instance.set_godNode_links() : + tn_instance.set_godnode_links() : all the links weights (from godNode to each node) are set to 1 ''' if nodes is None: #print('Setting god node') - for node in self._progress_bar(self._nx_obj.nodes()): - self.godNode_links[(-1, node)] = np.abs(np.random.randn()) - self.graph.update(self.godNode_links) + god_node_links = [] + for node in self._progress_bar(self.nx_obj.nodes()): + rand_num = np.abs(np.random.randn()) + god_node_links.append((-1, node, rand_num)) + self.nx_obj.add_weighted_edges_from(god_node_links) if isinstance(nodes, int): - self.godNode_links[(-1, nodes)] = weight - self.graph.update(self.godNode_links) + self.nx_obj.add_weighted_edges_from([(-1, nodes, weight)]) if isinstance(nodes, collections.Iterable): for node in self._progress_bar(nodes): - self.godNode_links[(-1, node)] = weight - self.graph.update(self.godNode_links) + self.nx_obj.add_weighted_edges_from([(-1, node, weight)]) + def set_interests(self, int_mode): ''' @@ -223,7 +193,7 @@ def set_interests(self, int_mode): if int_mode == 'nmf': self.nmf_interests() - if int_mode == 'load' or self._interests != None: + if int_mode == 'load' or self._interests is not None: if isinstance(self._interests, dict): print('Loading interests') else: @@ -242,34 +212,24 @@ def set_influence(self): ''' if self._infl_strength is None: - for node in self._nx_obj.nodes(): + for node in self.nx_obj.nodes(): self.users_influence[node] = [0. for _ in range(self._numb_topics)] else: - fitness = 1 + np.random.pareto(a=self._infl_strength, size=self._nx_obj.number_of_nodes()) + fitness = 1 + np.random.pareto(a=self._infl_strength, + size=self.nx_obj.number_of_nodes()) if np.any(np.isinf(fitness)): print('Wrong parameter infl strenght') fitness = np.minimum(fitness, 10E20) - for node in self._nx_obj.nodes(): + for node in self.nx_obj.nodes(): self.users_influence[node] = fitness[node] * self.users_interests[node] - if self._godNode_strength is not None: + if self._godnode_strength is not None: self.users_influence[-1] = np.ones(self._numb_topics) for node, value in self.users_influence.items(): if not all(np.isfinite(value)): print('ATTENTION node: ', node, ' influences: ', value) - ''' - random_powerlaw_vec(gamma=self._rho, self._numb_topics) - # rescaling infleunce importance - norm_avg = 0. - for node in self._nx_obj.nodes(): - norm_avg += np.linalg.norm(self.users_interests[node])/self._numb_nodes - scale_fact = self._infl_strength*norm_avg - # setting influence vec - for node in self._nx_obj.nodes(): - influence_vec = self.node2influence(scale_fact) - self.users_influence[node] = influence_vec - ''' + def random_interests(self, norm=True): ''' @@ -283,38 +243,40 @@ def random_interests(self, norm=True): ''' if norm: - for node in self._nx_obj.nodes(): + for node in self.nx_obj.nodes(): self.users_interests[node] = np.random.rand(self._numb_topics) - - def overlap_generator(self, A): + @staticmethod + def overlap_generator(A): """ Generate the second order overlap from a sparse adjacency matrix A. """ aat = A.dot(A.T) d = aat.diagonal() ndiag = sparse.diags(d, 0) - n = np.sqrt(ndiag.dot(aat>0).dot(ndiag)) + n = np.sqrt(ndiag.dot(aat > 0).dot(ndiag)) n.data[:] = 1./n.data[:] return aat.multiply(n) #- sparse.identity(aat.shape[0]) def nmf_interests(self, eta=64.): + ''' + Generates nodes interests using NMF + ''' #beta = self._homophily - A = nx.adjacency_matrix(self._nx_obj) + A = nx.adjacency_matrix(self.nx_obj) S_0 = self.overlap_generator(A) - R = np.random.rand(self._nx_obj.number_of_nodes(), self._nx_obj.number_of_nodes()) + R = np.random.rand(self.nx_obj.number_of_nodes(), self.nx_obj.number_of_nodes()) #S = beta*(S_0 + A + sparse.identity(A.shape[0])) + (1-beta)*R eta = 64. S = eta*S_0 + A + self._rand*R model = NMF(n_components=self._numb_topics, init='nndsvd', random_state=self._seed) W = model.fit_transform(S) - #print('Doing nmf with random coeff ', self._rand, ' and homophily ', self._homophily) if not np.all(np.isfinite(W)): print('ATTENTION W contains infinites') - for node in self._nx_obj.nodes(): + for node in self.nx_obj.nodes(): self.users_interests[node] = W[node] def node2influence(self, scale_fact, alpha_max=10): @@ -345,16 +307,17 @@ def graph_weights_vecs_generation(self): - god_node_weight : int scalar value for each entry of the weight vector involving the godNode ''' - for link in self.graph.keys(): + for u, v in list(self.nx_obj.edges()): # god node - if link[0] == -1: - out_influence_vec = self.users_influence[-1] - in_interest_vec = self.users_interests[link[1]] - self.set_link_weight(link, in_interest_vec + self._godNode_strength * out_influence_vec) + if u == -1: + out_influence_vec = self.users_influence[u] + in_interest_vec = self.users_interests[v] + self.set_link_weight((u, v), + in_interest_vec + self._godnode_strength * out_influence_vec) else: - out_influence_vec = self.users_influence[link[0]] - in_interest_vec = self.users_interests[link[1]] - self.set_link_weight(link, out_influence_vec + in_interest_vec) + out_influence_vec = self.users_influence[u] + in_interest_vec = self.users_interests[v] + self.set_link_weight((u, v), out_influence_vec + in_interest_vec) def set_link_weight(self, link, new_weight): @@ -369,50 +332,60 @@ def set_link_weight(self, link, new_weight): numb_topic dimension array that is going to be the new attribute of the link ''' + u, v = link if not all(np.isfinite(new_weight)): print('ATTENTION link ', link, ' weight: ', new_weight) - self.graph[link] = new_weight + self.nx_obj[u][v]['weight'] = new_weight - def load_interests(self): + def load_interests(self, sep=','): ''' - Loads interests vector from path + Loads interests vector from path. + Format: "node int1,int2,int3.." ''' if isinstance(self._interests, dict): self.users_interests = self._interests else: - with open(self._interests, 'r') as f: - for _ in f.readlines(): - node, interests = _.split(' ', 1)[0], _.split(' ', 1)[1] - self.users_interests[int(node)] = np.array(eval(interests[:-1])) - - + with open(self._interests, 'r') as file: + for line in file.readlines(): + node, interests = line.split(' ')[0], line.split(' ')[1] + node_interests = [float(entry) for entry in interests[:-1].split(sep)] + self.users_interests[int(node)] = np.array(node_interests) ################# Analysis def sim_in(self): + '''Returns average interests similarity between connected nodes + ''' sims = [] - for i in self._nx_obj.nodes: - for j in list(self._nx_obj.neighbors(i)): + for i in self.nx_obj.nodes: + for j in list(self.nx_obj.neighbors(i)): sims.append(1 - distance.cosine(self.users_interests[i], self.users_interests[j])) return np.mean(sims) def select_notedge(self): - v1 = np.random.choice(self._nx_obj.nodes()) - v2 = np.random.choice(self._nx_obj.nodes()) + '''Returns tuple of not connected nodes + ''' + node1 = np.random.choice(self.nx_obj.nodes()) + node2 = np.random.choice(self.nx_obj.nodes()) - while (v1,v2) in self._nx_obj.edges or v1==v2: - v1 = np.random.choice(self._nx_obj.nodes()) - v2 = np.random.choice(self._nx_obj.nodes()) - return v1, v2 + while (node1, node2) in self.nx_obj.edges or node1 == node2: + node1 = np.random.choice(self.nx_obj.nodes()) + node2 = np.random.choice(self.nx_obj.nodes()) + return node1, node2 def sim_out(self, samples): + '''Returns average interests similarity between of not connected nodes + samples times + ''' sims_out = [] - for c in range(samples): + for _ in range(samples): i, j = self.select_notedge() sims_out.append(1 - distance.cosine(self.users_interests[i], self.users_interests[j])) return np.mean(sims_out) def homophily(self, numb_not_edges_tested=10000): + '''Returns the cosine similarity homophily ratio measure + ''' return self.sim_in() / self.sim_out(numb_not_edges_tested) diff --git a/src/womg_core/womg_core/propagation.py b/src/womg_core/womg_core/propagation.py index 4975089..064da46 100644 --- a/src/womg_core/womg_core/propagation.py +++ b/src/womg_core/womg_core/propagation.py @@ -1,3 +1,6 @@ +''' +Class containing all womg outputs +''' import numpy as np class Propagation: @@ -10,6 +13,7 @@ def __init__(self, network_model, topic_model, diffusion_model): self._docs = None self._topic_distributions = None self._interests = None + self._topics_descriptions = None @property @@ -26,6 +30,7 @@ def propagations(self, value): @property def docs(self): + print(self.topic_model.items_keyw) self.docs = [v for k, v in self.topic_model.items_keyw.items()] return self._docs @@ -39,6 +44,7 @@ def topic_distributions(self): ''' KxM matrix ''' + print(self.topic_model.items_descript) vectors = [v for v in self.topic_model.items_descript.values()] self.topic_distributions = np.column_stack(vectors) return self._topic_distributions @@ -59,3 +65,16 @@ def interests(self): @interests.setter def interests(self, value): self._interests = value + + + @property + def topics_descriptions(self): + ''' + list of linear combination of words + ''' + self._topics_descriptions = self.topic_model.topics_descript + return self._topics_descriptions + + @topics_descriptions.setter + def topics_descriptions(self, value): + self._topics_descriptions = value diff --git a/src/womg_core/womg_core/topic/lda.py b/src/womg_core/womg_core/topic/lda.py index 5be16ed..5089659 100644 --- a/src/womg_core/womg_core/topic/lda.py +++ b/src/womg_core/womg_core/topic/lda.py @@ -1,15 +1,15 @@ -# /Topic/lda.py -# Implementation of LDA topic-model -import re +'''/Topic/lda.py +Implementation of LDA topic-model +''' import os -import womg_core import pathlib -from tqdm import tqdm, tqdm_notebook import numpy as np +from tqdm import tqdm, tqdm_notebook +import womg_core from womg_core.topic.tlt_topic_model import TLTTopicModel -from womg_core.utils.utility_functions import count_files, read_docs, TopicsError, DocsError from womg_core.utils.distributions import random_powerlaw_vec + class LDA(TLTTopicModel): ''' Class implementing Latent Dirichlet Allocation as topic model @@ -28,9 +28,9 @@ class LDA(TLTTopicModel): sets the lda mode: reading or generating mode ''' def __init__(self, numb_topics, - numb_docs, - items_descr, - progress_bar): + numb_docs, + items_descr, + progress_bar): super().__init__() self.numb_topics = numb_topics self.numb_docs = numb_docs @@ -39,10 +39,10 @@ def __init__(self, numb_topics, self._items_descr = items_descr self.items_keyw = {} self.dictionary = [] - #self.main_data_path = pathlib.Path(os.path.abspath(womg.__file__).replace('/womg/__init__.py', ''))/'womgdata' self.main_data_path = pathlib.Path(os.path.abspath(womg_core.__file__).replace('/womg_core/__init__.py', ''))/'womgdata' self._training_path = self.main_data_path /'docs'/'training_corpus_ap' - #print(self._training_path) + self._topics_descr_path = self.main_data_path / 'topic_model' / 'Topics_descript.txt' + self.topics_descript = self.load_topics_descr(self._topics_descr_path) if progress_bar: self._progress_bar = tqdm_notebook else: @@ -68,7 +68,14 @@ def fit(self): if mode == 'gen': self.gen_items_descript() - + @staticmethod + def load_topics_descr(path): + ''' + Loads a topic description file (for each topic word distribution) + ''' + with open(path, 'r') as file: + topics_descript = file.readlines() + return str(topics_descript[0]) def set_lda_mode(self): ''' @@ -86,32 +93,34 @@ def set_lda_mode(self): reading : bool if True: it will read docs inside the given folder path or input folder if False: it will use lda for generating docs - ''' # setting mode - if self.numb_docs == None and self._docs_path == None: + if self.numb_docs is None and self._docs_path is None: mode = 'load' - if self._items_descr == None: + if self._items_descr is None: # pre-trained topic model with 15 topics and 50 docs self._items_descr = self.main_data_path / 'topic_model' / 'Items_descript.txt' - self._topics_descr_path = self.main_data_path / 'topic_model' / 'Topics_descript.txt' - self.topics_descript = self.load_topics_descr(self._topics_descr_path) else: pass if isinstance(self._items_descr, dict): print('Loading items descriptions (topic distrib for each doc)') else: - print('Loading items descriptions (topic distrib for each doc) in: ', self._items_descr) + print('Loading items descriptions (topic distrib for each doc) in: ', + self._items_descr) + if self.numb_docs is not None and self._docs_path is None and isinstance(self._items_descr, dict): + mode = 'load' # when womg extended has been executed in gen mode - elif self.numb_docs == None and self._docs_path != None and self._items_descr == None: + elif self.numb_docs is None and self._docs_path is not None and self._items_descr is None: print('Please install the womg extended version') - elif self.numb_docs != None and self._docs_path != None and self._items_descr == None: + elif self.numb_docs is not None and self._docs_path is not None and self._items_descr is None: print('Please install the womg extended version') - elif self.numb_docs != None and self._docs_path == None and self._items_descr == None: + elif self.numb_docs is not None and self._docs_path is None and self._items_descr is None: mode = 'gen' - print('Setting LDA in generative mode: ', self.numb_docs, ' documents, with ', self.numb_topics, ' topics.') + print('Setting LDA in generative mode: ', + self.numb_docs, ' documents, with ', + self.numb_topics, ' topics.') print('Training the LDA model ..') return mode @@ -141,7 +150,8 @@ def gen_items_descript(self): Generates the topic distribution for each item and stores it in the items_descript attribute ''' - alpha = [1.0 / self.numb_topics for i in range(self.numb_topics)] + print('generating items descript') + alpha = [1.0 / self.numb_topics for i in range(self.numb_topics)] gammas = {} for item in range(self.numb_docs): gammas[item] = np.random.dirichlet(alpha) @@ -162,24 +172,17 @@ def load_items_descr(self, path): tuple of items_descript loaded from path and numb_docs ''' items_descr_dict = {} - with open(path, 'r') as f: + with open(path, 'r') as file: numb_docs = 0 - for line in f: - line = line.replace(',','').replace(']','').replace('[','') + for line in file: + line = line.replace(',', '').replace(']', '').replace('[', '') values = line.split() node = int(values[0]) interests_vec = [float(i) for i in values[1:]] if self.numb_topics != len(interests_vec): - raise TopicsError("Please write the correct number of topics as input or in case you give the items_descr_path you can omit it") + print("Please write the correct number of topics", + "as input or in case you give the items_descr_path", + " you can omit it") items_descr_dict[node] = interests_vec numb_docs += 1 return items_descr_dict, numb_docs - - def load_topics_descr(self, path): - ''' - Loads a topic description file (for each topic word distribution) - ''' - with open(path, 'r') as f: - topics_descript = f.readlines() - #topics_descript[0].replace("\\",'').replace(']','') - return str(topics_descript[0]) diff --git a/src/womg_core/womg_core/topic/tlt_topic_model.py b/src/womg_core/womg_core/topic/tlt_topic_model.py index bf0998f..ef6ef11 100644 --- a/src/womg_core/womg_core/topic/tlt_topic_model.py +++ b/src/womg_core/womg_core/topic/tlt_topic_model.py @@ -1,5 +1,6 @@ -# /Topic/TopicModel.py -# Abstract class defining Topic model +'''/Topic/TopicModel.py +Abstract class defining Topic model +''' import abc from womg_core.topic.topic_model import TopicModel @@ -15,8 +16,11 @@ class TLTTopicModel(TopicModel): Methods ------- - topic_distrib_extraction : absract - inferes topic distribution for each document + fit : abstract + inferes/generates topic distribution for each document + + set_docs_viralities : abstract + sets the documents viralities ''' def __init__(self): @@ -25,7 +29,11 @@ def __init__(self): @abc.abstractmethod def fit(self): - ''' - Methods for infering topic distributions of the given documents + '''Methods for infering/generating topic distributions of the given documents ''' pass + + @abc.abstractmethod + def set_docs_viralities(self, virality_exp): + '''Method for setting the documents viralities + ''' diff --git a/src/womg_core/womg_core/topic/topic_model.py b/src/womg_core/womg_core/topic/topic_model.py index a560982..01fe100 100644 --- a/src/womg_core/womg_core/topic/topic_model.py +++ b/src/womg_core/womg_core/topic/topic_model.py @@ -1,5 +1,6 @@ -# /topic/topic_model.py -# Abstract class defining Topic model +'''/topic/topic_model.py +Abstract class defining Topic model +''' import abc @@ -30,4 +31,16 @@ class TopicModel(abc.ABC): def __init__(self): self.items_descript = {} - self.topics_descript = {} + self.dictionary = [] + + @abc.abstractmethod + def load_items_descr(self, path): + '''Method for loading the items topic distributions + ''' + pass + + @abc.abstractmethod + def gen_items_descript(self): + '''Method for generating the items topic distributions + ''' + pass diff --git a/src/womg_core/womg_core/utils/distributions.py b/src/womg_core/womg_core/utils/distributions.py index c13ffb1..0f18b7e 100644 --- a/src/womg_core/womg_core/utils/distributions.py +++ b/src/womg_core/womg_core/utils/distributions.py @@ -1,4 +1,6 @@ -# Class for random distributions definition +''' +Class for random distributions definition +''' import random import numpy as np @@ -6,7 +8,7 @@ def set_seed(seed): ''' Sets the given seed for each distribution extraction ''' - if seed != None: + if seed is not None: random.seed(seed) np.random.seed(seed) diff --git a/src/womg_core/womg_core/utils/saver.py b/src/womg_core/womg_core/utils/saver.py index fe80ad4..ce9c786 100644 --- a/src/womg_core/womg_core/utils/saver.py +++ b/src/womg_core/womg_core/utils/saver.py @@ -1,3 +1,6 @@ +''' +Class for saving attributes of the network, topic and propagation models' classes +''' import abc import pathlib @@ -9,19 +12,12 @@ def __init__(self): self._path = '' @abc.abstractmethod - def save_diffusion(self, diffusion_model): + def save_propagation(self, propagation, step): ''' Abstract method for saving a diffusion model at each step of simulation ''' return - @abc.abstractmethod - def save_model(self, model): - ''' - Abstract method for saving a network model or topic model - ''' - return - @staticmethod def make_output_directory(path): ''' @@ -59,14 +55,14 @@ def make_filename(name, output_dir, new_file=True): sim_numb = 0 while pathlib.Path(filename).exists(): - sim_numb+=1 + sim_numb += 1 filename = output_dir / str(name + str(sim_numb) + ".txt") if new_file: return filename - else: - sim_numb-=1 - filename = output_dir / str(name + str(sim_numb) + ".txt") - return filename + + sim_numb -= 1 + filename = output_dir / str(name + str(sim_numb) + ".txt") + return filename class TxtSaver(Saver): @@ -85,13 +81,13 @@ def save_users_interests(self, network_model): output_dir = self.make_output_directory(self._path) filename = self.make_filename("Users_interests", output_dir) - with open(filename, "w") as f: + with open(filename, "w") as file: for node in network_model.users_influence.keys(): if node != -1: #god node vec = '' for entry in list(network_model.users_interests[node]): vec += (str(entry)+', ') - f.write(str(node) + ';' +vec+ '\n') + file.write(str(node) + ';' +vec+ '\n') def save_users_influence(self, network_model): @@ -102,10 +98,11 @@ def save_users_influence(self, network_model): output_dir = self.make_output_directory(self._path) filename = self.make_filename("Users_influence", output_dir) - with open(filename, "w") as f: + with open(filename, "w") as file: for node in network_model.users_influence.keys(): if node != -1: #god node - f.write(str(node) + ';' +str(list(network_model.users_influence[node])) + '\n') + line = str(node) + ';' +str(list(network_model.users_influence[node])) + '\n' + file.write(line) def save_mapping(self, network_model): @@ -116,9 +113,9 @@ def save_mapping(self, network_model): output_dir = self.make_output_directory(self._path) filename = self.make_filename("nodes_mapping", output_dir) - if network_model.mapping != None: - with open(filename, "w") as f: - f.write(str(network_model.mapping)) + if network_model.mapping is not None: + with open(filename, "w") as file: + file.write(str(network_model.mapping)) else: pass @@ -131,9 +128,10 @@ def save_items_descript(self, topic_model): output_dir = self.make_output_directory(self._path) filename = self.make_filename("Items_descript", output_dir) - with open(filename, "w") as f: + with open(filename, "w") as file: for item in topic_model.items_descript.keys(): - f.write(str(item) + ' ' +str(list(topic_model.items_descript[item])) + '\n') + line = str(item) + ' ' +str(list(topic_model.items_descript[item])) + '\n' + file.write(line) def save_topics_descript(self, topic_model): ''' @@ -144,8 +142,8 @@ def save_topics_descript(self, topic_model): filename = self.make_filename("Topics_descript", output_dir) if topic_model.topics_descript != {}: - with open(filename, "w") as f: - f.write(str(topic_model.topics_descript)) + with open(filename, "w") as file: + file.write(str(topic_model.topics_descript)) else: pass @@ -158,9 +156,9 @@ def save_items_keyw(self, topic_model): output_dir = self.make_output_directory(self._path) filename = self.make_filename("Items_keyw", output_dir) - with open(filename, "w") as f: + with open(filename, "w") as file: for item in topic_model.items_keyw.keys(): - f.write(str(item) + ' ' + str(topic_model.items_keyw[item]) + '\n') + file.write(str(item) + ' ' + str(topic_model.items_keyw[item]) + '\n') def save_propagation(self, propagation, step=0): ''' @@ -172,6 +170,18 @@ def save_propagation(self, propagation, step=0): else: filename = self.make_filename("Propagations", output_dir, False) - with open(filename, 'a') as f: - for node in range(len(propagation)): - f.write(str(step) +' '+ str(propagation[node])) + with open(filename, 'a') as file: + for _node, prop in enumerate(propagation): + file.write(str(step) +' '+ str(prop)) + + def save_propagations(self, diffusion_model): + ''' + Concrete method for saving the cascades files in a txt format + ''' + output_dir = self.make_output_directory(self._path) + filename = self.make_filename("Propagations", output_dir, True) + + with open(filename, 'w') as file: + for item, item_activations in enumerate(diffusion_model.all_propagations): + for time, node in item_activations: + file.write(str(item) + ' ' + str(time) +' '+ str(node) + '\n') diff --git a/src/womg_core/womg_core/utils/utility_functions.py b/src/womg_core/womg_core/utils/utility_functions.py index 025a142..c528848 100644 --- a/src/womg_core/womg_core/utils/utility_functions.py +++ b/src/womg_core/womg_core/utils/utility_functions.py @@ -1,67 +1,35 @@ -# Utility functions +'''Utility functions +''' import pathlib import pickle import networkx as nx - -class TopicsError(Exception): - pass - -class DocsError(Exception): - pass - -def read_edgelist(self, path, weighted, directed): - ''' - Reference implementation of node2vec. - - Author: Aditya Grover - - For more details, refer to the paper: - node2vec: Scalable Feature Learning for Networks - Aditya Grover and Jure Leskovec - Knowledge Discovery and Data Mining (KDD), 2016 - - Reads the input network in networkx. [node2vec implementation] +def read_edgelist(path, weighted, directed): + '''Reads the input network in networkx. ''' if weighted: - #G = nx.read_edgelist(path, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) - G = nx.read_edgelist(path, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) + graph = nx.read_edgelist(path, nodetype=int, + data=(('weight', float),), + create_using=nx.DiGraph()) else: - G = nx.read_edgelist(path, nodetype=int, create_using=nx.DiGraph()) - for edge in G.edges(): - G[edge[0]][edge[1]]['weight'] = 1 + graph = nx.read_edgelist(path, nodetype=int, create_using=nx.DiGraph()) + for edge in graph.edges(): + graph[edge[0]][edge[1]]['weight'] = 1 if not directed: - G = G.to_undirected() + graph = graph.to_directed() # mapping labels mapping = {} identity_map = 0 - for new_label, old_label in enumerate(sorted(G.nodes())): + for new_label, old_label in enumerate(sorted(graph.nodes())): if new_label == old_label: identity_map += 1 mapping[old_label] = new_label - if identity_map == G.number_of_nodes(): - return G, None - else: - return nx.relabel_nodes(G, mapping), mapping - -''' -def def_numb_topics(numb_topics, numb_docs, docs_path): - # - Setting the numb_topics equal to the one given. - In case no documents are given, lda will be set in generative mode - and this class has to generate interets vectors of same dimension as - the topic distributions of docs - # - if ((numb_docs == None) and (docs_path == None)): - return 15 - elif numb_docs: - return 15 - else: - return numb_topics -''' + if identity_map == graph.number_of_nodes(): + return graph, None + return nx.relabel_nodes(graph, mapping), mapping def cleaning(): @@ -101,8 +69,8 @@ def read_graph(file_path): ''' Reads graph from a path ''' - with open(file_path, 'rb') as f: - rfile = pickle.load(f) + with open(file_path, 'rb') as file: + rfile = pickle.load(file) return rfile def read_docs(path, verbose=False): @@ -125,8 +93,8 @@ def read_docs(path, verbose=False): print(onlyfiles) for file in onlyfiles: f_path = pathlib.Path(path) / str(file) - with open(f_path, 'rb') as f: - doc_list = [j for j in f] + with open(f_path, 'rb') as file: + doc_list = [j for j in file] if verbose: print(doc_list) docs.append(doc_list) @@ -140,6 +108,6 @@ def find_numb_nodes(graph): maxx = 0 for key in graph.keys(): for i in range(2): - if key[i]>maxx: + if key[i] > maxx: maxx = key[i] return maxx