Polarisation update; added greedy algo; removed modularity/EI

leifeld · Jan 11, 2025 · f0bda2f · f0bda2f
1 parent d9ef035
commit f0bda2f
Show file tree

Hide file tree

Showing 13 changed files with 694 additions and 492 deletions.
diff --git a/dna/src/main/java/dna/Dna.java b/dna/src/main/java/dna/Dna.java
@@ -17,7 +17,7 @@ public class Dna {
 	public static Dna dna;
 	public static Logger logger;
 	public static Sql sql;
-	public static final String date = "2025-01-10";
+	public static final String date = "2025-01-11";
 	public static final String version = "3.0.11.4";
 	public static final String operatingSystem = System.getProperty("os.name");
 	public static File workingDirectory = null;

diff --git a/dna/src/main/java/dna/HeadlessDna.java b/dna/src/main/java/dna/HeadlessDna.java
@@ -623,23 +623,23 @@ public void rTimeWindow(String networkType, String statementType, String variabl
 	 * @param invertSources      boolean indicating whether the document-level source values should be included (= {@code true}) rather than excluded.
 	 * @param invertSections     boolean indicating whether the document-level section values should be included (= {@code true}) rather than excluded.
 	 * @param invertTypes        boolean indicating whether the document-level type values should be included (= {@code true}) rather than excluded.
-	 * @param k					 The number of clusters or factions, for example 2 for bi-polarisation.
-	 * @param numParents         The number of cluster solutions (i.e., parents) to generate in each iteration, for example 30 or 50.
-	 * @param iterations         For how many generations should the genetic algorithm run at most? This is the maximal number of generations through which optimisation should be attempted. Will be lower if early convergence is detected. A suggested starting value is 1000.
-	 * @param elitePercentage    The share of cluster solutions in each parent generation that is copied into the children generation without changes, between 0.0 and 1.0, usually around 0.1.
-	 * @param mutationPercentage The probability with which each bit in any cluster solution is selected for mutation after the cross-over step. For example 0.1 to select 10% of the nodes to swap their memberships.
-	 * @param qualityFunction    The quality function to evaluate cluster solutions. Supported values are "modularity", "eiIndex", and "absdiff".
-	 * @param normaliseMatrices  boolean indicating whether the network matrices should be normalised before computing the polarisation score.
-	 * @param randomSeed         The random seed to use for the random number generator. Pass 0 for random behaviour.
-	 * @return                   A PolarisationResultTimeSeries object containing the results of the genetic algorithm for each time step and iteration.
+	 * @param algorithm          The algorithm to maximise polarisation at each time step. Can be "greedy" (for a greedy algorithm) or "genetic" (for a genetic algorithm).
+	 * @param normaliseScores    boolean indicating whether the polarisation scores should be normalised by dividing them by their theoretical maximum within a given network. This takes away the effect of more activity (possibly due to participation by more actors or more statements per actor) contributing to polarisation scores and focuses solely on structure given the edge mass in the network. Without normalisation, time periods with more actors and activity will elevate the polarisation of the network (at constant levels of being divided over concepts).
+	 * @param numClusters		 The number of clusters or factions k, for example 2 for bi-polarisation.
+	 * @param numParents         Only for the genetic algorithm: The number of cluster solutions (i.e., parents) to generate in each iteration, for example 30 or 50.
+	 * @param numterations       Only for the genetic algorithm: For how many generations should the genetic algorithm run at most? This is the maximal number of generations through which optimisation should be attempted. Will be lower if early convergence is detected. A suggested starting value is 1000.
+	 * @param elitePercentage    Only for the genetic algorithm: The share of cluster solutions in each parent generation that is copied into the children generation without changes, between 0.0 and 1.0, usually around 0.1.
+	 * @param mutationPercentage Only for the genetic algorithm: The probability with which each bit in any cluster solution is selected for mutation after the cross-over step. For example 0.1 to select 10% of the nodes to swap their memberships.
+	 * @param randomSeed         Only for the genetic algorithm: The random seed to use for the random number generator. Pass 0 for random behaviour.
+	 * @return                   A PolarisationResultTimeSeries object containing the results of the algorithm for each time step and iteration.
 	 */
 	public PolarisationResultTimeSeries rPolarisation(String statementType, String variable1, boolean variable1Document,
 			String variable2, boolean variable2Document, String qualifier, String duplicates, String startDate,
 			String stopDate, String timeWindow, int windowSize, String kernel, boolean indentTime,
 			String[] excludeVariables, String[] excludeValues, String[] excludeAuthors, String[] excludeSources,
 			String[] excludeSections, String[] excludeTypes, boolean invertValues, boolean invertAuthors,
-			boolean invertSources, boolean invertSections, boolean invertTypes, int k, int numParents, int iterations,
-			double elitePercentage, double mutationPercentage, String qualityFunction, boolean normaliseMatrices,
+			boolean invertSources, boolean invertSections, boolean invertTypes, String algorithm, boolean normaliseScores,
+			int numClusters, int numParents, int numIterations, double elitePercentage, double mutationPercentage,
 			long randomSeed) {
 
 		// step 1: preprocess arguments
@@ -667,13 +667,11 @@ public PolarisationResultTimeSeries rPolarisation(String statementType, String v
 			}
 		}
 
-		Polarisation polarisation = new Polarisation(st, variable1, variable1Document, variable2,
-				variable2Document, qualifier, false, duplicates,
-				ldtStart, ldtStop, timeWindow, windowSize, map, excludeAuthors, excludeSources,
-				excludeSections, excludeTypes, invertValues, invertAuthors, invertSources,
-				invertSections, invertTypes, kernel, indentTime, k, numParents,
-				iterations, elitePercentage, mutationPercentage, qualityFunction, normaliseMatrices,
-				randomSeed);
+		Polarisation polarisation = new Polarisation(st, variable1, variable1Document, variable2, variable2Document,
+				qualifier, false, duplicates, ldtStart, ldtStop, timeWindow, windowSize, kernel, indentTime,
+				map, excludeAuthors, excludeSources,	excludeSections, excludeTypes, invertValues, invertAuthors,
+				invertSources, invertSections, invertTypes, algorithm, normaliseScores, numClusters, numParents,
+				numIterations, elitePercentage, mutationPercentage, randomSeed);
 
 		return polarisation.getResults();
 	}

diff --git a/dna/src/main/java/dna/export/Exporter.java b/dna/src/main/java/dna/export/Exporter.java
@@ -12,7 +12,6 @@
 import model.*;
 import org.apache.commons.math3.linear.EigenDecomposition;
 import org.apache.commons.math3.linear.RealMatrix;
-import org.apache.commons.math3.analysis.function.Log;
 import org.apache.commons.math3.linear.Array2DRowRealMatrix;
 import org.jdom.Attribute;
 import org.jdom.Comment;
@@ -27,7 +26,6 @@
 import org.ojalgo.matrix.decomposition.Eigenvalue;
 
 import java.io.*;
-import java.lang.reflect.Array;
 import java.time.Duration;
 import java.time.LocalDateTime;
 import java.time.Period;