diff --git a/dna/src/main/java/dna/Dna.java b/dna/src/main/java/dna/Dna.java index df6b3771..f3b6a4b1 100644 --- a/dna/src/main/java/dna/Dna.java +++ b/dna/src/main/java/dna/Dna.java @@ -17,7 +17,7 @@ public class Dna { public static Dna dna; public static Logger logger; public static Sql sql; - public static final String date = "2025-01-10"; + public static final String date = "2025-01-11"; public static final String version = "3.0.11.4"; public static final String operatingSystem = System.getProperty("os.name"); public static File workingDirectory = null; diff --git a/dna/src/main/java/dna/HeadlessDna.java b/dna/src/main/java/dna/HeadlessDna.java index 134c0b83..1f3ec660 100644 --- a/dna/src/main/java/dna/HeadlessDna.java +++ b/dna/src/main/java/dna/HeadlessDna.java @@ -623,23 +623,23 @@ public void rTimeWindow(String networkType, String statementType, String variabl * @param invertSources boolean indicating whether the document-level source values should be included (= {@code true}) rather than excluded. * @param invertSections boolean indicating whether the document-level section values should be included (= {@code true}) rather than excluded. * @param invertTypes boolean indicating whether the document-level type values should be included (= {@code true}) rather than excluded. - * @param k The number of clusters or factions, for example 2 for bi-polarisation. - * @param numParents The number of cluster solutions (i.e., parents) to generate in each iteration, for example 30 or 50. - * @param iterations For how many generations should the genetic algorithm run at most? This is the maximal number of generations through which optimisation should be attempted. Will be lower if early convergence is detected. A suggested starting value is 1000. - * @param elitePercentage The share of cluster solutions in each parent generation that is copied into the children generation without changes, between 0.0 and 1.0, usually around 0.1. - * @param mutationPercentage The probability with which each bit in any cluster solution is selected for mutation after the cross-over step. For example 0.1 to select 10% of the nodes to swap their memberships. - * @param qualityFunction The quality function to evaluate cluster solutions. Supported values are "modularity", "eiIndex", and "absdiff". - * @param normaliseMatrices boolean indicating whether the network matrices should be normalised before computing the polarisation score. - * @param randomSeed The random seed to use for the random number generator. Pass 0 for random behaviour. - * @return A PolarisationResultTimeSeries object containing the results of the genetic algorithm for each time step and iteration. + * @param algorithm The algorithm to maximise polarisation at each time step. Can be "greedy" (for a greedy algorithm) or "genetic" (for a genetic algorithm). + * @param normaliseScores boolean indicating whether the polarisation scores should be normalised by dividing them by their theoretical maximum within a given network. This takes away the effect of more activity (possibly due to participation by more actors or more statements per actor) contributing to polarisation scores and focuses solely on structure given the edge mass in the network. Without normalisation, time periods with more actors and activity will elevate the polarisation of the network (at constant levels of being divided over concepts). + * @param numClusters The number of clusters or factions k, for example 2 for bi-polarisation. + * @param numParents Only for the genetic algorithm: The number of cluster solutions (i.e., parents) to generate in each iteration, for example 30 or 50. + * @param numterations Only for the genetic algorithm: For how many generations should the genetic algorithm run at most? This is the maximal number of generations through which optimisation should be attempted. Will be lower if early convergence is detected. A suggested starting value is 1000. + * @param elitePercentage Only for the genetic algorithm: The share of cluster solutions in each parent generation that is copied into the children generation without changes, between 0.0 and 1.0, usually around 0.1. + * @param mutationPercentage Only for the genetic algorithm: The probability with which each bit in any cluster solution is selected for mutation after the cross-over step. For example 0.1 to select 10% of the nodes to swap their memberships. + * @param randomSeed Only for the genetic algorithm: The random seed to use for the random number generator. Pass 0 for random behaviour. + * @return A PolarisationResultTimeSeries object containing the results of the algorithm for each time step and iteration. */ public PolarisationResultTimeSeries rPolarisation(String statementType, String variable1, boolean variable1Document, String variable2, boolean variable2Document, String qualifier, String duplicates, String startDate, String stopDate, String timeWindow, int windowSize, String kernel, boolean indentTime, String[] excludeVariables, String[] excludeValues, String[] excludeAuthors, String[] excludeSources, String[] excludeSections, String[] excludeTypes, boolean invertValues, boolean invertAuthors, - boolean invertSources, boolean invertSections, boolean invertTypes, int k, int numParents, int iterations, - double elitePercentage, double mutationPercentage, String qualityFunction, boolean normaliseMatrices, + boolean invertSources, boolean invertSections, boolean invertTypes, String algorithm, boolean normaliseScores, + int numClusters, int numParents, int numIterations, double elitePercentage, double mutationPercentage, long randomSeed) { // step 1: preprocess arguments @@ -667,13 +667,11 @@ public PolarisationResultTimeSeries rPolarisation(String statementType, String v } } - Polarisation polarisation = new Polarisation(st, variable1, variable1Document, variable2, - variable2Document, qualifier, false, duplicates, - ldtStart, ldtStop, timeWindow, windowSize, map, excludeAuthors, excludeSources, - excludeSections, excludeTypes, invertValues, invertAuthors, invertSources, - invertSections, invertTypes, kernel, indentTime, k, numParents, - iterations, elitePercentage, mutationPercentage, qualityFunction, normaliseMatrices, - randomSeed); + Polarisation polarisation = new Polarisation(st, variable1, variable1Document, variable2, variable2Document, + qualifier, false, duplicates, ldtStart, ldtStop, timeWindow, windowSize, kernel, indentTime, + map, excludeAuthors, excludeSources, excludeSections, excludeTypes, invertValues, invertAuthors, + invertSources, invertSections, invertTypes, algorithm, normaliseScores, numClusters, numParents, + numIterations, elitePercentage, mutationPercentage, randomSeed); return polarisation.getResults(); } diff --git a/dna/src/main/java/dna/export/Exporter.java b/dna/src/main/java/dna/export/Exporter.java index 0f7c3ea0..9c7b4c0b 100644 --- a/dna/src/main/java/dna/export/Exporter.java +++ b/dna/src/main/java/dna/export/Exporter.java @@ -12,7 +12,6 @@ import model.*; import org.apache.commons.math3.linear.EigenDecomposition; import org.apache.commons.math3.linear.RealMatrix; -import org.apache.commons.math3.analysis.function.Log; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.jdom.Attribute; import org.jdom.Comment; @@ -27,7 +26,6 @@ import org.ojalgo.matrix.decomposition.Eigenvalue; import java.io.*; -import java.lang.reflect.Array; import java.time.Duration; import java.time.LocalDateTime; import java.time.Period; diff --git a/dna/src/main/java/dna/export/Polarisation.java b/dna/src/main/java/dna/export/Polarisation.java index 06b21890..6a6d8e63 100644 --- a/dna/src/main/java/dna/export/Polarisation.java +++ b/dna/src/main/java/dna/export/Polarisation.java @@ -26,8 +26,8 @@ public class Polarisation { Exporter exporter; final StatementType statementType; - final String variable1, variable2, qualifier, normalization, duplicates, timeWindow, kernel, qualityFunction; - final boolean variable1Document, variable2Document, qualifierDocument, invertValues, invertAuthors, invertSources, invertSections, invertTypes, indentTime, normaliseMatrices; + final String variable1, variable2, qualifier, duplicates, timeWindow, kernel, algorithm; + final boolean variable1Document, variable2Document, qualifierDocument, invertValues, invertAuthors, invertSources, invertSections, invertTypes, indentTime, normaliseScores; final LocalDateTime ldtStart, ldtStop; int windowSize; final HashMap> excludeValueMap; @@ -36,19 +36,26 @@ public class Polarisation { final int numParents, numClusters, numIterations; final double elitePercentage, mutationPercentage; final long randomSeed; - Random rng; PolarisationResultTimeSeries results; public Polarisation(StatementType statementType, String variable1, boolean variable1Document, String variable2, boolean variable2Document, String qualifier, boolean qualifierDocument, String duplicates, - LocalDateTime ldtStart, LocalDateTime ldtStop, String timeWindow, int windowSize, - HashMap> excludeValueMap, String[] excludeAuthors, String[] excludeSources, - String[] excludeSections, String[] excludeTypes, boolean invertValues, boolean invertAuthors, - boolean invertSources, boolean invertSections, boolean invertTypes, String kernel, boolean indentTime, - int numClusters, int numParents, int numIterations, double elitePercentage, double mutationPercentage, - String qualityFunction, boolean normaliseMatrices, long randomSeed) { + LocalDateTime ldtStart, LocalDateTime ldtStop, String timeWindow, int windowSize, String kernel, + boolean indentTime, HashMap> excludeValueMap, String[] excludeAuthors, + String[] excludeSources, String[] excludeSections, String[] excludeTypes, boolean invertValues, + boolean invertAuthors, boolean invertSources, boolean invertSections, boolean invertTypes, + String algorithm, boolean normaliseScores, int numClusters, int numParents, int numIterations, + double elitePercentage, double mutationPercentage, long randomSeed) { // Validate input parameters + if (!algorithm.equals("genetic") && !algorithm.equals("greedy")) { + this.algorithm = "greedy"; + LogEvent log = new LogEvent(Logger.WARNING, "Invalid algorithm.", + "Algorithm must be 'genetic' or 'greedy'. Using 'greedy' instead."); + Dna.logger.log(log); + } else { + this.algorithm = algorithm; + } if (numParents <= 0) { this.numParents = 50; LogEvent log = new LogEvent(Logger.WARNING, "Invalid number of cluster solutions.", @@ -89,14 +96,6 @@ public Polarisation(StatementType statementType, String variable1, boolean varia } else { this.mutationPercentage = mutationPercentage; } - if (!qualityFunction.equals("modularity") && !qualityFunction.equals("eiIndex") && !qualityFunction.equals("absdiff")) { - this.qualityFunction = "absdiff"; - LogEvent log = new LogEvent(Logger.WARNING, "Invalid quality function.", - "Quality function must be 'modularity', 'eiIndex', or 'absdiff'. Using 'absdiff' instead."); - Dna.logger.log(log); - } else { - this.qualityFunction = qualityFunction; - } this.statementType = statementType; this.variable1 = variable1; @@ -122,10 +121,8 @@ public Polarisation(StatementType statementType, String variable1, boolean varia this.invertTypes = invertTypes; this.kernel = kernel; this.indentTime = indentTime; - this.normaliseMatrices = normaliseMatrices; + this.normaliseScores = normaliseScores; this.randomSeed = randomSeed; - this.rng = (randomSeed == 0) ? new Random() : new Random(randomSeed); // Initialize random number generator - this.normalization = "average"; this.congruence = new ArrayList(); this.conflict = new ArrayList(); @@ -140,7 +137,7 @@ public Polarisation(StatementType statementType, String variable1, boolean varia this.qualifier, false, "subtract", - this.normalization, + "average", false, this.duplicates, this.ldtStart, @@ -168,11 +165,11 @@ public Polarisation(StatementType statementType, String variable1, boolean varia this.computeKernelSmoothedTimeSlices(); - if (normaliseMatrices) { - this.normaliseMatrices(); + if (this.algorithm.equals("genetic")) { + this.results = this.geneticAlgorithm(); + } else if (this.algorithm.equals("greedy")) { + this.results = this.greedyAlgorithm(); } - - this.results = this.geneticAlgorithm(); } public PolarisationResultTimeSeries getResults() { @@ -180,117 +177,16 @@ public PolarisationResultTimeSeries getResults() { } /** - * Compute Newman's modularity score for a given binary or weighted network - * matrix. It works with signed networks and takes loops into account if they - * exist. This implementation is based on the formulation presented in Newman - * (2004), "Finding and evaluating community structure in networks," Physical - * Review E, Equation 6. Values of 1 indicate perfect community structure, 0 - * indicates no community structure, and values of -1 indicate complete ties - * between groups but no ties within groups, which is inversely related to the - * interpretation of the EI index. - * - * @param mem The community membership array (one value per node). - * @param mat The network matrix (binary or weighted adjacency matrix). - * @param K The number of communities. - * @return The modularity score. - */ - private double qualityModularity(int[] mem, double[][] mat, int K) { - if (mat == null || mat.length == 0 || mat.length != mat[0].length) { - throw new IllegalArgumentException("Matrix must be square and non-empty."); - } - if (mem == null || mem.length != mat.length) { - throw new IllegalArgumentException( - "Community membership array must match the number of nodes in the matrix."); - } - if (K <= 0) { - throw new IllegalArgumentException("Number of communities (K) must be greater than 0."); - } - - int n = mat.length; // Number of nodes - double[] degrees = new double[n]; - double m = 0.0; // Total weight of all edges - - // Precompute degrees (k_i) and total edge weight (m) - // This corresponds to the summation \sum_j A_{ij} for k_i in the modularity formula - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - degrees[i] += mat[i][j]; - m += mat[i][j]; - } - } - m /= 2.0; // Divide total edge weight by 2 to account for double-counting i-j and j-i - - // Compute modularity Q using the edge-based approach - double Q = 0.0; - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - if (mem[i] == mem[j]) { // Check if nodes i and j are in the same community (\delta(c_i, c_j)) - // Add the contribution of this pair to modularity - // The first term (A_{ij}) and the second term (-k_i*k_j/2m) of Equation 6 are combined here - Q += mat[i][j] - (degrees[i] * degrees[j]) / (2.0 * m); - } - } - } - - // Normalize the modularity score by dividing by 2m (as per the modularity - // formula) - return Q / (2.0 * m); - } - - /** - * Compute the E-I index for binary or weighted networks. - * - * This implementation calculates the E-I index as introduced by Krackhardt - * (1987): "Cognitive Social Structures," Social Networks, 9(2), 109–134 (DOI: - * 10.1016/0378-8733(87)90009-8). The function works for binary or weighted - * networks and directed or undirected networks, while self-loops are ignored. - * Note that -1 indicates complete segregation, 0 indicates no segregation, and - * 1 indicates ties only between groups (i.e. the inverse interpretation of - * modularity). - * - * @param memberships The community membership array (one value per node). - * @param mat The network matrix (weighted adjacency matrix). - * @return The E-I index. - */ - private double qualityEi(int[] memberships, double[][] mat) { - double external = 0.0, internal = 0.0; - int n = mat.length; - - // Iterate over all pairs (i, j) - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - if (i != j) { // Exclude self-loops - if (memberships[i] == memberships[j]) { - internal += mat[i][j]; - } else { - external += mat[i][j]; - } - } - } - } - - // Compute total weight of ties - double total = external + internal; - - // Handle division by zero - if (total == 0) { - return 0.0; // No ties in the network - } - - // Compute the combined E-I index - return (external - internal) / total; - } - - /** - * Calculates the quality of polarization based on the absolute differences + * Calculates the quality of polarisation based on the absolute differences * between observed and expected congruence and conflict within and between clusters. * * @param memberships An array where each element represents the cluster membership of a node. * @param congruenceNetwork A 2D array representing the congruence network. * @param conflictNetwork A 2D array representing the conflict network. + * @param normaliseScores Should the result be divided by its theoretical maximum (the sum of the two matrix norms)? * @return The quality of polarization as a double value. */ - private double qualityAbsdiff(int[] memberships, double[][] congruenceNetwork, double[][] conflictNetwork) { + private double qualityAbsdiff(int[] memberships, double[][] congruenceNetwork, double[][] conflictNetwork, boolean normalise) { double congruenceNorm = calculateMatrixNorm(congruenceNetwork); double conflictNorm = calculateMatrixNorm(conflictNetwork); @@ -314,10 +210,10 @@ private double qualityAbsdiff(int[] memberships, double[][] congruenceNetwork, d for (int j = 0; j < congruenceNetwork[0].length; j++) { if (i != j) { if (memberships[i] == memberships[j]) { - absdiff += Math.abs(congruenceNetwork[i][j] - expectedWithinClusterCongruence[memberships[i]]); // Within-cluster congruenc - absdiff += Math.abs(conflictNetwork[i][j]); // Conflict within clusters + absdiff += Math.abs(congruenceNetwork[i][j] - expectedWithinClusterCongruence[memberships[i]]); // Within-cluster congruence + // absdiff += Math.abs(conflictNetwork[i][j]); // Conflict within clusters // not necessary, would count deviation twice } else { - absdiff += Math.abs(congruenceNetwork[i][j]); // Between-cluster congruence + // absdiff += Math.abs(congruenceNetwork[i][j]); // Between-cluster congruence // not necessary, would count deviation twice double betweenFactor = (double) clusterMembers[memberships[i]] * clusterMembers[memberships[j]] / numBetweenClusterDyads; double expectedBetweenClusterConflict = betweenFactor * (conflictNorm / numBetweenClusterDyads); absdiff += Math.abs(conflictNetwork[i][j] - expectedBetweenClusterConflict); // Between-cluster conflict @@ -325,62 +221,11 @@ private double qualityAbsdiff(int[] memberships, double[][] congruenceNetwork, d } } } - return 0.5 * absdiff; - } - - /** - * Normalises the congruence and conflict matrices by dividing each element - * by the joint norm of the corresponding matrices. The joint norm is - * calculated as the sum of the norms of the congruence and conflict matrices - * at each time step. A progress bar is displayed to indicate the progress - * of the normalisation process. - */ - private void normaliseMatrices() { - try (ProgressBar pb = new ProgressBar("Normalisation", this.congruence.size())) { - for (int t = 0; t < congruence.size(); t++) { - double jointNorm = calculateMatrixNorm(congruence.get(t).getMatrix()) + calculateMatrixNorm(conflict.get(t).getMatrix()); - for (int i = 0; i < congruence.get(t).getMatrix().length; i++) { - for (int j = 0; j < congruence.get(t).getMatrix()[0].length; j++) { - congruence.get(t).getMatrix()[i][j] /= jointNorm; - conflict.get(t).getMatrix()[i][j] /= jointNorm; - } - } - pb.step(); - } - } - } - - /** - * For a given int array, rank its values in descending order, starting at 0. - * - * @param arr An int array. - * @return An array of ranks, starting with 0. - */ - private int[] calculateRanks(int... arr) { - class Pair { - final int value; - final int index; - - Pair(int value, int index) { - this.value = value; - this.index = index; - } - } - - Pair[] pairs = new Pair[arr.length]; - for (int index = 0; index < arr.length; ++index) { - pairs[index] = new Pair(arr[index], index); - } - - // Sort pairs by value in descending order - Arrays.sort(pairs, (pair1, pair2) -> -Integer.compare(pair1.value, pair2.value)); - - int[] ranks = new int[arr.length]; - for (int i = 0; i < pairs.length; ++i) { - ranks[pairs[i].index] = i; + if (normalise) { + return (absdiff / (congruenceNorm + conflictNorm)); + } else { + return absdiff; } - - return ranks; } /** @@ -502,7 +347,7 @@ private class ClusterSolution implements Cloneable { * @param rng The random number generator. * @throws IllegalArgumentException If any parameter is invalid. */ - ClusterSolution(int n, int k) { + ClusterSolution(int n, int k, Random rng) { if (n <= 0) { throw new IllegalArgumentException("N must be positive."); } @@ -514,16 +359,7 @@ private class ClusterSolution implements Cloneable { } this.N = n; this.K = k; - this.memberships = createRandomMemberships(n, k, Polarisation.this.rng); - } - - /** - * Returns the number of clusters. - * - * @return The number of clusters. - */ - public int getK() { - return K; + this.memberships = createRandomMemberships(n, k, rng); } /** @@ -596,9 +432,10 @@ private int[] createRandomMemberships(int N, int K, Random rng) { * a foreign solution to produce an offspring with balanced cluster distribution. * * @param foreignMemberships A membership vector of a foreign cluster solution. + * @param rng The random number generator to use. * @throws IllegalArgumentException If the input vector is invalid or incompatible. */ - int[] crossover(int[] foreignMemberships) { + int[] crossover(int[] foreignMemberships, Random rng) { // Validate input if (foreignMemberships == null || foreignMemberships.length != this.memberships.length) { throw new IllegalArgumentException("Incompatible membership vector lengths."); @@ -610,7 +447,7 @@ int[] crossover(int[] foreignMemberships) { int[] newMemberships = performRelabeling(this.memberships, foreignMemberships, overlapMatrix); // Step 2: Perform random crossover between relabeled membership vectors - newMemberships = performCrossover(newMemberships, foreignMemberships); + newMemberships = performCrossover(newMemberships, foreignMemberships, rng); // Step 3: Adjust cluster distribution to achieve balance newMemberships = balanceClusterDistribution(newMemberships, K); @@ -670,12 +507,16 @@ private int[] performRelabeling(int[] memberships1, int[] memberships2, int[][] /** * Performs crossover by randomly combining bits from two membership vectors. + * + * @param memberships1 The first membership vector. + * @param memberships2 The second membership vector. + * @param rng The random number generator to use. + * @return recombined membership vector */ - private int[] performCrossover(int[] memberships1, int[] memberships2) { - Random rand = new Random(); // Optionally pass a seed here for reproducibility + private int[] performCrossover(int[] memberships1, int[] memberships2, Random rng) { int[] result = new int[memberships1.length]; for (int i = 0; i < memberships1.length; i++) { - result[i] = (rand.nextBoolean()) ? memberships1[i] : memberships2[i]; + result[i] = (rng.nextBoolean()) ? memberships1[i] : memberships2[i]; } return result; } @@ -733,17 +574,24 @@ private class GeneticIteration { final int numElites; final int numMutations; final ArrayList clusterSolutions; - final double[] q; // quality scores for each cluster solution + final boolean normalise; + double[] q; // quality scores for each cluster solution ArrayList children; // children cluster solutions /** * Performs a single iteration of the genetic algorithm, including quality * evaluation, elite retention, crossover, and mutation. * + * @param clusterSolutions The cluster solutions (= parents). + * @param congruenceNetwork The congruence matrix. + * @param conflictNetwork The conflict matrix. + * @param normalise Should the quality/fitness scores be normalised? + * @param rng The random number generator to use. * @return A list of children cluster solutions. */ - GeneticIteration(ArrayList clusterSolutions, double[][] congruenceNetwork, double[][] conflictNetwork) { + GeneticIteration(ArrayList clusterSolutions, double[][] congruenceNetwork, double[][] conflictNetwork, boolean normalise, Random rng) { this.clusterSolutions = new ArrayList<>(clusterSolutions); + this.normalise = normalise; this.congruenceNetwork = congruenceNetwork.clone(); this.conflictNetwork = conflictNetwork.clone(); this.n = this.congruenceNetwork.length; @@ -760,31 +608,27 @@ private class GeneticIteration { "Number of mutations based on the mutation percentage."); Dna.logger.log(log); - this.q = evaluateQuality(congruenceNetwork, conflictNetwork); + this.q = evaluateQuality(this.congruenceNetwork, this.conflictNetwork, this.normalise); this.children = eliteRetentionStep(this.clusterSolutions, this.q, this.numElites); - this.children = crossoverStep(this.clusterSolutions, this.q, this.children); - this.children = mutationStep(this.children, this.numMutations, this.n); + this.children = crossoverStep(this.clusterSolutions, this.q, this.children, rng); + this.children = mutationStep(this.children, this.numMutations, this.n, rng); } /** * Evaluates the quality of cluster solutions using the specified quality function. - * The quality scores are transformed to the range [0, 1] where 1 is high fitness. + * The quality scores are transformed to the range [-Inf, 0] where 0 is high fitness + * or [-1, 0] if normalisation is used. * * @param congruenceNetwork The congruence network matrix. - * @param conflictNetwork The conflict network matrix. + * @param conflictNetwork The conflict network matrix. + * @param normalise Normalise the results? * @return An array of quality scores for each cluster solution. */ - private double[] evaluateQuality(double[][] congruenceNetwork, double[][] conflictNetwork) { + private double[] evaluateQuality(double[][] congruenceNetwork, double[][] conflictNetwork, boolean normalise) { double[] q = new double[clusterSolutions.size()]; for (int i = 0; i < clusterSolutions.size(); i++) { int[] mem = clusterSolutions.get(i).getMemberships(); - if (qualityFunction.equals("modularity")) { - q[i] = 2.0 * (qualityModularity(mem, congruenceNetwork, numClusters) - qualityModularity(mem, congruenceNetwork, numClusters)); - } else if (qualityFunction.equals("eiIndex")) { - q[i] = 2.0 * (qualityEi(mem, conflictNetwork) - qualityEi(mem, congruenceNetwork)); - } else if (qualityFunction.equals("absdiff")) { - q[i] = qualityAbsdiff(mem, congruenceNetwork, conflictNetwork); - } + q[i] = qualityAbsdiff(mem, congruenceNetwork, conflictNetwork, normalise); } return q; } @@ -823,94 +667,96 @@ private ArrayList eliteRetentionStep (ArrayList crossoverStep(ArrayList clusterSolutions, double[] q, ArrayList children) { + private ArrayList crossoverStep(ArrayList clusterSolutions, double[] q, ArrayList children, Random rng) { - // Replace negative quality values with zero to ensure roulette sampling works + // adjust fitness scores to ensure that they are all non-negative and the sum is positive to make roulette wheel selection work + double qMinimum = 0.0, qMaximum = 0.0, qTotal = 0.0; for (int i = 0; i < q.length; i++) { - if (q[i] < 0) { - q[i] = 0; + if (i == 0) { + qMinimum = q[i]; + qMaximum = q[i]; + } else { + if (q[i] < qMinimum) { + qMinimum = q[i]; + } + if (q[i] > qMaximum) { + qMaximum = q[i]; + } } + qTotal += q[i]; } - - // Compute the total quality - double qTotal = 0.0; - for (double quality : q) { - qTotal += quality; - } - - // Handle case where total quality is zero - if (qTotal == 0) { - // Replace all q values with equal probabilities + if (qMinimum < 0) { // either completely in the [-Inf, 0] range or in [-x, x] with unknown x (e.g., modularity, where x = 1) -> shift to [0, 2x] by subtracting lowest (negative) value + qTotal = 0.0; for (int i = 0; i < q.length; i++) { - q[i] = 1.0; // Assign uniform score + q[i] = q[i] - qMinimum; + qTotal += q[i]; } - qTotal = q.length; // New total becomes the number of items - LogEvent log = new LogEvent(Logger.MESSAGE, "Total quality is zero. Using uniform probabilities.", "Roulette wheel sampling fallback."); - Dna.logger.log(log); } - - // Generate additional children until the desired total size is reached - int numClusterSolutions = clusterSolutions.size(); - while (children.size() < numClusterSolutions) { - // Perform weighted sampling for the first parent - double r1 = rng.nextDouble() * qTotal; - int index1 = selectIndexByRoulette(q, r1); - ClusterSolution parent1 = clusterSolutions.get(index1); - - // Perform weighted sampling for the second parent, ensuring it's different from the first - int index2; - // next two lines: perhaps worth trying to ensure that the second parent is different from the first, but there are apparently cases where this is not possible, so we don't do it for now - double r2 = rng.nextDouble() * qTotal; - index2 = selectIndexByRoulette(q, r2); - ClusterSolution parent2 = clusterSolutions.get(index2); - - // Clone and perform crossover - try { - ClusterSolution child = (ClusterSolution) parent1.clone(); - child.crossover(parent2.getMemberships()); - children.add(child); - } catch (CloneNotSupportedException e) { - LogEvent log = new LogEvent(Logger.ERROR, - "Cluster solution could not be cloned.", - "A child was not added to the generation."); - Dna.logger.log(log); + if (qTotal == 0.0) { // all values are 0 -> replace by uniform probabilities + for (int i = 0; i < q.length; i++) { + q[i] = 1.0; + qTotal += 1.0; } } - return children; - } + // hybrid roulette wheel sampling for fitness-proportional sampling with uniform random sampling element to create more diversity in the gene pool + while (children.size() < numParents) { + int firstParentIndex = -1, secondParentIndex = -1; - /** - * Selects the index of a cluster solution using roulette wheel sampling based on a random value. - * - * @param q The quality scores of the cluster solutions. - * @param r The random value for selection. - * @return The index of the selected cluster solution. - */ - private int selectIndexByRoulette(double[] q, double r) { - double cumulative = 0.0; - for (int i = 0; i < q.length; i++) { - cumulative += q[i]; - if (r <= cumulative) { - return i; + // select first parent with roulette wheel sampling (= probability proportional to fitness) + double r = rng.nextDouble() * qTotal; + double cumulative = 0.0; + for (int i = 0; i < q.length; i++) { + cumulative += q[i]; + if (r <= cumulative) { + firstParentIndex = i; + secondParentIndex = i; // provisional value to avoid breeding with oneself + break; + } } + + // select second parent with roulette wheel sampling or uniform random sampling + while (secondParentIndex == firstParentIndex) { // avoid breeding with oneself + // flip a coin to decide whether the second parent is selected via roulette wheel sampling or uniform random sampling + if (rng.nextDouble() <= 0.5) { + // select second parent with roulette wheel sampling (= probability proportional to fitness) + r = rng.nextDouble() * qTotal; + cumulative = 0.0; + for (int i = 0; i < q.length; i++) { + cumulative += q[i]; + if (r <= cumulative) { + secondParentIndex = i; + break; + } + } + } else { + // select second parent with uniform random sampling (to create more diversity in the gene pool) + secondParentIndex = rng.nextInt(q.length); + } + } + + // create child by crossover of the two selected parents + ClusterSolution c = clusterSolutions.get(firstParentIndex); + int[] child = c.crossover(clusterSolutions.get(secondParentIndex).getMemberships(), rng); + children.add(new ClusterSolution(n, numClusters, child)); } - LogEvent log = new LogEvent(Logger.WARNING, "Roulette wheel selection failed.", "Returning the last index."); - Dna.logger.log(log); - return q.length - 1; // Fallback in case of rounding issues + + return children; } /** * Mutation step: Randomly select some pairs of cluster memberships ("chromosomes") in non-elite solutions and swap around their cluster membership. * - * @param children The children generation of cluster solutions as an array list. - * @param numMutations The number of mutations to perform. - * @param n The number of nodes in the network. + * @param children The children generation of cluster solutions as an array list. + * @param numMutations The number of mutations to perform. + * @param n The number of nodes in the network. + * @param rng The random number generator to use. * @return An array list with the mutated children generation of cluster solutions. */ - private ArrayList mutationStep(ArrayList children, int numMutations, int n) { + private ArrayList mutationStep(ArrayList children, int numMutations, int n, Random rng) { if (numMutations <= 0) { return children; // No mutations to perform } @@ -957,33 +803,6 @@ public ArrayList getChildren() { return this.children; } - /** - * Returns the number of nodes in the network. - * - * @return The number of nodes in the network. - */ - public int getN() { - return n; - } - - /** - * Returns the number of elite cluster solutions. - * - * @return The number of elite cluster solutions. - */ - public int getNumElites() { - return numElites; - } - - /** - * Returns the number of mutations performed. - * - * @return The number of mutations performed. - */ - public int getNumMutations() { - return numMutations; - } - /** * Returns the quality scores for each cluster solution. * @@ -1002,6 +821,7 @@ public double[] getQ() { * @return A PolarisationResultTimeSeries object containing the results of the genetic algorithm for each time step and iteration. */ public PolarisationResultTimeSeries geneticAlgorithm () { + Random rng = (this.randomSeed == 0) ? new Random() : new Random(this.randomSeed); // Initialize random number generator ArrayList polarisationResults = new ArrayList<>(); try (ProgressBar pb = new ProgressBar("Genetic algorithm", this.congruence.size())) { for (int t = 0; t < this.congruence.size(); t++) { @@ -1020,13 +840,13 @@ public PolarisationResultTimeSeries geneticAlgorithm () { // Create initially random cluster solutions; supply the number of nodes and clusters ArrayList cs = new ArrayList(); for (int i = 0; i < numParents; i++) { - cs.add(new ClusterSolution(this.congruence.get(t).getMatrix().length, numClusters)); + cs.add(new ClusterSolution(this.congruence.get(t).getMatrix().length, numClusters, rng)); } // Run through iterations and do the breeding, then collect results and stats lastIndex = numIterations - 1; // choose last possible value here as a default if early convergence does not happen for (int i = 0; i < numIterations; i++) { - GeneticIteration geneticIteration = new GeneticIteration(cs, this.congruence.get(t).getMatrix(), this.conflict.get(t).getMatrix()); + GeneticIteration geneticIteration = new GeneticIteration(cs, this.congruence.get(t).getMatrix(), this.conflict.get(t).getMatrix(), this.normaliseScores, rng); cs = geneticIteration.getChildren(); // compute summary statistics based on iteration step and retain them @@ -1071,7 +891,6 @@ public PolarisationResultTimeSeries geneticAlgorithm () { // correct for early convergence in results vectors int finalIndex = lastIndex; - /* for (int i = lastIndex; i >= 0; i--) { if (maxQArray[i] == maxQArray[lastIndex]) { finalIndex = i; @@ -1079,7 +898,6 @@ public PolarisationResultTimeSeries geneticAlgorithm () { break; } } - */ double[] maxQArrayTemp = new double[finalIndex + 1]; double[] avgQArrayTemp = new double[finalIndex + 1]; @@ -1456,4 +1274,110 @@ private ArrayList[][][] create3dArray(String[] var1Values, Stri return X; } + + + /** + * Prepare the greedy membership swapping algorithm and run all the iterations. + * Take out the maximum quality measure at the last step and create an object + * that stores the polarisation results. + */ + private PolarisationResultTimeSeries greedyAlgorithm () { + Random rng = (this.randomSeed == 0) ? new Random() : new Random(this.randomSeed); // Initialize random number generator + ArrayList polarisationResults = new ArrayList(); + + // for each time step, run the algorithm over the cluster solutions; retain quality and memberships + double[][] congruenceMatrix, conflictMatrix; + int t, oldI, oldJ; + ArrayList maxQArray = new ArrayList(); + int[] bestMemberships, mem, mem2; + double maxQ, q1, q2; + boolean noChanges; + + try (ProgressBar pb = new ProgressBar("Greedy algorithm", this.congruence.size())) { + for (t = 0; t < congruence.size(); t++) { // go through all time steps of the time window networks + maxQArray.clear(); + congruenceMatrix = congruence.get(t).getMatrix(); + conflictMatrix = conflict.get(t).getMatrix(); + double combinedNorm = calculateMatrixNorm(congruenceMatrix) + calculateMatrixNorm(congruenceMatrix); + + if (congruenceMatrix.length > 0 || combinedNorm == 0.0) { // if the network has no nodes or edges, skip this step and return 0 directly + + // Create initially random cluster solution to update + ClusterSolution cs = new ClusterSolution(congruence.get(t).getMatrix().length, numClusters, rng); + mem = cs.getMemberships(); + + // evaluate quality of initial solution + maxQArray.add(qualityAbsdiff(mem, congruenceMatrix, conflictMatrix, this.normaliseScores)); + bestMemberships = mem.clone(); + maxQ = maxQArray.get(0); + + boolean convergence = false; + while (!convergence) { // run the two nested for-loops repeatedly until there are no more swaps + noChanges = true; + for (int i = 0; i < mem.length; i++) { + for (int j = 1; j < mem.length; j++) { // swap positions i and j in the membership vector and see if leads to higher fitness + if (i < j && mem[i] != mem[j]) { + mem2 = mem.clone(); + oldI = mem2[i]; + oldJ = mem2[j]; + mem2[i] = oldJ; + mem2[j] = oldI; + q1 = qualityAbsdiff(mem, congruenceMatrix, conflictMatrix, this.normaliseScores); + q2 = qualityAbsdiff(mem2, congruenceMatrix, conflictMatrix, this.normaliseScores); + if (q2 > q1) { // candidate solution has higher fitness -> keep it + mem = mem2.clone(); // accept the new solution if it was better than the previous + maxQArray.add(q2); + maxQ = q2; + bestMemberships = mem.clone(); + noChanges = false; + } + } + } + } + if (noChanges) { + convergence = true; + } + } + + double[] maxQArray2 = new double[maxQArray.size()]; + for (int i = 0; i < maxQArray.size(); i++) { + maxQArray2[i] = maxQArray.get(i); + } + + // save results in array as a complex object + double[] avgQArray = maxQArray2; + double[] sdQArray = new double[maxQArray.size()]; + PolarisationResult pr = new PolarisationResult( + maxQArray2, + avgQArray, + sdQArray, + maxQ, + bestMemberships, + congruence.get(t).getRowNames(), + true, + congruence.get(t).getStart(), + congruence.get(t).getStop(), + congruence.get(t).getDateTime()); + polarisationResults.add(pr); + } else { // zero result because network is empty + PolarisationResult pr = new PolarisationResult( + new double[] { 0 }, + new double[] { 0 }, + new double[] { 0 }, + 0.0, + new int[0], + new String[0], + true, + congruence.get(t).getStart(), + congruence.get(t).getStop(), + congruence.get(t).getDateTime()); + polarisationResults.add(pr); + } + pb.step(); + } + } + + PolarisationResultTimeSeries polarisationResultTimeSeries = new PolarisationResultTimeSeries(polarisationResults); + return polarisationResultTimeSeries; + } } \ No newline at end of file diff --git a/rDNA/rDNA/DESCRIPTION b/rDNA/rDNA/DESCRIPTION index 7f99ba63..a0949e4e 100755 --- a/rDNA/rDNA/DESCRIPTION +++ b/rDNA/rDNA/DESCRIPTION @@ -1,6 +1,6 @@ Package: rDNA Version: 3.0.11.4 -Date: 2025-01-10 +Date: 2025-01-11 Title: Discourse Network Analysis in R Authors@R: c(person(given = "Philip", diff --git a/rDNA/rDNA/R/dna_backbone.R b/rDNA/rDNA/R/dna_backbone.R index e5db7878..4c3d6913 100644 --- a/rDNA/rDNA/R/dna_backbone.R +++ b/rDNA/rDNA/R/dna_backbone.R @@ -216,13 +216,13 @@ dna_backbone <- function(method = "nested", invertTypes = FALSE, fileFormat = NULL, outfile = NULL) { - + # wrap the vectors of exclude values for document variables into Java arrays excludeAuthors <- .jarray(excludeAuthors) excludeSources <- .jarray(excludeSources) excludeSections <- .jarray(excludeSections) excludeTypes <- .jarray(excludeTypes) - + # compile exclude variables and values vectors dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) count <- 0 @@ -244,7 +244,7 @@ dna_backbone <- function(method = "nested", } var <- .jarray(var) # array of variable names of each excluded value val <- .jarray(val) # array of values to be excluded - + # encode R NULL as Java null value if necessary if (is.null(qualifier) || is.na(qualifier)) { qualifier <- .jnull(class = "java/lang/String") @@ -255,7 +255,7 @@ dna_backbone <- function(method = "nested", if (is.null(outfile)) { outfile <- .jnull(class = "java/lang/String") } - + # call rBackbone function to compute results .jcall(dnaEnvironment[["dna"]]$headlessDna, "V", @@ -292,12 +292,12 @@ dna_backbone <- function(method = "nested", outfile, fileFormat ) - - exporter <- .jcall(dnaEnvironment[["dna"]]$headlessDna, "Lexport/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored + + exporter <- .jcall(dnaEnvironment[["dna"]]$headlessDna, "Ldna/export/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored if (!is.null(outfile) && !is.null(fileFormat) && is.character(outfile) && is.character(fileFormat) && fileFormat %in% c("json", "xml")) { message("File exported.") } else if (method[1] %in% c("penalty", "fixed")) { - result <- .jcall(exporter, "Lexport/SimulatedAnnealingBackboneResult;", "getSimulatedAnnealingBackboneResult", simplify = TRUE) + result <- .jcall(exporter, "Ldna/export/SimulatedAnnealingBackboneResult;", "getSimulatedAnnealingBackboneResult", simplify = TRUE) # create a list with various results l <- list() l$penalty <- .jcall(result, "D", "getPenalty") @@ -312,7 +312,7 @@ dna_backbone <- function(method = "nested", l$unpenalized_backbone_loss <- .jcall(result, "D", "getUnpenalizedBackboneLoss") l$unpenalized_redundant_loss <- .jcall(result, "D", "getUnpenalizedRedundantLoss") rn <- .jcall(result, "[S", "getLabels") - + # store the three matrices in the result list fullmat <- .jcall(result, "[[D", "getFullNetwork", simplify = TRUE) rownames(fullmat) <- rn @@ -326,7 +326,7 @@ dna_backbone <- function(method = "nested", rownames(redundantmat) <- rn colnames(redundantmat) <- rn l$redundant_network <- redundantmat - + # store diagnostics per iteration as a data frame d <- data.frame(iteration = 1:.jcall(result, "I", "getIterations"), temperature = .jcall(result, "[D", "getTemperature"), @@ -337,9 +337,9 @@ dna_backbone <- function(method = "nested", current_backbone_size = .jcall(result, "[I", "getCurrentBackboneSize"), optimal_backbone_size = .jcall(result, "[I", "getOptimalBackboneSize"), acceptance_ratio_ma = .jcall(result, "[D", "getAcceptanceRatioMovingAverage")) - + l$diagnostics <- d - + # store start date/time, end date/time, number of statements, call, and class label in each network matrix start <- as.POSIXct(.jcall(result, "J", "getStart"), origin = "1970-01-01") # add the start date/time of the result as an attribute to the matrices attributes(l$full_network)$start <- start @@ -360,7 +360,7 @@ dna_backbone <- function(method = "nested", class(l) <- c("dna_backbone", class(l)) return(l) } else if (method[1] == "nested") { - result <- .jcall(exporter, "Lexport/NestedBackboneResult;", "getNestedBackboneResult", simplify = TRUE) + result <- .jcall(exporter, "Ldna/export/NestedBackboneResult;", "getNestedBackboneResult", simplify = TRUE) d <- data.frame(i = .jcall(result, "[I", "getIteration"), entity = .jcall(result, "[S", "getEntities"), backboneLoss = .jcall(result, "[D", "getBackboneLoss"), @@ -409,7 +409,7 @@ print.dna_backbone <- function(x, trim = 50, ...) { #' @importFrom rlang .data #' @export plot.dna_backbone <- function(x, ma = 500, ...) { - + if (attr(x, "method") != "nested") { # temperature and acceptance probability plot(x = x$diagnostics$iteration, @@ -423,7 +423,7 @@ plot.dna_backbone <- function(x, ma = 500, ...) { # note that better solutions are coded as -1 and need to be skipped: lines(x = x$diagnostics$iteration[x$diagnostics$acceptance_prob >= 0], y = x$diagnostics$acceptance_prob[x$diagnostics$acceptance_prob >= 0]) - + # spectral distance between full network and backbone network per iteration bb_loss <- stats::filter(x$diagnostics$penalized_backbone_loss, rep(1 / ma, ma), @@ -441,7 +441,7 @@ plot.dna_backbone <- function(x, ma = 500, ...) { xlab = "Iteration", ylab = yl, main = ti) - + # number of concepts in the backbone solution per iteration current_size_ma <- stats::filter(x$diagnostics$current_backbone_size, rep(1 / ma, ma), @@ -458,7 +458,7 @@ plot.dna_backbone <- function(x, ma = 500, ...) { ylab = paste0("Number of elements (MA, last ", ma, ")"), main = "Backbone size (red = best)") lines(x = x$diagnostics$iteration, y = optimal_size_ma, col = "red") - + # ratio of recent acceptances accept_ratio <- stats::filter(x$diagnostics$acceptance, rep(1 / ma, ma), @@ -472,33 +472,33 @@ plot.dna_backbone <- function(x, ma = 500, ...) { } else { # create hclust object # define merging pattern: negative numbers are leaves, positive are merged clusters merges_clust <- matrix(nrow = nrow(x) - 1, ncol = 2) - + merges_clust[1,1] <- -nrow(x) merges_clust[1,2] <- -(nrow(x) - 1) - + for (i in 2:(nrow(x) - 1)) { merges_clust[i, 1] <- -(nrow(x) - i) merges_clust[i, 2] <- i - 1 } - + # Initialize empty object a <- list() - + # Add merges a$merge <- merges_clust - + # Define merge heights a$height <- x$backboneLoss[1:nrow(x) - 1] - + # Order of leaves a$order <- 1:nrow(x) - + # Labels of leaves a$labels <- rev(x$entity) - + # Define hclust class class(a) <- "hclust" - + plot(a, ylab = "") } } @@ -527,7 +527,7 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { bd$current_size_ma <- stats::filter(bd$current_backbone_size, rep(1 / ma, ma), sides = 1) bd$optimal_size_ma <- stats::filter(bd$optimal_backbone_size, rep(1 / ma, ma), sides = 1) bd$accept_ratio <- stats::filter(bd$acceptance, rep(1 / ma, ma), sides = 1) - + # temperature and acceptance probability g_accept <- ggplot2::ggplot(bd, ggplot2::aes(y = .data[["temperature"]], x = .data[["iteration"]])) + ggplot2::geom_line(color = "#a50f15") + @@ -537,7 +537,7 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { ggplot2::xlab("Iteration") + ggplot2::ggtitle("Temperature and acceptance probability") + ggplot2::theme_bw() - + # spectral distance between full network and backbone network per iteration if (attributes(object)$method == "penalty") { yl <- "Penalized backbone loss" @@ -552,7 +552,7 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { ggplot2::xlab("Iteration") + ggplot2::ggtitle(ti) + ggplot2::theme_bw() - + # number of concepts in the backbone solution per iteration d <- data.frame(iteration = rep(bd$iteration, 2), size = c(bd$current_size_ma, bd$optimal_size_ma), @@ -565,7 +565,7 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { ggplot2::ggtitle("Backbone size") + ggplot2::theme_bw() + ggplot2::theme(legend.position = "bottom") - + # ratio of recent acceptances g_ar <- ggplot2::ggplot(bd, ggplot2::aes(y = .data[["accept_ratio"]], x = .data[["iteration"]])) + ggplot2::geom_line() + @@ -573,46 +573,46 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { ggplot2::xlab("Iteration") + ggplot2::ggtitle("Acceptance ratio") + ggplot2::theme_bw() - + # wrap in list plots <- list(g_accept, g_loss, g_size, g_ar) return(plots) } else { # create hclust object # define merging pattern: negative numbers are leaves, positive are merged clusters merges_clust <- matrix(nrow = nrow(object) - 1, ncol = 2) - + merges_clust[1,1] <- -nrow(object) merges_clust[1,2] <- -(nrow(object) - 1) - + for (i in 2:(nrow(object) - 1)) { merges_clust[i, 1] <- -(nrow(object) - i) merges_clust[i, 2] <- i - 1 } - + # Initialize empty object a <- list() - + # Add merges a$merge <- merges_clust - + # Define merge heights a$height <- object$backboneLoss[1:nrow(object) - 1] height <- a$height - + # Order of leaves a$order <- 1:nrow(object) - + # Labels of leaves a$labels <- rev(object$entity) - + # Define hclust class class(a) <- "hclust" - + # ensure ggraph is installed, otherwise throw error (better than importing it to avoid hard dependency) if (!requireNamespace("ggraph", quietly = TRUE)) { stop("The 'ggraph' package is required for plotting nested backbone dendrograms with 'ggplot2' but was not found. Consider installing it.") } - + g_clust <- ggraph::ggraph(graph = a, layout = "dendrogram", circular = FALSE, @@ -631,7 +631,7 @@ autoplot.dna_backbone <- function(object, ..., ma = 500) { labels = rev(object$entity)) + ggplot2::scale_y_continuous(expand = c(0, 0.01)) + ggplot2::coord_flip() - + return(g_clust) } } @@ -688,13 +688,13 @@ dna_evaluateBackboneSolution <- function(backboneEntities, invertSources = FALSE, invertSections = FALSE, invertTypes = FALSE) { - + # wrap the vectors of exclude values for document variables into Java arrays excludeAuthors <- .jarray(excludeAuthors) excludeSources <- .jarray(excludeSources) excludeSections <- .jarray(excludeSections) excludeTypes <- .jarray(excludeTypes) - + # compile exclude variables and values vectors dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) count <- 0 @@ -716,12 +716,12 @@ dna_evaluateBackboneSolution <- function(backboneEntities, } var <- .jarray(var) # array of variable names of each excluded value val <- .jarray(val) # array of values to be excluded - + # encode R NULL as Java null value if necessary if (is.null(qualifier) || is.na(qualifier)) { qualifier <- .jnull(class = "java/lang/String") } - + # call rBackbone function to compute results result <- .jcall(dnaEnvironment[["dna"]]$headlessDna, "[D", diff --git a/rDNA/rDNA/R/dna_barplot.R b/rDNA/rDNA/R/dna_barplot.R index ecc24ee2..17ccb7a8 100644 --- a/rDNA/rDNA/R/dna_barplot.R +++ b/rDNA/rDNA/R/dna_barplot.R @@ -52,13 +52,13 @@ dna_barplot <- function(statementType = "DNA Statement", invertSources = FALSE, invertSections = FALSE, invertTypes = FALSE) { - + # wrap the vectors of exclude values for document variables into Java arrays excludeAuthors <- .jarray(excludeAuthors) excludeSources <- .jarray(excludeSources) excludeSections <- .jarray(excludeSections) excludeTypes <- .jarray(excludeTypes) - + # compile exclude variables and values vectors dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) count <- 0 @@ -80,15 +80,15 @@ dna_barplot <- function(statementType = "DNA Statement", } var <- .jarray(var) # array of variable names of each excluded value val <- .jarray(val) # array of values to be excluded - + # encode R NULL as Java null value if necessary if (is.null(qualifier) || is.na(qualifier)) { qualifier <- .jnull(class = "java/lang/String") } - + # call rBarplotData function to compute results b <- .jcall(dnaEnvironment[["dna"]]$headlessDna, - "Lexport/BarplotResult;", + "Ldna/export/BarplotResult;", "rBarplotData", statementType, variable, @@ -110,37 +110,37 @@ dna_barplot <- function(statementType = "DNA Statement", invertSections, invertTypes, simplify = TRUE) - + at <- .jcall(b, "[[Ljava/lang/String;", "getAttributes") at <- t(sapply(at, FUN = .jevalArray)) - + counts <- .jcall(b, "[[I", "getCounts") counts <- t(sapply(counts, FUN = .jevalArray)) if (nrow(counts) < nrow(at)) { counts <- t(counts) } - + results <- data.frame(.jcall(b, "[S", "getValues"), counts, at) - + intValues <- .jcall(b, "[I", "getIntValues") intColNames <- intValues if (is.jnull(qualifier)) { intValues <- integer(0) intColNames <- "Frequency" } - + atVar <- .jcall(b, "[S", "getAttributeVariables") - + colnames(results) <- c("Entity", intColNames, atVar) - + attributes(results)$variable <- .jcall(b, "S", "getVariable") attributes(results)$intValues <- intValues attributes(results)$attributeVariables <- atVar - + class(results) <- c("dna_barplot", class(results)) - + return(results) } @@ -281,47 +281,47 @@ autoplot.dna_barplot <- function(object, axisWidth = 1.5, truncate = 40, exclude.min = NULL) { - - + + if (!("dna_barplot" %in% class(object))) { stop("Invalid data object. Please compute a dna_barplot object via the ", "dna_barplot function before plotting.") } - + if (!("Entity" %in% colnames(object))) { stop("dna_barplot object does not have a \'Entity\' variable. Please ", "compute a new dna_barplot object via the dna_barplot function before", " plotting.") } - + if (isTRUE(colors) & !("Color" %in% colnames(object)) | is.character(colors) & !(colors %in% colnames(object))) { colors <- FALSE warning("No color variable found in dna_barplot object. Colors will be", " ignored.") } - + if (!is.numeric(truncate)) { truncate <- Inf warning("No numeric value provided for trimming of entities. Truncation ", "will be ignored.") } - + # Get qualifier values w <- attr(object, "intValues") - + if (!all(w %in% colnames(object))) { stop("dna_barplot object does not include all qualifier values of the ", "statement type. Please compute a new dna_barplot object via the ", "dna_barplot function.") } - + # Check if qualifier is binary binary <- all(w %in% c(0, 1)) - + # Compute total values per entity object$sum <- rowSums(object[, colnames(object) %in% w]) - + # Exclude minimum number of statements per entity if (is.numeric(exclude.min)) { if (exclude.min > max(object$sum)) { @@ -332,15 +332,15 @@ autoplot.dna_barplot <- function(object, object <- object[object$sum >= exclude.min, ] } } - + # Stack agreement and disagreement object2 <- cbind(object$Entity, utils::stack(object, select = colnames(object) %in% w)) colnames(object2) <- c("entity", "frequency", "agreement") - + object <- object[order(object$sum, decreasing = TRUE), ] - + object2$entity <- factor(object2$entity, levels = rev(object$Entity)) - + # Get colors if (isTRUE(colors)) { object2$color <- object$Color[match(object2$entity, object$Entity)] @@ -354,14 +354,14 @@ autoplot.dna_barplot <- function(object, object2$color <- "white" object2$text_color <- "black" } - - + + if (binary) { # setting disagreement as -1 instead 0 object2$agreement <- ifelse(object2$agreement == 0, -1, 1) # recode frequency in positive and negative object2$frequency <- object2$frequency * as.integer(object2$agreement) - + # generate position of bar labels offset <- (max(object2$frequency) + abs(min(object2$frequency))) * 0.05 offset <- ifelse(offset < 0.5, 0.5, offset) # offset should be at least 0.5 @@ -374,7 +374,7 @@ autoplot.dna_barplot <- function(object, object2$pos <- ifelse(object2$frequency > 0, object2$frequency + offset, object2$frequency - offset) - + # move 0 labels where necessary object2$pos[object2$frequency == 0] <- ifelse(object2$agreement[object2$frequency == 0] == 1, object2$pos[object2$frequency == 0] * -1, @@ -394,17 +394,17 @@ autoplot.dna_barplot <- function(object, # Add labels object2$label <- paste(object2$count, object2$agreement, sep = " x ") } - + offset <- (max(object2$frequency) + abs(min(object2$frequency))) * 0.05 offset <- ifelse(offset < 0.5, 0.5, offset) yintercepts <- data.frame(x = c(0.5, length(unique(object2$entity)) + 0.5), y = c(0, 0)) high <- yintercepts$x[2] + 0.25 - + object2 <- object2[order(as.numeric(as.character(object2$agreement)), decreasing = FALSE), ] object2$agreement <- factor(object2$agreement, levels = w) - + # Plot g <- ggplot2::ggplot(object2, ggplot2::aes(x = .data[["entity"]], diff --git a/rDNA/rDNA/R/dna_network.R b/rDNA/rDNA/R/dna_network.R index b0faab57..f4759e6c 100644 --- a/rDNA/rDNA/R/dna_network.R +++ b/rDNA/rDNA/R/dna_network.R @@ -332,7 +332,7 @@ dna_network <- function(networkType = "twomode", fileFormat ) - exporter <- .jcall(dna_getHeadlessDna(), "Lexport/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored + exporter <- .jcall(dna_getHeadlessDna(), "Ldna/export/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored if (networkType == "eventlist") { # assemble an event list in the form of a data frame of filtered statements f <- J(exporter, "getFilteredStatements", simplify = TRUE) # array list of filtered export statements; use J because array list return type not recognized using .jcall @@ -370,7 +370,7 @@ dna_network <- function(networkType = "twomode", d$time <- as.POSIXct(d$time, origin = "1970-01-01 00:00:00") # convert long date/time to POSIXct return(d) } else { # assemble a one-mode or two-mode matrix with attributes or a list of matrices (if time window) - m <- .jcall(exporter, "[Lexport/Matrix;", "getMatrixResultsArray") # get list of Matrix objects from Exporter object + m <- .jcall(exporter, "[Ldna/export/Matrix;", "getMatrixResultsArray") # get list of Matrix objects from Exporter object l <- list() # create a list in which each result is stored; can be of length 1 if no time window is used for (t in 1:length(m)) { # loop through the matrices mat <- .jcall(m[[t]], "[[D", "getMatrix", simplify = TRUE) # get the resulting matrix at step t as a double[][] object and save as matrix diff --git a/rDNA/rDNA/R/dna_phaseTransitions.R b/rDNA/rDNA/R/dna_phaseTransitions.R index 6984bc5d..8b62771c 100644 --- a/rDNA/rDNA/R/dna_phaseTransitions.R +++ b/rDNA/rDNA/R/dna_phaseTransitions.R @@ -222,7 +222,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", invertSources = FALSE, invertSections = FALSE, invertTypes = FALSE) { - + # check arguments and packages if (distanceMethod == "spectral" && networkType == "twomode") { distanceMethod <- "absdiff" @@ -247,19 +247,19 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", } clusterMethods <- rev(clusterMethods) # reverse order to save time during parallel computation by starting the computationally intensive methods first mcall <- match.call() # save the arguments for storing them in the results later - + # generate the time window networks if (is.null(timeWindow) || is.na(timeWindow) || !is.character(timeWindow) || length(timeWindow) != 1 || !timeWindow %in% c("events", "seconds", "minutes", "hours", "days", "weeks", "months", "years")) { timeWindow <- "events" warning("The 'timeWindow' argument was invalid. Proceeding with 'timeWindow = \"events\" instead.") } - + # wrap the vectors of exclude values for document variables into Java arrays excludeAuthors <- .jarray(excludeAuthors) excludeSources <- .jarray(excludeSources) excludeSections <- .jarray(excludeSections) excludeTypes <- .jarray(excludeTypes) - + # compile exclude variables and values vectors dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) count <- 0 @@ -281,12 +281,12 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", } var <- .jarray(var) # array of variable names of each excluded value val <- .jarray(val) # array of values to be excluded - + # encode R NULL as Java null value if necessary if (is.null(qualifier) || is.na(qualifier)) { qualifier <- .jnull(class = "java/lang/String") } - + # call rNetwork function to compute results .jcall(dna_getHeadlessDna(), "V", @@ -325,7 +325,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", invertTypes ) exporter <- dna_getHeadlessDna()$getExporter() # save Java object reference to exporter class - + # compute distance matrix if (distanceMethod == "modularity") { stop("Differences in modularity have not been implemented yet. Please use absolute differences or spectral Euclidean distance as a distance method.") @@ -338,11 +338,11 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", distanceMethod, simplify = TRUE) distance_mat <- distance_mat / max(distance_mat) # rescale between 0 and 1 - + # retrieve mid-point dates (gamma) - m <- .jcall(exporter, "[Lexport/Matrix;", "getMatrixResultsArray") # get list of Matrix objects from Exporter object + m <- .jcall(exporter, "[Ldna/export/Matrix;", "getMatrixResultsArray") # get list of Matrix objects from Exporter object dates <- sapply(m, function(x) .jcall(x, "J", "getDateTimeLong")) # long integers, still needs conversion to date - + # define clustering function hclustMethods <- c("single", "average", "complete", "ward") cl <- function(method, distmat) { @@ -444,7 +444,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", warning("Cluster method '", method, "' threw a warning: ", w) }) } - + # apply all clustering methods to distance matrix if (cores > 1) { cat(paste("Clustering distance matrix on", cores, "cores.\n")) @@ -470,7 +470,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", best <- which(mod == max(mod))[1] results$modularity <- mod[best] results$clusterMethod <- clusterMethods[best] - + # temporal embedding via MDS if (!requireNamespace("MASS", quietly = TRUE)) { mem <- data.frame("date" = as.POSIXct(dates, tz = "UTC", origin = "1970-01-01"), @@ -491,7 +491,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", b <- Sys.time() print(b - a) } - + results$distmat <- distance_mat class(results) <- "dna_phaseTransitions" attributes(results)$stress <- ifelse(ncol(results$states) == 2, NA, mds$stress) @@ -538,7 +538,7 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil k <- max(object$states$state) shapes <- c(21:25, 0:14)[1:k] l <- list() - + # heatmap if ("heatmap" %in% plots) { try({ @@ -553,7 +553,7 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil } }) } - + # silhouette plot if ("silhouette" %in% plots) { try({ @@ -572,7 +572,7 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil } }) } - + # temporal embedding if ("mds" %in% plots) { try({ @@ -586,10 +586,10 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil nodes <- object$states nodes$date <- as.character(nodes$date) nodes$State <- as.factor(nodes$state) - + # Extract state values state_values <- nodes$State - + edges <- data.frame(sender = as.character(object$states$date), receiver = c(as.character(object$states$date[2:(nrow(object$states))]), "NA")) edges <- edges[-nrow(edges), ] @@ -609,7 +609,7 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil } }) } - + # state dynamics if ("states" %in% plots) { try({ @@ -619,12 +619,12 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil State = factor(object$states$state, levels = 1:k, labels = paste("State", 1:k)), time1 = as.Date(object$states$date) ) - + # Extracting values time_values <- d$time state_values <- d$State id_values <- d$id - + l[[length(l) + 1]] <- ggplot2::ggplot(d, ggplot2::aes(x = time_values, y = state_values, colour = state_values)) + ggplot2::geom_line(aes(group = 1), linewidth = 2, color = "black", lineend = "square") + ggplot2::geom_line(aes(group = id_values), linewidth = 2, lineend = "square") + @@ -638,6 +638,6 @@ autoplot.dna_phaseTransitions <- function(object, ..., plots = c("heatmap", "sil ggplot2::labs(color = "State") }) } - + return(l) } \ No newline at end of file diff --git a/rDNA/rDNA/R/dna_polarisation.R b/rDNA/rDNA/R/dna_polarisation.R index f667db6a..888a8aed 100644 --- a/rDNA/rDNA/R/dna_polarisation.R +++ b/rDNA/rDNA/R/dna_polarisation.R @@ -3,21 +3,55 @@ #' This function calculates the polarisation scores for a series of time windows. #' #' @inheritParams dna_network -#' @param statementType A character string specifying the type of DNA statement. Default is "DNA Statement". -#' @param k An integer specifying the number of clusters. Default is 2. -#' @param numParents An integer specifying the number of cluster solutions ("parents"). For example, 30 or 50. Default is 50. -#' @param iterations Number of iterations of the genetic algorithm. Often, 50 or 100 is enough, but since there is a built-in convergence check, it is recommended to keep this number large. Default is 1000. -#' @param elitePercentage A double specifying the percentage of the best solutions that are kept for the next generation. Useful values range between 0.05 and 0.2. Default is 0.1. -#' @param mutationPercentage A double specifying the percentage of the solutions that are mutated. Useful values range between 0.05 and 0.2. Default is 0.1. -#' @param qualityFunction A character string specifying the quality function to be used. Can be "modularity", "eiIndex", or "absdiff". Default is "absdiff". -#' @param normaliseMatrices A logical specifying whether the matrices should be normalised. Default is FALSE. -#' @param randomSeed An integer specifying the random seed for reproducibility of exact findings. Default is 0, which means the algorithm generates the random seed (= no reproducibility). +#' @param indentTime If \code{TRUE}, the sequence of time slices under the time +#' window algorithm starts with the first network and ends with the last +#' network that are entirely covered within the timeline defined by the start +#' and stop dates and times. For example, if the start date is 1 February, the +#' stop date is 31 December, and the time window duration is 21 days, the +#' mid-point of the first time window will be 11 February (to ensure the first +#' network entirely fits into the timeline), and the last network will be +#' centered around 20 December (to ensure the last network entirely fits into +#' the timeline). If \code{FALSE}, the start and stop dates and times are used +#' as the first and last mid-points. In that case, the first and last few +#' networks may contain fewer statements than other time slices and may, +#' therefore, be more similar to each other. +#' @param algorithm The algorithm to compute polarisation. Can be "greedy" (for +#' a greedy algorithm) or "genetic" (for a genetic algorithm). +#' @param normaliseScores A logical specifying whether the polarisation scores +#' should be normalised by edge mass per network to take away the effect of +#' networks over time having different activity levels. +#' @param numClusters An integer specifying the number of clusters k. Default is +#' \code{2}. +#' @param numParents Only for the genetic algorithm: An integer specifying the +#' number of cluster solutions ("parents"). For example, \code{30} or +#' \code{50}. +#' @param numIterations Only for the genetic algorithm: Number of iterations of +#' the genetic algorithm. Often, \code{50} or \code{100} is enough, but since +#' there is a built-in convergence check, it is recommended to keep this +#' number large. The default is \code{1000}. +#' @param elitePercentage Only for the genetic algorithm: A double specifying +#' the percentage of the best solutions that are kept for the next generation. +#' Useful values range between 0.05 and 0.2. +#' @param mutationPercentage Only for the genetic algorithm: A double specifying +#' the percentage of the solutions that are mutated. Useful values range +#' between 0.05 and 0.2. +#' @param randomSeed Only for the genetic algorithm: An integer specifying the +#' random seed for reproducibility of exact findings. The default is \code{0}, +#' which means the algorithm generates the random seed (= no reproducibility). #' -#' @return An object representing the polarisation of actors and the results of the genetic algorithm for all time steps and iterations. +#' @return An object representing the polarisation of actors and the results of +#' the algorithm for all time steps and iterations. #' #' @examples #' \dontrun{ -#' p <- dna_polarisation() +#' dna_init() +#' dna_sample() +#' dna_openDatabase("sample.dna", coderId = 1, coderPassword = "sample") +#' +#' p <- dna_polarisation(timeWindow = "days", windowSize = 40, normaliseScores = TRUE) +#' str(p) +#' +#' library("ggplot2") #' autoplot(p) #' } #' @@ -49,26 +83,21 @@ dna_polarisation <- function(statementType = "DNA Statement", excludeSources = character(), excludeSections = character(), excludeTypes = character(), - invertValues = FALSE, - invertAuthors = FALSE, - invertSources = FALSE, - invertSections = FALSE, - invertTypes = FALSE, - k = 2, + algorithm = "greedy", + normaliseScores = FALSE, + numClusters = 2, numParents = 50, - iterations = 1000, + numIterations = 1000, elitePercentage = 0.1, mutationPercentage = 0.1, - qualityFunction = "absdiff", - normaliseMatrices = FALSE, randomSeed = 0) { - + # wrap the vectors of exclude values for document variables into Java arrays excludeAuthors <- .jarray(excludeAuthors) excludeSources <- .jarray(excludeSources) excludeSections <- .jarray(excludeSections) excludeTypes <- .jarray(excludeTypes) - + # compile exclude variables and values vectors dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) count <- 0 @@ -90,12 +119,12 @@ dna_polarisation <- function(statementType = "DNA Statement", } var <- .jarray(var) # array of variable names of each excluded value val <- .jarray(val) # array of values to be excluded - + # encode R NULL as Java null value if necessary if (is.null(qualifier) || is.na(qualifier)) { - qualifier <- .jnull(class = "java/lang/String;") + qualifier <- .jnull(class = "Ljava/lang/String;") } - + # call rNetwork function to compute results polarisationObject <- .jcall(dna_getHeadlessDna(), "Ldna/export/PolarisationResultTimeSeries;", @@ -119,21 +148,21 @@ dna_polarisation <- function(statementType = "DNA Statement", excludeSources, excludeSections, excludeTypes, - invertValues, - invertAuthors, - invertSources, - invertSections, - invertTypes, - as.integer(k), + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + algorithm, + normaliseScores, + as.integer(numClusters), as.integer(numParents), - as.integer(iterations), + as.integer(numIterations), as.double(elitePercentage), as.double(mutationPercentage), - qualityFunction, - normaliseMatrices, .jlong(randomSeed) ) - + l <- list() l$finalMaxQs <- .jcall(polarisationObject, "[D", "getFinalMaxQs") l$earlyConvergence <- .jcall(polarisationObject, "[Z", "getEarlyConvergence") @@ -171,13 +200,6 @@ dna_polarisation <- function(statementType = "DNA Statement", #' polarization over time. #' #' @return A list of ggplot objects corresponding to the specified plots. -#' @export -#' -#' @examples -#' \dontrun{ -#' p <- dna_polarisation() -#' autoplot(p) -#' } #' #' @author Philip Leifeld #' @@ -191,22 +213,21 @@ dna_polarisation <- function(statementType = "DNA Statement", #' @importFrom ggplot2 scale_x_datetime #' @importFrom ggplot2 theme #' @importFrom ggplot2 element_text -#' #' #' @export autoplot.dna_polarisation <- function(object, ..., plots = c("hair", "hist", "time_series")) { l <- list() - + # hair diagnostic convergence plot hairData <- data.frame("Polarization" = do.call("c", object$maxQs), - "Iteration" = do.call("c", sapply(object$maxQs, function(x) 1:length(x))), + "Iteration" = do.call("c", lapply(object$maxQs, function(x) 1:length(x))), "Time" = rep(object$middleDates, times = sapply(object$maxQs, length))) gg_ha <- ggplot2::ggplot(hairData, ggplot2::aes(x = .data[["Iteration"]], y = .data[["Polarization"]], group = .data[["Time"]])) + ggplot2::geom_line(alpha = 0.3) + ggplot2::ylab("Maximal polarization") + ggplot2::theme_minimal() l$hair <- gg_ha - + # histogram diagnostic convergence plot histData <- data.frame("Iterations" = sapply(object$maxQs, length), "Time" = object$middleDates) @@ -215,7 +236,7 @@ autoplot.dna_polarisation <- function(object, ..., plots = c("hair", "hist", "ti ggplot2::labs(y = "Number of time windows", x = "Iterations until convergence") + ggplot2::theme_minimal() l$histogram <- gg_hi - + # time series plot timeSeriesData <- data.frame("Time" = object$middleDates, "Polarization" = object$finalMaxQs) gg_ts <- ggplot2::ggplot(timeSeriesData, ggplot2::aes(x = .data[["Time"]], y = .data[["Polarization"]])) + @@ -225,6 +246,6 @@ autoplot.dna_polarisation <- function(object, ..., plots = c("hair", "hist", "ti ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust=1)) l$polarisation <- gg_ts - + return(l) } \ No newline at end of file diff --git a/rDNA/rDNA/R/rDNA.R b/rDNA/rDNA/R/rDNA.R index 288124d4..655a0a8f 100644 --- a/rDNA/rDNA/R/rDNA.R +++ b/rDNA/rDNA/R/rDNA.R @@ -759,18 +759,18 @@ dna_getAttributes <- function(statementType = NULL, # get the data from the DNA database using rJava if (variableIdValid) { a <- .jcall(dnaEnvironment[["dna"]]$headlessDna, - "Lexport/DataFrame;", + "Ldna/export/DataFrame;", "getAttributes", as.integer(variableId)) } else if (variableValid && statementTypeIdValid) { a <- .jcall(dnaEnvironment[["dna"]]$headlessDna, - "Lexport/DataFrame;", + "Ldna/export/DataFrame;", "getAttributes", as.integer(statementTypeId), variable) } else if (variableValid && statementTypeValid) { a <- .jcall(dnaEnvironment[["dna"]]$headlessDna, - "Lexport/DataFrame;", + "Ldna/export/DataFrame;", "getAttributes", statementType, variable) diff --git a/rDNA/rDNA/man/dna_polarisation.Rd b/rDNA/rDNA/man/dna_polarisation.Rd new file mode 100644 index 00000000..d324b6da --- /dev/null +++ b/rDNA/rDNA/man/dna_polarisation.Rd @@ -0,0 +1,260 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dna_polarisation.R +\name{dna_polarisation} +\alias{dna_polarisation} +\alias{autoplot.dna_polarisation} +\title{Calculate DNA Polarisation} +\usage{ +dna_polarisation( + statementType = "DNA Statement", + variable1 = "organization", + variable1Document = FALSE, + variable2 = "concept", + variable2Document = FALSE, + qualifier = "agreement", + duplicates = "include", + start.date = "01.01.1900", + stop.date = "31.12.2099", + timeWindow = "days", + windowSize = 100, + kernel = "uniform", + indentTime = FALSE, + excludeValues = list(), + excludeAuthors = character(), + excludeSources = character(), + excludeSections = character(), + excludeTypes = character(), + algorithm = "greedy", + normaliseScores = FALSE, + numClusters = 2, + numParents = 50, + numIterations = 1000, + elitePercentage = 0.1, + mutationPercentage = 0.1, + randomSeed = 0 +) + +\method{autoplot}{dna_polarisation}(object, ..., plots = c("hair", "hist", "time_series")) +} +\arguments{ +\item{statementType}{The name of the statement type in which the variable +of interest is nested. For example, \code{"DNA Statement"}.} + +\item{variable1}{The first variable for network construction. In a one-mode +network, this is the variable for both the rows and columns. In a +two-mode network, this is the variable for the rows only. In an event +list, this variable is only used to check for duplicates (depending on +the setting of the \code{duplicates} argument).} + +\item{variable1Document}{A boolean value indicating whether the first +variable is at the document level (i.e., \code{"author"}, +\code{"source"}, \code{"section"}, \code{"type"}, \code{"id"}, or +\code{"title"}).} + +\item{variable2}{The second variable for network construction. In a one-mode +network, this is the variable over which the ties are created. For +example, if an organization x organization network is created, and ties +in this network indicate co-reference to a concept, then the second +variable is the \code{"concept"}. In a two-mode network, this is the +variable used for the columns of the network matrix. In an event list, +this variable is only used to check for duplicates (depending on the +setting of the \code{duplicates} argument).} + +\item{variable2Document}{A boolean value indicating whether the second +variable is at the document level (i.e., \code{"author"}, +\code{"source"}, \code{"section"}, \code{"type"}, \code{"id"}, or +\code{"title"}} + +\item{qualifier}{The qualifier variable. In a one-mode network, this + variable can be used to count only congruence or conflict ties. For + example, in an organization x organization network via common concepts, + a binary \code{"agreement"} qualifier could be used to record only ties + where both organizations have a positive stance on the concept or where + both organizations have a negative stance on the concept. With an + integer qualifier, the tie weight between the organizations would be + proportional to the similarity or distance between the two organizations + on the scale of the integer variable. With a short text variable as a + qualifier, agreement on common categorical values of the qualifier is + required, for example a tie is established (or a tie weight increased) if + two actors both refer to the same value on the second variable AND match on + the categorical qualifier, for example the type of referral. + + In a two-mode network, the qualifier variable can be used to retain only + positive or only negative statements or subtract negative from positive + mentions. All of this depends on the setting of the + \code{qualifierAggregation} argument. For event lists, the qualifier + variable is only used for filtering out duplicates (depending on the + setting of the \code{duplicates} argument. + + The qualifier can also be \code{NULL}, in which case it is ignored, meaning + that values in \code{variable1} and \code{variable2} are unconditionally + associated with each other in the network when they co-occur. This is + identical to selecting a qualifier variable and setting + \code{qualifierAggregation = "ignore"}.} + +\item{duplicates}{Setting for excluding duplicate statements before network +construction. Valid settings are \code{"include"} (for including all +statements in network construction), \code{"document"} (for counting +only one identical statement per document), \code{"week"} (for counting +only one identical statement per calendar week), \code{"month"} (for +counting only one identical statement per calendar month), \code{"year"} +(for counting only one identical statement per calendar year), and +\code{"acrossrange"} (for counting only one identical statement across +the whole time range).} + +\item{start.date}{The start date for network construction in the format +\code{"dd.mm.yyyy"}. All statements before this date will be excluded.} + +\item{stop.date}{The stop date for network construction in the format +\code{"dd.mm.yyyy"}. All statements after this date will be excluded.} + +\item{timeWindow}{Possible values are \code{"no"}, \code{"events"}, +\code{"seconds"}, \code{"minutes"}, \code{"hours"}, \code{"days"}, +\code{"weeks"}, \code{"months"}, and \code{"years"}. If \code{"no"} is +selected (= the default setting), no time window will be used. If any of +the time units is selected, a moving time window will be imposed, and +only the statements falling within the time period defined by the window +will be used to create the network. The time window will then be moved +forward by one time unit at a time, and a new network with the new time +boundaries will be created. This is repeated until the end of the overall +time span is reached. All time windows will be saved as separate +networks in a list. The duration of each time window is defined by the +\code{windowSize} argument. For example, this could be used to create a +time window of 6 months which moves forward by one month each time, thus +creating time windows that overlap by five months. If \code{"events"} is +used instead of a natural time unit, the time window will comprise +exactly as many statements as defined in the \code{windowSize} argument. +However, if the start or end statement falls on a date and time where +multiple events happen, those additional events that occur simultaneously +are included because there is no other way to decide which of the +statements should be selected. Therefore the window size is sometimes +extended when the start or end point of a time window is ambiguous in +event time.} + +\item{windowSize}{The number of time units of which a moving time window is +comprised. This can be the number of statement events, the number of days +etc., as defined in the \code{"timeWindow"} argument.} + +\item{kernel}{Use kernel smoothing for computing time windows? This option +only matters if the \code{timeWindow} argument has a value other than +\code{"no"} or \code{"event"}. The default value \code{kernel = "no"} +switches off kernel smoothing, which means all statements within a time +window are weighted equally. Other values down-weight statements the +farther they are temporally away from the mid-point of the time window. +Several kernel smoothing functions are available, similar to kernel density +estimation: \code{"uniform"} is similar to \code{"no"} and weights all +statements with a value of \code{0.5}. \code{"gaussian"} uses a standard +normal distribution as a kernel smoother. \code{"epanechnikov"} uses an +Epanechnikov kernel smoother. \code{"triangular"} uses a triangular kernel +function. If in doubt, do not use kernel smoothing.} + +\item{indentTime}{If \code{TRUE}, the sequence of time slices under the time +window algorithm starts with the first network and ends with the last +network that are entirely covered within the timeline defined by the start +and stop dates and times. For example, if the start date is 1 February, the +stop date is 31 December, and the time window duration is 21 days, the +mid-point of the first time window will be 11 February (to ensure the first +network entirely fits into the timeline), and the last network will be +centered around 20 December (to ensure the last network entirely fits into +the timeline). If \code{FALSE}, the start and stop dates and times are used +as the first and last mid-points. In that case, the first and last few +networks may contain fewer statements than other time slices and may, +therefore, be more similar to each other.} + +\item{excludeValues}{A list of named character vectors that contains entries +which should be excluded during network construction. For example, +\code{list(concept = c("A", "B"), organization = c("org A", "org B"))} +would exclude all statements containing concepts "A" or "B" or +organizations "org A" or "org B" when the network is constructed. This +is irrespective of whether these values appear in \code{variable1}, +\code{variable2}, or the \code{qualifier}. Note that only variables at +the statement level can be used here. There are separate arguments for +excluding statements nested in documents with certain meta-data.} + +\item{excludeAuthors}{A character vector of authors. If a statement is +nested in a document where one of these authors is set in the "Author" +meta-data field, the statement is excluded from network construction.} + +\item{excludeSources}{A character vector of sources. If a statement is +nested in a document where one of these sources is set in the "Source" +meta-data field, the statement is excluded from network construction.} + +\item{excludeSections}{A character vector of sections. If a statement is +nested in a document where one of these sections is set in the "Section" +meta-data field, the statement is excluded from network construction.} + +\item{excludeTypes}{A character vector of types. If a statement is +nested in a document where one of these types is set in the "Type" +meta-data field, the statement is excluded from network construction.} + +\item{algorithm}{The algorithm to compute polarisation. Can be "greedy" (for +a greedy algorithm) or "genetic" (for a genetic algorithm).} + +\item{normaliseScores}{A logical specifying whether the polarisation scores +should be normalised by edge mass per network to take away the effect of +networks over time having different activity levels.} + +\item{numClusters}{An integer specifying the number of clusters k. Default is +\code{2}.} + +\item{numParents}{Only for the genetic algorithm: An integer specifying the +number of cluster solutions ("parents"). For example, \code{30} or +\code{50}.} + +\item{numIterations}{Only for the genetic algorithm: Number of iterations of +the genetic algorithm. Often, \code{50} or \code{100} is enough, but since +there is a built-in convergence check, it is recommended to keep this +number large. The default is \code{1000}.} + +\item{elitePercentage}{Only for the genetic algorithm: A double specifying +the percentage of the best solutions that are kept for the next generation. +Useful values range between 0.05 and 0.2.} + +\item{mutationPercentage}{Only for the genetic algorithm: A double specifying +the percentage of the solutions that are mutated. Useful values range +between 0.05 and 0.2.} + +\item{randomSeed}{Only for the genetic algorithm: An integer specifying the +random seed for reproducibility of exact findings. The default is \code{0}, +which means the algorithm generates the random seed (= no reproducibility).} + +\item{object}{An object of class `dna_polarisation`.} + +\item{...}{Additional arguments passed to the plotting functions. +Currently not used.} + +\item{plots}{A character vector specifying the types of plots to generate. +Options are "hair", "hist", and "time_series". The hair plot +shows the convergence of the maximal polarization over time. +The histogram plot shows the distribution of the number of +iterations until convergence. The time series plot shows the +polarization over time.} +} +\value{ +An object representing the polarisation of actors and the results of + the algorithm for all time steps and iterations. + +A list of ggplot objects corresponding to the specified plots. +} +\description{ +This function calculates the polarisation scores for a series of time windows. + +This function generates various plots for a DNA polarisation object. +} +\examples{ +\dontrun{ +dna_init() +dna_sample() +dna_openDatabase("sample.dna", coderId = 1, coderPassword = "sample") + +p <- dna_polarisation(timeWindow = "days", windowSize = 40, normaliseScores = TRUE) +str(p) + +library("ggplot2") +autoplot(p) +} + +} +\author{ +Philip Leifeld +} diff --git a/rDNA/rDNA/rDNA.Rproj b/rDNA/rDNA/rDNA.Rproj index 902f7b15..4c8f1ebc 100644 --- a/rDNA/rDNA/rDNA.Rproj +++ b/rDNA/rDNA/rDNA.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: b4cef02f-130e-4874-9715-852a050cdc18 RestoreWorkspace: Default SaveWorkspace: Default