Merge pull request #10 from javacatknight/main

Add high-level dev notes, scaffold comments
sirixdb · Aug 31, 2023 · deabddd · deabddd
2 parents 656a7cd + 0562111
commit deabddd
Show file tree

Hide file tree

Showing 10 changed files with 855 additions and 27 deletions.
diff --git a/lib/src/main/java/Developer Notes.md b/lib/src/main/java/Developer Notes.md
@@ -0,0 +1,9 @@
+#Original Repo
+https://github.com/cwida/fsst/tree/master 
+* "...12..." files seem to be older files.
+
+#Codebase
+* Sanity folder - minimal code to covert from. More at the original repo
+
+#Technical Java Notes
+1. C/C++ char 1 byte. Java char 2 bytes.
diff --git a/lib/src/main/java/fsst/Counters.java b/lib/src/main/java/fsst/Counters.java
@@ -58,6 +58,9 @@ private static boolean longToBoolean(long l) {
         return l != 0 ? true : false;
     }
 
+/** Advance pos1 to the next nonzero counter in register range.
+ * Read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
+*/
     int count1GetNext(int pos1, boolean noOpt) {
         if (noOpt) {
             return count1[pos1];

diff --git a/lib/src/main/java/fsst/Encoder.java b/lib/src/main/java/fsst/Encoder.java
@@ -7,4 +7,4 @@ public class Encoder {
     Counters counters;
     int simdbuf[] = new int[FSST_BUFSZ];
 
-}
+}
diff --git a/lib/src/main/java/fsst/FSSTEncoder.java b/lib/src/main/java/fsst/FSSTEncoder.java
@@ -4,17 +4,42 @@
 import java.util.Arrays;
 
 public class FSSTEncoder {
-    static final long FSST_ENDIAN_MARKER = (long) 1;
-    static final long FSST_VERSION_20190218 = 20190218;
-    static final long FSST_VERSION = ((long) FSST_VERSION_20190218);
+    static final long FSST_ENDIAN_MARKER = 1L;
+    static final long FSST_VERSION_20190218 = 20190218L;
+    // static final long FSST_VERSION;
 
     SymbolTable symbolTable;
     Counters counters;
     int[] simdBuffer = new int[3 << 19];
 
+/** */
     FSSTEncoder() {
     }
 
+// TODO: Ask about string arrays instead of this char arrays.
+   /** Calibrate a FSST symbol table from a batch of strings (it is best to provide at least 16KB of data). */
+    FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) {
+        int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ];
+        int[] sampleLen = inputLength;
+        Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray();
+        FSSTEncoder encoder = new FSSTEncoder();
+        SymbolTable symbolTable = new SymbolTable();
+        encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen,
+                zeroTerminated);
+        if (sampleLen != inputLength) {
+            // TODO: There might be a better way of doing the delete operator as the c++
+            // code is doing but this is my current closest guess
+            Arrays.fill(sampleLen, 0);
+        }
+        // TODO: There might be a better way of doing the delete operator as the c++
+        // code is doing but this is my current closest guess
+        Arrays.fill(sampleBuffer, 0);
+    }
+
+    /** Create another FSSTEncoder instance, necessary to do multi-threaded encoding using the same symbol table.
+     * 
+     * @param symbolTable table to duplicate.
+    */
     FSSTEncoder(SymbolTable symbolTable) {
         this.symbolTable = symbolTable;
     }
@@ -71,23 +96,7 @@ static ArrayList<Integer> makeSample(int[] sampleBuffer, int[] inputString, int[
         return samples;
     }
 
-    FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) {
-        int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ];
-        int[] sampleLen = inputLength;
-        Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray();
-        FSSTEncoder encoder = new FSSTEncoder();
-        SymbolTable symbolTable = new SymbolTable();
-        encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen,
-                zeroTerminated);
-        if (sampleLen != inputLength) {
-            // TODO: There might be a better way of doing the delete operator as the c++
-            // code is doing but this is my current closest guess
-            Arrays.fill(sampleLen, 0);
-        }
-        // TODO: There might be a better way of doing the delete operator as the c++
-        // code is doing but this is my current closest guess
-        Arrays.fill(sampleBuffer, 0);
-    }
+
 
     FSSTEncoder duplicate() {
         FSSTEncoder duplicate = new FSSTEncoder(this.symbolTable);

diff --git a/lib/src/main/java/fsst/Symbol.java b/lib/src/main/java/fsst/Symbol.java
@@ -20,7 +20,7 @@ public class Symbol {
 
     static final int maxLength = 8;
     long value = 0;
-    long icl;
+    long icl; //ignoredBits:code:length
     int gcl;
     int gain;
     byte[] symbol = new byte[maxLength];
@@ -75,6 +75,15 @@ int first2() {
         return (int) (0xFFFF & this.value);
     }
 
+    Symbol concat(Symbol a, Symbol b) {
+        Symbol s;
+         length = a.length()+b.length();
+   if (length > Symbol::maxLength) length = Symbol::maxLength; 
+   s.set_code_len(FSST_CODE_MASK, length);
+   s.val.num = (b.val.num << (8*a.length())) | a.val.num;
+   return s;
+}
+
     static long FSST_HASH(long w) {
         return ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >>> 13));
     }

diff --git a/lib/src/main/java/fsst/SymbolTable.java b/lib/src/main/java/fsst/SymbolTable.java
@@ -47,6 +47,7 @@ public class SymbolTable {
         }
     }
 
+
     public void clear() {
         for (int i = Symbol.FSST_CODE_BASE; i < Symbol.FSST_CODE_BASE + nSymbols; i++) {
             if (symbols[i].length() == 1) {
@@ -312,7 +313,7 @@ int compressCount(SymbolTable symbolTable, Counters counters, Integer[] line, in
 
     SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int zeroTerminated) {
         SymbolTable symbolTable = new SymbolTable();
-        SymbolTable best = new SymbolTable();
+        SymbolTable bestTable = new SymbolTable();
         int bestGain = (int) -Symbol.FSST_SAMPLEMAXSZ;
         int sampleFrac = -128;
         // XXX: HACK
@@ -337,12 +338,71 @@ SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int z
             }
         }
         assert (symbolTable.terminator != 256);
-        Random rand = new Random();
+        Random rand = new Random(); //todo: check random seed
         int rand128 = rand.nextInt(129);
         int compressCountRet = this.compressCount(this, counters, line, len, sampleFrac);
-        // TODO: Implement this method
+        // TODO: Implement this method @javacatknight
         // SymbolTable table = this.makeTable(SymbolTable st, Counters counters);
         // https://github.com/cwida/fsst/blob/42850e13ba220dbba5fd721a4c54f969e2a45ac5/libfsst.cpp#L160
         return best;
     }
+
+    // // TODO: @javacatknight
+    // void makeTable(SymbolTable symbolTable, Counters counters, int sampleFrac) {
+    //     //Hashmap (needed because we can generate duplicate candidates)
+    //     //Not using HashSet due to lack of find() method, see: https://stackoverflow.com/questions/7283338/getting-an-element-from-a-set
+    //     HashMap <QSymbol, boolean> candidates;
+
+    // //TODO: @javacatknight Google shift optimization over division. Caveats: auto done by compiler, dealing with negative
+    // }
+
+    //   // Add candidate symbols based on counted frequency
+    //   // TODO: u32pos1, u32cnt1, (size_t)st.nSymbols
+    //   for (int pos1=0; pos1<FSST_CODE_BASE+(int)st.nSymbols; pos1++) { 
+    //      int cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
+    //      if (!cnt1) continue;
+
+    //      // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
+    //      Symbol s1 = st.symbols[pos1];
+    //      //
+    //      addOrInc(candidates, s1, ((s1.length()==1)?(BigInteger)8:(BigInteger)1)*cnt1, sampleFrac);
+    //      if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
+    //          s1.length() == Symbol.maxLength || // symbol cannot be extended
+    //          s1.val.str[0] == st.terminator) { // multi-byte symbols cannot contain the terminator byte
+    //         continue;
+    //      }
+    //      for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
+    //         u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
+    //         if (!cnt2) continue;
+
+    //         // create a new symbol
+    //         Symbol s2 = st->symbols[pos2];
+    //         Symbol s3 = concat(s1, s2);
+    //         if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
+    //            addOrInc(cands, s3, cnt2);
+    //      }
+    //   }
+
+    // /**
+    //  * Helper function.
+    //  * 
+    //  * @param count unsigned 64byte must be parsed to long
+    //  */
+
+    // // TODO: @javacatknight auto type inference
+    // void addOrInc(HashMap <QSymbol> candidates, Symbol s, Long count, int sampleFrac){
+    //     if (count < (5*sampleFrac)/128) return; // Improves both compression speed (less candidates), but also quality!!
+
+    //     QSymbol q;
+    //     q.symbol = s;
+    //     q.gain = count * s.length();
+
+    //     // Iterator  //look for the symbol. If not found, just insert.
+    //      var it = candidates.get(q);
+    //      if (it != candidates.end()) { // if found, add gain first and then insert
+    //         q.gain += (*it).gain;
+    //         candidates.erase(*it);
+    //      }
+    //      candidates.insert(q);
+    }
 }
diff --git a/lib/src/main/java/fsst/Utils.java b/lib/src/main/java/fsst/Utils.java
@@ -4,7 +4,8 @@
 import java.nio.ByteOrder;
 
 public class Utils {
-    static int boolToInt(boolean value) {
+
+    static int booleanToInt(boolean value) {
         return value ? 1 : 0;
     }
 

diff --git a/notes.md b/notes.md
@@ -0,0 +1,95 @@
+# FOREWORD:
+If you're only interested in developer code, skip background. Ongoing.
+
+# TABLE OF CONTENTS
+1. [Background](#background)
+2. [Summary](#summary)
+3. [Overview](#overview)
+
+# BACKGROUND <a name="background"></a>
+Dictionary compression : uniquely matches strings to fixed-size integers.
+	- Effective only if repeating strings, i.e. similiar words lose benefit
+	- Also if applied to fraction of a whole relation, ineffective
+	- Most srings stored are generally less than 200 bytes and often less than 30 bytes per string
+
+LZ4 (dictionary compression example)
+	- Not efficient for compressing individual strings - requires kB input size for efficient compression
+	- So it's used to compress columnar blocks (many string values together)
+	- Therefore prevents random access;
+	- Example: decompressing large blocks for these values, some of which goes unused.
+
+Potential:
+- Use in conjunction with dictionary compression - i.e. after data is compressed, FSST can compress the strings in the dictionary
+- Can apply on existing database systems
+- Compressed Query Processing - Can complete equality comparisons on the compressed, without needing decompression
+
+
+# SUMMARY <a name="summary"></a>
+FSST - Fast Static Symbol Table
+## Compression
+* Replace frequently-occuring substrings of 1-8 bytes with 1-byte codes. 
+* Remaining symbols/symbols that don't frequently occur are escaped, to indicate they should be copied as is. Result of symbol table being limited (256 bytes). Reserve the last byte of table for an escape byte.
+
+### Algorithm
+* Ties are resolved randomly
+
+## Decompression:
+* Translate each 1-byte code into its symbolic substring, using an immutable array table (256 entries)
+
+
+# OVERVIEW <a name="overview"></a>
+## Decompression Algorithm:
+	- Decompress into symbols and store as 8-byte word in array. 
+	/** */
+	void decodeBasic (int[] in, int[] out, symbolTable, actualLengthOfSymbols){
+		int code = *in++; //Dereference to get (*in) before the in pointer is moved forwards.
+		*out = sym[code]; //Translate the symbol, cast to 8 byte word and put it into outtput buffer
+		out+= len[ccode]; //Moves the pointer head forwards to the new out[0]/next place to write.
+	}
+
+	void decodeWithEscape (...) {
+		if (code == 255)
+			*out++ = *in++; //Copy the escape character.
+	}
+	/***/ 
+
+## Compression:
+* findLongestSymbol() finds the longest matching symbol at the current input position. If no matching symbol is found. The input byte is escaped.
+
+## Symbol Table Construction
+- Choosing the 256 symbols
+- Naive greedy single-pass: count and pick the most frequent occured. Con: does not consider overlapping symbols ex. ("http://w", ttp://www) and if sequential read-in, shorter symbols will be consumed long before the better/longer symbols (h before ttp://w)
+- Actual iterative algorithm - Linear time, multiple (ex. 5) iterations, and on-the-fly compression, bottom-up
+- Concatenate short symbols to longer symbols
+- Multiple iterations update the table, add new symbols, remove bad symbols
+- Base case: empty symbol table
+- Each iteration:
+	1. Iterate over the uncompressed input and compress with existing symbol table, count frequency
+	2. Select the highest-gain symbols to construct a new symbol table. Choose from:
+		* Old table
+		* New symbols generated by concatenating pairs (2) symbols
+		* Reconsider all symbols that consist of a single byte
+		* Each existing symbol concatenated with the next occuring byte (even if that single byte is not currently a symbol)
+- Ties for gain are resolved randomly for symbols
+
+<!-- 
+Variables:
+- SymbolTable st == current table
+- count1[], count2[][] == frequencies of the codes
+
+
+buildSymbolTable(SymbolTable st)
+- 5 iterations
+	- Initialize st.nSymbols = 0
+	- Initialize new symboltable(). Field st.symbols[] starts with 256 pseudo symbols == escaped bytes.
+	- In the array, the next st.nSymbols (number of symbols), up to 255, contain the real symbols.
+- ???
+
+compressCount(SymbolTable st, count1, count2, text)
+- Initial symboltable is empty, uses all escaped bytes, input size doubled.
+- Does not produce compressed text, just records the frequency of the codes or bytes it encounters
+* count1[] 
+
+  -->
+
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,4 @@ public class Encoder { @@
         Counters counters;
         int simdbuf[] = new int[FSST_BUFSZ];
-    }
+    }