Skip to content
This repository has been archived by the owner on Nov 6, 2023. It is now read-only.

Commit

Permalink
Merge pull request #10 from javacatknight/main
Browse files Browse the repository at this point in the history
Add high-level dev notes, scaffold comments
  • Loading branch information
AlvinKuruvilla authored Aug 31, 2023
2 parents 656a7cd + 0562111 commit deabddd
Show file tree
Hide file tree
Showing 10 changed files with 855 additions and 27 deletions.
9 changes: 9 additions & 0 deletions lib/src/main/java/Developer Notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#Original Repo
https://github.com/cwida/fsst/tree/master
* "...12..." files seem to be older files.

#Codebase
* Sanity folder - minimal code to covert from. More at the original repo

#Technical Java Notes
1. C/C++ char 1 byte. Java char 2 bytes.
3 changes: 3 additions & 0 deletions lib/src/main/java/fsst/Counters.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ private static boolean longToBoolean(long l) {
return l != 0 ? true : false;
}

/** Advance pos1 to the next nonzero counter in register range.
* Read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
*/
int count1GetNext(int pos1, boolean noOpt) {
if (noOpt) {
return count1[pos1];
Expand Down
2 changes: 1 addition & 1 deletion lib/src/main/java/fsst/Encoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ public class Encoder {
Counters counters;
int simdbuf[] = new int[FSST_BUFSZ];

}
}
49 changes: 29 additions & 20 deletions lib/src/main/java/fsst/FSSTEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,42 @@
import java.util.Arrays;

public class FSSTEncoder {
static final long FSST_ENDIAN_MARKER = (long) 1;
static final long FSST_VERSION_20190218 = 20190218;
static final long FSST_VERSION = ((long) FSST_VERSION_20190218);
static final long FSST_ENDIAN_MARKER = 1L;
static final long FSST_VERSION_20190218 = 20190218L;
// static final long FSST_VERSION;

SymbolTable symbolTable;
Counters counters;
int[] simdBuffer = new int[3 << 19];

/** */
FSSTEncoder() {
}

// TODO: Ask about string arrays instead of this char arrays.
/** Calibrate a FSST symbol table from a batch of strings (it is best to provide at least 16KB of data). */
FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) {
int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ];
int[] sampleLen = inputLength;
Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray();
FSSTEncoder encoder = new FSSTEncoder();
SymbolTable symbolTable = new SymbolTable();
encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen,
zeroTerminated);
if (sampleLen != inputLength) {
// TODO: There might be a better way of doing the delete operator as the c++
// code is doing but this is my current closest guess
Arrays.fill(sampleLen, 0);
}
// TODO: There might be a better way of doing the delete operator as the c++
// code is doing but this is my current closest guess
Arrays.fill(sampleBuffer, 0);
}

/** Create another FSSTEncoder instance, necessary to do multi-threaded encoding using the same symbol table.
*
* @param symbolTable table to duplicate.
*/
FSSTEncoder(SymbolTable symbolTable) {
this.symbolTable = symbolTable;
}
Expand Down Expand Up @@ -71,23 +96,7 @@ static ArrayList<Integer> makeSample(int[] sampleBuffer, int[] inputString, int[
return samples;
}

FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) {
int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ];
int[] sampleLen = inputLength;
Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray();
FSSTEncoder encoder = new FSSTEncoder();
SymbolTable symbolTable = new SymbolTable();
encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen,
zeroTerminated);
if (sampleLen != inputLength) {
// TODO: There might be a better way of doing the delete operator as the c++
// code is doing but this is my current closest guess
Arrays.fill(sampleLen, 0);
}
// TODO: There might be a better way of doing the delete operator as the c++
// code is doing but this is my current closest guess
Arrays.fill(sampleBuffer, 0);
}


FSSTEncoder duplicate() {
FSSTEncoder duplicate = new FSSTEncoder(this.symbolTable);
Expand Down
11 changes: 10 additions & 1 deletion lib/src/main/java/fsst/Symbol.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class Symbol {

static final int maxLength = 8;
long value = 0;
long icl;
long icl; //ignoredBits:code:length
int gcl;
int gain;
byte[] symbol = new byte[maxLength];
Expand Down Expand Up @@ -75,6 +75,15 @@ int first2() {
return (int) (0xFFFF & this.value);
}

Symbol concat(Symbol a, Symbol b) {
Symbol s;
length = a.length()+b.length();
if (length > Symbol::maxLength) length = Symbol::maxLength;
s.set_code_len(FSST_CODE_MASK, length);
s.val.num = (b.val.num << (8*a.length())) | a.val.num;
return s;
}

static long FSST_HASH(long w) {
return ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >>> 13));
}
Expand Down
66 changes: 63 additions & 3 deletions lib/src/main/java/fsst/SymbolTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public class SymbolTable {
}
}


public void clear() {
for (int i = Symbol.FSST_CODE_BASE; i < Symbol.FSST_CODE_BASE + nSymbols; i++) {
if (symbols[i].length() == 1) {
Expand Down Expand Up @@ -312,7 +313,7 @@ int compressCount(SymbolTable symbolTable, Counters counters, Integer[] line, in

SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int zeroTerminated) {
SymbolTable symbolTable = new SymbolTable();
SymbolTable best = new SymbolTable();
SymbolTable bestTable = new SymbolTable();
int bestGain = (int) -Symbol.FSST_SAMPLEMAXSZ;
int sampleFrac = -128;
// XXX: HACK
Expand All @@ -337,12 +338,71 @@ SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int z
}
}
assert (symbolTable.terminator != 256);
Random rand = new Random();
Random rand = new Random(); //todo: check random seed
int rand128 = rand.nextInt(129);
int compressCountRet = this.compressCount(this, counters, line, len, sampleFrac);
// TODO: Implement this method
// TODO: Implement this method @javacatknight
// SymbolTable table = this.makeTable(SymbolTable st, Counters counters);
// https://github.com/cwida/fsst/blob/42850e13ba220dbba5fd721a4c54f969e2a45ac5/libfsst.cpp#L160
return best;
}

// // TODO: @javacatknight
// void makeTable(SymbolTable symbolTable, Counters counters, int sampleFrac) {
// //Hashmap (needed because we can generate duplicate candidates)
// //Not using HashSet due to lack of find() method, see: https://stackoverflow.com/questions/7283338/getting-an-element-from-a-set
// HashMap <QSymbol, boolean> candidates;

// //TODO: @javacatknight Google shift optimization over division. Caveats: auto done by compiler, dealing with negative
// }

// // Add candidate symbols based on counted frequency
// // TODO: u32pos1, u32cnt1, (size_t)st.nSymbols
// for (int pos1=0; pos1<FSST_CODE_BASE+(int)st.nSymbols; pos1++) {
// int cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
// if (!cnt1) continue;

// // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
// Symbol s1 = st.symbols[pos1];
// //
// addOrInc(candidates, s1, ((s1.length()==1)?(BigInteger)8:(BigInteger)1)*cnt1, sampleFrac);
// if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
// s1.length() == Symbol.maxLength || // symbol cannot be extended
// s1.val.str[0] == st.terminator) { // multi-byte symbols cannot contain the terminator byte
// continue;
// }
// for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
// u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
// if (!cnt2) continue;

// // create a new symbol
// Symbol s2 = st->symbols[pos2];
// Symbol s3 = concat(s1, s2);
// if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
// addOrInc(cands, s3, cnt2);
// }
// }

// /**
// * Helper function.
// *
// * @param count unsigned 64byte must be parsed to long
// */

// // TODO: @javacatknight auto type inference
// void addOrInc(HashMap <QSymbol> candidates, Symbol s, Long count, int sampleFrac){
// if (count < (5*sampleFrac)/128) return; // Improves both compression speed (less candidates), but also quality!!

// QSymbol q;
// q.symbol = s;
// q.gain = count * s.length();

// // Iterator //look for the symbol. If not found, just insert.
// var it = candidates.get(q);
// if (it != candidates.end()) { // if found, add gain first and then insert
// q.gain += (*it).gain;
// candidates.erase(*it);
// }
// candidates.insert(q);
}
}
3 changes: 2 additions & 1 deletion lib/src/main/java/fsst/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import java.nio.ByteOrder;

public class Utils {
static int boolToInt(boolean value) {

static int booleanToInt(boolean value) {
return value ? 1 : 0;
}

Expand Down
95 changes: 95 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# FOREWORD:
If you're only interested in developer code, skip background. Ongoing.

# TABLE OF CONTENTS
1. [Background](#background)
2. [Summary](#summary)
3. [Overview](#overview)

# BACKGROUND <a name="background"></a>
Dictionary compression : uniquely matches strings to fixed-size integers.
- Effective only if repeating strings, i.e. similiar words lose benefit
- Also if applied to fraction of a whole relation, ineffective
- Most srings stored are generally less than 200 bytes and often less than 30 bytes per string

LZ4 (dictionary compression example)
- Not efficient for compressing individual strings - requires kB input size for efficient compression
- So it's used to compress columnar blocks (many string values together)
- Therefore prevents random access;
- Example: decompressing large blocks for these values, some of which goes unused.

Potential:
- Use in conjunction with dictionary compression - i.e. after data is compressed, FSST can compress the strings in the dictionary
- Can apply on existing database systems
- Compressed Query Processing - Can complete equality comparisons on the compressed, without needing decompression


# SUMMARY <a name="summary"></a>
FSST - Fast Static Symbol Table
## Compression
* Replace frequently-occuring substrings of 1-8 bytes with 1-byte codes.
* Remaining symbols/symbols that don't frequently occur are escaped, to indicate they should be copied as is. Result of symbol table being limited (256 bytes). Reserve the last byte of table for an escape byte.

### Algorithm
* Ties are resolved randomly

## Decompression:
* Translate each 1-byte code into its symbolic substring, using an immutable array table (256 entries)


# OVERVIEW <a name="overview"></a>
## Decompression Algorithm:
- Decompress into symbols and store as 8-byte word in array.
/** */
void decodeBasic (int[] in, int[] out, symbolTable, actualLengthOfSymbols){
int code = *in++; //Dereference to get (*in) before the in pointer is moved forwards.
*out = sym[code]; //Translate the symbol, cast to 8 byte word and put it into outtput buffer
out+= len[ccode]; //Moves the pointer head forwards to the new out[0]/next place to write.
}

void decodeWithEscape (...) {
if (code == 255)
*out++ = *in++; //Copy the escape character.
}
/***/

## Compression:
* findLongestSymbol() finds the longest matching symbol at the current input position. If no matching symbol is found. The input byte is escaped.

## Symbol Table Construction
- Choosing the 256 symbols
- Naive greedy single-pass: count and pick the most frequent occured. Con: does not consider overlapping symbols ex. ("http://w", ttp://www) and if sequential read-in, shorter symbols will be consumed long before the better/longer symbols (h before ttp://w)
- Actual iterative algorithm - Linear time, multiple (ex. 5) iterations, and on-the-fly compression, bottom-up
- Concatenate short symbols to longer symbols
- Multiple iterations update the table, add new symbols, remove bad symbols
- Base case: empty symbol table
- Each iteration:
1. Iterate over the uncompressed input and compress with existing symbol table, count frequency
2. Select the highest-gain symbols to construct a new symbol table. Choose from:
* Old table
* New symbols generated by concatenating pairs (2) symbols
* Reconsider all symbols that consist of a single byte
* Each existing symbol concatenated with the next occuring byte (even if that single byte is not currently a symbol)
- Ties for gain are resolved randomly for symbols

<!--
Variables:
- SymbolTable st == current table
- count1[], count2[][] == frequencies of the codes
buildSymbolTable(SymbolTable st)
- 5 iterations
- Initialize st.nSymbols = 0
- Initialize new symboltable(). Field st.symbols[] starts with 256 pseudo symbols == escaped bytes.
- In the array, the next st.nSymbols (number of symbols), up to 255, contain the real symbols.
- ???
compressCount(SymbolTable st, count1, count2, text)
- Initial symboltable is empty, uses all escaped bytes, input size doubled.
- Does not produce compressed text, just records the frequency of the codes or bytes it encounters
* count1[]
-->


Loading

0 comments on commit deabddd

Please sign in to comment.