diff --git a/lib/src/main/java/Developer Notes.md b/lib/src/main/java/Developer Notes.md new file mode 100644 index 0000000..d4776ed --- /dev/null +++ b/lib/src/main/java/Developer Notes.md @@ -0,0 +1,9 @@ +#Original Repo +https://github.com/cwida/fsst/tree/master +* "...12..." files seem to be older files. + +#Codebase +* Sanity folder - minimal code to covert from. More at the original repo + +#Technical Java Notes +1. C/C++ char 1 byte. Java char 2 bytes. diff --git a/lib/src/main/java/fsst/Counters.java b/lib/src/main/java/fsst/Counters.java index 880ff61..0408255 100644 --- a/lib/src/main/java/fsst/Counters.java +++ b/lib/src/main/java/fsst/Counters.java @@ -58,6 +58,9 @@ private static boolean longToBoolean(long l) { return l != 0 ? true : false; } +/** Advance pos1 to the next nonzero counter in register range. + * Read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros +*/ int count1GetNext(int pos1, boolean noOpt) { if (noOpt) { return count1[pos1]; diff --git a/lib/src/main/java/fsst/Encoder.java b/lib/src/main/java/fsst/Encoder.java index ab7aa82..843fe2d 100644 --- a/lib/src/main/java/fsst/Encoder.java +++ b/lib/src/main/java/fsst/Encoder.java @@ -7,4 +7,4 @@ public class Encoder { Counters counters; int simdbuf[] = new int[FSST_BUFSZ]; -} +} \ No newline at end of file diff --git a/lib/src/main/java/fsst/FSSTEncoder.java b/lib/src/main/java/fsst/FSSTEncoder.java index f1a53e4..8eb1f0e 100644 --- a/lib/src/main/java/fsst/FSSTEncoder.java +++ b/lib/src/main/java/fsst/FSSTEncoder.java @@ -4,17 +4,42 @@ import java.util.Arrays; public class FSSTEncoder { - static final long FSST_ENDIAN_MARKER = (long) 1; - static final long FSST_VERSION_20190218 = 20190218; - static final long FSST_VERSION = ((long) FSST_VERSION_20190218); + static final long FSST_ENDIAN_MARKER = 1L; + static final long FSST_VERSION_20190218 = 20190218L; + // static final long FSST_VERSION; SymbolTable symbolTable; Counters counters; int[] simdBuffer = new int[3 << 19]; +/** */ FSSTEncoder() { } +// TODO: Ask about string arrays instead of this char arrays. + /** Calibrate a FSST symbol table from a batch of strings (it is best to provide at least 16KB of data). */ + FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) { + int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ]; + int[] sampleLen = inputLength; + Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray(); + FSSTEncoder encoder = new FSSTEncoder(); + SymbolTable symbolTable = new SymbolTable(); + encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen, + zeroTerminated); + if (sampleLen != inputLength) { + // TODO: There might be a better way of doing the delete operator as the c++ + // code is doing but this is my current closest guess + Arrays.fill(sampleLen, 0); + } + // TODO: There might be a better way of doing the delete operator as the c++ + // code is doing but this is my current closest guess + Arrays.fill(sampleBuffer, 0); + } + + /** Create another FSSTEncoder instance, necessary to do multi-threaded encoding using the same symbol table. + * + * @param symbolTable table to duplicate. + */ FSSTEncoder(SymbolTable symbolTable) { this.symbolTable = symbolTable; } @@ -71,23 +96,7 @@ static ArrayList makeSample(int[] sampleBuffer, int[] inputString, int[ return samples; } - FSSTEncoder(int n, int[] inputLength, char[] inputString, int zeroTerminated) { - int[] sampleBuffer = new int[(int) Symbol.FSST_SAMPLEMAXSZ]; - int[] sampleLen = inputLength; - Object[] sample = makeSample(simdBuffer, sampleBuffer, sampleLen, n == 0 ? n : 1).toArray(); - FSSTEncoder encoder = new FSSTEncoder(); - SymbolTable symbolTable = new SymbolTable(); - encoder.symbolTable = symbolTable.buildSymbolTable(encoder.counters, (Integer[]) sample, sampleLen, - zeroTerminated); - if (sampleLen != inputLength) { - // TODO: There might be a better way of doing the delete operator as the c++ - // code is doing but this is my current closest guess - Arrays.fill(sampleLen, 0); - } - // TODO: There might be a better way of doing the delete operator as the c++ - // code is doing but this is my current closest guess - Arrays.fill(sampleBuffer, 0); - } + FSSTEncoder duplicate() { FSSTEncoder duplicate = new FSSTEncoder(this.symbolTable); diff --git a/lib/src/main/java/fsst/Symbol.java b/lib/src/main/java/fsst/Symbol.java index 1b33b17..dfd0e9a 100644 --- a/lib/src/main/java/fsst/Symbol.java +++ b/lib/src/main/java/fsst/Symbol.java @@ -20,7 +20,7 @@ public class Symbol { static final int maxLength = 8; long value = 0; - long icl; + long icl; //ignoredBits:code:length int gcl; int gain; byte[] symbol = new byte[maxLength]; @@ -75,6 +75,15 @@ int first2() { return (int) (0xFFFF & this.value); } + Symbol concat(Symbol a, Symbol b) { + Symbol s; + length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.val.num = (b.val.num << (8*a.length())) | a.val.num; + return s; +} + static long FSST_HASH(long w) { return ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >>> 13)); } diff --git a/lib/src/main/java/fsst/SymbolTable.java b/lib/src/main/java/fsst/SymbolTable.java index e3826b0..946d0e5 100644 --- a/lib/src/main/java/fsst/SymbolTable.java +++ b/lib/src/main/java/fsst/SymbolTable.java @@ -47,6 +47,7 @@ public class SymbolTable { } } + public void clear() { for (int i = Symbol.FSST_CODE_BASE; i < Symbol.FSST_CODE_BASE + nSymbols; i++) { if (symbols[i].length() == 1) { @@ -312,7 +313,7 @@ int compressCount(SymbolTable symbolTable, Counters counters, Integer[] line, in SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int zeroTerminated) { SymbolTable symbolTable = new SymbolTable(); - SymbolTable best = new SymbolTable(); + SymbolTable bestTable = new SymbolTable(); int bestGain = (int) -Symbol.FSST_SAMPLEMAXSZ; int sampleFrac = -128; // XXX: HACK @@ -337,12 +338,71 @@ SymbolTable buildSymbolTable(Counters counters, Integer[] line, int[] len, int z } } assert (symbolTable.terminator != 256); - Random rand = new Random(); + Random rand = new Random(); //todo: check random seed int rand128 = rand.nextInt(129); int compressCountRet = this.compressCount(this, counters, line, len, sampleFrac); - // TODO: Implement this method + // TODO: Implement this method @javacatknight // SymbolTable table = this.makeTable(SymbolTable st, Counters counters); // https://github.com/cwida/fsst/blob/42850e13ba220dbba5fd721a4c54f969e2a45ac5/libfsst.cpp#L160 return best; } + + // // TODO: @javacatknight + // void makeTable(SymbolTable symbolTable, Counters counters, int sampleFrac) { + // //Hashmap (needed because we can generate duplicate candidates) + // //Not using HashSet due to lack of find() method, see: https://stackoverflow.com/questions/7283338/getting-an-element-from-a-set + // HashMap candidates; + + // //TODO: @javacatknight Google shift optimization over division. Caveats: auto done by compiler, dealing with negative + // } + + // // Add candidate symbols based on counted frequency + // // TODO: u32pos1, u32cnt1, (size_t)st.nSymbols + // for (int pos1=0; pos1= 128 || // last round we do not create new (combined) symbols + // s1.length() == Symbol.maxLength || // symbol cannot be extended + // s1.val.str[0] == st.terminator) { // multi-byte symbols cannot contain the terminator byte + // continue; + // } + // for (u32 pos2=0; pos2nSymbols; pos2++) { + // u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + // if (!cnt2) continue; + + // // create a new symbol + // Symbol s2 = st->symbols[pos2]; + // Symbol s3 = concat(s1, s2); + // if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + // addOrInc(cands, s3, cnt2); + // } + // } + + // /** + // * Helper function. + // * + // * @param count unsigned 64byte must be parsed to long + // */ + + // // TODO: @javacatknight auto type inference + // void addOrInc(HashMap candidates, Symbol s, Long count, int sampleFrac){ + // if (count < (5*sampleFrac)/128) return; // Improves both compression speed (less candidates), but also quality!! + + // QSymbol q; + // q.symbol = s; + // q.gain = count * s.length(); + + // // Iterator //look for the symbol. If not found, just insert. + // var it = candidates.get(q); + // if (it != candidates.end()) { // if found, add gain first and then insert + // q.gain += (*it).gain; + // candidates.erase(*it); + // } + // candidates.insert(q); + } } diff --git a/lib/src/main/java/fsst/Utils.java b/lib/src/main/java/fsst/Utils.java index cc576ea..fe39f13 100644 --- a/lib/src/main/java/fsst/Utils.java +++ b/lib/src/main/java/fsst/Utils.java @@ -4,7 +4,8 @@ import java.nio.ByteOrder; public class Utils { - static int boolToInt(boolean value) { + + static int booleanToInt(boolean value) { return value ? 1 : 0; } diff --git a/notes.md b/notes.md new file mode 100644 index 0000000..3461ba6 --- /dev/null +++ b/notes.md @@ -0,0 +1,95 @@ +# FOREWORD: +If you're only interested in developer code, skip background. Ongoing. + +# TABLE OF CONTENTS +1. [Background](#background) +2. [Summary](#summary) +3. [Overview](#overview) + +# BACKGROUND +Dictionary compression : uniquely matches strings to fixed-size integers. + - Effective only if repeating strings, i.e. similiar words lose benefit + - Also if applied to fraction of a whole relation, ineffective + - Most srings stored are generally less than 200 bytes and often less than 30 bytes per string + +LZ4 (dictionary compression example) + - Not efficient for compressing individual strings - requires kB input size for efficient compression + - So it's used to compress columnar blocks (many string values together) + - Therefore prevents random access; + - Example: decompressing large blocks for these values, some of which goes unused. + +Potential: +- Use in conjunction with dictionary compression - i.e. after data is compressed, FSST can compress the strings in the dictionary +- Can apply on existing database systems +- Compressed Query Processing - Can complete equality comparisons on the compressed, without needing decompression + + +# SUMMARY +FSST - Fast Static Symbol Table +## Compression +* Replace frequently-occuring substrings of 1-8 bytes with 1-byte codes. +* Remaining symbols/symbols that don't frequently occur are escaped, to indicate they should be copied as is. Result of symbol table being limited (256 bytes). Reserve the last byte of table for an escape byte. + +### Algorithm +* Ties are resolved randomly + +## Decompression: +* Translate each 1-byte code into its symbolic substring, using an immutable array table (256 entries) + + +# OVERVIEW +## Decompression Algorithm: + - Decompress into symbols and store as 8-byte word in array. + /** */ + void decodeBasic (int[] in, int[] out, symbolTable, actualLengthOfSymbols){ + int code = *in++; //Dereference to get (*in) before the in pointer is moved forwards. + *out = sym[code]; //Translate the symbol, cast to 8 byte word and put it into outtput buffer + out+= len[ccode]; //Moves the pointer head forwards to the new out[0]/next place to write. + } + + void decodeWithEscape (...) { + if (code == 255) + *out++ = *in++; //Copy the escape character. + } + /***/ + +## Compression: +* findLongestSymbol() finds the longest matching symbol at the current input position. If no matching symbol is found. The input byte is escaped. + +## Symbol Table Construction +- Choosing the 256 symbols +- Naive greedy single-pass: count and pick the most frequent occured. Con: does not consider overlapping symbols ex. ("http://w", ttp://www) and if sequential read-in, shorter symbols will be consumed long before the better/longer symbols (h before ttp://w) +- Actual iterative algorithm - Linear time, multiple (ex. 5) iterations, and on-the-fly compression, bottom-up +- Concatenate short symbols to longer symbols +- Multiple iterations update the table, add new symbols, remove bad symbols +- Base case: empty symbol table +- Each iteration: + 1. Iterate over the uncompressed input and compress with existing symbol table, count frequency + 2. Select the highest-gain symbols to construct a new symbol table. Choose from: + * Old table + * New symbols generated by concatenating pairs (2) symbols + * Reconsider all symbols that consist of a single byte + * Each existing symbol concatenated with the next occuring byte (even if that single byte is not currently a symbol) +- Ties for gain are resolved randomly for symbols + + + + diff --git a/sanity/libfsst.cpp b/sanity/libfsst.cpp new file mode 100644 index 0000000..ca80231 --- /dev/null +++ b/sanity/libfsst.cpp @@ -0,0 +1,642 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.val.num = (b.val.num << (8*a.length())) | a.val.num; + return s; +} + +namespace std { +template <> +class hash { + public: + size_t operator()(const QSymbol& q) const { + uint64_t k = q.symbol.val.num; + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + uint64_t h = 0x8445d61a4e774912 ^ (8*m); + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } +}; +} + +bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + +std::ostream& operator<<(std::ostream& out, const Symbol& s) { + for (u32 i=0; i line, size_t len[], bool zeroTerminated=false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for(size_t i=0; iterminator = 256; + while(i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); + + // a random number between 0 and 128 + auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; + + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; + + for(size_t i=0; i sampleFrac) continue; + } + if (cur < end) { + u8* start = cur; + u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); + cur += st->symbols[code1].length(); + gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); + while (true) { + // count single symbol (i.e. an option is not extending it) + counters.count1Inc(code1); + + // as an alternative, consider just using the next byte.. + if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly + counters.count1Inc(*start); + + if (cur==end) { + break; + } + + // now match a new symbol + start = cur; + if (curhashTabSize-1); + Symbol s = st->hashTab[idx]; + code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { + code2 = s.code(); + cur += s.length(); + } else if (code2 >= FSST_CODE_BASE) { + cur += 2; + } else { + code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else { + code2 = st->findLongestSymbol(cur, end); + cur += st->symbols[code2].length(); + } + + // compute compressed output size + gain += ((int) (cur-start))-(1+isEscapeCode(code2)); + + // now count the subsequent two symbols we encode as an extension codesibility + if (sampleFrac < 128) { // no need to count pairs in final round + // consider the symbol that is the concatenation of the two last symbols + counters.count2Inc(code1, code2); + + // as an alternative, consider just extending with the next byte.. + if ((cur-start) > 1) // ..but do not count single byte extensions doubly + counters.count2Inc(code1, *start); + } + code1 = code2; + } + } + } + return gain; + }; + + auto makeTable = [&](SymbolTable *st, Counters &counters) { + // hashmap of c (needed because we can generate duplicate candidates) + unordered_set cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; + counters.count1Set(terminator,65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); //look for the symbol. If not found, just insert. + if (it != cands.end()) { // if found, add gain first and then insert + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1=0; pos1nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2=0; pos2nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); + } + } + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); }; + priority_queue,decltype(cmpGn)> pq(cmpGn); + for (auto& q : cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); + } + }; + + u8 bestCounters[512*sizeof(u16)]; +#ifdef NONOPT_FSST + for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { + sampleFrac = frac; +#else + for(sampleFrac=8; true; sampleFrac += 30) { +#endif + memset(&counters, 0, sizeof(Counters)); + long gain = compressCount(st, counters); + if (gain >= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; bestGain = gain; + } + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; +} + +static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1<<19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine]-curOff)*2; + + strOut[curLine] = (u8*) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while(curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more? + if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse + cur++; + } + } + } + }; + + for(curLine=0; curLine 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + } + if ((2*chunk+7) > (size_t) (lim-out)) { + return curLine; // out of memory + } + // copy the string to the 511-byte buffer + memcpy(buf, cur, chunk); + buf[chunk] = (u8) symbolTable.terminator; + cur = buf; + end = cur + chunk; + + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true,false); + } else if (avoidBranch) { + compressVariant(false,true); + } else { + compressVariant(false, false); + } + } while((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; +} + +#define FSST_SAMPLELINE ((size_t) 512) + +// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes +vector makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nlines) { + size_t totSize = 0, *lenIn = *lenRef; + vector sample; + + for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t*) encoder; +} + +/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ +extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t*) e; +} + +// export a symbol table in compact format. +extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder*) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for(u32 i=0; i<8; i++) + buf[9+i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized +} + +#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ + +extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + if ((version>>32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8]&1; + memcpy(lenHisto, buf+9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for(u32 j=0; jlen[code]; j++) + ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; + + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while(code<255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; +} + +// runtime check for simd +inline size_t _compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +#ifndef NONOPT_FSST + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +#endif + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); +} +size_t compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} + +// adaptive choosing of scalar compression method based on symbol length histogram +inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} +size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +} + +// the main compression function (everything automatic) +extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { + // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) + size_t totLen = accumulate(lenIn, lenIn+nlines, 0); + int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); + return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); +} + +/* deallocate encoder */ +extern "C" void fsst_destroy(fsst_encoder_t* encoder) { + Encoder *e = (Encoder*) encoder; + delete e; +} + +/* very lazy implementation relying on export and import */ +extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { + u8 buf[sizeof(fsst_decoder_t)]; + u32 cnt1 = fsst_export(encoder, buf); + fsst_decoder_t decoder; + u32 cnt2 = fsst_import(&decoder, buf); + assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + return decoder; +} \ No newline at end of file diff --git a/sanity/libfsst.hpp b/sanity/libfsst.hpp index e5999c1..5b03b60 100644 --- a/sanity/libfsst.hpp +++ b/sanity/libfsst.hpp @@ -129,7 +129,7 @@ struct QSymbol { Symbol symbol; mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols - bool operator==(const QSymbol &other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } + bool operator==(const QSymbol &other) const{ return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } }; // we construct FSST symbol tables using a random sample of about 16KB (1<<14)