From cfa93984c31be632fe401dd79326154d9e0d385f Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Mon, 14 Jun 2021 14:31:46 -0400 Subject: [PATCH 1/3] Move codec implementations to cpp and make codec dependencies private --- CMakeLists.txt | 25 +- benchmarks/perftest_interpolative.cpp | 1 + include/pisa/codec/VarIntG8IU.h | 458 +++++++++++++------------- include/pisa/codec/block_codecs.hpp | 78 +---- include/pisa/codec/maskedvbyte.hpp | 34 +- include/pisa/codec/qmx.hpp | 44 +-- include/pisa/codec/simdbp.hpp | 39 +-- include/pisa/codec/simple16.hpp | 33 +- include/pisa/codec/simple8b.hpp | 27 +- include/pisa/codec/streamvbyte.hpp | 24 +- include/pisa/codec/varintgb.hpp | 2 - src/codec/maskedvbyte.cpp | 39 +++ src/codec/optpfor.cpp | 86 +++++ src/codec/qmx.cpp | 46 +++ src/codec/simdbp.cpp | 42 +++ src/codec/simple16.cpp | 36 ++ src/codec/simple8b.cpp | 30 ++ src/codec/streamvbyte.cpp | 30 ++ 18 files changed, 605 insertions(+), 469 deletions(-) create mode 100644 src/codec/maskedvbyte.cpp create mode 100644 src/codec/optpfor.cpp create mode 100644 src/codec/qmx.cpp create mode 100644 src/codec/simdbp.cpp create mode 100644 src/codec/simple16.cpp create mode 100644 src/codec/simple8b.cpp create mode 100644 src/codec/streamvbyte.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 69b252d28..fcf480e7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,21 @@ option(PISA_ENABLE_CLANG_TIDY "Enable static analysis with clang-tidy" OFF) option(PISA_CLANG_TIDY_EXECUTABLE "clang-tidy executable path" "clang-tidy") option(PISA_USE_PIC "Enable Position-Independent code globally" ON) option(PISA_CI_BUILD "Remove debug information from Debug build" ON) +option(PISA_ENABLE_IPO "Enable Interprocedural Optimization, aka Link Time Optimization (LTO)" OFF) + +if(ENABLE_IPO) + include(CheckIPOSupported) + check_ipo_supported( + RESULT + result + OUTPUT + output) + if(result) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + else() + message(SEND_ERROR "IPO is not supported: ${output}") + endif() +endif() if(PISA_USE_PIC) set(CMAKE_POSITION_INDEPENDENT_CODE ON) @@ -112,11 +127,6 @@ target_link_libraries(pisa range-v3 taily # These should be made private in the future: - FastPFor - streamvbyte - MaskedVByte - simdcomp - QMX PRIVATE gumbo::gumbo warcpp @@ -124,6 +134,11 @@ target_link_libraries(pisa trecpp Porter2 KrovetzStemmer + QMX + streamvbyte + MaskedVByte + simdcomp + FastPFor ) target_include_directories(pisa PUBLIC external) diff --git a/benchmarks/perftest_interpolative.cpp b/benchmarks/perftest_interpolative.cpp index 416adacd1..9bbe4fa22 100644 --- a/benchmarks/perftest_interpolative.cpp +++ b/benchmarks/perftest_interpolative.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "spdlog/spdlog.h" diff --git a/include/pisa/codec/VarIntG8IU.h b/include/pisa/codec/VarIntG8IU.h index 0ea3bedcd..b2aed704c 100644 --- a/include/pisa/codec/VarIntG8IU.h +++ b/include/pisa/codec/VarIntG8IU.h @@ -4,106 +4,109 @@ * Apache License Version 2.0 http://www.apache.org/licenses/. */ #if defined(_MSC_VER) -#include + #include #else -#include + #include #endif #ifdef __GNUC__ -#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) + #define PREDICT_FALSE(x) (__builtin_expect(x, 0)) #else -#define PREDICT_FALSE(x) x + #define PREDICT_FALSE(x) x #endif +#include +#include +#include +#include + namespace pisa { -class NotEnoughStorage : public std::runtime_error { -public: - size_t required; // number of 32-bit symbols required - NotEnoughStorage(const size_t req) - : runtime_error(""), required(req){ +class NotEnoughStorage: public std::runtime_error { + public: + size_t required; // number of 32-bit symbols required + explicit NotEnoughStorage(const size_t req) + : runtime_error(""), + required(req){ - }; + }; }; class IntegerCODEC { -public: - /** - * You specify input and input length, as well as - * output and output length. nvalue gets modified to - * reflect how much was used. If the new value of - * nvalue is more than the original value, we can - * consider this a buffer overrun. - * - * You are responsible for allocating the memory (length - * for *in and nvalue for *out). - */ - virtual void encodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) = 0; + public: + /** + * You specify input and input length, as well as + * output and output length. nvalue gets modified to + * reflect how much was used. If the new value of + * nvalue is more than the original value, we can + * consider this a buffer overrun. + * + * You are responsible for allocating the memory (length + * for *in and nvalue for *out). + */ + virtual void + encodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) = 0; - /** - * Usage is similar to decodeArray except that it returns a pointer - * incremented from in. In theory it should be in+length. If the - * returned pointer is less than in+length, then this generally means - * that the decompression is not finished (some scheme compress - * the bulk of the data one way, and they then they compress remaining - * integers using another scheme). - * - * As with encodeArray, you need to have length element allocated - * for *in and at least nvalue elements allocated for out. The value - * of the variable nvalue gets updated with the number actually use - * (if nvalue exceeds the original value, there might be a buffer - * overrun). - */ - virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) = 0; - virtual ~IntegerCODEC() {} + /** + * Usage is similar to decodeArray except that it returns a pointer + * incremented from in. In theory it should be in+length. If the + * returned pointer is less than in+length, then this generally means + * that the decompression is not finished (some scheme compress + * the bulk of the data one way, and they then they compress remaining + * integers using another scheme). + * + * As with encodeArray, you need to have length element allocated + * for *in and at least nvalue elements allocated for out. The value + * of the variable nvalue gets updated with the number actually use + * (if nvalue exceeds the original value, there might be a buffer + * overrun). + */ + virtual const uint32_t* + decodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) = 0; + virtual ~IntegerCODEC() {} - /** - * Will compress the content of a vector into - * another vector. - * - * This is offered for convenience. It might be slow. - */ - virtual std::vector compress(const std::vector &data) { - std::vector compresseddata(data.size() * 2 + - 1024); // allocate plenty of memory - size_t memavailable = compresseddata.size(); - encodeArray(&data[0], data.size(), &compresseddata[0], memavailable); - compresseddata.resize(memavailable); - return compresseddata; - } + /** + * Will compress the content of a vector into + * another vector. + * + * This is offered for convenience. It might be slow. + */ + virtual std::vector compress(const std::vector& data) + { + std::vector compresseddata(data.size() * 2 + 1024); // allocate plenty of memory + size_t memavailable = compresseddata.size(); + encodeArray(&data[0], data.size(), &compresseddata[0], memavailable); + compresseddata.resize(memavailable); + return compresseddata; + } - /** - * Will uncompress the content of a vector into - * another vector. Some CODECs know exactly how much data to uncompress, - * others need to uncompress it all to know how data there is to uncompress... - * So it useful to have a hint (expected_uncompressed_size) that tells how - * much data there will be to uncompress. Otherwise, the code will - * try to guess, but the result is uncertain and inefficient. You really - * ought to keep track of how many symbols you had compressed. - * - * For convenience. Might be slow. - */ - virtual std::vector - uncompress(const std::vector &compresseddata, - size_t expected_uncompressed_size = 0) { - std::vector data( - expected_uncompressed_size); // allocate plenty of memory - size_t memavailable = data.size(); - try { - decodeArray(&compresseddata[0], compresseddata.size(), &data[0], - memavailable); - } catch (NotEnoughStorage &nes) { - data.resize(nes.required + 1024); - decodeArray(&compresseddata[0], compresseddata.size(), &data[0], - memavailable); + /** + * Will uncompress the content of a vector into + * another vector. Some CODECs know exactly how much data to uncompress, + * others need to uncompress it all to know how data there is to uncompress... + * So it useful to have a hint (expected_uncompressed_size) that tells how + * much data there will be to uncompress. Otherwise, the code will + * try to guess, but the result is uncertain and inefficient. You really + * ought to keep track of how many symbols you had compressed. + * + * For convenience. Might be slow. + */ + virtual std::vector + uncompress(const std::vector& compresseddata, size_t expected_uncompressed_size = 0) + { + std::vector data(expected_uncompressed_size); // allocate plenty of memory + size_t memavailable = data.size(); + try { + decodeArray(&compresseddata[0], compresseddata.size(), &data[0], memavailable); + } catch (NotEnoughStorage& nes) { + data.resize(nes.required + 1024); + decodeArray(&compresseddata[0], compresseddata.size(), &data[0], memavailable); + } + data.resize(memavailable); + return data; } - data.resize(memavailable); - return data; - } - virtual std::string name() const = 0; + virtual std::string name() const = 0; }; /** @@ -123,177 +126,170 @@ class IntegerCODEC { * * */ -class VarIntG8IU : public IntegerCODEC { +class VarIntG8IU: public IntegerCODEC { + public: + // For all possible values of the + // descriptor we build a table of any shuffle sequence + // that might be needed at decode time. + VarIntG8IU() + { + char mask[256][32]; + for (int desc = 0; desc <= 255; desc++) { + int bitmask = 0x00000001; + int bitindex = 0; + // count number of 0 in the char + int complete = 0; + int ithSize[8]; + int lastpos = -1; + while (bitindex < 8) { + if ((desc & bitmask) == 0) { + ithSize[complete] = bitindex - lastpos; + lastpos = bitindex; + complete++; + } + bitindex++; + bitmask = bitmask << 1; + } + maskOutputSize[desc] = complete; -public: - // For all possible values of the - // descriptor we build a table of any shuffle sequence - // that might be needed at decode time. - VarIntG8IU() { - char mask[256][32]; - for (int desc = 0; desc <= 255; desc++) { - int bitmask = 0x00000001; - int bitindex = 0; - // count number of 0 in the char - int complete = 0; - int ithSize[8]; - int lastpos = -1; - while (bitindex < 8) { - if ((desc & bitmask) == 0) { - ithSize[complete] = bitindex - lastpos; - lastpos = bitindex; - complete++; + int j = 0; + int k = 0; + for (int i = 0; i < complete; i++) { + for (int n = 0; n < 4; n++) { + if (n < ithSize[i]) { + mask[desc][k] = static_cast(j); + j = j + 1; + } else { + mask[desc][k] = -1; + } + k = k + 1; + } + } } - bitindex++; - bitmask = bitmask << 1; - } - maskOutputSize[desc] = complete; - - int j = 0; - int k = 0; - for (int i = 0; i < complete; i++) { - for (int n = 0; n < 4; n++) { - if (n < ithSize[i]) { - mask[desc][k] = static_cast(j); - j = j + 1; - } else { - mask[desc][k] = -1; - } - k = k + 1; + for (int desc = 0; desc <= 255; desc++) { + vecmask[desc][0] = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(mask[desc])); + vecmask[desc][1] = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(mask[desc] + 16)); } - } } - for (int desc = 0; desc <= 255; desc++) { - vecmask[desc][0] = - _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc])); - vecmask[desc][1] = - _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc] + 16)); - } - } - void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - const uint32_t *src = in; - size_t srclength = length * 4; + void encodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) + { + const uint32_t* src = in; + size_t srclength = length * 4; - unsigned char *dst = reinterpret_cast(out); - nvalue = nvalue * 4; + unsigned char* dst = reinterpret_cast(out); + nvalue = nvalue * 4; - size_t compressed_size = 0; - while (srclength > 0 && nvalue >= 9) { - compressed_size += encodeBlock(src, srclength, dst, nvalue); + size_t compressed_size = 0; + while (srclength > 0 && nvalue >= 9) { + compressed_size += encodeBlock(src, srclength, dst, nvalue); + } + // Ouput might not be a multiple of 4 so we make it so + nvalue = ((compressed_size + 3) / 4); } - // Ouput might not be a multiple of 4 so we make it so - nvalue = ((compressed_size + 3) / 4); - } - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { + const uint32_t* decodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) + { + const unsigned char* src = reinterpret_cast(in); + const uint32_t* const initdst = out; - const unsigned char *src = reinterpret_cast(in); - const uint32_t *const initdst = out; + uint32_t* dst = out; + size_t srclength = length * 4; + for (; srclength >= 22; srclength -= 8, src += 8) { + unsigned char desc = *src; + src += 1; + srclength -= 1; + const __m128i data = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(src)); + const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result); + int readSize = maskOutputSize[desc]; - uint32_t *dst = out; - size_t srclength = length * 4; - for (; srclength >= 22; srclength -= 8, src += 8) { - unsigned char desc = *src; - src += 1; - srclength -= 1; - const __m128i data = - _mm_lddqu_si128(reinterpret_cast<__m128i const *>(src)); - const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), result); - int readSize = maskOutputSize[desc]; + if (readSize > 4) { + const __m128i result2 = + _mm_shuffle_epi8(data, vecmask[desc][1]); //__builtin_ia32_pshufb128(data, + // shf2); + _mm_storeu_si128( + reinterpret_cast<__m128i*>(dst + 4), + result2); //__builtin_ia32_storedqu(dst + (16), result2); + } + dst += readSize; + } + while (srclength >= 9) { + unsigned char desc = *src; + src += 1; + srclength -= 1; + char buff[32]; + memcpy(buff, src, 8); + const __m128i data = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(buff)); + const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); + _mm_storeu_si128(reinterpret_cast<__m128i*>(buff), result); + int readSize = maskOutputSize[desc]; + if (readSize > 4) { + const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]); + _mm_storeu_si128(reinterpret_cast<__m128i*>(buff + 16), result2); + } + memcpy(dst, buff, 4 * readSize); + dst += readSize; + srclength -= 8; + src += 8; + } - if (readSize > 4) { - const __m128i result2 = _mm_shuffle_epi8( - data, vecmask[desc][1]); //__builtin_ia32_pshufb128(data, shf2); - _mm_storeu_si128( - reinterpret_cast<__m128i *>(dst + 4), - result2); //__builtin_ia32_storedqu(dst + (16), result2); - } - dst += readSize; - } - while (srclength >= 9) { - unsigned char desc = *src; - src += 1; - srclength -= 1; - char buff[32]; - memcpy(buff, src, 8); - const __m128i data = - _mm_lddqu_si128(reinterpret_cast<__m128i const *>(buff)); - const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buff), result); - int readSize = maskOutputSize[desc]; - if (readSize > 4) { - const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buff + 16), result2); - } - memcpy(dst, buff, 4 * readSize); - dst += readSize; - srclength -= 8; - src += 8; + nvalue = (dst - initdst); + return reinterpret_cast((reinterpret_cast(src) + 3) & ~3); } - nvalue = (dst - initdst); - return reinterpret_cast((reinterpret_cast(src) + 3) & - ~3); - } - - virtual std::string name() const { return std::string("VarIntG8IU"); } + virtual std::string name() const { return std::string("VarIntG8IU"); } - int encodeBlock(const uint32_t *&src, size_t &srclength, unsigned char *&dest, - size_t &dstlength) { - unsigned char desc = 0xFF; - unsigned char bitmask = 0x01; - uint32_t buffer[8]; - int ithSize[8]; - int length = 0; - int numInt = 0; + int encodeBlock(const uint32_t*& src, size_t& srclength, unsigned char*& dest, size_t& dstlength) + { + unsigned char desc = 0xFF; + unsigned char bitmask = 0x01; + uint32_t buffer[8]; + int ithSize[8]; + int length = 0; + int numInt = 0; - while (srclength > 0) { - const uint32_t *temp = src; - int byteNeeded = getNumByteNeeded(*temp); + while (srclength > 0) { + const uint32_t* temp = src; + int byteNeeded = getNumByteNeeded(*temp); - if (PREDICT_FALSE(length + byteNeeded > 8)) { - break; - } + if (PREDICT_FALSE(length + byteNeeded > 8)) { + break; + } - // flip the correct bit in desc - bitmask = static_cast(bitmask << (byteNeeded - 1)); - desc = desc ^ bitmask; - bitmask = static_cast(bitmask << 1); + // flip the correct bit in desc + bitmask = static_cast(bitmask << (byteNeeded - 1)); + desc = desc ^ bitmask; + bitmask = static_cast(bitmask << 1); - ithSize[numInt] = byteNeeded; - length += byteNeeded; - buffer[numInt] = *temp; - src = src + 1; - srclength -= 4; - numInt++; - } + ithSize[numInt] = byteNeeded; + length += byteNeeded; + buffer[numInt] = *temp; + src = src + 1; + srclength -= 4; + numInt++; + } - dest[0] = desc; - int written = 1; - for (int i = 0; i < numInt; i++) { - int size = ithSize[i]; - uint32_t value = buffer[i]; - for (int j = 0; j < size; j++) { - dest[written] = static_cast(value >> (j * 8)); - written++; - } + dest[0] = desc; + int written = 1; + for (int i = 0; i < numInt; i++) { + int size = ithSize[i]; + uint32_t value = buffer[i]; + for (int j = 0; j < size; j++) { + dest[written] = static_cast(value >> (j * 8)); + written++; + } + } + dest += 9; + dstlength -= 9; + return 9; } - dest += 9; - dstlength -= 9; - return 9; - } -protected: - int maskOutputSize[256]; - __m128i vecmask[256][2]; + protected: + int maskOutputSize[256]; + __m128i vecmask[256][2]; - int getNumByteNeeded(const uint32_t val) { - return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1; - } + int getNumByteNeeded(const uint32_t val) { return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1; } }; -} // namespace FastPFor +} // namespace pisa diff --git a/include/pisa/codec/block_codecs.hpp b/include/pisa/codec/block_codecs.hpp index feeb9fc7b..d4bd71234 100644 --- a/include/pisa/codec/block_codecs.hpp +++ b/include/pisa/codec/block_codecs.hpp @@ -1,8 +1,5 @@ #pragma once -#include "FastPFor/headers/optpfor.h" -#include "FastPFor/headers/variablebyte.h" - #include "VarIntG8IU.h" #include "interpolative_coding.hpp" #include "util/compiler_attribute.hpp" @@ -169,85 +166,16 @@ struct interpolative_block { }; struct optpfor_block { - struct codec_type: FastPForLib::OPTPFor<4, FastPForLib::Simple16> { - uint8_t const* force_b{nullptr}; - - uint32_t findBestB(const uint32_t* in, uint32_t len) - { - // trick to force the choice of b from a parameter - if (force_b != nullptr) { - return *force_b; - } - - // this is mostly a cut&paste from FastPFor, but we stop the - // optimization early as the b to test becomes larger than maxb - uint32_t b = 0; - uint32_t bsize = std::numeric_limits::max(); - const uint32_t mb = FastPForLib::maxbits(in, in + len); - uint32_t i = 0; - while (mb > 28 + possLogs[i]) { - ++i; // some schemes such as Simple16 don't code numbers greater than 28 - } - - for (; i < possLogs.size(); i++) { - if (possLogs[i] > mb && possLogs[i] >= mb) { - break; - } - const uint32_t csize = tryB(possLogs[i], in, len); - - if (csize <= bsize) { - b = possLogs[i]; - bsize = csize; - } - } - return b; - } - }; - - static const uint64_t block_size = codec_type::BlockSize; + static const uint64_t block_size; static void encode( uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out, - uint8_t const* b = nullptr) // if non-null forces b - { - thread_local codec_type optpfor_codec; - thread_local std::vector buf(2 * 4 * block_size); - assert(n <= block_size); - - if (n < block_size) { - interpolative_block::encode(in, sum_of_values, n, out); - return; - } - - size_t out_len = buf.size(); - - optpfor_codec.force_b = b; - optpfor_codec.encodeBlock(in, reinterpret_cast(buf.data()), out_len); - out_len *= 4; - out.insert(out.end(), buf.data(), buf.data() + out_len); - } - - static uint8_t const* PISA_NOINLINE - decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { - thread_local codec_type optpfor_codec; // pfor decoding is *not* thread-safe - assert(n <= block_size); - - if (PISA_UNLIKELY(n < block_size)) { - return interpolative_block::decode(in, out, sum_of_values, n); - } - - size_t out_len = block_size; - uint8_t const* ret; + uint8_t const* b = nullptr); - ret = reinterpret_cast( - optpfor_codec.decodeBlock(reinterpret_cast(in), out, out_len)); - assert(out_len == n); - return ret; - } + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n); }; struct varint_G8IU_block { diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp index 9512e01e9..b12a59e62 100644 --- a/include/pisa/codec/maskedvbyte.hpp +++ b/include/pisa/codec/maskedvbyte.hpp @@ -1,35 +1,15 @@ #pragma once +#include +#include #include -#include "MaskedVByte/include/varintdecode.h" -#include "MaskedVByte/include/varintencode.h" -#include "codec/block_codecs.hpp" -#include "util/util.hpp" - namespace pisa { + struct maskedvbyte_block { - static const uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { - assert(n <= block_size); - auto* src = const_cast(in); - if (n < block_size) { - interpolative_block::encode(src, sum_of_values, n, out); - return; - } - thread_local std::vector buf(2 * block_size * sizeof(uint32_t)); - size_t out_len = vbyte_encode(src, n, buf.data()); - out.insert(out.end(), buf.data(), buf.data() + out_len); - } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { - assert(n <= block_size); - if (PISA_UNLIKELY(n < block_size)) { - return interpolative_block::decode(in, out, sum_of_values, n); - } - auto read = masked_vbyte_decode(in, out, n); - return in + read; - } + static const uint64_t block_size; + static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out); + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp index 3fc10bbda..12e7492d0 100644 --- a/include/pisa/codec/qmx.hpp +++ b/include/pisa/codec/qmx.hpp @@ -1,44 +1,18 @@ #pragma once -#include "QMX/qmx.hpp" -#include "codec/block_codecs.hpp" +#include +#include +#include namespace pisa { + struct qmx_block { - static const uint64_t block_size = 128; - static const uint64_t overflow = 512; + static const uint64_t block_size; + static const uint64_t overflow; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { - assert(n <= block_size); - auto* src = const_cast(in); - if (n < block_size) { - interpolative_block::encode(src, sum_of_values, n, out); - return; - } - thread_local QMX::compress_integer_qmx_improved qmx_codec; - thread_local std::vector buf(2 * n * sizeof(uint32_t) + overflow); + static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out); - size_t out_len = qmx_codec.encode(buf.data(), buf.size(), in, n); - TightVariableByte::encode_single(out_len, out); - out.insert(out.end(), buf.data(), buf.data() + out_len); - } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { - static QMX::compress_integer_qmx_improved qmx_codec; // decodeBlock is thread-safe - assert(n <= block_size); - if (PISA_UNLIKELY(n < block_size)) { - return interpolative_block::decode(in, out, sum_of_values, n); - } - uint32_t enc_len = 0; - in = TightVariableByte::decode(in, &enc_len, 1); - std::vector buf(2 * n + overflow); - qmx_codec.decode(buf.data(), n, in, enc_len); - for (size_t i = 0; i < n; ++i) { - *out = buf[i]; - ++out; - } - return in + enc_len; - } + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp index 42ba654a8..e20118ccc 100644 --- a/include/pisa/codec/simdbp.hpp +++ b/include/pisa/codec/simdbp.hpp @@ -1,40 +1,15 @@ #pragma once -#include "codec/block_codecs.hpp" -#include "util/util.hpp" +#include +#include #include -extern "C" { -#include "simdcomp/include/simdbitpacking.h" -} - namespace pisa { + struct simdbp_block { - static const uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { - assert(n <= block_size); - auto* src = const_cast(in); - if (n < block_size) { - interpolative_block::encode(src, sum_of_values, n, out); - return; - } - uint32_t b = maxbits(in); - thread_local std::vector buf(8 * n); - uint8_t* buf_ptr = buf.data(); - *buf_ptr++ = b; - simdpackwithoutmask(src, (__m128i*)buf_ptr, b); - out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1); - } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { - assert(n <= block_size); - if (PISA_UNLIKELY(n < block_size)) { - return interpolative_block::decode(in, out, sum_of_values, n); - } - uint32_t b = *in++; - simdunpack((const __m128i*)in, out, b); - return in + b * sizeof(__m128i); - } + static const uint64_t block_size; + static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out); + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp index 3c08ca823..b07fc8e3d 100644 --- a/include/pisa/codec/simple16.hpp +++ b/include/pisa/codec/simple16.hpp @@ -1,36 +1,19 @@ #pragma once -#include "FastPFor/headers/simple16.h" + +#include +#include +#include namespace pisa { struct simple16_block { - static const uint64_t block_size = 128; + static const uint64_t block_size; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { - assert(n <= block_size); - thread_local FastPForLib::Simple16 codec; - thread_local std::vector buf(2 * 8 * block_size); - size_t out_len = buf.size(); - codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); - out_len *= 4; - out.insert(out.end(), buf.data(), buf.data() + out_len); - } + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out); static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { - assert(n <= block_size); - FastPForLib::Simple16 codec; - std::vector buf(2 * block_size); - - auto const* ret = reinterpret_cast( - codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n)); - for (size_t i = 0; i < n; ++i) { - *out++ = buf[i]; - } - return ret; - } + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp index 52c307b3e..234d47834 100644 --- a/include/pisa/codec/simple8b.hpp +++ b/include/pisa/codec/simple8b.hpp @@ -1,30 +1,19 @@ #pragma once -#include "FastPFor/headers/simple8b.h" + +#include +#include +#include namespace pisa { struct simple8b_block { - static const uint64_t block_size = 128; + static const uint64_t block_size; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { - assert(n <= block_size); - thread_local FastPForLib::Simple8b codec; - thread_local std::vector buf(2 * 8 * block_size); - size_t out_len = buf.size(); - codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); - out_len *= 4; - out.insert(out.end(), buf.data(), buf.data() + out_len); - } + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out); static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { - assert(n <= block_size); - FastPForLib::Simple8b codec; - return reinterpret_cast( - codec.decodeArray(reinterpret_cast(in), 8 * n, out, n)); - } + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp index c584da841..adaa958fd 100644 --- a/include/pisa/codec/streamvbyte.hpp +++ b/include/pisa/codec/streamvbyte.hpp @@ -1,29 +1,17 @@ #pragma once -#include +#include +#include #include -#include "streamvbyte/include/streamvbyte.h" - namespace pisa { struct streamvbyte_block { - static const uint64_t block_size = 128; + static const uint64_t block_size; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { - assert(n <= block_size); - auto* src = const_cast(in); - thread_local std::vector buf(streamvbyte_max_compressedbytes(block_size)); - size_t out_len = streamvbyte_encode(src, n, buf.data()); - out.insert(out.end(), buf.data(), buf.data() + out_len); - } + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out); static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { - assert(n <= block_size); - auto read = streamvbyte_decode(in, out, n); - return in + read; - } + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n); }; + } // namespace pisa diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp index b2b791956..7439c4be4 100644 --- a/include/pisa/codec/varintgb.hpp +++ b/include/pisa/codec/varintgb.hpp @@ -2,8 +2,6 @@ #include "codec/block_codecs.hpp" #include -#include "FastPFor/headers/common.h" - using namespace std; namespace pisa { diff --git a/src/codec/maskedvbyte.cpp b/src/codec/maskedvbyte.cpp new file mode 100644 index 000000000..ca656b76c --- /dev/null +++ b/src/codec/maskedvbyte.cpp @@ -0,0 +1,39 @@ +#include "codec/maskedvbyte.hpp" + +#include "MaskedVByte/include/varintdecode.h" +#include "MaskedVByte/include/varintencode.h" +#include "codec/block_codecs.hpp" +#include "util/util.hpp" + +#include + +namespace pisa { + +const uint64_t maskedvbyte_block::block_size = 128; + +void maskedvbyte_block::encode( + uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) +{ + assert(n <= block_size); + auto* src = const_cast(in); + if (n < block_size) { + interpolative_block::encode(src, sum_of_values, n, out); + return; + } + thread_local std::vector buf(2 * block_size * sizeof(uint32_t)); + size_t out_len = vbyte_encode(src, n, buf.data()); + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +uint8_t const* +maskedvbyte_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) +{ + assert(n <= block_size); + if (PISA_UNLIKELY(n < block_size)) { + return interpolative_block::decode(in, out, sum_of_values, n); + } + auto read = masked_vbyte_decode(in, out, n); + return in + read; +} + +} // namespace pisa diff --git a/src/codec/optpfor.cpp b/src/codec/optpfor.cpp new file mode 100644 index 000000000..cd566047a --- /dev/null +++ b/src/codec/optpfor.cpp @@ -0,0 +1,86 @@ +#include "codec/block_codecs.hpp" + +#include "FastPFor/headers/optpfor.h" + +namespace pisa { + +struct codec_type: FastPForLib::OPTPFor<4, FastPForLib::Simple16> { + uint8_t const* force_b{nullptr}; + + uint32_t findBestB(const uint32_t* in, uint32_t len) + { + // trick to force the choice of b from a parameter + if (force_b != nullptr) { + return *force_b; + } + + // this is mostly a cut&paste from FastPFor, but we stop the + // optimization early as the b to test becomes larger than maxb + uint32_t b = 0; + uint32_t bsize = std::numeric_limits::max(); + const uint32_t mb = FastPForLib::maxbits(in, in + len); + uint32_t i = 0; + while (mb > 28 + possLogs[i]) { + ++i; // some schemes such as Simple16 don't code numbers greater than 28 + } + + for (; i < possLogs.size(); i++) { + if (possLogs[i] > mb && possLogs[i] >= mb) { + break; + } + const uint32_t csize = tryB(possLogs[i], in, len); + + if (csize <= bsize) { + b = possLogs[i]; + bsize = csize; + } + } + return b; + } +}; + +const uint64_t optpfor_block::block_size = codec_type::BlockSize; + +void optpfor_block::encode( + uint32_t const* in, + uint32_t sum_of_values, + size_t n, + std::vector& out, + uint8_t const* b) // if non-null forces b +{ + thread_local codec_type optpfor_codec; + thread_local std::vector buf(2 * 4 * block_size); + assert(n <= block_size); + + if (n < block_size) { + interpolative_block::encode(in, sum_of_values, n, out); + return; + } + + size_t out_len = buf.size(); + + optpfor_codec.force_b = b; + optpfor_codec.encodeBlock(in, reinterpret_cast(buf.data()), out_len); + out_len *= 4; + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +uint8_t const* optpfor_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) +{ + thread_local codec_type optpfor_codec; // pfor decoding is *not* thread-safe + assert(n <= block_size); + + if (PISA_UNLIKELY(n < block_size)) { + return interpolative_block::decode(in, out, sum_of_values, n); + } + + size_t out_len = block_size; + uint8_t const* ret; + + ret = reinterpret_cast( + optpfor_codec.decodeBlock(reinterpret_cast(in), out, out_len)); + assert(out_len == n); + return ret; +} + +} // namespace pisa diff --git a/src/codec/qmx.cpp b/src/codec/qmx.cpp new file mode 100644 index 000000000..4f332fb92 --- /dev/null +++ b/src/codec/qmx.cpp @@ -0,0 +1,46 @@ +#include "codec/qmx.hpp" + +#include "QMX/qmx.hpp" +#include "codec/block_codecs.hpp" + +namespace pisa { + +const uint64_t qmx_block::block_size = 128; +const uint64_t qmx_block::overflow = 512; + +void qmx_block::encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) +{ + assert(n <= qmx_block::block_size); + auto* src = const_cast(in); + if (n < qmx_block::block_size) { + interpolative_block::encode(src, sum_of_values, n, out); + return; + } + thread_local QMX::compress_integer_qmx_improved qmx_codec; + thread_local std::vector buf(2 * n * sizeof(std::uint32_t) + overflow); + + size_t out_len = qmx_codec.encode(buf.data(), buf.size(), in, n); + TightVariableByte::encode_single(out_len, out); + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +auto qmx_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) + -> uint8_t const* +{ + static QMX::compress_integer_qmx_improved qmx_codec; // decodeBlock is thread-safe + assert(n <= qmx_block::block_size); + if (PISA_UNLIKELY(n < qmx_block::block_size)) { + return interpolative_block::decode(in, out, sum_of_values, n); + } + std::uint32_t enc_len = 0; + in = TightVariableByte::decode(in, &enc_len, 1); + std::vector buf(2 * n + qmx_block::overflow); + qmx_codec.decode(buf.data(), n, in, enc_len); + for (size_t i = 0; i < n; ++i) { + *out = buf[i]; + ++out; + } + return in + enc_len; +} + +} // namespace pisa diff --git a/src/codec/simdbp.cpp b/src/codec/simdbp.cpp new file mode 100644 index 000000000..9d5c54ec4 --- /dev/null +++ b/src/codec/simdbp.cpp @@ -0,0 +1,42 @@ +#include "codec/simdbp.hpp" + +#include "codec/block_codecs.hpp" +#include "util/util.hpp" +#include + +extern "C" { +#include "simdcomp/include/simdbitpacking.h" +} + +namespace pisa { + +const uint64_t simdbp_block::block_size = 128; + +void simdbp_block::encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) +{ + assert(n <= block_size); + auto* src = const_cast(in); + if (n < block_size) { + interpolative_block::encode(src, sum_of_values, n, out); + return; + } + uint32_t b = maxbits(in); + thread_local std::vector buf(8 * n); + uint8_t* buf_ptr = buf.data(); + *buf_ptr++ = b; + simdpackwithoutmask(src, (__m128i*)buf_ptr, b); + out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1); +} + +uint8_t const* simdbp_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) +{ + assert(n <= block_size); + if (PISA_UNLIKELY(n < block_size)) { + return interpolative_block::decode(in, out, sum_of_values, n); + } + uint32_t b = *in++; + simdunpack((const __m128i*)in, out, b); + return in + b * sizeof(__m128i); +} + +} // namespace pisa diff --git a/src/codec/simple16.cpp b/src/codec/simple16.cpp new file mode 100644 index 000000000..be4dc0c2c --- /dev/null +++ b/src/codec/simple16.cpp @@ -0,0 +1,36 @@ +#include "codec/simple16.hpp" + +#include "FastPFor/headers/simple16.h" + +namespace pisa { + +const uint64_t simple16_block::block_size = 128; + +void simple16_block::encode( + uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) +{ + assert(n <= block_size); + thread_local FastPForLib::Simple16 codec; + thread_local std::vector buf(2 * 8 * block_size); + size_t out_len = buf.size(); + codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); + out_len *= 4; + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +uint8_t const* +simple16_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) +{ + assert(n <= block_size); + FastPForLib::Simple16 codec; + std::vector buf(2 * block_size); + + auto const* ret = reinterpret_cast( + codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n)); + for (size_t i = 0; i < n; ++i) { + *out++ = buf[i]; + } + return ret; +} + +} // namespace pisa diff --git a/src/codec/simple8b.cpp b/src/codec/simple8b.cpp new file mode 100644 index 000000000..7597df4c9 --- /dev/null +++ b/src/codec/simple8b.cpp @@ -0,0 +1,30 @@ +#include "codec/simple8b.hpp" + +#include "FastPFor/headers/simple8b.h" + +namespace pisa { + +const uint64_t simple8b_block::block_size = 128; + +void simple8b_block::encode( + uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) +{ + assert(n <= block_size); + thread_local FastPForLib::Simple8b codec; + thread_local std::vector buf(2 * 8 * block_size); + size_t out_len = buf.size(); + codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); + out_len *= 4; + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +uint8_t const* +simple8b_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) +{ + assert(n <= block_size); + FastPForLib::Simple8b codec; + return reinterpret_cast( + codec.decodeArray(reinterpret_cast(in), 8 * n, out, n)); +} + +} // namespace pisa diff --git a/src/codec/streamvbyte.cpp b/src/codec/streamvbyte.cpp new file mode 100644 index 000000000..bd35a4451 --- /dev/null +++ b/src/codec/streamvbyte.cpp @@ -0,0 +1,30 @@ +#include "codec/streamvbyte.hpp" + +#include +#include + +#include "streamvbyte/include/streamvbyte.h" + +namespace pisa { + +const uint64_t streamvbyte_block::block_size = 128; + +void streamvbyte_block::encode( + uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) +{ + assert(n <= block_size); + auto* src = const_cast(in); + thread_local std::vector buf(streamvbyte_max_compressedbytes(block_size)); + size_t out_len = streamvbyte_encode(src, n, buf.data()); + out.insert(out.end(), buf.data(), buf.data() + out_len); +} + +uint8_t const* +streamvbyte_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) +{ + assert(n <= block_size); + auto read = streamvbyte_decode(in, out, n); + return in + read; +} + +} // namespace pisa From 9712a730ff30fb1c8f42021fd460917d0931654c Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Mon, 14 Jun 2021 14:48:30 -0400 Subject: [PATCH 2/3] Missing include --- CMakeLists.txt | 1 - include/pisa/forward_index.hpp | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fcf480e7b..f784cacc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,6 @@ target_link_libraries(pisa fmt::fmt range-v3 taily - # These should be made private in the future: PRIVATE gumbo::gumbo warcpp diff --git a/include/pisa/forward_index.hpp b/include/pisa/forward_index.hpp index 4a4af032d..1382f2574 100644 --- a/include/pisa/forward_index.hpp +++ b/include/pisa/forward_index.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include From f5945bb59924c66c184867ab1328120ef13e2edf Mon Sep 17 00:00:00 2001 From: J Mackenzie Date: Tue, 15 Jun 2021 10:52:20 +1000 Subject: [PATCH 3/3] Fix flag --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f784cacc0..7dd1aed1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ option(PISA_USE_PIC "Enable Position-Independent code globally" ON) option(PISA_CI_BUILD "Remove debug information from Debug build" ON) option(PISA_ENABLE_IPO "Enable Interprocedural Optimization, aka Link Time Optimization (LTO)" OFF) -if(ENABLE_IPO) +if(PISA_ENABLE_IPO) include(CheckIPOSupported) check_ipo_supported( RESULT