From cfa93984c31be632fe401dd79326154d9e0d385f Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <siedlaczek@pm.me>
Date: Mon, 14 Jun 2021 14:31:46 -0400
Subject: [PATCH 1/3] Move codec implementations to cpp and make codec
 dependencies private

---
 CMakeLists.txt                        |  25 +-
 benchmarks/perftest_interpolative.cpp |   1 +
 include/pisa/codec/VarIntG8IU.h       | 458 +++++++++++++-------------
 include/pisa/codec/block_codecs.hpp   |  78 +----
 include/pisa/codec/maskedvbyte.hpp    |  34 +-
 include/pisa/codec/qmx.hpp            |  44 +--
 include/pisa/codec/simdbp.hpp         |  39 +--
 include/pisa/codec/simple16.hpp       |  33 +-
 include/pisa/codec/simple8b.hpp       |  27 +-
 include/pisa/codec/streamvbyte.hpp    |  24 +-
 include/pisa/codec/varintgb.hpp       |   2 -
 src/codec/maskedvbyte.cpp             |  39 +++
 src/codec/optpfor.cpp                 |  86 +++++
 src/codec/qmx.cpp                     |  46 +++
 src/codec/simdbp.cpp                  |  42 +++
 src/codec/simple16.cpp                |  36 ++
 src/codec/simple8b.cpp                |  30 ++
 src/codec/streamvbyte.cpp             |  30 ++
 18 files changed, 605 insertions(+), 469 deletions(-)
 create mode 100644 src/codec/maskedvbyte.cpp
 create mode 100644 src/codec/optpfor.cpp
 create mode 100644 src/codec/qmx.cpp
 create mode 100644 src/codec/simdbp.cpp
 create mode 100644 src/codec/simple16.cpp
 create mode 100644 src/codec/simple8b.cpp
 create mode 100644 src/codec/streamvbyte.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69b252d28..fcf480e7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,21 @@ option(PISA_ENABLE_CLANG_TIDY "Enable static analysis with clang-tidy" OFF)
 option(PISA_CLANG_TIDY_EXECUTABLE "clang-tidy executable path" "clang-tidy")
 option(PISA_USE_PIC "Enable Position-Independent code globally" ON)
 option(PISA_CI_BUILD "Remove debug information from Debug build" ON)
+option(PISA_ENABLE_IPO "Enable Interprocedural Optimization, aka Link Time Optimization (LTO)" OFF)
+
+if(ENABLE_IPO)
+    include(CheckIPOSupported)
+    check_ipo_supported(
+        RESULT
+        result
+        OUTPUT
+        output)
+    if(result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(SEND_ERROR "IPO is not supported: ${output}")
+    endif()
+endif()
 
 if(PISA_USE_PIC)
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -112,11 +127,6 @@ target_link_libraries(pisa
         range-v3
         taily
     # These should be made private in the future:
-        FastPFor
-        streamvbyte
-        MaskedVByte
-        simdcomp
-        QMX
     PRIVATE
         gumbo::gumbo
         warcpp
@@ -124,6 +134,11 @@ target_link_libraries(pisa
         trecpp
         Porter2
         KrovetzStemmer
+        QMX
+        streamvbyte
+        MaskedVByte
+        simdcomp
+        FastPFor
 )
 target_include_directories(pisa PUBLIC external)
 
diff --git a/benchmarks/perftest_interpolative.cpp b/benchmarks/perftest_interpolative.cpp
index 416adacd1..9bbe4fa22 100644
--- a/benchmarks/perftest_interpolative.cpp
+++ b/benchmarks/perftest_interpolative.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <iostream>
+#include <numeric>
 
 #include "spdlog/spdlog.h"
 
diff --git a/include/pisa/codec/VarIntG8IU.h b/include/pisa/codec/VarIntG8IU.h
index 0ea3bedcd..b2aed704c 100644
--- a/include/pisa/codec/VarIntG8IU.h
+++ b/include/pisa/codec/VarIntG8IU.h
@@ -4,106 +4,109 @@
  * Apache License Version 2.0 http://www.apache.org/licenses/.
  */
 #if defined(_MSC_VER)
-#include <intrin.h>
+    #include <intrin.h>
 #else
-#include <x86intrin.h>
+    #include <x86intrin.h>
 #endif
 
 #ifdef __GNUC__
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+    #define PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #else
-#define PREDICT_FALSE(x) x
+    #define PREDICT_FALSE(x) x
 #endif
 
+#include <cstring>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
 namespace pisa {
 
-class NotEnoughStorage : public std::runtime_error {
-public:
-  size_t required; // number of 32-bit symbols required
-  NotEnoughStorage(const size_t req)
-      : runtime_error(""), required(req){
+class NotEnoughStorage: public std::runtime_error {
+  public:
+    size_t required;  // number of 32-bit symbols required
+    explicit NotEnoughStorage(const size_t req)
+        : runtime_error(""),
+          required(req){
 
-                           };
+          };
 };
 
 class IntegerCODEC {
-public:
-  /**
-   * You specify input and input length, as well as
-   * output and output length. nvalue gets modified to
-   * reflect how much was used. If the new value of
-   * nvalue is more than the original value, we can
-   * consider this a buffer overrun.
-   *
-   * You are responsible for allocating the memory (length
-   * for *in and nvalue for *out).
-   */
-  virtual void encodeArray(const uint32_t *in, const size_t length,
-                           uint32_t *out, size_t &nvalue) = 0;
+  public:
+    /**
+     * You specify input and input length, as well as
+     * output and output length. nvalue gets modified to
+     * reflect how much was used. If the new value of
+     * nvalue is more than the original value, we can
+     * consider this a buffer overrun.
+     *
+     * You are responsible for allocating the memory (length
+     * for *in and nvalue for *out).
+     */
+    virtual void
+    encodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) = 0;
 
-  /**
-   * Usage is similar to decodeArray except that it returns a pointer
-   * incremented from in. In theory it should be in+length. If the
-   * returned pointer is less than in+length, then this generally means
-   * that the decompression is not finished (some scheme compress
-   * the bulk of the data one way, and they then they compress remaining
-   * integers using another scheme).
-   *
-   * As with encodeArray, you need to have length element allocated
-   * for *in and at least nvalue elements allocated for out. The value
-   * of the variable nvalue gets updated with the number actually use
-   * (if nvalue exceeds the original value, there might be a buffer
-   * overrun).
-   */
-  virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length,
-                                      uint32_t *out, size_t &nvalue) = 0;
-  virtual ~IntegerCODEC() {}
+    /**
+     * Usage is similar to decodeArray except that it returns a pointer
+     * incremented from in. In theory it should be in+length. If the
+     * returned pointer is less than in+length, then this generally means
+     * that the decompression is not finished (some scheme compress
+     * the bulk of the data one way, and they then they compress remaining
+     * integers using another scheme).
+     *
+     * As with encodeArray, you need to have length element allocated
+     * for *in and at least nvalue elements allocated for out. The value
+     * of the variable nvalue gets updated with the number actually use
+     * (if nvalue exceeds the original value, there might be a buffer
+     * overrun).
+     */
+    virtual const uint32_t*
+    decodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue) = 0;
+    virtual ~IntegerCODEC() {}
 
-  /**
-   * Will compress the content of a vector into
-   * another vector.
-   *
-   * This is offered for convenience. It might be slow.
-   */
-  virtual std::vector<uint32_t> compress(const std::vector<uint32_t> &data) {
-    std::vector<uint32_t> compresseddata(data.size() * 2 +
-                                         1024); // allocate plenty of memory
-    size_t memavailable = compresseddata.size();
-    encodeArray(&data[0], data.size(), &compresseddata[0], memavailable);
-    compresseddata.resize(memavailable);
-    return compresseddata;
-  }
+    /**
+     * Will compress the content of a vector into
+     * another vector.
+     *
+     * This is offered for convenience. It might be slow.
+     */
+    virtual std::vector<uint32_t> compress(const std::vector<uint32_t>& data)
+    {
+        std::vector<uint32_t> compresseddata(data.size() * 2 + 1024);  // allocate plenty of memory
+        size_t memavailable = compresseddata.size();
+        encodeArray(&data[0], data.size(), &compresseddata[0], memavailable);
+        compresseddata.resize(memavailable);
+        return compresseddata;
+    }
 
-  /**
-   * Will uncompress the content of a vector into
-   * another vector. Some CODECs know exactly how much data to uncompress,
-   * others need to uncompress it all to know how data there is to uncompress...
-   * So it useful to have a hint (expected_uncompressed_size) that tells how
-   * much data there will be to uncompress. Otherwise, the code will
-   * try to guess, but the result is uncertain and inefficient. You really
-   * ought to keep track of how many symbols you had compressed.
-   *
-   * For convenience. Might be slow.
-   */
-  virtual std::vector<uint32_t>
-  uncompress(const std::vector<uint32_t> &compresseddata,
-             size_t expected_uncompressed_size = 0) {
-    std::vector<uint32_t> data(
-        expected_uncompressed_size); // allocate plenty of memory
-    size_t memavailable = data.size();
-    try {
-      decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
-                  memavailable);
-    } catch (NotEnoughStorage &nes) {
-      data.resize(nes.required + 1024);
-      decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
-                  memavailable);
+    /**
+     * Will uncompress the content of a vector into
+     * another vector. Some CODECs know exactly how much data to uncompress,
+     * others need to uncompress it all to know how data there is to uncompress...
+     * So it useful to have a hint (expected_uncompressed_size) that tells how
+     * much data there will be to uncompress. Otherwise, the code will
+     * try to guess, but the result is uncertain and inefficient. You really
+     * ought to keep track of how many symbols you had compressed.
+     *
+     * For convenience. Might be slow.
+     */
+    virtual std::vector<uint32_t>
+    uncompress(const std::vector<uint32_t>& compresseddata, size_t expected_uncompressed_size = 0)
+    {
+        std::vector<uint32_t> data(expected_uncompressed_size);  // allocate plenty of memory
+        size_t memavailable = data.size();
+        try {
+            decodeArray(&compresseddata[0], compresseddata.size(), &data[0], memavailable);
+        } catch (NotEnoughStorage& nes) {
+            data.resize(nes.required + 1024);
+            decodeArray(&compresseddata[0], compresseddata.size(), &data[0], memavailable);
+        }
+        data.resize(memavailable);
+        return data;
     }
-    data.resize(memavailable);
-    return data;
-  }
 
-  virtual std::string name() const = 0;
+    virtual std::string name() const = 0;
 };
 
 /**
@@ -123,177 +126,170 @@ class IntegerCODEC {
  *
  *
  */
-class VarIntG8IU : public IntegerCODEC {
+class VarIntG8IU: public IntegerCODEC {
+  public:
+    // For all possible values of the
+    // descriptor we build a table of any shuffle sequence
+    // that might be needed at decode time.
+    VarIntG8IU()
+    {
+        char mask[256][32];
+        for (int desc = 0; desc <= 255; desc++) {
+            int bitmask = 0x00000001;
+            int bitindex = 0;
+            // count number of 0 in the char
+            int complete = 0;
+            int ithSize[8];
+            int lastpos = -1;
+            while (bitindex < 8) {
+                if ((desc & bitmask) == 0) {
+                    ithSize[complete] = bitindex - lastpos;
+                    lastpos = bitindex;
+                    complete++;
+                }
+                bitindex++;
+                bitmask = bitmask << 1;
+            }
+            maskOutputSize[desc] = complete;
 
-public:
-  // For all possible values of the
-  // descriptor we build a table of any shuffle sequence
-  // that might be needed at decode time.
-  VarIntG8IU() {
-    char mask[256][32];
-    for (int desc = 0; desc <= 255; desc++) {
-      int bitmask = 0x00000001;
-      int bitindex = 0;
-      // count number of 0 in the char
-      int complete = 0;
-      int ithSize[8];
-      int lastpos = -1;
-      while (bitindex < 8) {
-        if ((desc & bitmask) == 0) {
-          ithSize[complete] = bitindex - lastpos;
-          lastpos = bitindex;
-          complete++;
+            int j = 0;
+            int k = 0;
+            for (int i = 0; i < complete; i++) {
+                for (int n = 0; n < 4; n++) {
+                    if (n < ithSize[i]) {
+                        mask[desc][k] = static_cast<unsigned char>(j);
+                        j = j + 1;
+                    } else {
+                        mask[desc][k] = -1;
+                    }
+                    k = k + 1;
+                }
+            }
         }
-        bitindex++;
-        bitmask = bitmask << 1;
-      }
-      maskOutputSize[desc] = complete;
-
-      int j = 0;
-      int k = 0;
-      for (int i = 0; i < complete; i++) {
-        for (int n = 0; n < 4; n++) {
-          if (n < ithSize[i]) {
-            mask[desc][k] = static_cast<unsigned char>(j);
-            j = j + 1;
-          } else {
-            mask[desc][k] = -1;
-          }
-          k = k + 1;
+        for (int desc = 0; desc <= 255; desc++) {
+            vecmask[desc][0] = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(mask[desc]));
+            vecmask[desc][1] = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(mask[desc] + 16));
         }
-      }
     }
-    for (int desc = 0; desc <= 255; desc++) {
-      vecmask[desc][0] =
-          _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc]));
-      vecmask[desc][1] =
-          _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc] + 16));
-    }
-  }
 
-  void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
-                   size_t &nvalue) {
-    const uint32_t *src = in;
-    size_t srclength = length * 4;
+    void encodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue)
+    {
+        const uint32_t* src = in;
+        size_t srclength = length * 4;
 
-    unsigned char *dst = reinterpret_cast<unsigned char *>(out);
-    nvalue = nvalue * 4;
+        unsigned char* dst = reinterpret_cast<unsigned char*>(out);
+        nvalue = nvalue * 4;
 
-    size_t compressed_size = 0;
-    while (srclength > 0 && nvalue >= 9) {
-      compressed_size += encodeBlock(src, srclength, dst, nvalue);
+        size_t compressed_size = 0;
+        while (srclength > 0 && nvalue >= 9) {
+            compressed_size += encodeBlock(src, srclength, dst, nvalue);
+        }
+        // Ouput might not be a multiple of 4 so we make it so
+        nvalue = ((compressed_size + 3) / 4);
     }
-    // Ouput might not be a multiple of 4 so we make it so
-    nvalue = ((compressed_size + 3) / 4);
-  }
 
-  const uint32_t *decodeArray(const uint32_t *in, const size_t length,
-                              uint32_t *out, size_t &nvalue) {
+    const uint32_t* decodeArray(const uint32_t* in, const size_t length, uint32_t* out, size_t& nvalue)
+    {
+        const unsigned char* src = reinterpret_cast<const unsigned char*>(in);
+        const uint32_t* const initdst = out;
 
-    const unsigned char *src = reinterpret_cast<const unsigned char *>(in);
-    const uint32_t *const initdst = out;
+        uint32_t* dst = out;
+        size_t srclength = length * 4;
+        for (; srclength >= 22; srclength -= 8, src += 8) {
+            unsigned char desc = *src;
+            src += 1;
+            srclength -= 1;
+            const __m128i data = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(src));
+            const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result);
+            int readSize = maskOutputSize[desc];
 
-    uint32_t *dst = out;
-    size_t srclength = length * 4;
-    for (; srclength >= 22; srclength -= 8, src += 8) {
-      unsigned char desc = *src;
-      src += 1;
-      srclength -= 1;
-      const __m128i data =
-          _mm_lddqu_si128(reinterpret_cast<__m128i const *>(src));
-      const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), result);
-      int readSize = maskOutputSize[desc];
+            if (readSize > 4) {
+                const __m128i result2 =
+                    _mm_shuffle_epi8(data, vecmask[desc][1]);  //__builtin_ia32_pshufb128(data,
+                                                               // shf2);
+                _mm_storeu_si128(
+                    reinterpret_cast<__m128i*>(dst + 4),
+                    result2);  //__builtin_ia32_storedqu(dst + (16), result2);
+            }
+            dst += readSize;
+        }
+        while (srclength >= 9) {
+            unsigned char desc = *src;
+            src += 1;
+            srclength -= 1;
+            char buff[32];
+            memcpy(buff, src, 8);
+            const __m128i data = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(buff));
+            const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(buff), result);
+            int readSize = maskOutputSize[desc];
+            if (readSize > 4) {
+                const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]);
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(buff + 16), result2);
+            }
+            memcpy(dst, buff, 4 * readSize);
+            dst += readSize;
+            srclength -= 8;
+            src += 8;
+        }
 
-      if (readSize > 4) {
-        const __m128i result2 = _mm_shuffle_epi8(
-            data, vecmask[desc][1]); //__builtin_ia32_pshufb128(data, shf2);
-        _mm_storeu_si128(
-            reinterpret_cast<__m128i *>(dst + 4),
-            result2); //__builtin_ia32_storedqu(dst + (16), result2);
-      }
-      dst += readSize;
-    }
-    while (srclength >= 9) {
-      unsigned char desc = *src;
-      src += 1;
-      srclength -= 1;
-      char buff[32];
-      memcpy(buff, src, 8);
-      const __m128i data =
-          _mm_lddqu_si128(reinterpret_cast<__m128i const *>(buff));
-      const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(buff), result);
-      int readSize = maskOutputSize[desc];
-      if (readSize > 4) {
-        const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]);
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(buff + 16), result2);
-      }
-      memcpy(dst, buff, 4 * readSize);
-      dst += readSize;
-      srclength -= 8;
-      src += 8;
+        nvalue = (dst - initdst);
+        return reinterpret_cast<uint32_t*>((reinterpret_cast<uintptr_t>(src) + 3) & ~3);
     }
 
-    nvalue = (dst - initdst);
-    return reinterpret_cast<uint32_t *>((reinterpret_cast<uintptr_t>(src) + 3) &
-                                        ~3);
-  }
-
-  virtual std::string name() const { return std::string("VarIntG8IU"); }
+    virtual std::string name() const { return std::string("VarIntG8IU"); }
 
-  int encodeBlock(const uint32_t *&src, size_t &srclength, unsigned char *&dest,
-                  size_t &dstlength) {
-    unsigned char desc = 0xFF;
-    unsigned char bitmask = 0x01;
-    uint32_t buffer[8];
-    int ithSize[8];
-    int length = 0;
-    int numInt = 0;
+    int encodeBlock(const uint32_t*& src, size_t& srclength, unsigned char*& dest, size_t& dstlength)
+    {
+        unsigned char desc = 0xFF;
+        unsigned char bitmask = 0x01;
+        uint32_t buffer[8];
+        int ithSize[8];
+        int length = 0;
+        int numInt = 0;
 
-    while (srclength > 0) {
-      const uint32_t *temp = src;
-      int byteNeeded = getNumByteNeeded(*temp);
+        while (srclength > 0) {
+            const uint32_t* temp = src;
+            int byteNeeded = getNumByteNeeded(*temp);
 
-      if (PREDICT_FALSE(length + byteNeeded > 8)) {
-        break;
-      }
+            if (PREDICT_FALSE(length + byteNeeded > 8)) {
+                break;
+            }
 
-      // flip the correct bit in desc
-      bitmask = static_cast<unsigned char>(bitmask << (byteNeeded - 1));
-      desc = desc ^ bitmask;
-      bitmask = static_cast<unsigned char>(bitmask << 1);
+            // flip the correct bit in desc
+            bitmask = static_cast<unsigned char>(bitmask << (byteNeeded - 1));
+            desc = desc ^ bitmask;
+            bitmask = static_cast<unsigned char>(bitmask << 1);
 
-      ithSize[numInt] = byteNeeded;
-      length += byteNeeded;
-      buffer[numInt] = *temp;
-      src = src + 1;
-      srclength -= 4;
-      numInt++;
-    }
+            ithSize[numInt] = byteNeeded;
+            length += byteNeeded;
+            buffer[numInt] = *temp;
+            src = src + 1;
+            srclength -= 4;
+            numInt++;
+        }
 
-    dest[0] = desc;
-    int written = 1;
-    for (int i = 0; i < numInt; i++) {
-      int size = ithSize[i];
-      uint32_t value = buffer[i];
-      for (int j = 0; j < size; j++) {
-        dest[written] = static_cast<unsigned char>(value >> (j * 8));
-        written++;
-      }
+        dest[0] = desc;
+        int written = 1;
+        for (int i = 0; i < numInt; i++) {
+            int size = ithSize[i];
+            uint32_t value = buffer[i];
+            for (int j = 0; j < size; j++) {
+                dest[written] = static_cast<unsigned char>(value >> (j * 8));
+                written++;
+            }
+        }
+        dest += 9;
+        dstlength -= 9;
+        return 9;
     }
-    dest += 9;
-    dstlength -= 9;
-    return 9;
-  }
 
-protected:
-  int maskOutputSize[256];
-  __m128i vecmask[256][2];
+  protected:
+    int maskOutputSize[256];
+    __m128i vecmask[256][2];
 
-  int getNumByteNeeded(const uint32_t val) {
-    return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1;
-  }
+    int getNumByteNeeded(const uint32_t val) { return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1; }
 };
 
-} // namespace FastPFor
+}  // namespace pisa
diff --git a/include/pisa/codec/block_codecs.hpp b/include/pisa/codec/block_codecs.hpp
index feeb9fc7b..d4bd71234 100644
--- a/include/pisa/codec/block_codecs.hpp
+++ b/include/pisa/codec/block_codecs.hpp
@@ -1,8 +1,5 @@
 #pragma once
 
-#include "FastPFor/headers/optpfor.h"
-#include "FastPFor/headers/variablebyte.h"
-
 #include "VarIntG8IU.h"
 #include "interpolative_coding.hpp"
 #include "util/compiler_attribute.hpp"
@@ -169,85 +166,16 @@ struct interpolative_block {
 };
 
 struct optpfor_block {
-    struct codec_type: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> {
-        uint8_t const* force_b{nullptr};
-
-        uint32_t findBestB(const uint32_t* in, uint32_t len)
-        {
-            // trick to force the choice of b from a parameter
-            if (force_b != nullptr) {
-                return *force_b;
-            }
-
-            // this is mostly a cut&paste from FastPFor, but we stop the
-            // optimization early as the b to test becomes larger than maxb
-            uint32_t b = 0;
-            uint32_t bsize = std::numeric_limits<uint32_t>::max();
-            const uint32_t mb = FastPForLib::maxbits(in, in + len);
-            uint32_t i = 0;
-            while (mb > 28 + possLogs[i]) {
-                ++i;  // some schemes such as Simple16 don't code numbers greater than 28
-            }
-
-            for (; i < possLogs.size(); i++) {
-                if (possLogs[i] > mb && possLogs[i] >= mb) {
-                    break;
-                }
-                const uint32_t csize = tryB(possLogs[i], in, len);
-
-                if (csize <= bsize) {
-                    b = possLogs[i];
-                    bsize = csize;
-                }
-            }
-            return b;
-        }
-    };
-
-    static const uint64_t block_size = codec_type::BlockSize;
+    static const uint64_t block_size;
 
     static void encode(
         uint32_t const* in,
         uint32_t sum_of_values,
         size_t n,
         std::vector<uint8_t>& out,
-        uint8_t const* b = nullptr)  // if non-null forces b
-    {
-        thread_local codec_type optpfor_codec;
-        thread_local std::vector<uint8_t> buf(2 * 4 * block_size);
-        assert(n <= block_size);
-
-        if (n < block_size) {
-            interpolative_block::encode(in, sum_of_values, n, out);
-            return;
-        }
-
-        size_t out_len = buf.size();
-
-        optpfor_codec.force_b = b;
-        optpfor_codec.encodeBlock(in, reinterpret_cast<uint32_t*>(buf.data()), out_len);
-        out_len *= 4;
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
-
-    static uint8_t const* PISA_NOINLINE
-    decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
-    {
-        thread_local codec_type optpfor_codec;  // pfor decoding is *not* thread-safe
-        assert(n <= block_size);
-
-        if (PISA_UNLIKELY(n < block_size)) {
-            return interpolative_block::decode(in, out, sum_of_values, n);
-        }
-
-        size_t out_len = block_size;
-        uint8_t const* ret;
+        uint8_t const* b = nullptr);
 
-        ret = reinterpret_cast<uint8_t const*>(
-            optpfor_codec.decodeBlock(reinterpret_cast<uint32_t const*>(in), out, out_len));
-        assert(out_len == n);
-        return ret;
-    }
+    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n);
 };
 
 struct varint_G8IU_block {
diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp
index 9512e01e9..b12a59e62 100644
--- a/include/pisa/codec/maskedvbyte.hpp
+++ b/include/pisa/codec/maskedvbyte.hpp
@@ -1,35 +1,15 @@
 #pragma once
 
+#include <cstdint>
+#include <cstdio>
 #include <vector>
 
-#include "MaskedVByte/include/varintdecode.h"
-#include "MaskedVByte/include/varintencode.h"
-#include "codec/block_codecs.hpp"
-#include "util/util.hpp"
-
 namespace pisa {
+
 struct maskedvbyte_block {
-    static const uint64_t block_size = 128;
-    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        auto* src = const_cast<uint32_t*>(in);
-        if (n < block_size) {
-            interpolative_block::encode(src, sum_of_values, n, out);
-            return;
-        }
-        thread_local std::vector<uint8_t> buf(2 * block_size * sizeof(uint32_t));
-        size_t out_len = vbyte_encode(src, n, buf.data());
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
-    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
-    {
-        assert(n <= block_size);
-        if (PISA_UNLIKELY(n < block_size)) {
-            return interpolative_block::decode(in, out, sum_of_values, n);
-        }
-        auto read = masked_vbyte_decode(in, out, n);
-        return in + read;
-    }
+    static const uint64_t block_size;
+    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out);
+    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp
index 3fc10bbda..12e7492d0 100644
--- a/include/pisa/codec/qmx.hpp
+++ b/include/pisa/codec/qmx.hpp
@@ -1,44 +1,18 @@
 #pragma once
 
-#include "QMX/qmx.hpp"
-#include "codec/block_codecs.hpp"
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
 namespace pisa {
+
 struct qmx_block {
-    static const uint64_t block_size = 128;
-    static const uint64_t overflow = 512;
+    static const uint64_t block_size;
+    static const uint64_t overflow;
 
-    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        auto* src = const_cast<uint32_t*>(in);
-        if (n < block_size) {
-            interpolative_block::encode(src, sum_of_values, n, out);
-            return;
-        }
-        thread_local QMX::compress_integer_qmx_improved qmx_codec;
-        thread_local std::vector<uint8_t> buf(2 * n * sizeof(uint32_t) + overflow);
+    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out);
 
-        size_t out_len = qmx_codec.encode(buf.data(), buf.size(), in, n);
-        TightVariableByte::encode_single(out_len, out);
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
-    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
-    {
-        static QMX::compress_integer_qmx_improved qmx_codec;  // decodeBlock is thread-safe
-        assert(n <= block_size);
-        if (PISA_UNLIKELY(n < block_size)) {
-            return interpolative_block::decode(in, out, sum_of_values, n);
-        }
-        uint32_t enc_len = 0;
-        in = TightVariableByte::decode(in, &enc_len, 1);
-        std::vector<uint32_t> buf(2 * n + overflow);
-        qmx_codec.decode(buf.data(), n, in, enc_len);
-        for (size_t i = 0; i < n; ++i) {
-            *out = buf[i];
-            ++out;
-        }
-        return in + enc_len;
-    }
+    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp
index 42ba654a8..e20118ccc 100644
--- a/include/pisa/codec/simdbp.hpp
+++ b/include/pisa/codec/simdbp.hpp
@@ -1,40 +1,15 @@
 #pragma once
 
-#include "codec/block_codecs.hpp"
-#include "util/util.hpp"
+#include <cstdint>
+#include <cstdio>
 #include <vector>
 
-extern "C" {
-#include "simdcomp/include/simdbitpacking.h"
-}
-
 namespace pisa {
+
 struct simdbp_block {
-    static const uint64_t block_size = 128;
-    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        auto* src = const_cast<uint32_t*>(in);
-        if (n < block_size) {
-            interpolative_block::encode(src, sum_of_values, n, out);
-            return;
-        }
-        uint32_t b = maxbits(in);
-        thread_local std::vector<uint8_t> buf(8 * n);
-        uint8_t* buf_ptr = buf.data();
-        *buf_ptr++ = b;
-        simdpackwithoutmask(src, (__m128i*)buf_ptr, b);
-        out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1);
-    }
-    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
-    {
-        assert(n <= block_size);
-        if (PISA_UNLIKELY(n < block_size)) {
-            return interpolative_block::decode(in, out, sum_of_values, n);
-        }
-        uint32_t b = *in++;
-        simdunpack((const __m128i*)in, out, b);
-        return in + b * sizeof(__m128i);
-    }
+    static const uint64_t block_size;
+    static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out);
+    static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp
index 3c08ca823..b07fc8e3d 100644
--- a/include/pisa/codec/simple16.hpp
+++ b/include/pisa/codec/simple16.hpp
@@ -1,36 +1,19 @@
 #pragma once
-#include "FastPFor/headers/simple16.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
 namespace pisa {
 
 struct simple16_block {
-    static const uint64_t block_size = 128;
+    static const uint64_t block_size;
 
     static void
-    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        thread_local FastPForLib::Simple16<false> codec;
-        thread_local std::vector<uint8_t> buf(2 * 8 * block_size);
-        size_t out_len = buf.size();
-        codec.encodeArray(in, n, reinterpret_cast<uint32_t*>(buf.data()), out_len);
-        out_len *= 4;
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
+    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out);
 
     static uint8_t const*
-    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
-    {
-        assert(n <= block_size);
-        FastPForLib::Simple16<false> codec;
-        std::vector<uint32_t> buf(2 * block_size);
-
-        auto const* ret = reinterpret_cast<uint8_t const*>(
-            codec.decodeArray(reinterpret_cast<uint32_t const*>(in), 8 * n, buf.data(), n));
-        for (size_t i = 0; i < n; ++i) {
-            *out++ = buf[i];
-        }
-        return ret;
-    }
+    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp
index 52c307b3e..234d47834 100644
--- a/include/pisa/codec/simple8b.hpp
+++ b/include/pisa/codec/simple8b.hpp
@@ -1,30 +1,19 @@
 #pragma once
-#include "FastPFor/headers/simple8b.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
 namespace pisa {
 
 struct simple8b_block {
-    static const uint64_t block_size = 128;
+    static const uint64_t block_size;
 
     static void
-    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        thread_local FastPForLib::Simple8b<false> codec;
-        thread_local std::vector<uint8_t> buf(2 * 8 * block_size);
-        size_t out_len = buf.size();
-        codec.encodeArray(in, n, reinterpret_cast<uint32_t*>(buf.data()), out_len);
-        out_len *= 4;
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
+    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out);
 
     static uint8_t const*
-    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
-    {
-        assert(n <= block_size);
-        FastPForLib::Simple8b<false> codec;
-        return reinterpret_cast<uint8_t const*>(
-            codec.decodeArray(reinterpret_cast<uint32_t const*>(in), 8 * n, out, n));
-    }
+    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp
index c584da841..adaa958fd 100644
--- a/include/pisa/codec/streamvbyte.hpp
+++ b/include/pisa/codec/streamvbyte.hpp
@@ -1,29 +1,17 @@
 #pragma once
 
-#include <cassert>
+#include <cstdint>
+#include <cstdio>
 #include <vector>
 
-#include "streamvbyte/include/streamvbyte.h"
-
 namespace pisa {
 
 struct streamvbyte_block {
-    static const uint64_t block_size = 128;
+    static const uint64_t block_size;
     static void
-    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
-    {
-        assert(n <= block_size);
-        auto* src = const_cast<uint32_t*>(in);
-        thread_local std::vector<uint8_t> buf(streamvbyte_max_compressedbytes(block_size));
-        size_t out_len = streamvbyte_encode(src, n, buf.data());
-        out.insert(out.end(), buf.data(), buf.data() + out_len);
-    }
+    encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out);
     static uint8_t const*
-    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
-    {
-        assert(n <= block_size);
-        auto read = streamvbyte_decode(in, out, n);
-        return in + read;
-    }
+    decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n);
 };
+
 }  // namespace pisa
diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp
index b2b791956..7439c4be4 100644
--- a/include/pisa/codec/varintgb.hpp
+++ b/include/pisa/codec/varintgb.hpp
@@ -2,8 +2,6 @@
 #include "codec/block_codecs.hpp"
 #include <vector>
 
-#include "FastPFor/headers/common.h"
-
 using namespace std;
 
 namespace pisa {
diff --git a/src/codec/maskedvbyte.cpp b/src/codec/maskedvbyte.cpp
new file mode 100644
index 000000000..ca656b76c
--- /dev/null
+++ b/src/codec/maskedvbyte.cpp
@@ -0,0 +1,39 @@
+#include "codec/maskedvbyte.hpp"
+
+#include "MaskedVByte/include/varintdecode.h"
+#include "MaskedVByte/include/varintencode.h"
+#include "codec/block_codecs.hpp"
+#include "util/util.hpp"
+
+#include <vector>
+
+namespace pisa {
+
+const uint64_t maskedvbyte_block::block_size = 128;
+
+void maskedvbyte_block::encode(
+    uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= block_size);
+    auto* src = const_cast<uint32_t*>(in);
+    if (n < block_size) {
+        interpolative_block::encode(src, sum_of_values, n, out);
+        return;
+    }
+    thread_local std::vector<uint8_t> buf(2 * block_size * sizeof(uint32_t));
+    size_t out_len = vbyte_encode(src, n, buf.data());
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+uint8_t const*
+maskedvbyte_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
+{
+    assert(n <= block_size);
+    if (PISA_UNLIKELY(n < block_size)) {
+        return interpolative_block::decode(in, out, sum_of_values, n);
+    }
+    auto read = masked_vbyte_decode(in, out, n);
+    return in + read;
+}
+
+}  // namespace pisa
diff --git a/src/codec/optpfor.cpp b/src/codec/optpfor.cpp
new file mode 100644
index 000000000..cd566047a
--- /dev/null
+++ b/src/codec/optpfor.cpp
@@ -0,0 +1,86 @@
+#include "codec/block_codecs.hpp"
+
+#include "FastPFor/headers/optpfor.h"
+
+namespace pisa {
+
+struct codec_type: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> {
+    uint8_t const* force_b{nullptr};
+
+    uint32_t findBestB(const uint32_t* in, uint32_t len)
+    {
+        // trick to force the choice of b from a parameter
+        if (force_b != nullptr) {
+            return *force_b;
+        }
+
+        // this is mostly a cut&paste from FastPFor, but we stop the
+        // optimization early as the b to test becomes larger than maxb
+        uint32_t b = 0;
+        uint32_t bsize = std::numeric_limits<uint32_t>::max();
+        const uint32_t mb = FastPForLib::maxbits(in, in + len);
+        uint32_t i = 0;
+        while (mb > 28 + possLogs[i]) {
+            ++i;  // some schemes such as Simple16 don't code numbers greater than 28
+        }
+
+        for (; i < possLogs.size(); i++) {
+            if (possLogs[i] > mb && possLogs[i] >= mb) {
+                break;
+            }
+            const uint32_t csize = tryB(possLogs[i], in, len);
+
+            if (csize <= bsize) {
+                b = possLogs[i];
+                bsize = csize;
+            }
+        }
+        return b;
+    }
+};
+
+const uint64_t optpfor_block::block_size = codec_type::BlockSize;
+
+void optpfor_block::encode(
+    uint32_t const* in,
+    uint32_t sum_of_values,
+    size_t n,
+    std::vector<uint8_t>& out,
+    uint8_t const* b)  // if non-null forces b
+{
+    thread_local codec_type optpfor_codec;
+    thread_local std::vector<uint8_t> buf(2 * 4 * block_size);
+    assert(n <= block_size);
+
+    if (n < block_size) {
+        interpolative_block::encode(in, sum_of_values, n, out);
+        return;
+    }
+
+    size_t out_len = buf.size();
+
+    optpfor_codec.force_b = b;
+    optpfor_codec.encodeBlock(in, reinterpret_cast<uint32_t*>(buf.data()), out_len);
+    out_len *= 4;
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+uint8_t const* optpfor_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
+{
+    thread_local codec_type optpfor_codec;  // pfor decoding is *not* thread-safe
+    assert(n <= block_size);
+
+    if (PISA_UNLIKELY(n < block_size)) {
+        return interpolative_block::decode(in, out, sum_of_values, n);
+    }
+
+    size_t out_len = block_size;
+    uint8_t const* ret;
+
+    ret = reinterpret_cast<uint8_t const*>(
+        optpfor_codec.decodeBlock(reinterpret_cast<uint32_t const*>(in), out, out_len));
+    assert(out_len == n);
+    return ret;
+}
+
+}  // namespace pisa
diff --git a/src/codec/qmx.cpp b/src/codec/qmx.cpp
new file mode 100644
index 000000000..4f332fb92
--- /dev/null
+++ b/src/codec/qmx.cpp
@@ -0,0 +1,46 @@
+#include "codec/qmx.hpp"
+
+#include "QMX/qmx.hpp"
+#include "codec/block_codecs.hpp"
+
+namespace pisa {
+
+const uint64_t qmx_block::block_size = 128;
+const uint64_t qmx_block::overflow = 512;
+
+void qmx_block::encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= qmx_block::block_size);
+    auto* src = const_cast<std::uint32_t*>(in);
+    if (n < qmx_block::block_size) {
+        interpolative_block::encode(src, sum_of_values, n, out);
+        return;
+    }
+    thread_local QMX::compress_integer_qmx_improved qmx_codec;
+    thread_local std::vector<uint8_t> buf(2 * n * sizeof(std::uint32_t) + overflow);
+
+    size_t out_len = qmx_codec.encode(buf.data(), buf.size(), in, n);
+    TightVariableByte::encode_single(out_len, out);
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+auto qmx_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
+    -> uint8_t const*
+{
+    static QMX::compress_integer_qmx_improved qmx_codec;  // decodeBlock is thread-safe
+    assert(n <= qmx_block::block_size);
+    if (PISA_UNLIKELY(n < qmx_block::block_size)) {
+        return interpolative_block::decode(in, out, sum_of_values, n);
+    }
+    std::uint32_t enc_len = 0;
+    in = TightVariableByte::decode(in, &enc_len, 1);
+    std::vector<std::uint32_t> buf(2 * n + qmx_block::overflow);
+    qmx_codec.decode(buf.data(), n, in, enc_len);
+    for (size_t i = 0; i < n; ++i) {
+        *out = buf[i];
+        ++out;
+    }
+    return in + enc_len;
+}
+
+}  // namespace pisa
diff --git a/src/codec/simdbp.cpp b/src/codec/simdbp.cpp
new file mode 100644
index 000000000..9d5c54ec4
--- /dev/null
+++ b/src/codec/simdbp.cpp
@@ -0,0 +1,42 @@
+#include "codec/simdbp.hpp"
+
+#include "codec/block_codecs.hpp"
+#include "util/util.hpp"
+#include <vector>
+
+extern "C" {
+#include "simdcomp/include/simdbitpacking.h"
+}
+
+namespace pisa {
+
+const uint64_t simdbp_block::block_size = 128;
+
+void simdbp_block::encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= block_size);
+    auto* src = const_cast<uint32_t*>(in);
+    if (n < block_size) {
+        interpolative_block::encode(src, sum_of_values, n, out);
+        return;
+    }
+    uint32_t b = maxbits(in);
+    thread_local std::vector<uint8_t> buf(8 * n);
+    uint8_t* buf_ptr = buf.data();
+    *buf_ptr++ = b;
+    simdpackwithoutmask(src, (__m128i*)buf_ptr, b);
+    out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1);
+}
+
+uint8_t const* simdbp_block::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n)
+{
+    assert(n <= block_size);
+    if (PISA_UNLIKELY(n < block_size)) {
+        return interpolative_block::decode(in, out, sum_of_values, n);
+    }
+    uint32_t b = *in++;
+    simdunpack((const __m128i*)in, out, b);
+    return in + b * sizeof(__m128i);
+}
+
+}  // namespace pisa
diff --git a/src/codec/simple16.cpp b/src/codec/simple16.cpp
new file mode 100644
index 000000000..be4dc0c2c
--- /dev/null
+++ b/src/codec/simple16.cpp
@@ -0,0 +1,36 @@
+#include "codec/simple16.hpp"
+
+#include "FastPFor/headers/simple16.h"
+
+namespace pisa {
+
+const uint64_t simple16_block::block_size = 128;
+
+void simple16_block::encode(
+    uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= block_size);
+    thread_local FastPForLib::Simple16<false> codec;
+    thread_local std::vector<uint8_t> buf(2 * 8 * block_size);
+    size_t out_len = buf.size();
+    codec.encodeArray(in, n, reinterpret_cast<uint32_t*>(buf.data()), out_len);
+    out_len *= 4;
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+uint8_t const*
+simple16_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
+{
+    assert(n <= block_size);
+    FastPForLib::Simple16<false> codec;
+    std::vector<uint32_t> buf(2 * block_size);
+
+    auto const* ret = reinterpret_cast<uint8_t const*>(
+        codec.decodeArray(reinterpret_cast<uint32_t const*>(in), 8 * n, buf.data(), n));
+    for (size_t i = 0; i < n; ++i) {
+        *out++ = buf[i];
+    }
+    return ret;
+}
+
+}  // namespace pisa
diff --git a/src/codec/simple8b.cpp b/src/codec/simple8b.cpp
new file mode 100644
index 000000000..7597df4c9
--- /dev/null
+++ b/src/codec/simple8b.cpp
@@ -0,0 +1,30 @@
+#include "codec/simple8b.hpp"
+
+#include "FastPFor/headers/simple8b.h"
+
+namespace pisa {
+
+const uint64_t simple8b_block::block_size = 128;
+
+void simple8b_block::encode(
+    uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= block_size);
+    thread_local FastPForLib::Simple8b<false> codec;
+    thread_local std::vector<uint8_t> buf(2 * 8 * block_size);
+    size_t out_len = buf.size();
+    codec.encodeArray(in, n, reinterpret_cast<uint32_t*>(buf.data()), out_len);
+    out_len *= 4;
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+uint8_t const*
+simple8b_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
+{
+    assert(n <= block_size);
+    FastPForLib::Simple8b<false> codec;
+    return reinterpret_cast<uint8_t const*>(
+        codec.decodeArray(reinterpret_cast<uint32_t const*>(in), 8 * n, out, n));
+}
+
+}  // namespace pisa
diff --git a/src/codec/streamvbyte.cpp b/src/codec/streamvbyte.cpp
new file mode 100644
index 000000000..bd35a4451
--- /dev/null
+++ b/src/codec/streamvbyte.cpp
@@ -0,0 +1,30 @@
+#include "codec/streamvbyte.hpp"
+
+#include <cassert>
+#include <vector>
+
+#include "streamvbyte/include/streamvbyte.h"
+
+namespace pisa {
+
+const uint64_t streamvbyte_block::block_size = 128;
+
+void streamvbyte_block::encode(
+    uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out)
+{
+    assert(n <= block_size);
+    auto* src = const_cast<uint32_t*>(in);
+    thread_local std::vector<uint8_t> buf(streamvbyte_max_compressedbytes(block_size));
+    size_t out_len = streamvbyte_encode(src, n, buf.data());
+    out.insert(out.end(), buf.data(), buf.data() + out_len);
+}
+
+uint8_t const*
+streamvbyte_block::decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n)
+{
+    assert(n <= block_size);
+    auto read = streamvbyte_decode(in, out, n);
+    return in + read;
+}
+
+}  // namespace pisa

From 9712a730ff30fb1c8f42021fd460917d0931654c Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <siedlaczek@pm.me>
Date: Mon, 14 Jun 2021 14:48:30 -0400
Subject: [PATCH 2/3] Missing include

---
 CMakeLists.txt                 | 1 -
 include/pisa/forward_index.hpp | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcf480e7b..f784cacc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,6 @@ target_link_libraries(pisa
         fmt::fmt
         range-v3
         taily
-    # These should be made private in the future:
     PRIVATE
         gumbo::gumbo
         warcpp
diff --git a/include/pisa/forward_index.hpp b/include/pisa/forward_index.hpp
index 4a4af032d..1382f2574 100644
--- a/include/pisa/forward_index.hpp
+++ b/include/pisa/forward_index.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstdint>
+#include <fstream>
 #include <string>
 #include <vector>
 

From f5945bb59924c66c184867ab1328120ef13e2edf Mon Sep 17 00:00:00 2001
From: J Mackenzie <JMMackenzie@users.noreply.github.com>
Date: Tue, 15 Jun 2021 10:52:20 +1000
Subject: [PATCH 3/3] Fix flag

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f784cacc0..7dd1aed1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ option(PISA_USE_PIC "Enable Position-Independent code globally" ON)
 option(PISA_CI_BUILD "Remove debug information from Debug build" ON)
 option(PISA_ENABLE_IPO "Enable Interprocedural Optimization, aka Link Time Optimization (LTO)" OFF)
 
-if(ENABLE_IPO)
+if(PISA_ENABLE_IPO)
     include(CheckIPOSupported)
     check_ipo_supported(
         RESULT