diff --git a/.travis.yml b/.travis.yml
index 792a7c0..fdde70b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -78,6 +78,7 @@ script:
     - ./bin/test-ansv
     - ./bin/test-suffixtree
     - ./bin/test-gsa
+    - ./bin/test-desa
     - mpiexec -np 4 ./bin/test-psac
     - mpiexec -np 13 ./bin/test-psac
     - mpiexec -np 4 ./bin/test-ansv
@@ -86,6 +87,8 @@ script:
     - mpiexec -np 13 ./bin/test-gsa
     - mpiexec -np 4 ./bin/test-gsa
     - mpiexec -np 4 ./bin/test-ss
+    - mpiexec -np 4 ./bin/test-desa
+
 
 after_success:
   # only collect coverage if compiled with gcc
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index 34a8e50..5cc49cc 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -86,6 +86,10 @@
 '-I',
 './ext',
 '-I',
+'./ext/mxx',
+'-I',
+'../ext/mxx',
+'-I',
 './ext/mxx/include',
 '-I',
 '../ext/mxx/include',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a60bcb..1dbbc1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,134 +3,64 @@ cmake_minimum_required(VERSION 2.6)
 # project settings
 project(psac)
 
+#### Options
+OPTION(PSAC_BUILD_EXES "Build all psac executables (command line tools & benchmark scripts)" ON)
+OPTION(PSAC_BUILD_TESTS "Build unit tests" ON)
+OPTION(PSAC_ENABLE_COVERAGE "Enable code coverage reporting" OFF)
+
 ##### General Compilation Settings
 
 # Initialize CXXFLAGS.
-#set(CMAKE_CXX_FLAGS                "-Wall -O0 -g --std=c++11 -D_GLIBCXX_DEBUG")
-#set(CMAKE_CXX_FLAGS                "-Wall -Wuninitialized -O0 -g --std=c++11")
-#set(CMAKE_CXX_FLAGS                "-Wall -O3 -funroll-loops -msse3 --std=c++11 -g -march=native")
-#set(CMAKE_CXX_FLAGS                "-Wall -O3 -funroll-loops -msse3 -DNDEBUG --std=c++11 -march=native")
-#set(CMAKE_CXX_FLAGS                "-Wall -O3 -DNDEBUG --std=c++11 -Wuninitialized -g")
-
-
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wuninitialized --std=c++11")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -g")
-
-# Add these standard paths to the search paths for FIND_LIBRARY
-# to find libraries from these locations first
-if(UNIX)
-    set(CMAKE_LIBRARY_PATH "${CMAKE_LIBRARY_PATH} /lib /usr/lib")
-endif()
-
-# --------------------------------------------------------------
-# Indicate CMake 2.7 and above that we don't want to mix relative
-#  and absolute paths in linker lib lists.
-# Run "cmake --help-policy CMP0003" for more information.
-# --------------------------------------------------------------
-if(COMMAND cmake_policy)
-    cmake_policy(SET CMP0003 NEW)
-endif()
-
-
-#### MPI
-find_package(MPI REQUIRED)
-if (MPI_FOUND)
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_COMPILE_FLAGS}")
-    #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_COMPILE_FLAGS}")
-    #set(CMAKE_LINK_FLAGS "${CMAKE_LINK_FLAGS} ${MPI_LINK_FLAGS}")
-    set(EXTRA_LIBS ${EXTRA_LIBS} ${MPI_LIBRARIES})
-    include_directories(SYSTEM ${MPI_INCLUDE_PATH})
-else (MPI_FOUND)
-    message(SEND_ERROR "This application cannot compile without MPI")
-endif (MPI_FOUND)
-
-
-
-
-## add ScoreP support
-
-
-option(ENABLE_PROFILING_SCOREP "Enable ScoreP profiling support." OFF)
-
-#if(ENABLE_PROFILING_SCOREP)
-#    message(STATUS "Profiling with ScoreP enabled.")
-#
-#    execute_process(COMMAND scorep-config --cxxflags --nocompiler
-#                    OUTPUT_VARIABLE SCOREP_CXX_FLAGS)
-#                ##RESULT_VARIABLE SCOREP_CONFIG_RETURN)
-#                #if(NOT SCOREP_CONFIG_RETURN EQUAL 0)
-#                #    message(FATAL_ERROR "Can NOT execute 'scorep-config' at $ENV{SCOREP_ROOT}/bin/scorep-config - check file permissions")
-#                #endif()
-#    execute_process(COMMAND scorep-config --mpp=mpi --ldflags
-#                    OUTPUT_VARIABLE SCOREP_LD_FLAGS)
-#    execute_process(COMMAND scorep-config --mpp=mpi --libs
-#                    OUTPUT_VARIABLE SCOREP_LIBS)
-#                string(STRIP "${SCOREP_LIBS}" SCOREP_LIBS)
-#
-#                #execute_process(COMMAND scorep-config --mpp=mpi --libs
-#                #                OUTPUT_VARIABLE SCOREP_LIBFLAGS)
-#                #string(STRIP "${SCOREP_LIBFLAGS}" SCOREP_LIBFLAGS)
-#
-#    # subsystem iniialization file
-#    execute_process(COMMAND scorep-config --mpp=mpi --nocuda --noopencl --adapter-init
-#                    OUTPUT_VARIABLE SCOREP_INIT_FILE)
-#    file(WRITE ${CMAKE_BINARY_DIR}/scorep_init.c "${SCOREP_INIT_FILE}")
-#
-#    #if(SCOREP_ENABLE)
-#        set(SCOREP_SRCFILES "${CMAKE_BINARY_DIR}/scorep_init.c")
-#        #endif(SCOREP_ENABLE)
-#
-#
-#    # TODO: possibly put the LIBS to target_link_libraries
-#    set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${SCOREP_LD_FLAGS} ${SCOREP_LIBS}")
-#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SCOREP_CXX_FLAGS}")
-#    #set(EXTRA_LIBS ${EXTRA_LIBS} ${SCOREP_LIBS})
-#endif(ENABLE_PROFILING_SCOREP)
-
-
-if(ENABLE_PROFILING_SCOREP)
-    message(STATUS "Profiling with ScoreP enabled.")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "scorep")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "scorep")
-endif(ENABLE_PROFILING_SCOREP)
 
 ### Test Coverage
-OPTION(ENABLE_COVERAGE "Enable code coverage reporting" OFF)
-if(ENABLE_COVERAGE)
+if(PSAC_ENABLE_COVERAGE)
   # turn off stack protection for gcov coverage, because the stack protector shows
   # up as a never taken branch, and thus turns any last statement in a function
   # with a stack procetor into a partially covered statement.
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fno-stack-protector")
-endif(ENABLE_COVERAGE)
+endif(PSAC_ENABLE_COVERAGE)
 
 ###### Executable and Libraries
 # Save libs and executables in the same place
-set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib CACHE PATH "Output directory for libraries" )
-set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin CACHE PATH "Output directory for applications" )
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
-include_directories("${PROJECT_SOURCE_DIR}/include")
-include_directories("${PROJECT_SOURCE_DIR}/ext/mxx")
-include_directories("${PROJECT_SOURCE_DIR}/ext/mxx/include")
-include_directories("${PROJECT_SOURCE_DIR}/ext")
-include_directories("${PROJECT_SOURCE_DIR}/ext/cxx-prettyprint")
+## psac header library
+add_library(psaclib INTERFACE)
+target_include_directories(psaclib INTERFACE include)
+
+## psac dependencies
 
 # load mxx and its gtest wrapper
-add_subdirectory("${PROJECT_SOURCE_DIR}/ext/mxx")
+set(MXX_BUILD_TESTS OFF CACHE BOOL "disable building mxx tests" FORCE)
+add_subdirectory(ext/mxx)
+target_link_libraries(psaclib INTERFACE mxx)
 
 # include libdivsufsort (with 64bit support but without examples)
 set(BUILD_DIVSUFSORT64 ON CACHE BOOL "enables divsufsort 64bit functions" FORCE)
-set(BUILD_EXAMPLES OFF CACHE BOOL "enables divsufsort 64bit functions" FORCE)
-add_subdirectory("${PROJECT_SOURCE_DIR}/ext/libdivsufsort")
-include_directories(${libdivsufsort_BINARY_DIR}/include)
-include_directories(${libdivsufsort_SOURCE_DIR}/include)
+set(BUILD_EXAMPLES OFF CACHE BOOL "enables divsufsort examples" FORCE)
+add_subdirectory(ext/libdivsufsort)
+
+# prettyprint
+target_include_directories(psaclib INTERFACE ext/cxx-prettyprint)
 
+# divsufsort (integrate into psac as `psac-dss-lib`)
+add_library(psac-dss-lib INTERFACE)
+target_link_libraries(psac-dss-lib INTERFACE psaclib)
+target_include_directories(psac-dss-lib INTERFACE ${libdivsufsort_BINARY_DIR}/include)
+target_link_libraries(psac-dss-lib INTERFACE divsufsort divsufsort64)
 
-# add own subdirectories
-add_subdirectory(src)
-add_subdirectory(src/tests)
+
+## build executables
+if(PSAC_BUILD_EXES)
+  add_subdirectory(src)
+endif()
 
 # build tests
-add_subdirectory(test)
+if (PSAC_BUILD_TESTS)
+  add_subdirectory(test)
+endif()
 
diff --git a/ext/mxx b/ext/mxx
index f09a41c..4f19d42 160000
--- a/ext/mxx
+++ b/ext/mxx
@@ -1 +1 @@
-Subproject commit f09a41cde6ba705eaf87a2d3798abed0ed9d1fd4
+Subproject commit 4f19d4293b015f631f2a47dbe255091aed36a8ca
diff --git a/include/alphabet.hpp b/include/alphabet.hpp
index 3dda846..e028958 100644
--- a/include/alphabet.hpp
+++ b/include/alphabet.hpp
@@ -19,7 +19,10 @@
 #include <vector>
 #include <string>
 #include <algorithm>
+#include <iostream>
+#include <fstream>
 #include <mxx/comm.hpp>
+#include <mxx/reduction.hpp>
 
 #include "bitops.hpp"
 
@@ -41,6 +44,7 @@ std::string rand_dna(std::size_t size, int seed) {
     return str;
 }
 
+/// scans the input sequence and updates the given frequency histogram of characters
 template<typename T, typename Iterator>
 void update_histogram(Iterator begin, Iterator end, std::vector<T>& hist) {
     using char_type = typename std::iterator_traits<Iterator>::value_type;
@@ -54,6 +58,7 @@ void update_histogram(Iterator begin, Iterator end, std::vector<T>& hist) {
     }
 }
 
+/// Returns a frequency histogram of characters from a given input sequence
 template<typename T, typename Iterator>
 std::vector<T> get_histogram(Iterator begin, Iterator end, std::size_t size = 0) {
     if (size == 0)
@@ -63,6 +68,7 @@ std::vector<T> get_histogram(Iterator begin, Iterator end, std::size_t size = 0)
     return hist;
 }
 
+/// Returns a frequency histogram of characters from a given distributed stringset (collective call)
 template <typename index_t, typename StringSet>
 std::vector<index_t> alphabet_histogram(const StringSet& ss, const mxx::comm& comm) {
     std::vector<index_t> hist(256, 0);
@@ -74,6 +80,7 @@ std::vector<index_t> alphabet_histogram(const StringSet& ss, const mxx::comm& co
     return out_hist;
 }
 
+/// Returns a frequency histogram of characters from a given input sequence.
 template <typename index_t, typename InputIterator>
 std::vector<index_t> alphabet_histogram(InputIterator begin, InputIterator end) {
     static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type, char>::value, "Iterator must be of value type `char`.");
@@ -82,6 +89,7 @@ std::vector<index_t> alphabet_histogram(InputIterator begin, InputIterator end)
     return hist;
 }
 
+/// Returns a frequency histogram of characters from a given distributed input sequence (collective call)
 template <typename index_t, typename InputIterator>
 std::vector<index_t> alphabet_histogram(InputIterator begin, InputIterator end, const mxx::comm& comm) {
     static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type, char>::value, "Iterator must be of value type `char`.");
@@ -92,15 +100,18 @@ std::vector<index_t> alphabet_histogram(InputIterator begin, InputIterator end,
 }
 
 
-
-
+/**
+ * @brief Class representing an alphabet with character size <= 2 bytes.
+ */
 template<typename CharType>
 class alphabet {
     static_assert(sizeof(CharType) <= 2, "Dynamic alphabet supports at most 2-byte (16bits) alphabets");
 public:
-
+    /// the character type of the alphabet
     using char_type = CharType;
+    /// unsigned version of `char_type`
     using uchar_type = typename std::make_unsigned<char_type>::type;
+    /// limit of `uchar_type`, defines size of mapping table
     static constexpr uchar_type max_uchar = std::numeric_limits<uchar_type>::max();
 
     /// executes the given function on every char in the alphabet
@@ -109,7 +120,7 @@ class alphabet {
     inline void for_each_char(Func f) const {
         for (uchar_type c = 0; ; ++c) {
             if (chars_used[c]) {
-                f(c);
+                f(static_cast<char_type>(c));
             }
             if (c == max_uchar)
                 break;
@@ -118,25 +129,22 @@ class alphabet {
 
 private:
 
+    /// bitmap of characters in the alphabet
     std::vector<bool> chars_used;
 
     /// maps each unsigned char to a new integer using at most log(sigma+1) bits
     std::vector<uchar_type> mapping_table;
+    /// maps the encoded characters back to their original
     std::vector<char_type> inverse_mapping;
 
+    /// alphabet size (number of unique characters excl. '\0')
     unsigned int m_sigma;
+    /// number of bits each character can be represented/encoded as
     unsigned int m_bits_per_char;
 
-    inline void init_mapping_table() {
-        mapping_table.resize(max_uchar+1, 0);
-        uint16_t mapped = 1; // start with 1 to include special char '\0' ('$')
-        for_each_char([&](uchar_type c) {
-            mapping_table[c] = mapped;
-            ++mapped;
-        });
-    }
 
-    inline void init_sizes() {
+    /// initializes the alphabet size and bit sizes
+    void init_sizes() {
         unsigned int num_chars = 0;
         for_each_char([&](uchar_type) {
             ++num_chars;
@@ -145,13 +153,25 @@ class alphabet {
         m_bits_per_char = ceillog2(m_sigma+1);
     }
 
-    inline void init_inverse_mapping() {
+    /// initializes the alphabet mapping table
+    void init_mapping_table() {
+        mapping_table.resize(max_uchar+1, 0);
+        uint16_t mapped = 1; // start with 1 to include special char '\0' ('$')
+        for_each_char([&](uchar_type c) {
+            mapping_table[c] = mapped;
+            ++mapped;
+        });
+    }
+
+    /// initializes the inverse mapping table
+    void init_inverse_mapping() {
         inverse_mapping.push_back('\0');
         for_each_char([&](uchar_type c) {
             inverse_mapping.push_back(c);
         });
     }
 
+    /// constructs the alphabet from a given character frequency histogram
     template <typename count_type>
     alphabet(const std::vector<count_type>& hist) {
         assert(hist.size() == max_uchar+1);
@@ -173,14 +193,14 @@ class alphabet {
     alphabet& operator=(const alphabet&) = default;
     alphabet& operator=(alphabet&&) = default;
 
+    /// Constructs the alphabet from a given character frequency histogram
     template <typename index_t>
     static alphabet from_hist(const std::vector<index_t>& hist) {
         alphabet a(hist);
         return a;
     }
 
-
-    // sequential version
+    /// Constructs the alphabet from a given sequence
     template <typename Iterator>
     static alphabet from_sequence(Iterator begin, Iterator end) {
         static_assert(std::is_same<char_type, typename std::iterator_traits<Iterator>::value_type>::value, "Character type of alphabet must match the value type of input sequence");
@@ -188,8 +208,7 @@ class alphabet {
         return alphabet::from_hist(alphabet_hist);
     }
 
-
-    // MPI parallel version (performs reduction over histogram)
+    /// Constructs the alphabet from a given distributed sequence (collective call)
     template <typename Iterator>
     static alphabet from_sequence(Iterator begin, Iterator end, const mxx::comm& comm) {
         static_assert(std::is_same<char_type, typename std::iterator_traits<Iterator>::value_type>::value, "Character type of alphabet must match the value type of input sequence");
@@ -198,42 +217,51 @@ class alphabet {
         return alphabet::from_hist(alphabet_hist);
     }
 
-    static alphabet from_string(const std::string& str, const mxx::comm& comm) {
+    /// Constructs the alphabet from a given string
+    static alphabet from_string(const std::basic_string<char_type>& str) {
+        return alphabet::from_sequence(str.begin(), str.end());
+    }
+
+    /// Constructs the alphabet from a given distributed string (collective call)
+    static alphabet from_string(const std::basic_string<char_type>& str, const mxx::comm& comm) {
         return alphabet::from_sequence(str.begin(), str.end(), comm);
     }
 
+    /// Constructs the alphabet from a distributed stringset (collective call)
     template <typename StringSet>
     static alphabet from_stringset(const StringSet& ss, const mxx::comm& comm) {
         std::vector<size_t> alphabet_hist = alphabet_histogram<size_t>(ss, comm);
         return alphabet::from_hist(alphabet_hist);
     }
 
+    /// Returns the number of unique characters in the alphabet (i.e. the
+    //  alphabet size)
     inline unsigned int sigma() const {
         return m_sigma;
     }
 
+    /// Returns the alphabet size (same as .sigma())
+    inline unsigned int size() const {
+        return m_sigma;
+    }
+
+    /// Returns the number of bits a single character can be represented by
     inline unsigned int bits_per_char() const {
         return m_bits_per_char;
     }
 
+    /// Returns the number of characters that fit into the given word_type
     template <typename word_type>
     inline unsigned int chars_per_word() const {
         static_assert(sizeof(word_type) >= sizeof(char_type), "expecting the word_type to be larger than the char type");
         unsigned int bits_per_word = sizeof(word_type)*8;
-        // TODO: this is currently a "work-around": if the type is signed, we
-        //       can't use the msb, thus we need to subtract one
+        // if the type is signed, we can't use the msb, thus we need to subtract one
         if (std::is_signed<word_type>::value)
             --bits_per_word;
         return bits_per_word/bits_per_char();
     }
 
-    inline uchar_type encode(char_type c) const {
-        uchar_type uc = c;
-        assert(0 <= uc && uc < mapping_table.size());
-        return mapping_table[uc];
-    }
-
-
+    /// Returns the unique characters in this alphabet
     inline std::vector<char_type> unique_chars() const {
         std::vector<char_type> result;
         for_each_char([&](uchar_type c) {
@@ -242,9 +270,81 @@ class alphabet {
         return result;
     }
 
+    /// encodes the given character to use at most `bits_per_char` bits
+    inline uchar_type encode(char_type c) const {
+        uchar_type uc = c;
+        assert(0 <= uc && uc < mapping_table.size());
+        return mapping_table[uc];
+    }
+
+    /// Returns the decoded character from the encoded (compressed) character
     inline char_type decode(uchar_type c) const {
         return inverse_mapping[c];
     }
+
+    /// equality check
+    bool operator==(const alphabet& o) const {
+        return o.chars_used == this->chars_used && o.m_sigma == this->m_sigma;
+    }
+
+    /// in-equality check
+    bool operator!=(const alphabet& o) const {
+        return !(*this == o);
+    }
+
+    /// write alphabet to file
+    void write_to(std::ostream& os) const {
+        for_each_char([&os](char_type c) {
+            os.write(&c,sizeof(c));
+        });
+    }
+
+    /// read alphabet from file
+    static alphabet read_from(std::istream& is) {
+        // read in everything
+        std::basic_string<char_type> s;
+        char_type c;
+        while (is.read(&c,sizeof(char_type))) {
+            s.push_back(c);
+        }
+        return from_string(s);
+    }
+
+    /// write alphabet to file
+    void write(const std::string& filename) const {
+        std::ofstream f(filename);
+        write_to(f);
+    }
+
+    /// write alphabet to file (collective call)
+    void write(const std::string& filename, const mxx::comm& comm) const {
+        // only one processor writes the file
+        if (comm.rank() == 0) {
+            write(filename);
+        }
+    }
+
+    /// read alphabet from file
+    void read(const std::string& filename) {
+        std::ifstream f(filename);
+        *this = alphabet::read_from(f);
+    }
+
+    /// read alphabet from file (collective call)
+    void read(const std::string& filename, const mxx::comm& comm) {
+        std::basic_string<char_type> s;
+        // only one processor reads the file and then broadcasts the alphabet
+        if (comm.rank() == 0) {
+            std::ifstream f(filename);
+            char_type c;
+            while (f.read(&c,sizeof(char_type))) {
+                s.push_back(c);
+            }
+        }
+        mxx::bcast(s, 0, comm);
+
+        *this = alphabet::from_string(s);
+    }
 };
 
 template <typename char_type>
@@ -379,8 +479,7 @@ class int_alphabet {
     template <typename word_type>
     inline unsigned int chars_per_word() const {
         unsigned int bits_per_word = sizeof(word_type)*8;
-        // TODO: this is currently a "work-around": if the type is signed, we
-        //       can't use the msb, thus we need to subtract one
+        // if the type is signed, we can't use the msb, thus we need to subtract one
         if (std::is_signed<word_type>::value)
             --bits_per_word;
         return bits_per_word/bits_per_char();
@@ -397,6 +496,9 @@ class int_alphabet {
         // c + min_char - 1
         return c - offset;
     }
+
+    // TODO:
+    // write/read file IO
 };
 
 template <typename char_type>
diff --git a/include/desa.hpp b/include/desa.hpp
new file mode 100644
index 0000000..36ffa0a
--- /dev/null
+++ b/include/desa.hpp
@@ -0,0 +1,716 @@
+/*
+ * Copyright 2018 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DESA_HPP
+#define DESA_HPP
+
+#include "suffix_array.hpp"
+#include "rmq.hpp"
+#include "lookup_table.hpp"
+#include "partition.hpp"
+#include "dstrings.hpp"
+
+#include <mxx/comm.hpp>
+#include <mxx/reduction.hpp>
+#include <mxx/bcast.hpp>
+#include <mxx/file.hpp>
+
+#define DESA_CONSTR_NAIVE_LC 0
+
+
+/**
+ * @brief Redistribute a distributed vector `v`, so that the calling processor
+ *        contains global indexes [gbeg,gend).
+ *
+ * This is a collective call.
+ *
+ * @return The redistributed vector with `gend-gbeg` elements corresponding to
+ *          global elements [gbeg, gend).
+ */
+template <typename T>
+std::vector<T> redistr(const std::vector<T>& v, size_t gbeg, size_t gend, const mxx::comm& comm) {
+    assert(gbeg <= gend);
+    std::vector<T> res(gend - gbeg);
+    // TODO: implememnt more efficient version for this (using send/recv only)
+    mxx::redo_arbit_decomposition(v.begin(), v.end(), res.begin(), gend - gbeg, comm);
+    return res;
+}
+
+/**
+ * @brief Redistribute the distributed suffix array `sa`,
+ *        such that the calling processor contains global indexes [gbeg,gend).
+ *
+ * This is a collective call.
+ */
+template <typename index_t, bool CONSTRUCT_LC>
+void redistr_sa(suffix_array<char, index_t, true, CONSTRUCT_LC>& sa, size_t gbeg, size_t gend, const mxx::comm& comm) {
+    sa.local_SA = redistr(sa.local_SA, gbeg, gend, comm);
+    sa.local_LCP = redistr(sa.local_LCP, gbeg, gend, comm);
+    if (!sa.local_B.empty()) {
+        sa.local_B = redistr(sa.local_B, gbeg, gend, comm);
+    }
+    if (CONSTRUCT_LC) {
+        sa.local_Lc = redistr(sa.local_Lc, gbeg, gend, comm);
+    }
+}
+
+
+
+
+template <typename index_t>
+struct tllt {
+    using range_t = std::pair<index_t,index_t>;
+    lookup_index<index_t> idx;
+    size_t n;
+
+    template <typename char_t>
+    void construct(const suffix_array<char_t, index_t, true, true>& sa, const std::string& local_str, const mxx::comm& comm) {
+        // choose a "good" `q`
+        unsigned int l = sa.alpha.bits_per_char();
+        unsigned int log_size = 24; // 2^24 = 16 M (*8 = 128 M)
+        unsigned int q = log_size / l;
+
+        // `q` might have to be choosen smaller if the input is very small
+        if (sa.local_size < q) {
+            q = sa.local_size;
+            q = std::max(q,1u);
+        }
+        q = mxx::allreduce(q, mxx::min<unsigned int>(), comm);
+        if (q < log_size / l && comm.rank() == 0) {
+            std::cerr << "[WARNING] q for q-mer partitioning was choosen smaller (q=" << q << " vs opt_q=" << log_size/l << ") due to small local size." << std::endl;
+        }
+
+        // construct kmer lookup table
+        std::vector<uint64_t> hist = kmer_hist<uint64_t>(local_str.begin(), local_str.end(), q, sa.alpha, comm);
+        idx.construct_from_hist(hist, q, sa.alpha);
+    }
+
+    inline index_t minmatch() const {
+        return idx.k;
+    }
+
+    // for partitioning
+    const std::vector<index_t>& prefix() const {
+        return idx.table;
+    }
+    std::vector<index_t>& prefix() {
+        return idx.table;
+    }
+
+    template <typename String>
+    inline range_t lookup(const String& P) const {
+        return idx.lookup(P);
+    }
+};
+
+
+    // partition information:
+    // after partitioning subtrees, this processor now contains the following
+    // indeces:
+
+    // TODO: separate class for `arbitrary_distribution`:
+    //       containing also the `lt_proc` and `part` tables from above and the info below
+    //       and provide some useful functions
+
+struct gen_dist {
+    /// globally number of elements
+    size_t global_size;
+    /// global index of first element on this processor
+    size_t my_begin;
+    /// global index of first element on next processor
+    size_t my_end;
+    /// = my_end - my_begin: number of elements on this processor
+    size_t local_size;
+
+    /// partition: assigns each processor it's lookup table index (shared copy)
+
+    /// inverse partition mapping
+    // map each processor start index -> processor index
+    std::map<size_t, int> lt_proc; // XXX: possibly optimize: use vector and binary search
+
+    int target_processor(size_t gidx) {
+        auto ub = lt_proc.upper_bound(gidx); // returns iterator to first element > gidx
+        --ub; // then the previous element is the target processor
+        return ub->second;
+    }
+
+    void check_correct(const mxx::comm& comm) const {
+        // check that these are all exclusive?
+        size_t next_beg = mxx::left_shift(my_begin, comm);
+        size_t prev_end = mxx::right_shift(my_end, comm);
+
+        assert(my_begin <= my_end);
+        if (comm.rank() > 0) {
+          assert(prev_end == my_begin);
+        } else {
+          assert(my_begin == 0);
+        }
+
+        if (comm.rank()+1 < comm.size()) {
+          assert(next_beg == my_end);
+        } else {
+          assert(my_end == global_size);
+        }
+    }
+
+    void print_imbalance_stats(const mxx::comm& comm) const {
+        size_t min_size = mxx::allreduce(local_size, mxx::min<size_t>(), comm);
+        size_t max_size = mxx::allreduce(local_size, mxx::max<size_t>(), comm);
+
+        std::vector<size_t> local_sizes = mxx::gather(local_size, 0, comm);
+
+        size_t ex_size = global_size / comm.size();
+
+        double imbalance = (max_size - ex_size)*1. / ex_size;
+
+        if (comm.rank() == 0) {
+            std::cout << "Repartition load im-balance: " << imbalance*100. << "%,  range: [" << min_size << "," << max_size << "]" << std::endl;
+            std::cout << "  sizes: " << local_sizes << std::endl;
+        }
+    }
+
+    template <typename index_t=size_t>
+    static gen_dist from_prefix_sizes(const std::vector<index_t>& table, const mxx::comm& comm) {
+        mxx::section_timer t;
+        gen_dist d;
+        d.global_size = table.back();
+        // partition the lookup index by processors
+        std::vector<size_t> part = partition(table, comm.size());
+        for (int i = 0; i < comm.size(); ++i) {
+            size_t kmer_l = part[i];
+            size_t proc_begin = (kmer_l == 0) ? 0 : table[kmer_l-1];
+            d.lt_proc[proc_begin] = i;
+        }
+
+        // kmers from: [part[comm.rank()] .. part[comm.rank()+1])
+        size_t my_kmer_l = part[comm.rank()];
+        size_t my_kmer_r = (comm.rank()+1 == comm.size()) ? table.size() : part[comm.rank()+1];
+
+        // kmer table is incl-prefix histogram, thus my_kmer_l is not the start global address
+        // instead the start global address should be the previous kmer count (ie, tl[my_kmer_l-1])
+        // the exclusive right global index of my global range should then be:
+        // my_kmer_r-1
+        d.my_begin = (my_kmer_l == 0) ? 0 : table[my_kmer_l-1];
+        d.my_end = (my_kmer_r == 0) ? 0 : table[my_kmer_r-1];
+        d.local_size = d.my_end - d.my_begin;
+
+#ifndef NDEBUG
+        d.check_correct(comm);
+#endif
+        t.end_section("repartition: 1D-partition");
+        return d;
+    }
+};
+/**
+ * @brief Distributed Enhanced Suffix Array
+ *
+ * @tparam index_t  Index type used for SA, RMQ, TL-lookup-table.
+ */
+template <typename index_t, typename TLI = tllt<index_t>>
+struct dist_desa {
+    /// Distributed Suffix Array & LCP array, and L_c array
+    suffix_array<char, index_t, true, true> sa;
+
+    /// LCP iterator type for RMQ
+    using it_t = typename std::vector<index_t>::const_iterator;
+
+    /// RMQ of local LCP
+    rmq<it_t, index_t> minq;
+
+    /// Top-Level lookup table (shared copy)
+    //lookup_index<index_t> lt; // shared kmer lookup table
+    TLI tli;
+
+    /// global size of input and arrays
+    size_t n;
+
+    // save the string
+    std::string local_str;
+    // block distribution of string
+    mxx::blk_dist str_dist;
+
+    // distribution of subtrees (general distribution of global range)
+    gen_dist subtree_dist;
+
+    /// type of query results: typeof [l,r)
+    using range_t = std::pair<index_t,index_t>;
+
+    /// creates an empty DESA
+    dist_desa(const mxx::comm& c) : sa(c) {}
+
+    /// naive implementation of L_c construciton
+    // (used only for runtime comparison with the better algorithm)
+#if DESA_CONSTR_NAIVE_LC
+    std::vector<char> local_Lc;
+
+    void naive_construct_Lc() {
+        size_t local_size = sa.local_SA.size();
+        sa.comm.with_subset(sa.local_SA.size() > 0, [&](const mxx::comm& comm) {
+        // Note:
+        //  LCP[i] = lcp(SA[i-1],SA[i])
+        //  Lc[i]  = S[SA[i-1]+LCP[i]], i=1,...n-1
+        index_t prev_SA = mxx::right_shift(sa.local_SA.back(), comm);
+
+        mxx::blk_dist dist(sa.n, comm.size(), comm.rank());
+        MXX_ASSERT(dist.local_size() == local_str.size());
+
+        std::vector<size_t> counts(comm.size(), 0);
+        if (comm.rank() > 0) {
+            if (prev_SA + sa.local_LCP[0] < sa.n) {
+                counts[dist.rank_of(prev_SA + sa.local_LCP[0])]++;
+            }
+        }
+        for (size_t i = 0; i < local_size; ++i) {
+            if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) {
+                counts[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++;
+            }
+        }
+        std::vector<size_t> offsets = mxx::local_exscan(counts);
+        size_t total_count = std::accumulate(counts.begin(), counts.end(), static_cast<size_t>(0));
+
+        std::vector<size_t> charidx(total_count);
+        // add bucketed requests
+        if (comm.rank() > 0) {
+            if (prev_SA + sa.local_LCP[0] < sa.n) {
+                charidx[offsets[dist.rank_of(prev_SA + sa.local_LCP[0])]++] = prev_SA + sa.local_LCP[0];
+            }
+        }
+        for (size_t i = 0; i < local_size; ++i) {
+            if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) {
+                charidx[offsets[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++] = sa.local_SA[i-1] + sa.local_LCP[i];
+            }
+        }
+
+        std::vector<char> resp_chars = bulk_rma(local_str.begin(), local_str.end(), charidx, counts, comm);
+        charidx.clear();
+
+        offsets = mxx::local_exscan(counts);
+
+        local_Lc.resize(local_size);
+
+        // add bucketed requests
+        if (comm.rank() > 0) {
+            if (prev_SA + sa.local_LCP[0] < sa.n) {
+                local_Lc[0] = resp_chars[offsets[dist.rank_of(prev_SA + sa.local_LCP[0])]++];
+            }
+        }
+        for (size_t i = 0; i < local_size; ++i) {
+            if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) {
+                local_Lc[i] = resp_chars[offsets[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++];
+            }
+        }
+        });
+    }
+#endif
+
+    void repartition(const mxx::comm& comm) {
+        mxx::section_timer t;
+        redistr_sa(sa, subtree_dist.my_begin, subtree_dist.my_end, comm);
+        t.end_section("repartition: redistribute");
+    }
+
+    /**
+     * @brief   Constructs the distributed DESA given the block
+     *          distributed string given by [begin,end).
+     */
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end, const mxx::comm& comm) {
+        mxx::section_timer t;
+        /// we need a copy of the input string for aligning queries
+        local_str = std::string(begin, end); // copy input string into data structure
+        n = mxx::allreduce(local_str.size(), comm);
+        str_dist = mxx::blk_dist(n, comm.size(), comm.rank());
+        t.end_section("desa_construct: cpy str");
+
+        // create SA/LCP
+        sa.construct(local_str.begin(), local_str.end()); // move comm from constructor into .construct !?
+        MXX_ASSERT(sa.local_SA.size() == str_dist.local_size());
+        t.end_section("desa_construct: SA/LCP construct");
+
+        tli.construct(sa, local_str, comm);
+        t.end_section("desa_construct: TLI construct");
+
+        subtree_dist = gen_dist::from_prefix_sizes(tli.prefix(), comm);
+        t.end_section("desa_construct: 1-D partition");
+        repartition(comm);
+        subtree_dist.print_imbalance_stats(comm);
+        t.end_section("desa_construct: repartition");
+
+        // TODO: LCP might need some 1 element overlaps on boundaries (or different distribution??)
+        // construct RMQ on local LCP
+        if (subtree_dist.local_size > 0) {
+            minq = rmq<it_t,index_t>(sa.local_LCP.begin(), sa.local_LCP.end());
+        }
+        t.end_section("desa_construct: RMQ construct");
+
+#if DESA_CONSTR_NAIVE_LC
+        naive_construct_Lc();
+        t.end_section("desa_construct: Lc naive construct");
+#endif
+    }
+
+    /// write distributed DESA to files using parallel IO
+    void write(const std::string& basename, const mxx::comm& comm) {
+        // writes only the SA, the rest gets re-created from the SA and input string
+        sa.write(basename);
+    }
+
+    /// read and initialize the distributed DESA from file
+    void read(const std::string& string_file, const std::string& basename, const mxx::comm& comm) {
+        mxx::section_timer t;
+        // read input string
+        local_str = mxx::file_block_decompose(string_file.c_str(), comm);
+        n = mxx::allreduce(local_str.size(), comm);
+        str_dist = mxx::blk_dist(n, comm.size(), comm.rank());
+        t.end_section("desa_read: read str");
+
+        // read SA
+        sa.read(basename);
+        t.end_section("desa_read: read SA");
+
+        tli.construct(sa, local_str, comm);
+        t.end_section("desa_read: init q-mer hist");
+
+        subtree_dist = gen_dist::from_prefix_sizes(tli.prefix(), comm);
+        repartition(comm);
+        subtree_dist.print_imbalance_stats(comm);
+        t.end_section("desa_read: repartition");
+
+        // construct RMQ(LCP)
+        if (subtree_dist.local_size > 0) {
+            minq = rmq<it_t,index_t>(sa.local_LCP.begin(), sa.local_LCP.end());
+        }
+        t.end_section("desa_read: RMQ construct");
+    }
+
+    // a ST node is virtually represented by it's interval [l,r] and it's first
+    // child split point `i1`, where LCP[i1] = minLCP[l..r] is the string
+    // depths `q` of the node. `c` is P[q], the (q+1)th char in P
+    inline void find_child(size_t& l, size_t& i1, size_t& r, size_t& q, char c) {
+        assert(l < r);
+        assert(l <= i1);
+        assert(i1 <= r);
+        do {
+            // `i` is the lcp(SA[i-1],SA[i])
+            char lc = this->sa.local_Lc[i1]; // == S[SA[l]+lcpv] for first iter
+            if (lc == c) {
+                r = i1-1;
+                break;
+            }
+            l = i1;
+            if (l == r)
+                break;
+
+            i1 = this->minq(l+1, r);
+        } while (l < r && sa.local_LCP[i1] == q);
+
+        if (sa.local_LCP[i1] == q) {
+            if (l+1 < r) {
+                i1 = this->minq(l+1, r);
+            } else {
+                i1 = l;
+            }
+        }
+        q = sa.local_LCP[i1];
+    }
+
+#if 0
+    template <typename String>
+    inline range_t local_locate_possible(const String& P, size_t l, size_t r) {
+        assert(my_begin <= l && r < my_end);
+        size_t m = P.size();
+
+        // convert to local coords
+        l -= my_begin;
+        r -= my_begin;
+
+        if (P.size() > lt.k && l <= r) {
+            // further narrow down search space
+            if (l < r) {
+                size_t i = this->minq(l+1, r);
+                size_t q = sa.local_LCP[i];
+                assert(q >= lt.k);
+
+                // FIXME: the check for l < i shouldn't be necessary, but
+                // somehow it happens sometimes and leads to error in find_child
+                while (q < m && l < r && l < i) {
+                    this->find_child(l, i, r, q, P[q]);
+                }
+            }
+
+        }
+        return range_t(l+my_begin, r+1+my_begin);
+    }
+#else 
+
+    // manually in-lined version
+    template <typename String>
+    inline range_t local_locate_possible(const String& P, size_t l, size_t r) {
+        assert(subtree_dist.my_begin <= l && r < subtree_dist.my_end);
+        size_t m = P.size();
+
+        // convert to local coords
+        l -= subtree_dist.my_begin;
+        r -= subtree_dist.my_begin;
+        if (P.size() > tli.minmatch() && l <= r) {
+            // further narrow down search space
+            if (l < r) {
+
+                size_t i = this->minq(l+1, r);
+                size_t q = sa.local_LCP[i];
+
+                while (q < m && l < r && l < i) {
+
+                    // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+                    // using [l,r] as an inclusive SA range
+                    // corresponding to LCP query range [l+1,r]
+
+                    // check if we've reached the end of the pattern
+                    if (q >= m) {
+                        break;
+                    }
+
+                    char c = P[q];
+                    do {
+                        // `i` is the lcp(SA[i-1],SA[i])
+                        char lc = sa.local_Lc[i]; // == S[SA[l]+lcpv] for first iter
+                        if (lc == c) {
+                            r = i-1;
+                            break;
+                        }
+                        l = i;
+                        if (l == r)
+                            break;
+                        i = this->minq(l+1, r);
+                    } while (l < r && sa.local_LCP[i] == q);
+
+                    if (sa.local_LCP[i] == q) {
+                        if (l+1 < r) {
+                            i = this->minq(l+1, r);
+                        } else {
+                            i = l;
+                        }
+                    }
+                    q = sa.local_LCP[i];
+                }
+            }
+        }
+        return range_t(l+subtree_dist.my_begin, r+1+subtree_dist.my_begin);
+    }
+
+#endif
+
+    template <typename String>
+    range_t local_locate_possible(const String& P) {
+        // only if `P` is on local processor
+        index_t l, r;
+        std::tie(l, r) = tli.lookup(P);
+        if (l == r) {
+            return range_t(l,l);
+        }
+        --r; // convert [l,r) to [l,r]
+
+        return local_locate_possible(P, l, r);
+    }
+
+    /// execute in parallel (each processor checks the lookup table) and
+    /// proceeds only if it's the owner of the pattern
+    template <typename String>
+    range_t locate_possible(const String& P) {
+        // only if `P` is on local processor
+        index_t l, r;
+        std::tie(l, r) = tli.lookup(P);
+        if (l == r) {
+            return range_t(l,l);
+        }
+        --r; // convert [l,r) to [l,r]
+
+        if (P.size() <= tli.minmatch()) {
+            return range_t(l,r+1);
+        }
+
+        range_t result;
+        int owner = -1;
+        if (subtree_dist.my_begin <= l && r < subtree_dist.my_end) {
+            result = local_locate_possible(P, l, r);
+            owner = sa.comm.rank();
+        }
+        owner = mxx::allreduce(owner, mxx::max<int>(), sa.comm);
+        mxx::bcast(result, owner, sa.comm);
+
+        return result;
+    }
+
+    std::vector<range_t> bulk_locate(const strings& ss) {
+        mxx::section_timer timer(std::cerr, sa.comm);
+        size_t nstrings = ss.nstrings;
+        std::vector<size_t> send_data_sizes(sa.comm.size()); // per processor sum of string sizes
+
+        std::vector<int> sprocs(nstrings, -1);
+        std::vector<range_t> solutions(nstrings);
+
+        std::vector<size_t> send_counts(sa.comm.size());
+
+        for (size_t i = 0; i < ss.nstrings; ++i) {
+            mystring s;
+            s.ptr = ss.str_begins[i];
+            s.len = ss.str_lens[i];
+            range_t res = tli.lookup(s);
+            // get processor for this range
+            int sproc = subtree_dist.target_processor(res.first);
+
+            if (s.len > tli.minmatch() && res.second > res.first) {
+                // this one needs further consideration
+                if (sproc != sa.comm.rank()) {
+                    assert(res.second <= subtree_dist.my_begin || res.first >= subtree_dist.my_end);
+                    sprocs[i] = sproc;
+                    ++send_counts[sproc];
+                } else {
+                    assert(subtree_dist.my_begin <= res.first && res.second <= subtree_dist.my_end);
+                    // can be solved locally (do these while sending?)
+
+                    if (res.first < res.second) {
+                        // local query
+                        res = local_locate_possible(s, res.first, res.second-1);
+                    }
+                    solutions[i] = res;
+                }
+            } else {
+                // solution is known
+                // save soluton, don't send this one
+                solutions[i] = res;
+            }
+        }
+
+        timer.end_section("Phase I: local queries");
+
+        // communicate the patterns to where they can be answered
+        strings recv_ss = all2all_strings(ss, sprocs, send_counts, sa.comm);
+        timer.end_section("Phase I: all2all patterns");
+
+        std::vector<range_t> recv_res(recv_ss.nstrings);
+        // locally continue querying the received strings
+        for (size_t i = 0; i < recv_ss.nstrings; ++i) {
+            // need to lookup again, then query
+            mystring P;
+            P.ptr = recv_ss.str_begins[i];
+            P.len = recv_ss.str_lens[i];
+
+            range_t res = tli.lookup(P);
+            assert(subtree_dist.my_begin <= res.first && res.second <= subtree_dist.my_end);
+            assert(P.len > tli.minmatch());
+
+            if (res.first < res.second) {
+                // local query
+                res = local_locate_possible(P, res.first, res.second-1);
+            }
+            recv_res[i] = res;
+        }
+
+        timer.end_section("Phase II: local_locate_possible");
+
+        // rule out false positives
+        // - take first or first few !?, sent to SA[first] for string alignment and checks
+        // - string is still equally block distributed!
+        //std::vector<size_t> sa_idx;
+        // steps:
+        // - bucket patterns and results by rank_of(SA[l])
+        // - all2all of patterns, results, SA[l], and originating processor?
+        // - local string compare (might cross boundaries!) [for now assume single boundary?]
+        // - reverse all all2alls?
+        std::vector<int> rank_sa(recv_ss.nstrings);
+        std::vector<size_t> fp_send_counts(sa.comm.size());
+        for (size_t i = 0; i < recv_ss.nstrings; ++i) {
+            size_t stridx = sa.local_SA[recv_res[i].first - subtree_dist.my_begin];
+            rank_sa[i] = str_dist.rank_of(stridx);
+            fp_send_counts[rank_sa[i]]++;
+        }
+        std::vector<size_t> stridxs(recv_ss.nstrings);
+        std::vector<size_t> fp_offset = mxx::local_exscan(fp_send_counts);
+        for (size_t i = 0; i < recv_ss.nstrings; ++i) {
+            stridxs[fp_offset[rank_sa[i]]++] = sa.local_SA[recv_res[i].first - subtree_dist.my_begin];
+        }
+
+        std::vector<size_t> fp_recv_counts = mxx::all2all(fp_send_counts, sa.comm);
+        stridxs = mxx::all2allv(stridxs, fp_send_counts, sa.comm);
+        strings fp_ss = all2all_strings(recv_ss, rank_sa, fp_send_counts, sa.comm);
+
+        timer.end_section("all2all patterns for cmp");
+
+
+        std::vector<size_t> overlap_sidx;
+        std::vector<size_t> overlap_strs;
+        // strcmp the patterns to the stridxs in the underlying string data
+        //
+        std::vector<int> fp_cmp(stridxs.size());
+        for (size_t i = 0; i < stridxs.size(); ++i) {
+            bool match = true;
+            mystring P;
+            P.ptr = fp_ss.str_begins[i];
+            P.len = fp_ss.str_lens[i];
+            // compare P with `stridxs`
+            size_t cmp_len = std::min(str_dist.iprefix_size() - stridxs[i], P.len);
+            for (size_t j = 0; j < cmp_len; ++j) {
+                match = match && (local_str[stridxs[i] - str_dist.eprefix_size()+j] == P.ptr[j]);
+            }
+            if (match && cmp_len < P.len) {
+                overlap_sidx.emplace_back(stridxs[i]);
+                overlap_strs.emplace_back(i);
+            }
+            fp_cmp[i] = match;
+        }
+
+        timer.end_section("Phase III: local strcmp");
+
+
+        // TODO: right shift those in overlap_xxx and cmp on next processor
+
+        // return results all the way back to originating processor
+        fp_cmp = mxx::all2allv(fp_cmp, fp_recv_counts, fp_send_counts, sa.comm);
+        fp_offset = mxx::local_exscan(fp_send_counts);
+        for (size_t i = 0; i < recv_ss.nstrings; ++i) {
+            if (!fp_cmp[fp_offset[rank_sa[i]]++]) {
+                recv_res[i] = range_t(recv_res[i].first, recv_res[i].first);
+            }
+        }
+
+        timer.end_section("return P3 results");
+
+
+        /// Phase IV: send shit back to its origin
+
+        // TODO: re-use this from within the all2all_strings
+        std::vector<size_t> recv_counts = mxx::all2all(send_counts, sa.comm);
+
+        // send back results
+        std::vector<range_t> ret_res = mxx::all2allv(recv_res, recv_counts, sa.comm);
+
+        // iterate through the original strings and add the results in correct place
+        std::vector<size_t> offset = mxx::local_exscan(send_counts);
+        for (size_t i = 0; i < ss.nstrings; ++i) {
+            if (sprocs[i] >= 0) {
+                // save received solutions
+                solutions[i] = ret_res[offset[sprocs[i]]++];
+            }
+        }
+
+        timer.end_section("return P2 results");
+
+        return solutions;
+    }
+};
+
+#endif // DESA_HPP
diff --git a/include/dstrings.hpp b/include/dstrings.hpp
new file mode 100644
index 0000000..8f736bf
--- /dev/null
+++ b/include/dstrings.hpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright 2018 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DSTRINGS_HPP
+#define DSTRINGS_HPP
+
+/**
+ * This file implements a simple distributed stringset, used mainly for
+ * loading query-patterns.
+ * In this representation, strings are split between processors, such
+ * that each stirng is fully contained within a processor.
+ */
+
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <memory>
+
+#include <mxx/comm.hpp>
+#include <mxx/file.hpp>
+#include <mxx/stream.hpp>
+
+/**
+ * @brief A simple string representation that does not own its own storage.
+ *
+ * Not necessarily '\0' terminated. Just a pointer and length.
+ *
+ * Implements some simple string operations, so that it can be used
+ * interchangably with std::string in some functions. Namely implements:
+ *
+ * - size(), length()
+ * - operator[]
+ * - data()
+ */
+struct mystring {
+    /// pointer to string start
+    char* ptr;
+    /// string length
+    size_t len;
+
+    /// returns the size (length) of the string
+    size_t size() const {
+        return len;
+    }
+
+    /// returns the size (length) of the string
+    size_t length() const {
+        return len;
+    }
+
+    /// returns a reference to the `i`th character
+    char& operator[](size_t i) const {
+        return ptr[i];
+    }
+
+    /// returns a reference to the `i`th character
+    char& operator[](size_t i) {
+        return ptr[i];
+    }
+
+    /// returns a pointer to the first character
+    char* data() {
+        return ptr;
+    }
+
+    /// returns a pointer to the first character
+    const char* data() const {
+        return ptr;
+    }
+};
+
+
+/**
+ * @brief A simple stringset with shared storage.
+ *
+ * The data for all stirngs is stored in `data`.
+ * Each string `i` is represented by a pointer to its start
+ * `str_begins[i]` and its length `str_lens[i]`.
+ */
+struct strings {
+    /// raw data
+    std::vector<char> data;
+    /// number of strings
+    size_t nstrings;
+    /// pointer to first char for each string
+    std::vector<char*> str_begins;
+    /// length of each string
+    std::vector<size_t> str_lens;
+    static constexpr char sep = '\n';
+
+    // TODO: constructors??
+    // TODO: iterators through strings?
+
+    static strings from_vec(const std::vector<std::string>& v) {
+        strings ss;
+        ss.nstrings = v.size();
+        size_t total_size = 0;
+        for (size_t i = 0; i < v.size(); ++i) {
+            total_size += v[i].size() + 1;
+        }
+        ss.data.resize(total_size);
+
+        ss.str_begins.resize(ss.nstrings);
+        ss.str_lens.resize(ss.nstrings);
+
+        char* out = ss.data.data();
+        for (size_t i = 0; i < v.size(); ++i) {
+            ss.str_begins[i] = out;
+            ss.str_lens[i] = v[i].size();
+            memcpy(out, &v[i][0], v[i].size());
+            out += v[i].size();
+            if (i+1 < v.size()) {
+                *(out++) = '\n';
+            } else {
+                *(out++) = '\0';
+            }
+        }
+
+        // replace very last sep with 0
+
+        return ss;
+    }
+
+    void parse() {
+        // parse the strings by seperator into str_begins, and str_lens
+        // given that the data vector contains the string data
+        // now parse in mem data
+        nstrings = 0;
+        str_begins.clear();
+        str_lens.clear();
+
+        size_t pos = 0;
+        while (pos < data.size()) {
+            // skip seperators
+            while (pos < data.size() && data[pos] == '\n') {
+                ++pos;
+            }
+            size_t str_beg = pos;
+            while (pos < data.size() && data[pos] != '\n') {
+                ++pos;
+            }
+            if (pos > str_beg && pos < data.size()) {
+                // save string info
+                nstrings++;
+                str_begins.emplace_back(data.data() + str_beg);
+                str_lens.emplace_back(pos - str_beg);
+            }
+        }
+    }
+
+    static strings from_string(const std::string& s) {
+        strings ss;
+
+        // copy into ss
+        ss.data.resize(s.size()+1);
+        memcpy(ss.data.data(), &s[0], s.size()+1);
+        ss.parse();
+
+        return ss;
+    }
+
+    static strings from_dfile(const std::string& filename, const mxx::comm& comm) {
+        size_t size = mxx::get_filesize(filename.c_str());
+        mxx::blk_dist dist(size, comm.size(), comm.rank());
+
+        // open file and seek to my pos
+        std::ifstream t(filename);
+        t.seekg(dist.eprefix_size());
+        // scan for first newline (sep?)
+        size_t my_offset = 0;
+        if (comm.rank() == 0) {
+            my_offset = 0;
+        } else {
+            my_offset = dist.eprefix_size();
+            // find first '\n'
+            char c;
+            while (t.get(c) && c != '\n') {
+                ++my_offset;
+            }
+            if (my_offset < size) {
+                ++my_offset; // advance one further
+            }
+        }
+
+        size_t my_end = mxx::left_shift(my_offset, comm);
+        if (comm.rank() + 1 == comm.size()) {
+            my_end = size;
+        }
+
+        // create rangebuf
+        mxx::rangebuf rb(my_offset, my_end-my_offset, t.rdbuf());
+        // read file (range) buffer into string stream
+        std::stringstream sstr;
+        sstr << &rb;
+
+        std::string local_str(sstr.str());
+
+        return strings::from_string(local_str);
+    }
+};
+
+std::ostream& operator<<(std::ostream& os, const strings& ss) {
+    os << "{nstrings=" << ss.nstrings << ", nbytes=" << ss.data.size() << ", [";
+    for (size_t i = 0; i < ss.nstrings; ++i) {
+        std::string s(ss.str_begins[i], ss.str_begins[i] + ss.str_lens[i]);
+        os << "\"" << s << "\"";
+        if (i+1 < ss.nstrings)
+            os << ", ";
+    }
+    os << "]}";
+    return os;
+}
+
+strings all2all_strings(const strings& ss, std::vector<int>& target, std::vector<size_t>& send_counts, const mxx::comm& comm) {
+    std::vector<size_t> offset = mxx::local_exscan(send_counts);
+    size_t send_num = offset.back() + send_counts.back();
+
+    // string lengths to send
+    std::vector<size_t> send_lens(send_num); // str-lens in bucketed order of strings
+    std::vector<size_t> send_data_sizes(comm.size()); // total data size per target proc
+    for (size_t i = 0; i < ss.nstrings; ++i) {
+        if (target[i] >= 0 && target[i] != comm.rank()) {
+            send_lens[offset[target[i]]++] = ss.str_lens[i];
+            send_data_sizes[target[i]] += ss.str_lens[i];
+        }
+    }
+
+    std::vector<size_t> data_offsets = mxx::local_exscan(send_data_sizes);
+    size_t send_data_size = data_offsets.back() + send_data_sizes.back();
+    std::vector<char> send_data(send_data_size); // buffer for sending
+    // create a "sorted" send buffer
+    for (size_t i = 0; i < ss.nstrings; ++i) {
+        if (target[i] >= 0 && target[i] != comm.rank()) {
+            memcpy(&send_data[data_offsets[target[i]]], ss.str_begins[i], ss.str_lens[i]);
+            data_offsets[target[i]] += ss.str_lens[i];
+            // TODO: just copy this extra char from `ss`?
+            //send_data[data_offsets[sprocs[i]]] = '\n';
+            //data_offsets[sprocs[i]] += 1;
+            // keep string lengths
+            //send_lens[lens_offset[target[i]]++] = ss.str_lens[i];
+        }
+    }
+
+    // send/recv the sequence lengths
+    std::vector<size_t> recv_counts = mxx::all2all(send_counts, comm);
+    std::vector<size_t> recv_lens = mxx::all2allv(send_lens, send_counts, recv_counts, comm);
+
+    // send/recv the string data of the patterns
+    std::vector<size_t> recv_data_sizes = mxx::all2all(send_data_sizes, comm);
+    std::vector<char> recv_data = mxx::all2allv(send_data, send_data_sizes, recv_data_sizes, comm);
+
+    // create `strings` data structure from received data
+    strings recv_ss;
+    recv_ss.data.swap(recv_data); // FIXME allocate memory for data
+    recv_ss.nstrings = std::accumulate(recv_counts.begin(), recv_counts.end(), static_cast<size_t>(0));
+    recv_ss.str_lens = recv_lens;
+    recv_ss.str_begins.resize(recv_ss.nstrings);
+
+    // init str_begins
+    size_t beg = 0;
+    for (size_t i = 0; i < recv_ss.nstrings; ++i) {
+        recv_ss.str_begins[i] = recv_ss.data.data() + beg;
+        beg += recv_ss.str_lens[i];
+    }
+
+    return recv_ss;
+}
+
+#endif // DSTRINGS_HPP
diff --git a/include/kmer.hpp b/include/kmer.hpp
index e02150e..55cf2d4 100644
--- a/include/kmer.hpp
+++ b/include/kmer.hpp
@@ -125,6 +125,8 @@ void par_for_each_kmer(InputIterator begin, InputIterator end, unsigned int k, c
     if (kmer_mask == 0)
         kmer_mask = ~static_cast<word_type>(0);
 
+    MXX_ASSERT(std::distance(begin,end) >= k-1);
+
     // fill first k-mer (until k-1 positions) and send to left processor
     // filling k-mer with first character = MSBs for lexicographical ordering
     InputIterator str_it = begin;
diff --git a/include/lookup_table.hpp b/include/lookup_table.hpp
new file mode 100644
index 0000000..6c522ab
--- /dev/null
+++ b/include/lookup_table.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2018 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LOOKUP_TABLE_HPP
+#define LOOKUP_TABLE_HPP
+
+#include <vector>
+#include <algorithm>
+
+#include "alphabet.hpp"
+#include "kmer.hpp"
+
+// Top-Level Lookup-Table (small kmer index)
+
+/**
+ * @brief Top-Level Lookup-Table
+ *
+ * The table is represented as a prefix-sum over the k-mer histogram.
+ *
+ * @tparam index_t  Index type (unsigned integer). Should be big enough to hold
+ *                  the number of total elements indexed.
+ */
+template <typename index_t>
+struct lookup_index {
+    /// prefixsum of k-mer histogram
+    std::vector<index_t> table;
+    /// k
+    unsigned int k;
+    /// alphabet
+    alphabet<char> alpha;
+    /// output type
+    using range_t = std::pair<index_t,index_t>;
+
+    /**
+     * @brief Constructs the lookup table from a k-mer histogram.
+     *
+     * @param hist  k-mer histogram
+     * @param k
+     * @param a     alphabet for underlying string.
+     */
+    template <typename word_type>
+    void construct_from_hist(const std::vector<word_type>& hist, unsigned int k, const alphabet<char>& a) {
+        this->alpha = a;
+        this->k = k;
+
+        if (table.size() != hist.size()) {
+            table.resize(hist.size());
+        }
+        std::partial_sum(hist.begin(), hist.end(), table.begin());
+    }
+
+    /**
+     * @brief Constructs the lookup table from a given string and alphabet.
+     *
+     * @param begin     Iterator the string start.
+     * @param end       Iterator to string end.
+     * @param bits      Used to compute `k` as `k = bits / log(alphabet size)`.
+     *                  The table will have size 2^bits.
+     * @param a         The alphabet of the string.
+     */
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end, unsigned int bits, const alphabet<char>& a) {
+        this->alpha = a;
+
+        unsigned int l = this->alpha.bits_per_char();
+        this->k = bits / l;
+        assert(k >= 1);
+
+        // scan through string to create kmer histogram
+        table = kmer_hist<index_t>(begin, end, k, alpha);
+
+        // prefix sum over hist
+        std::partial_sum(table.begin(), table.end(), table.begin());
+    }
+
+    /**
+     * @brief Constructs the lookup table from a given string.
+     *
+     * @param begin     Iterator the string start.
+     * @param end       Iterator to string end.
+     * @param bits      Used to compute `k` as `k = bits / log(alphabet size)`.
+     *                  The table will have size 2^bits.
+     */
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end, unsigned int bits) {
+        // get alphabet?
+        alphabet<char> a = alphabet<char>::from_sequence(begin, end);
+        this->construct(begin, end, bits, a);
+    }
+
+    /**
+     * @brief Queries the lookup table for a given string patter `P`.
+     *
+     * Returns the range [l,r) of suffixes for the first `k` characters of `P`.
+     *
+     * @param P     Input string pattern.
+     *
+     * @return  Range [l,r) of sorted suffixes where `P[0..k-1]` occurs.
+     */
+    template <typename String>
+    range_t lookup(const String& P) const {
+        unsigned int l = alpha.bits_per_char();
+        assert(l*k < sizeof(index_t)*8);
+
+        if (P.size() >= k) {
+            // create kmer from P
+            index_t kmer = 0;
+            for (size_t i = 0; i < k; ++i) {
+                kmer <<= l;
+                kmer |= alpha.encode(P[i]);
+            }
+
+            if (kmer == 0) {
+                return range_t(0, table[0]);
+            } else {
+                return range_t(table[kmer-1],table[kmer]);
+            }
+        } else {
+            // create partial kmer
+            index_t kmer = 0;
+            for (size_t i = 0; i < P.size(); ++i) {
+                kmer <<= l;
+                kmer |= alpha.encode(P[i]);
+            }
+            size_t extra = k - P.size();
+            for (size_t i = P.size(); i < k; ++i) {
+                kmer <<= l;
+            }
+            if (kmer == 0) {
+                return range_t(0, table[kmer + (1 << (extra*l))-1]);
+            } else {
+                return range_t(table[kmer-1], table[kmer + (1 << (extra*l))-1]);
+            }
+        }
+    }
+};
+
+#endif // LOOKUP_TABLE_HPP
diff --git a/include/partition.hpp b/include/partition.hpp
new file mode 100644
index 0000000..0502969
--- /dev/null
+++ b/include/partition.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2018 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef PARTITION_HPP
+#define PARTITION_HPP
+
+#include <vector>
+
+/**
+ * @brief Simple recursive 1D-partitioning via bisection.
+ *
+ * Partitions bins given by an inclusive prefix sum of bin sizes.
+ * Recursively partitions index range [l, r) of `tl` onto
+ * processors (or partitons) [p0,p0+p).
+ *
+ * @param tl    Number of elements in each bin given as an incl. prefix sum
+ *              (partial_sum) of bin sizes.
+ * @param l     Left index (incl.) [l,r).
+ * @param r     Right index (excl.) [l,r).
+ * @param p0    First partition index.
+ * @param p     Number of partitions
+ * @param pstart    Returns the assigned bins in pstart[p0],...,pstart[p0+p-1]
+ *                  where each `pstart[i]` is the index (number) of the first
+ *                  bin assigned to partition `i`. I.e., each element is between
+ *                  [0,tl.size()].
+ *                  Partition `i` is assigned bins `pstart[i],..,pstart[i+1]`
+ *                  or `pstart[i],...,tl.size()` for the last partition.
+ */
+template <typename index_t>
+void rec_partition(const std::vector<index_t>& tl, size_t l, size_t r, int p0, int p, std::vector<size_t>& pstart) {
+    if (p == 1) {
+        return;
+    }
+
+    if (l == r) {
+        for (int i = p0+1; i < p0+p; ++i) {
+            pstart[i] = l;
+        }
+        return;
+    }
+
+    // p odd: (p+1)/2, (p-1)/2 weighted halfs (extreme case: 2,1)
+    size_t excl = l == 0 ? 0 : tl[l-1];
+    size_t num = tl[r-1] - excl;
+
+    // compute how many elements for each side would be optimal
+    size_t left_size;
+    int left_p;
+    if (p % 2 == 0) {
+        left_size = num/2;
+        left_p = p / 2;
+    } else {
+        // (p+1)/ 2 to the left, (p-1)/2 to the right
+        left_size = (num*(p+1))/(2*p);
+        left_p = (p+1)/2;
+    }
+
+    // find first index which is larger
+    auto it = std::lower_bound(tl.begin()+l, tl.begin()+r, excl+left_size);
+    size_t i = it - tl.begin();
+
+    // decide whether this or the previous split is better (closer to `left size`)
+    if (i > 0) {
+        if (tl[i] - (left_size+excl) > (left_size+excl) - tl[i-1]) {
+            --i;
+        }
+    }
+
+    // left partition is now [l, i], convert to [l, i)
+    ++i;
+
+    // save the result
+    pstart[p0+left_p] = i;
+
+    // p even: find i in tl closest to (tl[l]+tl[r])/2
+    rec_partition(tl, l, i, p0, left_p, pstart);
+    rec_partition(tl, i, r, p0 + left_p, p - left_p, pstart);
+}
+
+/**
+ * @brief Simple recursive 1D-partitioning via bisection.
+ *
+ * Partitions bins given by an inclusive prefix sum of bin sizes.
+ *
+ * @param tl    Number of elements in each bin given as an incl. prefix sum
+ *              (partial_sum) of bin sizes.
+ * @param p     Number of partitions
+ * @return      Returns for each parititon, the assigned bins.
+ *              Each `pstart[i]` is the index (number) of the first
+ *              bin assigned to partition `i`.
+ *              I.e., each element is between [0,tl.size()].
+ *              Partition `i` is assigned bins `pstart[i],..,pstart[i+1]`
+ *              or `pstart[i],...,tl.size()` for the last partition.
+ */
+template <typename index_t>
+std::vector<size_t> partition(const std::vector<index_t>& tl, int p) {
+    // partition by recursive weighted bisection
+    std::vector<size_t> pstarts(p, 0);
+    rec_partition(tl, 0, tl.size(), 0, p, pstarts);
+    return pstarts;
+}
+
+#endif // PARTITION_HPP
diff --git a/include/seq_query.hpp b/include/seq_query.hpp
new file mode 100644
index 0000000..05b6830
--- /dev/null
+++ b/include/seq_query.hpp
@@ -0,0 +1,908 @@
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <string.h>
+
+// psac stuff:
+//#include <suffix_array.hpp>
+#include <lcp.hpp>
+#include <rmq.hpp>
+#include <divsufsort_wrapper.hpp>
+#include <lookup_table.hpp>
+
+#include <prettyprint.hpp>
+
+#define RMQ_USE_SDSL 0
+
+#if RMQ_USE_SDSL
+#include <sdsl/rmq_succinct_sada.hpp>
+#endif
+
+#ifndef SEQ_QUERY_HPP
+#define SEQ_QUERY_HPP
+
+namespace my {
+struct timer {
+    using clock_type = std::chrono::steady_clock;
+    using time_point = clock_type::time_point;
+    using duration = time_point::duration;
+
+    time_point ts;
+    duration elapsed;
+    duration total;
+
+    timer() : ts(), elapsed(0), total(0) {}
+
+    inline void tic() {
+      ts = clock_type::now();
+    }
+
+    inline void toc() {
+      elapsed = clock_type::now() - ts;
+      total += elapsed;
+    }
+
+    template <typename precision>
+    inline typename precision::rep get_time() const {
+      return std::chrono::duration_cast<precision>(elapsed).count();
+    }
+
+    template <typename precision>
+    inline typename precision::rep get_total_time() const {
+      return std::chrono::duration_cast<precision>(total).count();
+    }
+
+    inline std::chrono::nanoseconds::rep get_ns() const {
+      return get_time<std::chrono::nanoseconds>();
+    }
+
+    inline std::chrono::nanoseconds::rep get_total_ns() const {
+      return get_total_time<std::chrono::nanoseconds>();
+    }
+};
+} // namepsace my
+
+// LCP from SA and string??
+
+// libdivsufsort wrapper
+//#include <divsufsort_wrapper.hpp>
+
+/// TODO: compare SA, SA+LCP, ESA, and DESA querying
+///       (opt) compare against sdsl (although ours requires tons more mem...)
+
+
+// TODO: speedup this comparison function using AVX etc
+// compares pattern P lexicographically to S[pos...) and returns:
+//  <0: P is smaller
+//  =0: P is equal
+//  >0: P is larger
+inline int cmp_pattern(const std::string& P, const std::string& S, size_t pos) {
+    if (pos >= S.size()) {
+        return 1;
+    }
+    return strncmp(&P[0], &S[pos], P.size());
+}
+
+template <typename Iterator>
+inline int cmp_pattern(const std::string& P, Iterator strbeg, size_t n, size_t pos) {
+    if (pos >= n) {
+        return 1;
+    }
+    return strncmp(&P[0], &strbeg[pos], P.size());
+}
+
+
+// simple binary search for given condition
+// assuming for the sequence `data`, all cond(data[i]) == false come before all
+// cond(data[i]) == true. This returns the smallest index i such that
+// cond(data[i]] == true, and returns data.size() if none exist
+template <typename T, typename Cond>
+size_t binary_search(const std::vector<T>& data, Cond cond) {
+    if (data.empty()) {
+        return 0;
+    }
+    if (cond(data.front())) {
+        return 0;
+    }
+    if (!cond(data.back())) {
+        return data.size();
+    }
+
+    size_t l = 0; 
+    size_t r = data.size();
+
+    while (l + 1 < r) {
+        size_t m = (l+r) >> 1;
+        if (!cond(data[m])) {
+            l = m;
+        } else {
+            r = m;
+        }
+    }
+    return r;
+}
+
+
+// O(P log(n)) binary search for pattern using only string and SA
+template <typename index_t>
+std::pair<index_t, index_t> locate_sa(const std::string& S, const std::vector<index_t>& SA, const std::string& P) {
+    // binary search using SA and S
+    assert(S.size() == SA.size());
+
+    // TODO: can be speed up by first searching until we hit a midpoint for which cmp == 0, then do sepearate lb and ub
+    index_t lb = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, S, pos) <= 0; });
+    index_t ub = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, S, pos) < 0; });
+    return std::pair<index_t,index_t>(lb,ub);
+}
+
+// locate using LCP information (M&M use interval tree, we use RMQ !?)
+// O(P + log(n))
+
+
+// returns the offset for the first differening character between x and y
+// this compares until at most n characters in. if x[0..n) == y[0..n), then
+// this returns `n`
+int my_stricmp(const char* x, const char* y, size_t n, size_t& len) {
+    len = 0;
+    while (len < n && x[len] == y[len]) {
+        ++len;
+    }
+    if (len == n)
+        return 0;
+    else if (x[len] < y[len])
+        return -1;
+    else
+        return 1;
+}
+
+// computes the lcp between the suffix S[pos+offset...) and the pattern P[offset...)
+inline size_t lcp_offset(const std::string& P, const std::string& S, size_t pos, size_t offset) {
+    assert(pos+offset <= S.size());
+    size_t max = std::min(S.size() - pos, P.size());
+    size_t i = offset;
+    while (i < max && P[i] == S[pos+i]) {
+        ++i;
+    }
+    return i;
+}
+
+inline size_t lcp_offset(const std::string& P, const char* strbegin, size_t n, size_t pos, size_t offset) {
+    assert(pos+offset <= n);
+    size_t max = std::min(n - pos, P.size());
+    size_t i = offset;
+    while (i < max && P[i] == strbegin[pos+i]) {
+        ++i;
+    }
+    return i;
+}
+
+template <typename index_t, typename RMQ>
+size_t lb_rmq(const std::vector<index_t>& LCP, const RMQ& minq, size_t l, size_t r, index_t q) {
+    // we have a `q` match at `r` and trying to find the lower bound
+    // LCP[r] = lcp(SA[r-1],SA[r])
+    if (LCP[r] < q)
+        return r; // `r` is the lower bound
+
+    //typename std::vector<index_t>::const_iterator b = LCP.begin();
+
+    while (l + 1 < r) {
+        size_t mid = (l+r)/2;
+        index_t x = LCP[minq(mid+1, r)];
+        if (x < q) {
+            l = mid;
+        } else {
+            r = mid;
+        }
+    }
+
+    return r;
+}
+
+template <typename index_t, typename RMQ>
+size_t ub_rmq(const std::vector<index_t>& LCP, const RMQ& minq, size_t l, size_t r, index_t q) {
+    // we have a `q` match at `l` and trying to find the upper bound
+    // LCP[l+1] = lcp(SA[l],SA[l+1])
+    if (LCP[l+1] < q)
+        return l; // `l` is the upper bound
+
+
+    //typename std::vector<index_t>::const_iterator b = LCP.begin();
+
+    // one-sided binary search
+    while (l + 1 < r) {
+        size_t mid = (l+r)/2;
+        index_t x = LCP[minq(l+1, mid)];
+        if (x < q) {
+            r = mid;
+        } else {
+            l = mid;
+        }
+    }
+
+    return l;
+}
+
+template <typename index_t>
+struct sa_index {
+
+    size_t n;
+    const char* strbegin;
+    const char* strend;
+    std::vector<index_t> SA;
+
+    my::timer rmqtimer;
+
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end) {
+        strbegin = &(*begin);
+        strend = &(*end);
+        dss::construct(begin, end, SA);
+        n = SA.size();
+    }
+
+    std::pair<index_t, index_t> locate(const std::string& P) {
+        // TODO: can be speed up by first searching until we hit a midpoint for which cmp == 0, then do sepearate lb and ub
+        index_t lb = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, strbegin, n, pos) <= 0; });
+        index_t ub = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, strbegin, n, pos) < 0; });
+        return std::pair<index_t,index_t>(lb,ub);
+    }
+};
+
+template <typename index_t>
+struct salcp_index : public sa_index<index_t> {
+
+    std::vector<index_t> LCP;
+
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end) {
+        sa_index<index_t>::construct(begin, end);
+        // construct LCP from SA:
+        // create ISA needed for constructing LCP
+        std::vector<index_t> ISA(this->n);
+        for (size_t i = 0; i < this->n; ++i) {
+            ISA[this->SA[i]] = i;
+        }
+        std::vector<index_t> LCP;
+        lcp_from_sa(std::string(this->strbegin, this->strend), this->SA, ISA, this->LCP);
+    }
+};
+
+
+// O(sigma*P) using repeated RMQ on LCP [Fischer & Heun 2007]
+template <typename index_t>
+struct esa_index : public salcp_index<index_t> {
+    using it_t = typename std::vector<index_t>::const_iterator;
+
+#if RMQ_USE_SDSL
+    sdsl::rmq_succinct_sct<> minq;
+#else
+    rmq<it_t, index_t> minq;
+#endif
+
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end) {
+        salcp_index<index_t>::construct(begin, end);
+        // construct RMQ ontop of LCP
+#if RMQ_USE_SDSL
+        minq = sdsl::rmq_succinct_sct<>(&this->LCP);
+#else
+        minq = rmq<it_t,index_t>(this->LCP.begin(), this->LCP.end());
+#endif
+    }
+
+    std::pair<index_t,index_t> locate(const std::string& P) const {
+
+        size_t n = this->n;
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        size_t q = 0; // size of current match
+
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+
+            // l, r <- getChild(q, c, l, r):
+
+            // get first child interval and depth
+            //it_t ii = minq.query(b+l+1, b+r+1); // this may have already been run by the previous step
+            //size_t i = ii - b;
+            size_t i = minq(l+1, r);
+            index_t lcpv = this->LCP[i];
+            assert(lcpv >= q);
+
+            // check skipped characters (XXX optional?)
+            // characters in P[q..lcpv), compare to [l]
+            bool match = true;
+            for (size_t j = q; q < lcpv; ++q) {
+                match &= (P[j] == this->strbegin[this->SA[l]+j]);
+            }
+            if (!match) {
+                return std::pair<index_t, index_t>(l,l);
+            }
+
+            // check if we've reached the end of the pattern
+            if (lcpv >= m) {
+                return std::pair<index_t,index_t>(l, r+1);
+            }
+
+            char c = P[lcpv];
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char Lc = this->strbegin[this->SA[i-1]+lcpv]; // == S[SA[l]+lcpv] for first iter
+                //char Rc = S[SA[i]+lcpv];
+                if (Lc == c) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+                i = minq(l+1, r);
+            } while (l < r && this->LCP[i] == lcpv);
+
+            if (this->strbegin[this->SA[l]+lcpv] == c) {
+                // found the interval we were looking for
+                q = lcpv+1;
+            } else {
+                return std::pair<index_t,index_t>(l,l);
+            }
+
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+};
+
+// O(P + log(n)) via binary search range queries on LCP
+// original M&M does this via pre-computing the RLCP and LLCP queries for
+// all possible (l,mid,r) points of the binary search
+//
+// this version uses binary search on an underlying RMQ
+template <typename index_t>
+struct bs_esa_index : public esa_index<index_t> {
+    using it_t = typename std::vector<index_t>::const_iterator;
+
+    std::pair<index_t, index_t> locate(const std::string& P) const {
+        size_t n = this->n;
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+        size_t llcp = lcp_offset(P, this->strbegin, n, this->SA[0], 0);
+        size_t rlcp = lcp_offset(P, this->strbegin, n, this->SA[n-1], 0);
+
+        index_t q = 0; // num of chars matched
+
+        if (llcp < m && P[llcp] < this->strbegin[this->SA[0]+llcp]) {
+            // first suffix is larger than `P` -> no match
+            return std::pair<index_t,index_t>(0,0);
+        } else if (llcp == m) {
+            // first suffixes matches, just find ub
+            r = ub_rmq(this->LCP, this->minq, 0, n-1, m);
+            return std::pair<index_t,index_t>(0,r+1);
+        }
+
+        if (rlcp < m && P[rlcp] > this->strbegin[this->SA[n-1]+rlcp]) {
+            // last suffix is smaller than `P` -> no match
+            return std::pair<index_t,index_t>(n,n);
+        } else if (rlcp == m) {
+            // last suffix matches `P` -> find lb
+            l = lb_rmq(this->LCP, this->minq, 0, n-1, m);
+            return std::pair<index_t, index_t>(l, n);
+        }
+
+
+        while (l+1 < r) {
+
+            size_t mid = (l+r) / 2;
+            index_t pos = this->SA[mid];
+
+            if (llcp >= rlcp) {
+                // sharing more with left than right
+                size_t i = this->minq(l+1, mid);
+                q = this->LCP[i]; // minLCP for suffixes in SA[l..m]
+                if (q >= llcp) {
+                    q = lcp_offset(P, this->strbegin, n, pos, llcp);
+                }
+            } else {
+                size_t i = this->minq(mid+1, r); // minLCP for suffixes in SA[m..r]
+                q = this->LCP[i];
+                if (q >= rlcp) {
+                    q = lcp_offset(P, this->strbegin, n, pos, rlcp);
+                }
+            }
+
+            if (q == m) {
+                // found _a_ match, now find left and right boundaries
+                assert(llcp < m && rlcp < m);
+
+                // find ub
+                r = ub_rmq(this->LCP, this->minq, mid, r, m);
+
+                // find lb
+                l = lb_rmq(this->LCP, this->minq, l, mid, m);
+
+                return std::pair<index_t, index_t>(l, r+1);
+
+            } else if (P[q] <= this->strbegin[pos+q]) {
+                r = mid;
+                rlcp = q;
+            } else {
+                l = mid;
+                llcp = q;
+            }
+        }
+        // found no match
+        size_t lb = r;
+        return std::pair<index_t,index_t>(lb, lb);
+    }
+};
+
+template <typename index_t>
+struct desa_index : public esa_index<index_t> {
+    using it_t = typename esa_index<index_t>::it_t;
+    // adds Lc array of characters
+    std::vector<char> Lc;
+
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end) {
+        esa_index<index_t>::construct(begin, end);
+
+        // NOTE:
+        //   LCP[i] = lcp(SA[i-1],SA[i])
+        //   differing chars (needed for decision)
+        //     left:  S[SA[i-1]+LCP[i]]   (need only one of these)
+        //     right: S[SA[i]  +LCP[i]]
+        // construct char array Lc[i] = S[SA[i-1]+LCP[i]], i=1,...n-1
+        Lc.resize(this->n);
+        for (size_t i = 1; i < this->n; ++i) {
+            Lc[i] = this->strbegin[this->SA[i-1]+this->LCP[i]];
+        }
+    }
+
+    // a ST node is virtually represented by it's interval [l,r] and it's first
+    // child split point `i1`, where LCP[i1] = minLCP[l..r] is the string
+    // depths `q` of the node. `c` is P[q], the (q+1)th char in P
+    inline void find_child(size_t& l, size_t& i1, size_t& r, size_t& q, char c) const {
+        assert(l < r);
+        assert(l <= i1);
+        assert(i1 <= r);
+        do {
+            // `i` is the lcp(sa[i-1],sa[i])
+            char lc = this->lc[i1]; // == s[sa[l]+lcpv] for first iter
+            if (lc == c) {
+                r = i1-1;
+                break;
+            }
+            l = i1;
+            if (l == r)
+                break;
+
+            this->rmqtimer.tic();
+            i1 = this->minq(l+1, r);
+            this->rmqtimer.toc();
+        } while (l < r && this->lcp[i1] == q);
+
+        if (this->lcp[i1] == q) {
+            if (l+1 < r) {
+                this->rmqtimer.tic();
+                i1 = this->minq(l+1, r);
+                this->rmqtimer.toc();
+            } else {
+                i1 = l;
+            }
+        }
+        q = this->lcp[i1];
+    }
+
+    /*
+    std::pair<index_t,index_t> locate_possible(const std::string& P) {
+        size_t n = this->n;
+        size_t m = P.size();
+
+        size_t l = 0;
+        size_t r = n-1;
+        //size_t q = 0; // size of current match
+
+        if (l == r) {
+            return std::pair<index_t, index_t>(l, r+1);
+        }
+
+
+        this->rmqtimer.tic();
+        size_t i = this->minq(l+1, r);
+        this->rmqtimer.toc();
+        size_t q = this->LCP[i];
+
+        while (q < m && l < r) {
+            find_child(l, i, r, q, P[q]);
+        }
+
+        return std::pair<index_t, index_t>(l, r+1);
+    }
+
+    std::pair<index_t,index_t> locate(const std::string& P) {
+        std::pair<index_t, index_t> res = locate_possible(P);
+        // check if pattern matches
+        if (res.first < res.second) {
+            int cmp = strncmp(&this->strbegin[this->SA[res.first]], &P[0], P.size());
+            if (cmp == 0) {
+                return res;
+            } else {
+                return std::pair<index_t,index_t>(res.first, res.first);
+            }
+        }
+        return res;
+    }
+    */
+
+    /*
+    std::pair<index_t,index_t> locate(const std::string& P) {
+
+        size_t n = this->n;
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        size_t q = 0; // size of current match
+
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+            // get first child interval and depth
+            size_t i = this->minq(l+1, r);
+            index_t lcpv = this->LCP[i];
+            assert(lcpv >= q);
+
+            // check if we've reached the end of the pattern
+            if (lcpv >= m) {
+                break;
+            }
+
+            char c = P[lcpv];
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter
+                if (lc == c) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+
+                i = this->minq(l+1, r);
+            } while (l < r && this->LCP[i] == lcpv);
+
+            if (this->strbegin[this->SA[l]+lcpv] == c) {
+                // found the interval we were looking for
+                q = lcpv+1;
+            } else {
+                return std::pair<index_t,index_t>(l,l);
+            }
+        }
+        // check if pattern matches
+        if (l <= r) {
+            int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size());
+            if (cmp != 0) {
+                return std::pair<index_t,index_t>(l, l);
+            }
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+    */
+
+
+    template <typename String>
+    inline std::pair<index_t,index_t> locate_possible(const String& P) const {
+
+        size_t n = this->n;
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        // get first child interval and depth
+        size_t i = this->minq(l+1, r);
+        index_t q = this->LCP[i];
+
+        // blind search
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+            // check if we've reached the end of the pattern
+            if (q >= m) {
+                break;
+            }
+
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter
+                if (lc == P[q]) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+
+                i = this->minq(l+1, r);
+            } while (l < r && this->LCP[i] == q);
+
+            if (this->LCP[i] == q) {
+                if (l+1 < r) {
+                    i = this->minq(l+1, r);
+                } else {
+                    i = l;
+                }
+            }
+            q = this->LCP[i];
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+
+
+    std::pair<index_t,index_t> locate(const std::string& P) const {
+
+        size_t n = this->n;
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        // get first child interval and depth
+        size_t i = this->minq(l+1, r);
+        index_t q = this->LCP[i];
+
+        // blind search
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+            // check if we've reached the end of the pattern
+            if (q >= m) {
+                break;
+            }
+
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter
+                if (lc == P[q]) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+
+                i = this->minq(l+1, r);
+            } while (l < r && this->LCP[i] == q);
+
+            if (this->LCP[i] == q) {
+                if (l+1 < r) {
+                    i = this->minq(l+1, r);
+                } else {
+                    i = l;
+                }
+            }
+            q = this->LCP[i];
+        }
+
+        // check if pattern matches the string
+        if (l <= r) {
+            int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size());
+            if (cmp != 0) {
+                return std::pair<index_t,index_t>(l, l);
+            }
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+};
+
+
+template <typename index_t>
+struct lookup_desa_index : public desa_index<index_t> {
+    lookup_index<index_t> tl;
+
+    template <typename Iterator>
+    void construct(Iterator begin, Iterator end) {
+        desa_index<index_t>::construct(begin, end);
+        tl.construct(begin, end, 16); // automatically size `k` given table size and use alphabet size
+    }
+
+    std::pair<index_t,index_t> locate(const std::string& P) const {
+        size_t m = P.size();
+
+        index_t l, r;
+        std::tie(l, r) = tl.lookup(P);
+        if (l == r) {
+            return std::pair<index_t, index_t>(l,l);
+        }
+        --r; // convert [l,r) to [l,r]
+
+        if (P.size() > tl.k && l <= r) {
+            // further narrow down search space
+            if (l < r) {
+
+                size_t i = this->minq(l+1, r);
+                index_t q = this->LCP[i];
+                assert(q >= tl.k);
+
+                // blind search
+                while (q < m && l < r) {
+
+                    // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+                    // using [l,r] as an inclusive SA range
+                    // corresponding to LCP query range [l+1,r]
+
+                    // check if we've reached the end of the pattern
+                    if (q >= m) {
+                        break;
+                    }
+
+                    do {
+                        // `i` is the lcp(SA[i-1],SA[i])
+                        char lc = this->Lc[i]; // == S[SA[l]+q] for first iter
+                        if (lc == P[q]) {
+                            r = i-1;
+                            break;
+                        }
+                        l = i;
+                        if (l == r)
+                            break;
+                        i = this->minq(l+1, r);
+                    } while (l < r && this->LCP[i] == q);
+
+                    if (this->LCP[i] == q) {
+                        if (l+1 < r) {
+                            i = this->minq(l+1, r);
+                        } else {
+                            i = l;
+                        }
+                    }
+                    q = this->LCP[i];
+                }
+            }
+
+            // check if pattern matches
+            if (l <= r) {
+                int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size());
+                if (cmp == 0) {
+                    return std::pair<index_t, index_t>(l, r+1);
+                } else {
+                    // no match
+                    return std::pair<index_t,index_t>(l, l);
+                }
+            }
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+    /*
+    std::pair<index_t,index_t> locate(const std::string& P) {
+        size_t n = this->n;
+        size_t m = P.size();
+
+        index_t l, r;
+        std::tie(l, r) = tl.lookup(P);
+        if (l == r) {
+            return std::pair<index_t, index_t>(l,l);
+        }
+        --r; // convert [l,r) to [l,r]
+
+        if (P.size() > tl.k && l <= r) {
+            // further narrow down search space
+            if (l < r) {
+
+                size_t i = this->minq(l+1, r);
+                size_t q = this->LCP[i];
+                assert(q >= tl.k);
+
+                while (q < m && l < r) {
+
+                    // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+                    // using [l,r] as an inclusive SA range
+                    // corresponding to LCP query range [l+1,r]
+
+                    // get first child interval and depth
+                    size_t i = this->minq(l+1, r);
+                    index_t lcpv = this->LCP[i];
+                    assert(lcpv >= q);
+
+                    // check if we've reached the end of the pattern
+                    if (lcpv >= m) {
+                        break;
+                    }
+
+                    char c = P[lcpv];
+                    do {
+                        // `i` is the lcp(SA[i-1],SA[i])
+                        char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter
+                        if (lc == c) {
+                            r = i-1;
+                            break;
+                        }
+                        l = i;
+                        if (l == r)
+                            break;
+                        i = this->minq(l+1, r);
+                    } while (l < r && this->LCP[i] == lcpv);
+
+                    if (this->strbegin[this->SA[l]+lcpv] == c) {
+                        // found the interval we were looking for
+                        q = lcpv+1;
+                    } else {
+                        return std::pair<index_t,index_t>(l,l);
+                    }
+                }
+            }
+
+            // check if pattern matches
+            if (l <= r) {
+                int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size());
+                if (cmp == 0) {
+                    return std::pair<index_t, index_t>(l, r+1);
+                } else {
+                    // no match
+                    return std::pair<index_t,index_t>(l, l);
+                }
+            }
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+    */
+    /*
+    std::pair<index_t,index_t> locate(const std::string& P) {
+        size_t m = P.size();
+
+        index_t l, r;
+        std::tie(l, r) = tl.lookup(P);
+        if (l == r) {
+            return std::pair<index_t, index_t>(l,l);
+        }
+        --r; // convert [l,r) to [l,r]
+
+        if (P.size() > tl.k && l <= r) {
+            // further narrow down search space
+            if (l < r) {
+                this->rmqtimer.tic();
+                size_t i = this->minq(l+1, r);
+                this->rmqtimer.toc();
+                size_t q = this->LCP[i];
+                assert(q >= tl.k);
+
+                while (q < m && l < r) {
+                    this->find_child(l, i, r, q, P[q]);
+                }
+            }
+
+            // check if pattern matches
+            if (l <= r) {
+                int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size());
+                if (cmp == 0) {
+                    return std::pair<index_t, index_t>(l, r+1);
+                } else {
+                    // no match
+                    return std::pair<index_t,index_t>(l, l);
+                }
+            }
+        }
+        return std::pair<index_t,index_t>(l, r+1);
+    }
+    */
+};
+
+
+
+#endif // SEQ_QUERY_HPP
diff --git a/include/suffix_array.hpp b/include/suffix_array.hpp
index 2e9869a..64dfb2c 100644
--- a/include/suffix_array.hpp
+++ b/include/suffix_array.hpp
@@ -139,17 +139,29 @@ void write_dist_int_array(const std::string& basename, const std::vector<T>& vec
 template <typename T>
 std::vector<T> read_dist_int_array(const std::string& filename, const mxx::comm& comm) {
     static_assert(std::is_integral<T>::value && !std::is_same<T,bool>::value, "This function works only for integral types excluding bool");
-    // TODO: make sure the file ending matches the type size?
 
-    mxx::coll_file f(filename, comm);
-    f.open(MPI_MODE_RDONLY);
-
-    size_t nbytes = f.get_size();
-    size_t n = nbytes / sizeof(T);
+    std::ifstream f(filename, std::ios::binary | std::ios::ate);
+    size_t n = f.tellg() / sizeof(T);
     mxx::blk_dist d(n, comm);
 
+    // Use C++ fileIO if local size is larger than INT_MAX
+    // because some MPI implementations will silently fail to load the input
+    // file otherwise
     std::vector<T> data(d.local_size());
-    f.read_ordered(d.local_size(), data.data());
+    if ((n + comm.size())*sizeof(T) / comm.size() >= std::numeric_limits<int>::max()) {
+        if (comm.rank() == 0) {
+            std::cerr << "[WARNING] local size is larger than INT_MAX, using C++ file IO instead of MPI IO" << std::endl;
+        }
+        f.seekg(d.eprefix_size(), std::ios::beg);
+        f.read((char*)data.data(), d.local_size()*sizeof(T));
+    } else {
+        // use MPI File IO
+        f.close();
+        mxx::coll_file f2(filename, comm);
+        f2.open(MPI_MODE_RDONLY);
+        f2.read_ordered(d.local_size(), data.data());
+    }
+
     return data;
 }
 
@@ -225,6 +237,8 @@ void write(const std::string& basename) {
     if (_CONSTRUCT_LC) {
         write_dist_int_array(basename + ".lc", local_Lc, comm);
     }
+
+    alpha.write(basename + ".alpha", comm);
 }
 
 // load from file using parallel IO
@@ -244,6 +258,8 @@ void read(const std::string& basename) {
         }
     }
 
+    alpha.read(basename + ".alpha", comm);
+
     // initialize the rest
     init_size(local_SA.size());
 }
diff --git a/include/tldt.hpp b/include/tldt.hpp
new file mode 100644
index 0000000..05a2cff
--- /dev/null
+++ b/include/tldt.hpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright 2018 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TLDT_HPP
+#define TLDT_HPP
+
+#include <assert.h>
+
+#include <vector>
+#include <stack>
+#include <algorithm>
+
+#include <mxx/reduction.hpp>
+#include <mxx/partition.hpp>
+
+#include <seq_query.hpp>
+#include <suffix_array.hpp>
+
+
+template <typename index_t, typename lcp_t>
+std::vector<index_t> sample_lcp(const std::vector<lcp_t>& LCP, index_t maxsize) {
+
+    index_t n = LCP.size();
+
+    struct node {
+        index_t lcp; // == LCP[pos]
+        index_t pos; // index of node
+        index_t l;  // index of left parent
+    };
+
+    std::stack<node> st;
+    st.push(node{0,0,0});
+
+    index_t total_out = 1;
+    std::vector<bool> do_output(n, false);
+    do_output[0] = true;
+
+    for (index_t i = 1; i < n; ++i) {
+
+        while (!st.empty() && st.top().lcp > LCP[i]) {
+            // new node is smaller LCP
+            // -> "complete" all nodes which are on the stack
+            node& u = st.top();
+
+            // u.pos has range [u.l, .. , i)
+            index_t parent_size = i - u.l;
+            if (parent_size > maxsize) {
+                // output but in inverse order !?
+                do_output[u.pos] = true;
+                ++total_out;
+            }
+            st.pop();
+        }
+
+        if (st.empty()) {
+            // cant happen, because always: LCP[0] = 0
+            assert(false);
+        } else if (st.top().lcp == LCP[i]) {
+            st.push(node{LCP[i], i, st.top().l});
+            if (LCP[i] == 0) {
+                do_output[i] = true;
+                ++total_out;
+            }
+        } else {
+            assert(st.top().lcp < LCP[i]);
+            st.push(node{LCP[i], i, st.top().pos});
+        }
+    }
+    // virtually, there is a 0 at the very end of the LCP array -> output them all
+    while (!st.empty() && st.top().lcp > 0) {
+        // new node is smaller LCP
+        // -> "complete" all nodes which are on the stack
+        node& u = st.top();
+
+        // u.pos has range [u.l, .. , i)
+        index_t parent_size = n - u.l;
+        if (parent_size > maxsize) {
+            do_output[u.pos] = true;
+            ++total_out;
+        }
+        st.pop();
+    }
+
+    // TODO: output everything still in the queue?
+
+    std::vector<index_t> idx(total_out);
+    index_t j = 0;
+    for (index_t i = 0; i < n; ++i) {
+        if (do_output[i]) {
+            idx[j] = i;
+            ++j;
+        }
+    }
+    return idx;
+}
+
+
+template <typename index_t, typename lcp_t>
+std::vector<index_t> sample_lcp_indirect(const std::vector<lcp_t>& LCP, index_t maxsize) {
+    assert(LCP[0] == 0);
+
+    std::vector<index_t> samples; // all indices for output, eventually the result
+    samples.push_back(0);
+    size_t n = LCP.size();
+
+    std::vector<index_t> st; // keep track of where each sequence starts? best to use indces into `samples`!?
+    //st.push(0); // not necessary, 0 never gets "popped"
+
+    //lcp_t minval = LCP[1];
+
+    for (size_t i = 1; i < LCP.size(); ++i) {
+        index_t l = LCP[i];
+        index_t topl = LCP[samples.back()];
+
+        while (!st.empty() && topl > l) {
+            // l is rightmatch for topl and all its equals, their left match is in stack
+            index_t lsidx = st.back(); // index of parent in `samples`
+            index_t lnsv = samples[lsidx]; // index of parent in `LCP`
+            st.pop_back();
+
+            index_t parent_size = i - lnsv;
+            if (parent_size > maxsize) {
+                //  here i can output everything on the stack and quit the while loop
+                // ie, everything in samples is safe, also `i` necessarily is safe and everything smaller to come after!
+                // so i can push i, keep the stack empty. empty stack means: everything larger than samples.back() gets pushed to both, l == samples.back() -> nothing to stack, push samples. l < samples.back() -> output and safe, ie. not onto stack??
+                // stack contains parent of those elements which may be removed at a later time
+                //st = std::stack<index_t>();
+                st.clear();
+                break;
+            } else {
+                // elements are not output, but skipped
+                // [x y z L L L]
+                //      ^
+                //      2       6, resize to 3
+                samples.resize(lsidx + 1);
+            }
+
+            topl = LCP[samples.back()];
+            // if i clear off the entire stack without outputting anything, what happens?
+        }
+
+        if (topl < l) { // only if new larger sequence do we push the stack
+            st.push_back(samples.size()-1);
+            samples.push_back(i);
+        } else if (topl == l) {
+            samples.push_back(i);
+        } else {
+            assert(st.empty());
+            samples.push_back(i);
+        }
+    }
+
+    // at the end we assume we get a 0 element. we go through the remaining stack
+    // and remove everything that isn't large enough until the distance becomes big enough
+    index_t topl = LCP[samples.back()];
+    while (!st.empty() && topl > 0) {
+        index_t lsidx = st.back(); // index of parent in `samples`
+        index_t lnsv = samples[lsidx]; // index of parent in `LCP`
+        st.pop_back();
+
+        index_t parent_size = n - lnsv;
+        if (parent_size > maxsize) {
+            break;
+        } else {
+            samples.resize(lsidx + 1);
+        }
+    }
+
+    return samples;
+}
+
+template <typename T>
+std::vector<T> send_vec_left(const std::vector<T>& vec, const mxx::comm& c) {
+    size_t send_size = vec.size();
+    size_t recv_size = 0;
+    mxx::datatype dt = mxx::get_datatype<T>();
+    mxx::datatype size_dt = mxx::get_datatype<size_t>();
+    int dst = c.rank() > 0 ? c.rank() - 1 : MPI_PROC_NULL;
+    int src = c.rank() + 1 < c.size() ? c.rank() + 1 : MPI_PROC_NULL;
+    MPI_Sendrecv(&send_size, 1, size_dt.type(), dst, 0,
+                 &recv_size, 1, size_dt.type(), src, 0, c, MPI_STATUS_IGNORE);
+
+    std::vector<T> res(recv_size);
+    MPI_Sendrecv(const_cast<T*>(vec.data()), send_size, dt.type(), dst, 0,
+                 res.data(), recv_size, dt.type(), src, 0, c, MPI_STATUS_IGNORE);
+
+    return res;
+}
+
+// if this is not included as part of google test, define our own assert functions!
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define ASSERT_TRUE(x) {if (!(x)) { std::cerr << "[ERROR]: Assertion failed in " __FILE__ ":" << __LINE__ << std::endl;return ;}} std::cerr << ""
+#define ASSERT_EQ(x, y) ASSERT_TRUE((x) == (y))
+#define ASSERT_GT(x, y) ASSERT_TRUE((x) > (y))
+#define ASSERT_LT(x, y) ASSERT_TRUE((x) < (y))
+#endif
+
+#define PV(x, c) mxx::sync_cout(c) << "[" << c.rank() << "] " << "" # x  " = " << x << std::endl;
+
+template <typename index_t>
+void seq_check_sample(const std::vector<index_t>& LCP, const std::vector<index_t>& off, index_t maxsize) {
+    size_t n = LCP.size();
+    // create sampling
+    std::vector<size_t> lcp(off.size());
+    for (size_t i = 0; i < off.size(); ++i) {
+        ASSERT_TRUE(0 <= off[i] && off[i] < n);
+        lcp[i] = LCP[off[i]];
+    }
+    // check correctness
+    // (EXPECTED, ACTUAL)
+    ASSERT_TRUE(off.size() > 0);
+    ASSERT_EQ(0u, off[0]);
+
+    for (size_t i = 1; i < off.size(); ++i) {
+        // everything between off[i-1], .., off[i] should have LCP > than those two
+        ASSERT_TRUE(off[i] - off[i-1] <= maxsize); // TODO: +-1 errors?
+        size_t minlcp = std::max(LCP[off[i-1]], LCP[off[i]]);
+        for (size_t j = off[i-1]+1; j < off[i]; ++j) {
+            ASSERT_TRUE(0 <= j && j < n);
+            ASSERT_TRUE(LCP[j] > minlcp); // << "LCP values skipped must be smaller than the larger of the two surrounding ones";
+        }
+    }
+
+    // so far:
+    // this tests the necessary conditions: intervals are valid, and not larger than the max
+
+    // next:
+    // need to also check they are the largest not exceeding the max,
+    // ie. check that parent ranges of everything selected exceed the max
+
+    std::stack<size_t> st;
+    st.push(0);
+
+    for (size_t i = 0; i < off.size(); ++i) {
+        while (lcp[st.top()] > lcp[i]) {
+            // skip all equal to find parent
+            size_t l = lcp[st.top()];
+            while (lcp[st.top()] == l) {
+                st.pop();
+            }
+
+            // parent range should be larger than max size
+            ASSERT_TRUE(off[i] - off[st.top()] > maxsize);
+        }
+        st.push(i);
+    }
+
+
+    while (lcp[st.top()] > 0) {
+        // skip all equal to find parent
+        size_t l = lcp[st.top()];
+        while (lcp[st.top()] == l) {
+            st.pop();
+        }
+
+        // parent range should be larger than max size
+        ASSERT_TRUE(n - off[st.top()] > maxsize);
+    }
+
+}
+
+// distributed version for sampling the LCP array
+// -> used for TLDT construction
+template <typename index_t, typename lcp_t>
+std::vector<index_t> sample_lcp_distr(const std::vector<lcp_t>& local_LCP, index_t maxsize, const mxx::comm& comm) {
+
+    // set up global sizes
+    size_t n = mxx::allreduce(local_LCP.size(), comm);
+    mxx::blk_dist dist(n, comm);
+    size_t prefix = dist.eprefix_size();
+
+    std::vector<index_t> samples; // all indices for output, eventually the result
+    samples.push_back(prefix);
+
+    std::vector<index_t> st; // stack of missing right matches, index into samples
+
+    std::vector<index_t> left_st; // index into `samples`, stack of missing left matches
+    left_st.push_back(0);
+
+    // minimum seen so far
+    lcp_t minval = local_LCP[0];
+
+    for (size_t i = 1; i < local_LCP.size(); ++i) {
+        index_t l = local_LCP[i];
+        index_t topl = local_LCP[samples.back()-prefix];
+
+        while (!st.empty() && topl > l) {
+            // l is rightmatch for topl and all its equals, their left match is in stack
+            index_t lsidx = st.back(); // index of parent in `samples`
+            index_t lnsv = samples[lsidx]; // index of parent in `LCP`
+            st.pop_back();
+
+            index_t parent_size = i+prefix - lnsv;
+            if (parent_size > maxsize) {
+                //  here i can output everything on the stack and quit the while loop
+                // ie, everything in samples is safe, also `i` necessarily is safe and everything smaller to come after!
+                // so i can push i, keep the stack empty. empty stack means: everything larger than samples.back() gets pushed to both, l == samples.back() -> nothing to stack, push samples. l < samples.back() -> output and safe, ie. not onto stack??
+                // stack contains parent of those elements which may be removed at a later time
+                st.clear();
+                break;
+            } else {
+                // elements are not output, but skipped
+                samples.resize(lsidx + 1);
+            }
+
+            topl = local_LCP[samples.back()-prefix];
+        }
+
+        if (topl < l) { // only if new larger sequence do we push the stack
+            st.push_back(samples.size()-1); // st always points toward right-most equal
+            samples.push_back(i+prefix);
+        } else if (topl == l) {
+            samples.push_back(i+prefix);
+        } else {
+            assert(st.empty());
+            samples.push_back(i+prefix);
+            if (l < minval) {
+                minval = l;
+                if (samples[left_st.back()]-prefix <= maxsize) {
+                    left_st.push_back(samples.size()-1);
+                } else {
+                    left_st.back() = samples.size() - 1;
+                }
+            }
+        }
+    }
+
+    using pair_t = std::pair<lcp_t, index_t>;
+    std::vector<pair_t> left(left_st.size());
+    for (size_t i = 0; i < left_st.size(); ++i) {
+        left[i].second = samples[left_st[i]];
+        left[i].first = local_LCP[samples[left_st[i]]-prefix];
+    }
+
+    std::vector<pair_t> right = send_vec_left(left, comm);
+    if (comm.rank() == comm.size()-1) {
+        right.emplace_back(0, n);
+    }
+
+    // use `right` to solve remaining elements in stack `st`
+    index_t first_out = right.size()-1;
+    for (size_t i = 0; i < right.size(); ++i) {
+        lcp_t l = right[i].first;
+        index_t topl = local_LCP[samples.back()-prefix];
+        while (!st.empty() && topl > l) { // right[i] is right match for samples.back()
+            index_t lsidx = st.back(); // index of left parent in `samples`
+            index_t lnsv = samples[lsidx]; // index of left parent in `LCP`
+            st.pop_back();
+
+            index_t parent_size = right[i].second - lnsv;
+            if (parent_size > maxsize) {
+                // here, I usually clear the stack, all remain!
+                // thus right[i] as a parent is also a valid output!
+                st.clear();
+                break;
+            } else {
+                // remove from output
+                samples.resize(lsidx + 1);
+            }
+            topl = local_LCP[samples.back()-prefix];
+        }
+
+
+        // what is right[i]'s parent range?
+        //
+        // 3 cases:
+        if (topl < right[i].first) {
+            // topl < right[i]  -> samples.back() is lnsv for right[i]
+            if (i + 1 < right.size() && right[i+1].second - samples.back() > maxsize) {
+                first_out = i;
+                break;
+            }
+        } else if (topl == l) {
+             // parent range is larger
+            if (i + 1 < right.size() && right[i+1].second - samples[st.back()] > maxsize) {
+                first_out = i;
+                break;
+             }
+        } else if (st.empty()) {
+            // empty and topl > right[i] => def output `i`
+            first_out = i;
+            break;
+        }
+    }
+
+    // send back start point!
+    index_t leftstart = mxx::right_shift(first_out, comm);
+
+    // remove elements from front if they have a parent range smaller than maxsize
+    if (comm.rank() > 0 && leftstart > 0) {
+        std::vector<index_t> tmp;
+        tmp.swap(samples);
+        samples = std::vector<index_t>(tmp.begin() + left_st[leftstart], tmp.end());
+    }
+
+    return samples;
+}
+
+template <typename index_t>
+struct tldt {
+    desa_index<index_t> idx; // uses a sequential DESA underneath
+
+    using range_t = std::pair<index_t,index_t>;
+
+    std::vector<index_t> offsets;
+    size_t n;
+
+    template <typename char_t>
+    void construct(const suffix_array<char_t, index_t, true, true>& sa, const std::string& local_str, const mxx::comm& comm) {
+        // sample to 100
+        n = sa.n;
+        size_t prefix = sa.part.eprefix_size();
+        index_t maxsize = sa.n / comm.size() / 128;
+        if (maxsize < 2)
+            maxsize = 2; // smallest parent interval
+        std::vector<index_t> local_off = sample_lcp_distr(sa.local_LCP, maxsize, comm);
+
+        // sample LCP and Lc at local_off
+        std::vector<index_t> local_sample_lcp(local_off.size());
+        std::vector<char_t> local_sample_lc(local_off.size());
+
+        for (size_t i = 0; i < local_off.size(); ++i) {
+            local_sample_lcp[i] = sa.local_LCP[local_off[i] - prefix];
+            local_sample_lc[i] = sa.local_Lc[local_off[i] - prefix];
+        }
+
+        // allgather offsets and sampled lcp and Lc
+        offsets = mxx::allgatherv(local_off, comm);
+        idx.n = offsets.size();
+        idx.LCP = mxx::allgatherv(local_sample_lcp, comm);
+        idx.Lc = mxx::allgatherv(local_sample_lc, comm);
+
+        // construct rmq
+        idx.minq = rmq<typename std::vector<index_t>::const_iterator,index_t>(idx.LCP.begin(), idx.LCP.end());
+    }
+
+    inline index_t minmatch() const {
+        return 1;
+    }
+
+    // for partitioning
+    std::vector<index_t> prefix() const {
+        // return inclusive prefix sum of bin size
+        // (offsets is already the exclusive prefix sum, thus just need to shift by one)
+        std::vector<index_t> ps(offsets.size());
+        for (size_t i = 0; i < offsets.size()-1; ++i) {
+            ps[i] = offsets[i+1];
+        }
+        ps.back() = n;
+        return ps;
+    }
+
+    template <typename String>
+    inline range_t lookup(const String& P) const {
+        range_t r = idx.locate_possible(P);
+        return range_t(offsets[r.first], r.second == offsets.size() ? n : offsets[r.second]);
+    }
+};
+
+#endif // TLDT_HPP
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f9c8705..52323ea 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,24 +1,25 @@
-cmake_minimum_required(VERSION 2.6)
+project(psac-src)
 
-# project settings
-project(psac-main)
+# for tclap command line parsing header library
+include_directories(../ext/)
 
-#################
-#  shared libs  #
-#################
 
-#####################
-#  main executable  #
-#####################
+######################
+#  main executables  #
+######################
 
 # our main executable for suffix array construction
 add_executable(psac psac.cpp)
-target_link_libraries(psac ${EXTRA_LIBS} rt)
+target_link_libraries(psac psac-dss-lib)
 
 add_executable(gsac gsac.cpp)
-target_link_libraries(gsac ${EXTRA_LIBS} rt)
-target_link_libraries(gsac divsufsort)
-target_link_libraries(gsac divsufsort64)
+target_link_libraries(gsac psac-dss-lib)
+
+add_executable(desa-main desa_main.cpp)
+target_link_libraries(desa-main psac-dss-lib)
+
+add_executable(test-slcp test_slcp.cpp)
+target_link_libraries(test-slcp psac-dss-lib)
 
 ################
 #  benchmarks  #
@@ -26,34 +27,38 @@ target_link_libraries(gsac divsufsort64)
 
 # benchmarks of our different internal methods
 add_executable(benchmark_sac benchmark.cpp)
-target_link_libraries(benchmark_sac ${EXTRA_LIBS} rt)
+target_link_libraries(benchmark_sac psaclib)
 
 # benchmark with different values of `k` for initial k-mer sorting
 add_executable(benchmark_k benchmark_k.cpp)
-target_link_libraries(benchmark_k ${EXTRA_LIBS} rt)
+target_link_libraries(benchmark_k psaclib)
 
 # benchmark ANSV
 add_executable(benchmark-ansv benchmark_ansv.cpp)
-target_link_libraries(benchmark-ansv ${EXTRA_LIBS} rt)
+target_link_libraries(benchmark-ansv psaclib)
+
 
 ################
 # tools/utils  #
 ################
+
 add_executable(print64 print64.cpp)
 
+add_executable(mkpattern mkpattern.cpp)
+
+add_executable(kmer-stats kmer_partition.cpp)
+target_link_libraries(kmer-stats psac-dss-lib)
+
+
 ################
 #  divsufsort  #
 ################
 
 # divsufsort executable (supporting 32 and 64 bits)
 add_executable(dss dss.cpp)
-target_link_libraries(dss ${EXTRA_LIBS})
-target_link_libraries(dss divsufsort)
-target_link_libraries(dss divsufsort64)
+target_link_libraries(dss psac-dss-lib)
 
 # compare our algorithm against divsufsort and check correctness
 add_executable(psac-vs-dss psac_vs_dss.cpp)
-target_link_libraries(psac-vs-dss ${EXTRA_LIBS} rt)
-target_link_libraries(psac-vs-dss divsufsort)
-target_link_libraries(psac-vs-dss divsufsort64)
+target_link_libraries(psac-vs-dss psac-dss-lib)
 
diff --git a/src/desa_main.cpp b/src/desa_main.cpp
new file mode 100644
index 0000000..99cc830
--- /dev/null
+++ b/src/desa_main.cpp
@@ -0,0 +1,117 @@
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <utility>
+
+// using TCLAP for command line parsing
+#include <tclap/CmdLine.h>
+
+// mxx dependencies
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/file.hpp>
+#include <mxx/utils.hpp>
+
+#include <suffix_array.hpp>
+#include <lcp.hpp>
+#include <divsufsort_wrapper.hpp>
+
+#include "seq_query.hpp"
+#include "desa.hpp"
+#include "tldt.hpp"
+
+// define index and TLI types for experiment
+using index_t = uint64_t;
+using TLI_t = tldt<index_t>;
+//using TLI_t = tllt<index_t>;
+using desa_t = dist_desa<index_t, TLI_t>;
+
+int main(int argc, char *argv[]) {
+    // set up mxx / MPI
+    mxx::env e(argc, argv);
+    mxx::env::set_exception_on_error();
+    mxx::comm c;
+    mxx::print_node_distribution(c);
+
+    // given a file, compute suffix array and lcp array, how to do count/locate query?
+    // via sa_range()
+    /*
+    if (argc < 3) {
+        std::cerr << "Usage: ./xx <text_file> <pattern_file>" << std::endl;
+    }
+    */
+    try {
+        // define commandline usage
+        TCLAP::CmdLine cmd("Distributed Enhanced Suffix Array");
+        TCLAP::ValueArg<std::string> fileArg("f", "file", "Input string filename.", true, "", "filename");
+        cmd.add(fileArg);
+        TCLAP::ValueArg<std::string> loadArg("l", "load", "Load index from given basename", false, "", "filename");
+        TCLAP::SwitchArg constructArg("c", "construct", "Construct SA/LCP/Lc from input file", false);
+        cmd.xorAdd(loadArg, constructArg); // either load or construct SA/LCP
+        TCLAP::ValueArg<std::string> outArg("o", "outfile", "Output file base name. If --construct was used, this stores the resulting DESA.", false, "", "filename");
+        cmd.add(outArg);
+
+        TCLAP::ValueArg<std::string> queryArg("q", "query", "Query file for benchmarking querying.", false, "", "filename");
+        cmd.add(queryArg);
+        cmd.parse(argc, argv);
+
+
+        mxx::section_timer t;
+
+        // create distributed DESA class
+        using range_t = desa_t::range_t;
+        desa_t idx(c);
+
+        if (constructArg.getValue()) {
+            if (c.rank() == 0) {
+                std::cout << "constructing DESA (SA+LCP+LC)..." << std::endl;
+            }
+            // read input file into in-memory string
+            std::string input_str = mxx::file_block_decompose(fileArg.getValue().c_str(), c);
+            t.end_section("read input file");
+            // construct DESA from scratch
+            idx.construct(input_str.begin(), input_str.end(), c);
+            t.end_section("construct idx");
+
+            if (outArg.getValue() != "") {
+                // store DESA to given basename
+                if (c.rank() == 0) {
+                    std::cout << "saving DESA to basename `" << outArg.getValue() << "` ..." << std::endl;
+                }
+                idx.write(outArg.getValue(), c);
+            }
+        } else {
+            if (outArg.getValue() != "") {
+                if (c.rank() == 0) {
+                    std::cerr << "WARNING: --outfile argument will be ignored since the input is loaded from file (don't use in conjuction with --load)." << std::endl;
+                }
+            }
+            if (c.rank() == 0) {
+                std::cout << "loading DESA (SA+LCP+LC) from basename `" << loadArg.getValue() << "` ..." << std::endl;
+            }
+            idx.read(fileArg.getValue(), loadArg.getValue(), c);
+
+        }
+
+        // query benchmarking?
+        if (queryArg.getValue() != "") {
+            strings ss = strings::from_dfile(queryArg.getValue(), c);
+            t.end_section("read patterns file");
+
+            // run locate a couple of times
+            int reps = 10;
+            for (int i = 0; i < reps; ++i) {
+                std::vector<range_t> mysols = idx.bulk_locate(ss);
+                t.end_section("bulk_locate");
+            }
+        }
+
+    } catch (TCLAP::ArgException& e) {
+        std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}
diff --git a/src/kmer_partition.cpp b/src/kmer_partition.cpp
new file mode 100644
index 0000000..611687a
--- /dev/null
+++ b/src/kmer_partition.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2015 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mpi.h>
+
+#include <iostream>
+#include <vector>
+#include <stack>
+
+// using TCLAP for command line parsing
+#include <tclap/CmdLine.h>
+
+// mxx dependency
+#include <mxx/env.hpp>
+
+// suffix array construction
+#include <divsufsort_wrapper.hpp>
+#include <alphabet.hpp> // for random DNA
+#include <kmer.hpp>
+#include <partition.hpp>
+#include <seq_query.hpp>
+#include <rmq.hpp>
+#include <tldt.hpp>
+
+std::string file2string(const std::string& filename) {
+    // read input file into in-memory string
+    std::string input_str;
+    std::ifstream t(filename.c_str());
+    std::stringstream buffer;
+    buffer << t.rdbuf();
+    input_str = buffer.str();
+    return input_str;
+}
+
+
+template <typename index_t>
+struct tl_desa {
+    index_t total_size;
+    std::vector<char> lc;
+    std::vector<index_t> lcp;
+    std::vector<index_t> off; // offsets
+
+    using range_t = std::pair<index_t, index_t>;
+
+    using it_t = typename std::vector<index_t>::const_iterator;
+    rmq<it_t, index_t> minq;
+
+    void construct(const std::vector<index_t>& LCP, const std::vector<char> Lc, index_t maxsize) {
+        // local ansv !?
+        off = sample_lcp(LCP, maxsize);
+
+        // create sampled DESA
+        std::cout << "creating TL DT with " << off.size() << " els" << std::endl;
+        lcp.resize(off.size());
+        lc.resize(off.size());
+
+        for (size_t i = 0; i < off.size(); ++i) {
+            lcp[i] = LCP[off[i]];
+            lc[i] = Lc[off[i]];
+        }
+
+        // constrct RMQ over new sampled LCP
+        minq = rmq<it_t,index_t>(lcp.begin(), lcp.end());
+    }
+
+
+    range_t locate(const std::string& P) {
+
+        size_t n = lcp.size();
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        // get first child interval and depth
+        size_t i = this->minq(l+1, r);
+        index_t q = this->lcp[i];
+
+        // blind search
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+            // check if we've reached the end of the pattern
+            if (q >= m) {
+                break;
+            }
+
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char lc = this->lc[i]; // == S[SA[l]+lcpv] for first iter
+                if (lc == P[q]) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+
+                i = this->minq(l+1, r);
+            } while (l < r && this->lcp[i] == q);
+
+            if (this->lcp[i] == q) {
+                if (l+1 < r) {
+                    i = this->minq(l+1, r);
+                } else {
+                    i = l;
+                }
+            }
+            q = this->lcp[i];
+        }
+
+        // return the range using offsets
+        if (r == n-1) {
+            return range_t(off[l], total_size);
+        } else {
+            return range_t(off[l], off[r+1]);
+        }
+    }
+};
+
+// table: inclusive prefix sum?
+void evaluate_partition(const std::vector<size_t>& table) {
+
+    for (int p = 2; p <= 1024; p <<= 1) {
+        // partitining by bisection
+        std::vector<size_t> part = partition(table, p);
+
+        // partition pointers to partition size
+        int start = 0;
+        std::vector<size_t> part_size(p, 0);
+        for (int i = 0; i < p; ++i) {
+            int end = (i+1 < p) ? part[i+1] : table.size();
+            //table[start] // inclusive !?
+            // table[i] = 
+            // hist[i] = table[i] - table[i-1]
+            // sum [start,end) ->
+            //     table[end-1] - table[start-1]
+            part_size[i] = table[end-1];
+            if (start > 0)
+                part_size[i] -= table[start-1];
+
+            start = end;
+        }
+
+        size_t max_size = *std::max_element(part_size.begin(), part_size.end());
+        size_t min_size = *std::min_element(part_size.begin(), part_size.end());
+        size_t avg = table.back() / p;
+
+        std::cout << "for p=" << p << " size range [" << max_size << "," << min_size << "], load imbalance: " << (max_size - avg)*100./avg << " %" << std::endl;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    // set up mxx / MPI
+    mxx::env e(argc, argv);
+
+    try {
+        // define commandline usage
+        TCLAP::CmdLine cmd("Load Imbalance study for partitioning of kmers");
+        TCLAP::ValueArg<std::string> fileArg("f", "file", "Input filename.", true, "", "filename");
+        cmd.add(fileArg);
+        TCLAP::SwitchArg trieArg("t", "trie", "use dynamic TL trie", false);
+        cmd.add(trieArg);
+        cmd.parse(argc, argv);
+
+        if (!trieArg.getValue()) {
+            int bits = 25;
+
+            std::string str = file2string(fileArg.getValue());
+            alphabet<char> a = alphabet<char>::from_sequence(str.begin(), str.end());
+            unsigned int l = a.bits_per_char();
+            std::cout << "alphabet: " << a << std::endl;
+            int k = bits / l;
+            std::cout << "using k = " << k << std::endl;
+
+            // scan through string to create kmer histogram
+            std::vector<size_t> hist = kmer_hist<size_t>(str.begin(), str.end(), k, a);
+            std::vector<size_t> table(hist.size());
+            std::cout << "kmer table size: " << table.size()*sizeof(size_t) / 1024 << " kiB" << std::endl;
+
+            std::partial_sum(hist.begin(), hist.end(), table.begin());
+            evaluate_partition(table);
+
+        } else {
+            // use top level trie
+            std::cout << "loading string" << std::endl;
+            std::string str = file2string(fileArg.getValue());
+            std::cout << "construting DESA" << std::endl;
+            desa_index<size_t> idx;
+            idx.construct(str.begin(), str.end());
+
+
+            std::cout << "constructing TL DESA" << std::endl;
+            tl_desa<size_t> tl;
+            size_t maxsize = str.size() / (1024*128);
+            tl.construct(idx.LCP, idx.Lc, maxsize);
+            std::cout << "constructed TL with " << tl.lcp.size() << " elements" << std::endl;
+            std::cout << "TL total size: " << tl.off.size()*(2*sizeof(size_t)+1) / 1024 << " kiB" << std::endl;
+
+            // use TL offsets and create inclusive prefix sum
+            std::vector<size_t> table(tl.off.size());
+
+            for (size_t i = 0; i+1 < table.size(); ++i) {
+                table[i] = tl.off[i+1];
+            }
+            table.back() = str.size();
+
+            evaluate_partition(table);
+        }
+
+    // catch any TCLAP exception
+    } catch (TCLAP::ArgException& e) {
+        std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
+    }
+
+    return 0;
+}
diff --git a/src/mkpattern.cpp b/src/mkpattern.cpp
new file mode 100644
index 0000000..cefa3da
--- /dev/null
+++ b/src/mkpattern.cpp
@@ -0,0 +1,58 @@
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <random>
+
+std::string read_file(const std::string& filename) {
+    // read input file into in-memory string
+    std::string input_str;
+    std::ifstream t(filename.c_str());
+    std::stringstream buffer;
+    buffer << t.rdbuf();
+    input_str = buffer.str();
+    return input_str;
+}
+
+std::vector<std::string> rand_patterns(const std::string& str, size_t len, size_t num) {
+    std::vector<std::string> patterns;
+    patterns.reserve(num);
+    std::minstd_rand g;
+    std::uniform_int_distribution<size_t> d(0, str.size()-len-1);
+    for (size_t i = 0; i < num; ++i) {
+        size_t start = d(g);
+        patterns.emplace_back(str.begin() + start, str.begin() + start + len);
+    }
+    return patterns;
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc < 4) {
+        std::cerr << "Usage:  ./mkpattern <filename> <num> <len>" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    std::string filename(argv[1]);
+    int num = std::stoi(argv[2]);
+    int len = std::stoi(argv[3]);
+
+    if (num <= 0 || len < 1) {
+        std::cerr << "Usage:  ./mkpattern <filename> <num> <len>" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // read input file
+    std::string input = read_file(filename);
+
+    // generate random patterns of given length
+    std::vector<std::string> patterns = rand_patterns(input, len, num);
+
+    // output patterns in text format
+    for (const std::string& P : patterns) {
+        std::cout << P << std::endl;
+    }
+
+    return 0;
+}
diff --git a/src/print64.cpp b/src/print64.cpp
index 62dd4fa..85c2b3c 100644
--- a/src/print64.cpp
+++ b/src/print64.cpp
@@ -30,8 +30,13 @@ int main(int argc, char** argv) {
     }
 
     // read file
-    std::vector<uint64_t> buf(size/8);
-    int bytes_read = fread(&buf[0], sizeof(uint64_t), size/8, f);
+    size_t count = size/sizeof(uint64_t);
+    std::vector<uint64_t> buf(count);
+    size_t read_count = fread(&buf[0], sizeof(uint64_t), count, f);
+    if (read_count != count) {
+        std::cerr << "Unexpected error reading file." << std::endl;
+        exit(EXIT_FAILURE);
+    }
 
     // output file in decimal format
     for (size_t i = 0; i < buf.size(); ++i) {
diff --git a/src/test_slcp.cpp b/src/test_slcp.cpp
new file mode 100644
index 0000000..15e43ad
--- /dev/null
+++ b/src/test_slcp.cpp
@@ -0,0 +1,43 @@
+
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+
+#include <mxx/distribution.hpp>
+
+#include <iostream>
+#include <vector>
+#include <tldt.hpp>
+#include <seq_query.hpp>
+
+#include <cxx-prettyprint/prettyprint.hpp>
+
+int main(int argc, char *argv[])
+{
+    mxx::env e(argc, argv);
+    mxx::comm c;
+
+    size_t n = 1000;
+    size_t maxsize = 20;
+
+    salcp_index<size_t> idx;
+    if (c.rank() == 0) {
+        std::string s = rand_dna(n, 13);
+        //std::cout << "s = " << s << std::endl;
+        idx.construct(s.begin(), s.end());
+    }
+    std::vector<size_t> local_LCP = stable_distribute(idx.LCP, c);
+
+
+    std::vector<size_t> off = sample_lcp_distr(local_LCP, maxsize, c);
+
+
+    std::vector<size_t> goff = mxx::gatherv(off, 0, c);
+
+    if (c.rank() == 0) {
+        seq_check_sample(idx.LCP, goff, maxsize);
+    }
+
+
+    return 0;
+}
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
deleted file mode 100644
index e172452..0000000
--- a/src/tests/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.6)
-
-# project settings
-project(psac-tests)
-
-# RMQ
-add_executable(test_rmq test_rmq.cpp)
-# Bitops
-add_executable(test_bitops test_bitops.cpp)
-
diff --git a/src/tests/test_bitops.cpp b/src/tests/test_bitops.cpp
deleted file mode 100644
index 7e608ee..0000000
--- a/src/tests/test_bitops.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2015 Georgia Institute of Technology
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <cstdlib>
-#include <cstdint>
-#include "bitops.hpp"
-
-
-// TODO do these things as google test
-int main()
-{
-    // run a large set of random tests
-    unsigned int sum = 0;
-    int num_tests = 100000000;
-    for (int i = 0; i < num_tests; ++i)
-    {
-        uint64_t x = 0;
-        x |= static_cast<uint64_t>(rand()) << 32;
-        x |= static_cast<uint64_t>(rand());
-
-        unsigned int t1 = trailing_zeros(x);
-        unsigned int t2 = reference_trailing_zeros(x);
-        //unsigned int t2 = t1;
-        //sum += t2;
-
-        unsigned int l1 = leading_zeros(x);
-        unsigned int l2 = reference_leading_zeros(x);
-        //unsigned int l2 = l1;
-        //sum += l2;
-
-        if (t1 != t2) {
-            std::cerr << "Error! trailing zeros: " << t1 << " != " << t2 << std::endl;
-        }
-        if (l1 != l2) {
-            std::cerr << "Error! leading zeros: " << l1 << " != " << l2 << std::endl;
-        }
-    }
-    return sum;
-}
diff --git a/src/tests/test_rmq.cpp b/src/tests/test_rmq.cpp
deleted file mode 100644
index 5004d67..0000000
--- a/src/tests/test_rmq.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2015 Georgia Institute of Technology
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <vector>
-#include <cstdlib>
-#include <chrono>
-#include <algorithm>
-
-#include "rmq.hpp"
-
-template class std::vector<int>;
-
-class rmq_tester
-{
-    std::size_t size;
-    std::vector<int> els;
-    rmq<std::vector<int>::iterator>* minquery;
-public:
-    rmq_tester(std::size_t size) : size(size), els(size)
-    {
-        std::generate(els.begin(), els.end(), [](){return std::rand();});
-        minquery = new rmq<std::vector<int>::iterator>(els.begin(), els.end());
-    }
-
-    bool test(int start, int end)
-    {
-        std::vector<int>::iterator min_it = minquery->query(els.begin()+start, els.begin()+end);
-        if (*min_it != *std::min_element(els.begin()+start, els.begin()+end))
-        {
-            std::cerr << "ERROR: range(" << start << "," << end << ")=" << *(min_it) << " at i=" << min_it - els.begin() << std::endl;
-            return false;
-        }
-        else
-            return true;
-    }
-
-    void test_all()
-    {
-        for (std::size_t i = 0; i < size; ++i)
-            for (std::size_t j = i+1; j < size; ++j)
-            {
-                if (!test(i,j))
-                {
-                    std::cerr << "Error with range(" << i << "," << j << ")" << std::endl;
-                    exit(EXIT_FAILURE);
-                }
-            }
-    }
-
-    ~rmq_tester()
-    {
-        delete minquery;
-    }
-};
-
-void timing_comp(std::size_t size)
-{
-    auto start_time = std::chrono::high_resolution_clock::now();
-    std::vector<int> els(size);
-    std::generate(els.begin(), els.end(), [](){return std::rand() % 1000;});
-
-    auto now = std::chrono::high_resolution_clock::now();
-    long duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-    start_time = now;
-    std::cerr << "Generate input: " << duration_ms << std::endl;
-
-    rmq<std::vector<int>::iterator> rmq(els.begin(), els.end());
-
-    now = std::chrono::high_resolution_clock::now();
-    duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-    std::cerr << "Construction RMQ: " << duration_ms << std::endl;
-
-    int rand_tests = size;
-    //double rmq_queries = 0.0;
-    //double minel_queries = 0.0;
-    start_time = std::chrono::high_resolution_clock::now();
-    for (int i = 0; i < rand_tests; ++i)
-    {
-        // get random range and start timing
-        int range_start = std::rand() % (size - 2);
-        int range_end = std::rand() % (size - range_start - 1) + range_start;
-
-        // query RMQ
-        auto min = rmq.query(els.begin()+range_start, els.begin()+range_end);
-
-        // stop timing
-        //now = std::chrono::high_resolution_clock::now();
-        //duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-        //rmq_queries += duration_ms;
-        //start_time = now;
-
-        // use linear std::min_element
-        auto min2 = std::min_element(els.begin()+range_start, els.begin() + range_end);
-
-        // stop time
-        //now = std::chrono::high_resolution_clock::now();
-        //duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-        //minel_queries += duration_ms;
-
-        if (*min != *min2)
-        {
-            std::cerr << "ERROR: different mins for range(" << range_start << "," << range_end << ")" << std::endl;
-            std::cerr << "rmq at " << min - els.begin() << " = " << *min << std::endl;
-            std::cerr << "mne at " << min2 - els.begin() << " = " << *min2 << std::endl;
-            exit(EXIT_FAILURE);
-        }
-        else
-            std::cerr << "." << std::endl;
-    }
-
-    now = std::chrono::high_resolution_clock::now();
-    duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - start_time).count();
-    std::cout << "RMQ              queries total time: " << duration_ms/1000.0 << "s" << std::endl;
-    //std::cout << "std::min_element queries total time: " << minel_queries/1000.0 << "s" << std::endl;
-}
-
-int main(int argc, char *argv[])
-{
-    //timing_comp(1000000037);
-    /*
-    for (int size = 1; size < 1000; ++size)
-    {
-        rmq_tester rmq(size);
-        std::cerr << "Testing with size: " << size << std::endl;
-        rmq.test_all();
-    }
-    */
-    //rmq_tester rmq(10000);
-    //rmq.test_all();
-    timing_comp(100000);
-
-
-
-    return 0;
-}
diff --git a/src/tldt.cpp b/src/tldt.cpp
new file mode 100644
index 0000000..d91147b
--- /dev/null
+++ b/src/tldt.cpp
@@ -0,0 +1,250 @@
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <utility>
+
+// using TCLAP for command line parsing
+#include <tclap/CmdLine.h>
+
+// mxx dependencies
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/file.hpp>
+#include <mxx/utils.hpp>
+
+#include <seq_query.hpp>
+#include <rmq.hpp>
+#include <lcp.hpp>
+#include <divsufsort_wrapper.hpp>
+//#include <suffix_array.hpp>
+//#include <ansv.hpp>
+
+#include "seq_query.hpp"
+#include "desa.hpp"
+
+
+template <typename index_t>
+struct tl_desa {
+    index_t total_size;
+    std::vector<char> lc;
+    std::vector<index_t> lcp;
+    std::vector<index_t> off; // offsets
+
+    using range_t = std::pair<index_t, index_t>;
+
+    using it_t = typename std::vector<index_t>::const_iterator;
+    rmq<it_t, index_t> minq;
+
+    void construct(const std::vector<index_t>& LCP, const std::vector<char> Lc, index_t maxsize) {
+        // local ansv !?
+        index_t n = LCP.size();
+        total_size = n;
+
+        struct node {
+            index_t lcp;
+            index_t pos;
+            index_t l;
+        };
+
+        std::stack<node> st;
+        st.push(node{0,0,0,0});
+
+        index_t total_out = 1;
+        std::vector<bool> do_output(n, false);
+        do_output[0] = true;
+
+        for (index_t i = 1; i < n; ++i) {
+            if (LCP[i] == 0) {
+                do_output[i] = true;
+                continue;
+            }
+            while (!st.empty() && st.top().lcp > LCP[i]) {
+                node u = st.top();
+                st.pop();
+                // u.pos has range [u.l, .. , i)
+                index_t parent_size = i - u.l;
+                if (parent_size > maxsize) {
+                    // output but in inverse order !?
+                    do_output[u.pos] = true;
+                    ++total_out;
+                }
+            }
+
+            if (st.empty()) {
+                // cant happen
+                assert(false);
+            } else if (st.top().lcp == LCP[i]) {
+                st.push(node{LCP[i], i, st.top().l});
+            } else {
+                assert(st.top().lcp < LCP[i]);
+                st.push(node{LCP[i], i, st.top().pos});
+            }
+        }
+
+        std::cout << "creating TL DT with " << total_out << " els" << std::endl;
+        lcp.resize(total_out);
+        lc.resize(total_out);
+        off.resize(total_out);
+
+        index_t j = 0;
+        for (index_t i = 0; i < n; ++i) {
+            if (do_output[i]) {
+                lcp[j] = sa.LCP[i];
+                lc[j] = sa.Lc[i];
+                off[j] = i;
+                ++j;
+            }
+        }
+
+        // constrct RMQ over new sampled LCP
+        minq = rmq<it_t,index_t>(lcp.begin(), lcp.end());
+    }
+
+
+    range_t locate(const std::string& P) {
+
+        size_t n = lcp.size();
+        size_t m = P.size();
+        size_t l = 0;
+        size_t r = n-1;
+
+        // get first child interval and depth
+        size_t i = this->minq(l+1, r);
+        index_t q = this->lcp[i];
+
+        // blind search
+        while (q < m && l < r) {
+
+            // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0
+            // using [l,r] as an inclusive SA range
+            // corresponding to LCP query range [l+1,r]
+
+            // check if we've reached the end of the pattern
+            if (q >= m) {
+                break;
+            }
+
+            do {
+                // `i` is the lcp(SA[i-1],SA[i])
+                char lc = this->lc[i]; // == S[SA[l]+lcpv] for first iter
+                if (lc == P[q]) {
+                    r = i-1;
+                    break;
+                }
+                l = i;
+                if (l == r)
+                    break;
+
+                i = this->minq(l+1, r);
+            } while (l < r && this->lcp[i] == q);
+
+            if (this->lcp[i] == q) {
+                if (l+1 < r) {
+                    i = this->minq(l+1, r);
+                } else {
+                    i = l;
+                }
+            }
+            q = this->lcp[i];
+        }
+
+        // return the range using offsets
+        if (r == n-1) {
+            return range_t(off[l], total_size);
+        } else {
+            return range_t(off[l], off[r+1]);
+        }
+    }
+};
+
+
+int main(int argc, char *argv[]) {
+    // set up mxx / MPI
+    mxx::env e(argc, argv);
+    mxx::env::set_exception_on_error();
+    mxx::comm c;
+    mxx::print_node_distribution(c);
+
+    // given a file, compute suffix array and lcp array, how to do count/locate query?
+    // via sa_range()
+    /*
+    if (argc < 3) {
+        std::cerr << "Usage: ./xx <text_file> <pattern_file>" << std::endl;
+    }
+    */
+    try {
+        // define commandline usage
+        TCLAP::CmdLine cmd("Distributed Enhanced Suffix Array");
+        TCLAP::ValueArg<std::string> fileArg("f", "file", "Input string filename.", true, "", "filename");
+        cmd.add(fileArg);
+        TCLAP::ValueArg<std::string> loadArg("l", "load", "Load index from given basename", false, "", "filename");
+        TCLAP::SwitchArg constructArg("c", "construct", "Construct SA/LCP/Lc from input file", false);
+        cmd.xorAdd(loadArg, constructArg); // either load or construct SA/LCP
+        TCLAP::ValueArg<std::string> outArg("o", "outfile", "Output file base name. If --construct was used, this stores the resulting DESA.", false, "", "filename");
+        cmd.add(outArg);
+
+        TCLAP::ValueArg<std::string> queryArg("q", "query", "Query file for benchmarking querying.", false, "", "filename");
+        cmd.add(queryArg);
+        cmd.parse(argc, argv);
+
+
+        mxx::section_timer t;
+
+        // create distributed DESA class
+        using index_t = uint64_t;
+        using range_t = dist_desa<index_t>::range_t;
+        dist_desa<index_t> idx(c);
+
+        if (constructArg.getValue()) {
+            if (c.rank() == 0) {
+                std::cout << "constructing DESA (SA+LCP+LC)..." << std::endl;
+            }
+            // read input file into in-memory string
+            std::string input_str = mxx::file_block_decompose(fileArg.getValue().c_str(), c);
+            t.end_section("read input file");
+            // construct DESA from scratch
+            idx.construct(input_str.begin(), input_str.end(), c);
+            t.end_section("construct idx");
+
+            if (outArg.getValue() != "") {
+                // store DESA to given basename
+                if (c.rank() == 0) {
+                    std::cout << "saving DESA to basename `" << outArg.getValue() << "` ..." << std::endl;
+                }
+                idx.write(outArg.getValue(), c);
+            }
+        } else {
+            if (outArg.getValue() != "") {
+                if (c.rank() == 0) {
+                    std::cerr << "WARNING: --outfile argument will be ignored since the input is loaded from file (don't use in conjuction with --load)." << std::endl;
+                }
+            }
+            if (c.rank() == 0) {
+                std::cout << "loading DESA (SA+LCP+LC) from basename `" << loadArg.getValue() << "` ..." << std::endl;
+            }
+            idx.read(fileArg.getValue(), loadArg.getValue(), c);
+
+        }
+
+        // query benchmarking?
+        if (queryArg.getValue() != "") {
+            strings ss = strings::from_dfile(queryArg.getValue(), c);
+            t.end_section("read patterns file");
+
+            // run locate a couple of times
+            int reps = 10;
+            for (int i = 0; i < reps; ++i) {
+                std::vector<range_t> mysols = idx.bulk_locate(ss);
+                t.end_section("bulk_locate");
+            }
+        }
+
+    } catch (TCLAP::ArgException& e) {
+        std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7ee9532..ee0a64a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,28 +5,35 @@ project(psac-test)
 
 # bitops TODO: don't use mxx for these sequential tests!
 add_executable(test-bitops test_bitops.cpp)
-target_link_libraries(test-bitops mxx-gtest-main)
+target_link_libraries(test-bitops psaclib mxx-gtest-main)
 
 add_executable(test-rmq test_rmq.cpp)
-target_link_libraries(test-rmq mxx-gtest-main)
+target_link_libraries(test-rmq psaclib mxx-gtest-main)
+
+add_executable(test-samplelcp test_samplelcp.cpp)
+target_link_libraries(test-samplelcp psac-dss-lib mxx-gtest-main)
 
 # Parallel tests
 add_executable(test-ansv test_ansv.cpp)
-target_link_libraries(test-ansv mxx-gtest-main)
+target_link_libraries(test-ansv psaclib mxx-gtest-main)
 
 add_executable(test-ss test_stringset.cpp)
-target_link_libraries(test-ss mxx-gtest-main)
+target_link_libraries(test-ss psaclib mxx-gtest-main)
 
 add_executable(test-gsa test_gsa.cpp)
-target_link_libraries(test-gsa mxx-gtest-main rt)
+target_link_libraries(test-gsa psaclib mxx-gtest-main)
 
 add_executable(test-suffixtree test_suffixtree.cpp)
-target_link_libraries(test-suffixtree mxx-gtest-main rt)
+target_link_libraries(test-suffixtree psaclib mxx-gtest-main)
 
 add_executable(test-psac test_psac.cpp)
-target_link_libraries(test-psac mxx-gtest-main)
-target_link_libraries(test-psac divsufsort)
-target_link_libraries(test-psac divsufsort64)
+target_link_libraries(test-psac psac-dss-lib mxx-gtest-main)
+
+add_executable(test-seq-query test_seq_query.cpp)
+target_link_libraries(test-seq-query psac-dss-lib mxx-gtest-main)
+
+add_executable(test-desa test_desa.cpp)
+target_link_libraries(test-desa psac-dss-lib mxx-gtest-main)
 
 # standalone tests
 #add_executable(test-ss test_stringset.cpp)
@@ -34,4 +41,4 @@ target_link_libraries(test-psac divsufsort64)
 
 # standalone tests
 add_executable(test-lcp test_glcp.cpp)
-target_link_libraries(test-lcp ${EXTRA_LIBS} rt)
+target_link_libraries(test-lcp psaclib)
diff --git a/test/test_desa.cpp b/test/test_desa.cpp
new file mode 100644
index 0000000..1370dab
--- /dev/null
+++ b/test/test_desa.cpp
@@ -0,0 +1,154 @@
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <utility>
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/file.hpp>
+#include <mxx/distribution.hpp>
+
+#include <suffix_array.hpp>
+#include <lcp.hpp>
+#include <divsufsort_wrapper.hpp>
+
+#include "seq_query.hpp"
+#include "desa.hpp"
+#include "tldt.hpp"
+
+
+using index_t = uint64_t;
+using range_t = dist_desa<index_t>::range_t;
+
+
+inline std::vector<std::pair<std::string, range_t>> mississippi_testcase() {
+    using r = range_t;
+
+    std::vector<std::pair<std::string, range_t>> patterns;
+    patterns.emplace_back("i", r(0,4));
+    patterns.emplace_back("mississip", r(4,5));
+    patterns.emplace_back("ss", r(9,11));
+    patterns.emplace_back("pp", r(6,7));
+
+    patterns.emplace_back("m", r(4,5));
+    patterns.emplace_back("pi", r(5,6));
+    patterns.emplace_back("is", r(2,4));
+    patterns.emplace_back("ssi", r(9,11));
+
+    patterns.emplace_back("p", r(5,7));
+    patterns.emplace_back("ississippi", r(3,4));
+    patterns.emplace_back("si", r(7,9));
+    patterns.emplace_back("miss", r(4,5));
+
+    patterns.emplace_back("s", r(7,11));
+    patterns.emplace_back("ip", r(1,2));
+    patterns.emplace_back("ppi", r(6,7));
+    patterns.emplace_back("mississippi", r(4,5));
+
+    return patterns;
+}
+
+
+TEST(DesaTest,MississippiLocatePossible) {
+    mxx::comm c;
+    ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs";
+
+    std::string s;
+    if (c.rank() == 0) {
+        s = "mississippi";
+    }
+    std::string input_str = mxx::stable_distribute(s, c);
+    // construct DESA
+    dist_desa<index_t> idx(c);
+    idx.construct(input_str.begin(), input_str.end(), c);
+
+
+    auto patterns = mississippi_testcase();
+
+    // locate single pattern in distributed representation (collective op)
+    for (auto pat : patterns) {
+        std::string P = pat.first;
+        range_t exp_res = pat.second;
+
+        range_t r = idx.locate_possible(P);
+        // (EXPECTED, ACTUAL)
+        EXPECT_EQ(exp_res, r);
+    }
+}
+
+TEST(DesaTest, FileIO) {
+    mxx::comm c;
+    ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs";
+
+    std::string s;
+    if (c.rank() == 0) {
+        s = "mississippi";
+        std::ofstream f("miss.str");
+        f.write(&s[0],s.size());
+    }
+    std::string input_str = mxx::stable_distribute(s, c);
+    // construct DESA
+    dist_desa<index_t> desa(c);
+    desa.construct(input_str.begin(), input_str.end(), c);
+
+    desa.write("desa_miss", c);
+
+    dist_desa<index_t> desa2(c);
+    desa2.read("miss.str", "desa_miss", c);
+    EXPECT_EQ(desa.sa.local_SA, desa2.sa.local_SA);
+    EXPECT_EQ(desa.sa.local_LCP, desa2.sa.local_LCP);
+    EXPECT_EQ(desa.sa.local_Lc, desa2.sa.local_Lc);
+    EXPECT_EQ(desa.tli.idx.table, desa2.tli.idx.table);
+    EXPECT_EQ(desa.tli.idx.alpha, desa2.tli.idx.alpha);
+
+    // TODO: implement test for loading with smaller/larger nuber of processors
+    // and still check that querying works correctly
+    // -> introduce function to test queries for a given desa
+}
+
+#define PV(x, c) mxx::sync_cout(c) << "[" << c.rank() << "] " << "" # x  " = " << x << std::endl;
+
+TEST(DesaTest,MississippiBulkLocatePossible) {
+    mxx::comm c;
+
+    ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs";
+
+    std::string s;
+    if (c.rank() == 0) {
+        s = "mississippi";
+    }
+    std::string input_str = mxx::stable_distribute(s, c);
+    // construct DESA
+    dist_desa<index_t, tldt<index_t>> idx(c);
+    idx.construct(input_str.begin(), input_str.end(), c);
+
+    PV(input_str, c);
+    PV(idx.sa.local_LCP, c);
+    PV(idx.sa.local_SA, c);
+    PV(idx.tli.prefix(), c);
+    PV(idx.tli.idx.LCP, c);
+
+
+    // create `strings` from patterns
+    auto patterns = mississippi_testcase();
+    mxx::blk_dist dist(patterns.size(), c.size(), c.rank());
+
+
+    std::vector<std::string> my_patterns(dist.local_size());
+
+    for (size_t i = 0; i < dist.local_size(); ++i) {
+        my_patterns[i] = patterns[i+dist.eprefix_size()].first;
+    }
+
+    strings ss = strings::from_vec(my_patterns);
+    std::vector<range_t> mysols = idx.bulk_locate(ss);
+
+    // check results
+    for (size_t i = 0; i < dist.local_size(); ++i) {
+        EXPECT_EQ(patterns[i+dist.eprefix_size()].second, mysols[i]);
+    }
+}
diff --git a/test/test_psac.cpp b/test/test_psac.cpp
index c0d8771..80116b8 100644
--- a/test/test_psac.cpp
+++ b/test/test_psac.cpp
@@ -324,6 +324,8 @@ TEST(PSAC, FileIO) {
         EXPECT_EQ(sa.local_LCP, sa2.local_LCP);
         EXPECT_EQ(sa.local_Lc, sa2.local_Lc);
 
+        EXPECT_EQ(sa.alpha, sa2.alpha);
+
         std::vector<size_t> gsa = mxx::gatherv(sa.local_SA, 0, c);
         std::vector<size_t> glcp = mxx::gatherv(sa.local_LCP, 0, c);
         std::vector<char> glc = mxx::gatherv(sa.local_Lc, 0, c);
@@ -332,6 +334,7 @@ TEST(PSAC, FileIO) {
             c.with_subset(c.rank() < c.size() - 2, [&](const mxx::comm& sc) {
                 suffix_array<char, size_t, true, true> sa3(sc);
                 sa3.read("miss");
+                EXPECT_EQ(sa.alpha, sa3.alpha);
                 // gather and check
                 EXPECT_EQ(gsa, mxx::gatherv(sa3.local_SA, 0, sc));
                 EXPECT_EQ(glcp, mxx::gatherv(sa3.local_LCP, 0, sc));
diff --git a/test/test_samplelcp.cpp b/test/test_samplelcp.cpp
new file mode 100644
index 0000000..4c80d91
--- /dev/null
+++ b/test/test_samplelcp.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2019 Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @brief   Unit tests for sampling the LCP
+ */
+
+#include <gtest/gtest.h>
+
+#include <stack>
+
+#include <mxx/comm.hpp>
+#include <mxx/distribution.hpp>
+
+#include <tldt.hpp> // TODO: renmae/move code to different file?
+#include <alphabet.hpp> // for rand_dna
+#include <seq_query.hpp> // for sequential SA/LCP construction
+#include <cxx-prettyprint/prettyprint.hpp>
+
+
+
+TEST(PsacSampleLCP, SeqSmall) {
+    // sequentially get SA and LCP for random or easy input!?
+    size_t n = 10000;
+    size_t maxsize = 78;
+    std::string s = rand_dna(n, 13);
+
+
+    salcp_index<size_t> idx;
+    idx.construct(s.begin(), s.end());
+
+
+    std::vector<size_t> off = sample_lcp_indirect(idx.LCP, maxsize);
+
+
+    // check correctness
+    // (EXPECTED, ACTUAL)
+    seq_check_sample(idx.LCP, off, maxsize);
+}
+
+TEST(PsacSampleLCP, ParSmall) {
+    // sequentially get SA and LCP for random or easy input!?
+    size_t n = 100000;
+    size_t maxsize = 123;
+
+    mxx::comm c;
+
+    salcp_index<size_t> idx;
+    if (c.rank() == 0) {
+        std::string s = rand_dna(n, 13);
+        idx.construct(s.begin(), s.end());
+    }
+
+    std::vector<size_t> local_LCP = mxx::stable_distribute(idx.LCP, c);
+
+    std::vector<size_t> local_off = sample_lcp_distr(local_LCP, maxsize, c);
+
+    std::vector<size_t> off = mxx::gatherv(local_off, 0, c);
+
+    if (c.rank() == 0) {
+        seq_check_sample(idx.LCP, off, maxsize);
+    }
+}
diff --git a/test/test_seq_query.cpp b/test/test_seq_query.cpp
new file mode 100644
index 0000000..1b524cf
--- /dev/null
+++ b/test/test_seq_query.cpp
@@ -0,0 +1,83 @@
+
+#include <gtest/gtest.h>
+
+#include <prettyprint.hpp>
+
+#include <seq_query.hpp>
+#include <partition.hpp> // TODO: move partition tests elsewhere
+
+
+/* type parameterized index test */
+
+template <typename IdxType>
+class SeqIndexTest : public ::testing::Test {
+     public:
+    IdxType idx;
+};
+
+using index_t = uint64_t;
+using MyTypes = ::testing::Types<sa_index<index_t>, esa_index<index_t>, bs_esa_index<index_t>, desa_index<index_t>, lookup_desa_index<index_t>>;
+TYPED_TEST_CASE(SeqIndexTest, MyTypes);
+
+// search for patterns inside `mississippi`
+TYPED_TEST(SeqIndexTest, Locate_mississippi) {
+    std::string input_str = "mississippi";
+    using r = std::pair<index_t,index_t>;
+
+    TypeParam idx;
+    idx.construct(input_str.begin(), input_str.end());
+
+    // TODO: flip arguments to: (EXPECTED, ACTUAL)
+    EXPECT_EQ(idx.locate("i"), r(0,4));
+    EXPECT_EQ(idx.locate("is"), r(2,4));
+    EXPECT_EQ(idx.locate("ip"), r(1,2));
+    EXPECT_EQ(idx.locate("ississippi"), r(3,4));
+
+    EXPECT_EQ(idx.locate("m"), r(4,5));
+    EXPECT_EQ(idx.locate("miss"), r(4,5));
+    EXPECT_EQ(idx.locate("mississip"), r(4,5));
+    EXPECT_EQ(idx.locate("mississippi"), r(4,5));
+
+    EXPECT_EQ(idx.locate("p"), r(5,7));
+    EXPECT_EQ(idx.locate("pi"), r(5,6));
+    EXPECT_EQ(idx.locate("pp"), r(6,7));
+    EXPECT_EQ(idx.locate("ppi"), r(6,7));
+
+    EXPECT_EQ(idx.locate("s"), r(7,11));
+    EXPECT_EQ(idx.locate("ss"), r(9,11));
+    EXPECT_EQ(idx.locate("si"), r(7,9));
+    EXPECT_EQ(idx.locate("ssi"), r(9,11));
+
+    // expect not to find these:
+    for (auto P : {"misx", "ississippii", "issississi", "mippi", "piss"}) {
+        r res = idx.locate(P);
+        EXPECT_EQ(res.first, res.second);
+
+    }
+}
+
+// TODO: test case for testing random string and random patterns
+//       check correcness by checking left and right next SA suffixes
+
+
+
+TEST(Partition, part0) {
+    lookup_desa_index<index_t> idx;
+    std::string input_str = "mississippi";
+    idx.construct(input_str.begin(), input_str.end());
+
+    int p = 6;
+    std::vector<size_t> part = partition(idx.tl.table, p);
+    std::cout << "part: " << part << std::endl;
+
+    // print size stats
+    for (int i = 0; i < p; ++i) {
+        size_t start = part[i];
+        size_t end = i+1 < p ? part[i+1] : idx.tl.table.size()-1;
+        size_t num = idx.tl.table[end] - idx.tl.table[start];
+        std::cout << "[p " << i << "]: [" << start << "," << end << "):  " << num*100. / idx.tl.table.back() << " %  share" << std::endl;
+    }
+}
+
+
+