diff --git a/.travis.yml b/.travis.yml index 792a7c0..fdde70b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -78,6 +78,7 @@ script: - ./bin/test-ansv - ./bin/test-suffixtree - ./bin/test-gsa + - ./bin/test-desa - mpiexec -np 4 ./bin/test-psac - mpiexec -np 13 ./bin/test-psac - mpiexec -np 4 ./bin/test-ansv @@ -86,6 +87,8 @@ script: - mpiexec -np 13 ./bin/test-gsa - mpiexec -np 4 ./bin/test-gsa - mpiexec -np 4 ./bin/test-ss + - mpiexec -np 4 ./bin/test-desa + after_success: # only collect coverage if compiled with gcc diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py index 34a8e50..5cc49cc 100644 --- a/.ycm_extra_conf.py +++ b/.ycm_extra_conf.py @@ -86,6 +86,10 @@ '-I', './ext', '-I', +'./ext/mxx', +'-I', +'../ext/mxx', +'-I', './ext/mxx/include', '-I', '../ext/mxx/include', diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a60bcb..1dbbc1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,134 +3,64 @@ cmake_minimum_required(VERSION 2.6) # project settings project(psac) +#### Options +OPTION(PSAC_BUILD_EXES "Build all psac executables (command line tools & benchmark scripts)" ON) +OPTION(PSAC_BUILD_TESTS "Build unit tests" ON) +OPTION(PSAC_ENABLE_COVERAGE "Enable code coverage reporting" OFF) + ##### General Compilation Settings # Initialize CXXFLAGS. -#set(CMAKE_CXX_FLAGS "-Wall -O0 -g --std=c++11 -D_GLIBCXX_DEBUG") -#set(CMAKE_CXX_FLAGS "-Wall -Wuninitialized -O0 -g --std=c++11") -#set(CMAKE_CXX_FLAGS "-Wall -O3 -funroll-loops -msse3 --std=c++11 -g -march=native") -#set(CMAKE_CXX_FLAGS "-Wall -O3 -funroll-loops -msse3 -DNDEBUG --std=c++11 -march=native") -#set(CMAKE_CXX_FLAGS "-Wall -O3 -DNDEBUG --std=c++11 -Wuninitialized -g") - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wuninitialized --std=c++11") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -g") - -# Add these standard paths to the search paths for FIND_LIBRARY -# to find libraries from these locations first -if(UNIX) - set(CMAKE_LIBRARY_PATH "${CMAKE_LIBRARY_PATH} /lib /usr/lib") -endif() - -# -------------------------------------------------------------- -# Indicate CMake 2.7 and above that we don't want to mix relative -# and absolute paths in linker lib lists. -# Run "cmake --help-policy CMP0003" for more information. -# -------------------------------------------------------------- -if(COMMAND cmake_policy) - cmake_policy(SET CMP0003 NEW) -endif() - - -#### MPI -find_package(MPI REQUIRED) -if (MPI_FOUND) - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_COMPILE_FLAGS}") - #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_COMPILE_FLAGS}") - #set(CMAKE_LINK_FLAGS "${CMAKE_LINK_FLAGS} ${MPI_LINK_FLAGS}") - set(EXTRA_LIBS ${EXTRA_LIBS} ${MPI_LIBRARIES}) - include_directories(SYSTEM ${MPI_INCLUDE_PATH}) -else (MPI_FOUND) - message(SEND_ERROR "This application cannot compile without MPI") -endif (MPI_FOUND) - - - - -## add ScoreP support - - -option(ENABLE_PROFILING_SCOREP "Enable ScoreP profiling support." OFF) - -#if(ENABLE_PROFILING_SCOREP) -# message(STATUS "Profiling with ScoreP enabled.") -# -# execute_process(COMMAND scorep-config --cxxflags --nocompiler -# OUTPUT_VARIABLE SCOREP_CXX_FLAGS) -# ##RESULT_VARIABLE SCOREP_CONFIG_RETURN) -# #if(NOT SCOREP_CONFIG_RETURN EQUAL 0) -# # message(FATAL_ERROR "Can NOT execute 'scorep-config' at $ENV{SCOREP_ROOT}/bin/scorep-config - check file permissions") -# #endif() -# execute_process(COMMAND scorep-config --mpp=mpi --ldflags -# OUTPUT_VARIABLE SCOREP_LD_FLAGS) -# execute_process(COMMAND scorep-config --mpp=mpi --libs -# OUTPUT_VARIABLE SCOREP_LIBS) -# string(STRIP "${SCOREP_LIBS}" SCOREP_LIBS) -# -# #execute_process(COMMAND scorep-config --mpp=mpi --libs -# # OUTPUT_VARIABLE SCOREP_LIBFLAGS) -# #string(STRIP "${SCOREP_LIBFLAGS}" SCOREP_LIBFLAGS) -# -# # subsystem iniialization file -# execute_process(COMMAND scorep-config --mpp=mpi --nocuda --noopencl --adapter-init -# OUTPUT_VARIABLE SCOREP_INIT_FILE) -# file(WRITE ${CMAKE_BINARY_DIR}/scorep_init.c "${SCOREP_INIT_FILE}") -# -# #if(SCOREP_ENABLE) -# set(SCOREP_SRCFILES "${CMAKE_BINARY_DIR}/scorep_init.c") -# #endif(SCOREP_ENABLE) -# -# -# # TODO: possibly put the LIBS to target_link_libraries -# set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${SCOREP_LD_FLAGS} ${SCOREP_LIBS}") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SCOREP_CXX_FLAGS}") -# #set(EXTRA_LIBS ${EXTRA_LIBS} ${SCOREP_LIBS}) -#endif(ENABLE_PROFILING_SCOREP) - - -if(ENABLE_PROFILING_SCOREP) - message(STATUS "Profiling with ScoreP enabled.") - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "scorep") - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "scorep") -endif(ENABLE_PROFILING_SCOREP) ### Test Coverage -OPTION(ENABLE_COVERAGE "Enable code coverage reporting" OFF) -if(ENABLE_COVERAGE) +if(PSAC_ENABLE_COVERAGE) # turn off stack protection for gcov coverage, because the stack protector shows # up as a never taken branch, and thus turns any last statement in a function # with a stack procetor into a partially covered statement. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fno-stack-protector") -endif(ENABLE_COVERAGE) +endif(PSAC_ENABLE_COVERAGE) ###### Executable and Libraries # Save libs and executables in the same place -set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib CACHE PATH "Output directory for libraries" ) -set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin CACHE PATH "Output directory for applications" ) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -include_directories("${PROJECT_SOURCE_DIR}/include") -include_directories("${PROJECT_SOURCE_DIR}/ext/mxx") -include_directories("${PROJECT_SOURCE_DIR}/ext/mxx/include") -include_directories("${PROJECT_SOURCE_DIR}/ext") -include_directories("${PROJECT_SOURCE_DIR}/ext/cxx-prettyprint") +## psac header library +add_library(psaclib INTERFACE) +target_include_directories(psaclib INTERFACE include) + +## psac dependencies # load mxx and its gtest wrapper -add_subdirectory("${PROJECT_SOURCE_DIR}/ext/mxx") +set(MXX_BUILD_TESTS OFF CACHE BOOL "disable building mxx tests" FORCE) +add_subdirectory(ext/mxx) +target_link_libraries(psaclib INTERFACE mxx) # include libdivsufsort (with 64bit support but without examples) set(BUILD_DIVSUFSORT64 ON CACHE BOOL "enables divsufsort 64bit functions" FORCE) -set(BUILD_EXAMPLES OFF CACHE BOOL "enables divsufsort 64bit functions" FORCE) -add_subdirectory("${PROJECT_SOURCE_DIR}/ext/libdivsufsort") -include_directories(${libdivsufsort_BINARY_DIR}/include) -include_directories(${libdivsufsort_SOURCE_DIR}/include) +set(BUILD_EXAMPLES OFF CACHE BOOL "enables divsufsort examples" FORCE) +add_subdirectory(ext/libdivsufsort) + +# prettyprint +target_include_directories(psaclib INTERFACE ext/cxx-prettyprint) +# divsufsort (integrate into psac as `psac-dss-lib`) +add_library(psac-dss-lib INTERFACE) +target_link_libraries(psac-dss-lib INTERFACE psaclib) +target_include_directories(psac-dss-lib INTERFACE ${libdivsufsort_BINARY_DIR}/include) +target_link_libraries(psac-dss-lib INTERFACE divsufsort divsufsort64) -# add own subdirectories -add_subdirectory(src) -add_subdirectory(src/tests) + +## build executables +if(PSAC_BUILD_EXES) + add_subdirectory(src) +endif() # build tests -add_subdirectory(test) +if (PSAC_BUILD_TESTS) + add_subdirectory(test) +endif() diff --git a/ext/mxx b/ext/mxx index f09a41c..4f19d42 160000 --- a/ext/mxx +++ b/ext/mxx @@ -1 +1 @@ -Subproject commit f09a41cde6ba705eaf87a2d3798abed0ed9d1fd4 +Subproject commit 4f19d4293b015f631f2a47dbe255091aed36a8ca diff --git a/include/alphabet.hpp b/include/alphabet.hpp index 3dda846..e028958 100644 --- a/include/alphabet.hpp +++ b/include/alphabet.hpp @@ -19,7 +19,10 @@ #include #include #include +#include +#include #include +#include #include "bitops.hpp" @@ -41,6 +44,7 @@ std::string rand_dna(std::size_t size, int seed) { return str; } +/// scans the input sequence and updates the given frequency histogram of characters template void update_histogram(Iterator begin, Iterator end, std::vector& hist) { using char_type = typename std::iterator_traits::value_type; @@ -54,6 +58,7 @@ void update_histogram(Iterator begin, Iterator end, std::vector& hist) { } } +/// Returns a frequency histogram of characters from a given input sequence template std::vector get_histogram(Iterator begin, Iterator end, std::size_t size = 0) { if (size == 0) @@ -63,6 +68,7 @@ std::vector get_histogram(Iterator begin, Iterator end, std::size_t size = 0) return hist; } +/// Returns a frequency histogram of characters from a given distributed stringset (collective call) template std::vector alphabet_histogram(const StringSet& ss, const mxx::comm& comm) { std::vector hist(256, 0); @@ -74,6 +80,7 @@ std::vector alphabet_histogram(const StringSet& ss, const mxx::comm& co return out_hist; } +/// Returns a frequency histogram of characters from a given input sequence. template std::vector alphabet_histogram(InputIterator begin, InputIterator end) { static_assert(std::is_same::value_type, char>::value, "Iterator must be of value type `char`."); @@ -82,6 +89,7 @@ std::vector alphabet_histogram(InputIterator begin, InputIterator end) return hist; } +/// Returns a frequency histogram of characters from a given distributed input sequence (collective call) template std::vector alphabet_histogram(InputIterator begin, InputIterator end, const mxx::comm& comm) { static_assert(std::is_same::value_type, char>::value, "Iterator must be of value type `char`."); @@ -92,15 +100,18 @@ std::vector alphabet_histogram(InputIterator begin, InputIterator end, } - - +/** + * @brief Class representing an alphabet with character size <= 2 bytes. + */ template class alphabet { static_assert(sizeof(CharType) <= 2, "Dynamic alphabet supports at most 2-byte (16bits) alphabets"); public: - + /// the character type of the alphabet using char_type = CharType; + /// unsigned version of `char_type` using uchar_type = typename std::make_unsigned::type; + /// limit of `uchar_type`, defines size of mapping table static constexpr uchar_type max_uchar = std::numeric_limits::max(); /// executes the given function on every char in the alphabet @@ -109,7 +120,7 @@ class alphabet { inline void for_each_char(Func f) const { for (uchar_type c = 0; ; ++c) { if (chars_used[c]) { - f(c); + f(static_cast(c)); } if (c == max_uchar) break; @@ -118,25 +129,22 @@ class alphabet { private: + /// bitmap of characters in the alphabet std::vector chars_used; /// maps each unsigned char to a new integer using at most log(sigma+1) bits std::vector mapping_table; + /// maps the encoded characters back to their original std::vector inverse_mapping; + /// alphabet size (number of unique characters excl. '\0') unsigned int m_sigma; + /// number of bits each character can be represented/encoded as unsigned int m_bits_per_char; - inline void init_mapping_table() { - mapping_table.resize(max_uchar+1, 0); - uint16_t mapped = 1; // start with 1 to include special char '\0' ('$') - for_each_char([&](uchar_type c) { - mapping_table[c] = mapped; - ++mapped; - }); - } - inline void init_sizes() { + /// initializes the alphabet size and bit sizes + void init_sizes() { unsigned int num_chars = 0; for_each_char([&](uchar_type) { ++num_chars; @@ -145,13 +153,25 @@ class alphabet { m_bits_per_char = ceillog2(m_sigma+1); } - inline void init_inverse_mapping() { + /// initializes the alphabet mapping table + void init_mapping_table() { + mapping_table.resize(max_uchar+1, 0); + uint16_t mapped = 1; // start with 1 to include special char '\0' ('$') + for_each_char([&](uchar_type c) { + mapping_table[c] = mapped; + ++mapped; + }); + } + + /// initializes the inverse mapping table + void init_inverse_mapping() { inverse_mapping.push_back('\0'); for_each_char([&](uchar_type c) { inverse_mapping.push_back(c); }); } + /// constructs the alphabet from a given character frequency histogram template alphabet(const std::vector& hist) { assert(hist.size() == max_uchar+1); @@ -173,14 +193,14 @@ class alphabet { alphabet& operator=(const alphabet&) = default; alphabet& operator=(alphabet&&) = default; + /// Constructs the alphabet from a given character frequency histogram template static alphabet from_hist(const std::vector& hist) { alphabet a(hist); return a; } - - // sequential version + /// Constructs the alphabet from a given sequence template static alphabet from_sequence(Iterator begin, Iterator end) { static_assert(std::is_same::value_type>::value, "Character type of alphabet must match the value type of input sequence"); @@ -188,8 +208,7 @@ class alphabet { return alphabet::from_hist(alphabet_hist); } - - // MPI parallel version (performs reduction over histogram) + /// Constructs the alphabet from a given distributed sequence (collective call) template static alphabet from_sequence(Iterator begin, Iterator end, const mxx::comm& comm) { static_assert(std::is_same::value_type>::value, "Character type of alphabet must match the value type of input sequence"); @@ -198,42 +217,51 @@ class alphabet { return alphabet::from_hist(alphabet_hist); } - static alphabet from_string(const std::string& str, const mxx::comm& comm) { + /// Constructs the alphabet from a given string + static alphabet from_string(const std::basic_string& str) { + return alphabet::from_sequence(str.begin(), str.end()); + } + + /// Constructs the alphabet from a given distributed string (collective call) + static alphabet from_string(const std::basic_string& str, const mxx::comm& comm) { return alphabet::from_sequence(str.begin(), str.end(), comm); } + /// Constructs the alphabet from a distributed stringset (collective call) template static alphabet from_stringset(const StringSet& ss, const mxx::comm& comm) { std::vector alphabet_hist = alphabet_histogram(ss, comm); return alphabet::from_hist(alphabet_hist); } + /// Returns the number of unique characters in the alphabet (i.e. the + // alphabet size) inline unsigned int sigma() const { return m_sigma; } + /// Returns the alphabet size (same as .sigma()) + inline unsigned int size() const { + return m_sigma; + } + + /// Returns the number of bits a single character can be represented by inline unsigned int bits_per_char() const { return m_bits_per_char; } + /// Returns the number of characters that fit into the given word_type template inline unsigned int chars_per_word() const { static_assert(sizeof(word_type) >= sizeof(char_type), "expecting the word_type to be larger than the char type"); unsigned int bits_per_word = sizeof(word_type)*8; - // TODO: this is currently a "work-around": if the type is signed, we - // can't use the msb, thus we need to subtract one + // if the type is signed, we can't use the msb, thus we need to subtract one if (std::is_signed::value) --bits_per_word; return bits_per_word/bits_per_char(); } - inline uchar_type encode(char_type c) const { - uchar_type uc = c; - assert(0 <= uc && uc < mapping_table.size()); - return mapping_table[uc]; - } - - + /// Returns the unique characters in this alphabet inline std::vector unique_chars() const { std::vector result; for_each_char([&](uchar_type c) { @@ -242,9 +270,81 @@ class alphabet { return result; } + /// encodes the given character to use at most `bits_per_char` bits + inline uchar_type encode(char_type c) const { + uchar_type uc = c; + assert(0 <= uc && uc < mapping_table.size()); + return mapping_table[uc]; + } + + /// Returns the decoded character from the encoded (compressed) character inline char_type decode(uchar_type c) const { return inverse_mapping[c]; } + + /// equality check + bool operator==(const alphabet& o) const { + return o.chars_used == this->chars_used && o.m_sigma == this->m_sigma; + } + + /// in-equality check + bool operator!=(const alphabet& o) const { + return !(*this == o); + } + + /// write alphabet to file + void write_to(std::ostream& os) const { + for_each_char([&os](char_type c) { + os.write(&c,sizeof(c)); + }); + } + + /// read alphabet from file + static alphabet read_from(std::istream& is) { + // read in everything + std::basic_string s; + char_type c; + while (is.read(&c,sizeof(char_type))) { + s.push_back(c); + } + return from_string(s); + } + + /// write alphabet to file + void write(const std::string& filename) const { + std::ofstream f(filename); + write_to(f); + } + + /// write alphabet to file (collective call) + void write(const std::string& filename, const mxx::comm& comm) const { + // only one processor writes the file + if (comm.rank() == 0) { + write(filename); + } + } + + /// read alphabet from file + void read(const std::string& filename) { + std::ifstream f(filename); + *this = alphabet::read_from(f); + } + + /// read alphabet from file (collective call) + void read(const std::string& filename, const mxx::comm& comm) { + std::basic_string s; + // only one processor reads the file and then broadcasts the alphabet + if (comm.rank() == 0) { + std::ifstream f(filename); + char_type c; + while (f.read(&c,sizeof(char_type))) { + s.push_back(c); + } + } + mxx::bcast(s, 0, comm); + + *this = alphabet::from_string(s); + } }; template @@ -379,8 +479,7 @@ class int_alphabet { template inline unsigned int chars_per_word() const { unsigned int bits_per_word = sizeof(word_type)*8; - // TODO: this is currently a "work-around": if the type is signed, we - // can't use the msb, thus we need to subtract one + // if the type is signed, we can't use the msb, thus we need to subtract one if (std::is_signed::value) --bits_per_word; return bits_per_word/bits_per_char(); @@ -397,6 +496,9 @@ class int_alphabet { // c + min_char - 1 return c - offset; } + + // TODO: + // write/read file IO }; template diff --git a/include/desa.hpp b/include/desa.hpp new file mode 100644 index 0000000..36ffa0a --- /dev/null +++ b/include/desa.hpp @@ -0,0 +1,716 @@ +/* + * Copyright 2018 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DESA_HPP +#define DESA_HPP + +#include "suffix_array.hpp" +#include "rmq.hpp" +#include "lookup_table.hpp" +#include "partition.hpp" +#include "dstrings.hpp" + +#include +#include +#include +#include + +#define DESA_CONSTR_NAIVE_LC 0 + + +/** + * @brief Redistribute a distributed vector `v`, so that the calling processor + * contains global indexes [gbeg,gend). + * + * This is a collective call. + * + * @return The redistributed vector with `gend-gbeg` elements corresponding to + * global elements [gbeg, gend). + */ +template +std::vector redistr(const std::vector& v, size_t gbeg, size_t gend, const mxx::comm& comm) { + assert(gbeg <= gend); + std::vector res(gend - gbeg); + // TODO: implememnt more efficient version for this (using send/recv only) + mxx::redo_arbit_decomposition(v.begin(), v.end(), res.begin(), gend - gbeg, comm); + return res; +} + +/** + * @brief Redistribute the distributed suffix array `sa`, + * such that the calling processor contains global indexes [gbeg,gend). + * + * This is a collective call. + */ +template +void redistr_sa(suffix_array& sa, size_t gbeg, size_t gend, const mxx::comm& comm) { + sa.local_SA = redistr(sa.local_SA, gbeg, gend, comm); + sa.local_LCP = redistr(sa.local_LCP, gbeg, gend, comm); + if (!sa.local_B.empty()) { + sa.local_B = redistr(sa.local_B, gbeg, gend, comm); + } + if (CONSTRUCT_LC) { + sa.local_Lc = redistr(sa.local_Lc, gbeg, gend, comm); + } +} + + + + +template +struct tllt { + using range_t = std::pair; + lookup_index idx; + size_t n; + + template + void construct(const suffix_array& sa, const std::string& local_str, const mxx::comm& comm) { + // choose a "good" `q` + unsigned int l = sa.alpha.bits_per_char(); + unsigned int log_size = 24; // 2^24 = 16 M (*8 = 128 M) + unsigned int q = log_size / l; + + // `q` might have to be choosen smaller if the input is very small + if (sa.local_size < q) { + q = sa.local_size; + q = std::max(q,1u); + } + q = mxx::allreduce(q, mxx::min(), comm); + if (q < log_size / l && comm.rank() == 0) { + std::cerr << "[WARNING] q for q-mer partitioning was choosen smaller (q=" << q << " vs opt_q=" << log_size/l << ") due to small local size." << std::endl; + } + + // construct kmer lookup table + std::vector hist = kmer_hist(local_str.begin(), local_str.end(), q, sa.alpha, comm); + idx.construct_from_hist(hist, q, sa.alpha); + } + + inline index_t minmatch() const { + return idx.k; + } + + // for partitioning + const std::vector& prefix() const { + return idx.table; + } + std::vector& prefix() { + return idx.table; + } + + template + inline range_t lookup(const String& P) const { + return idx.lookup(P); + } +}; + + + // partition information: + // after partitioning subtrees, this processor now contains the following + // indeces: + + // TODO: separate class for `arbitrary_distribution`: + // containing also the `lt_proc` and `part` tables from above and the info below + // and provide some useful functions + +struct gen_dist { + /// globally number of elements + size_t global_size; + /// global index of first element on this processor + size_t my_begin; + /// global index of first element on next processor + size_t my_end; + /// = my_end - my_begin: number of elements on this processor + size_t local_size; + + /// partition: assigns each processor it's lookup table index (shared copy) + + /// inverse partition mapping + // map each processor start index -> processor index + std::map lt_proc; // XXX: possibly optimize: use vector and binary search + + int target_processor(size_t gidx) { + auto ub = lt_proc.upper_bound(gidx); // returns iterator to first element > gidx + --ub; // then the previous element is the target processor + return ub->second; + } + + void check_correct(const mxx::comm& comm) const { + // check that these are all exclusive? + size_t next_beg = mxx::left_shift(my_begin, comm); + size_t prev_end = mxx::right_shift(my_end, comm); + + assert(my_begin <= my_end); + if (comm.rank() > 0) { + assert(prev_end == my_begin); + } else { + assert(my_begin == 0); + } + + if (comm.rank()+1 < comm.size()) { + assert(next_beg == my_end); + } else { + assert(my_end == global_size); + } + } + + void print_imbalance_stats(const mxx::comm& comm) const { + size_t min_size = mxx::allreduce(local_size, mxx::min(), comm); + size_t max_size = mxx::allreduce(local_size, mxx::max(), comm); + + std::vector local_sizes = mxx::gather(local_size, 0, comm); + + size_t ex_size = global_size / comm.size(); + + double imbalance = (max_size - ex_size)*1. / ex_size; + + if (comm.rank() == 0) { + std::cout << "Repartition load im-balance: " << imbalance*100. << "%, range: [" << min_size << "," << max_size << "]" << std::endl; + std::cout << " sizes: " << local_sizes << std::endl; + } + } + + template + static gen_dist from_prefix_sizes(const std::vector& table, const mxx::comm& comm) { + mxx::section_timer t; + gen_dist d; + d.global_size = table.back(); + // partition the lookup index by processors + std::vector part = partition(table, comm.size()); + for (int i = 0; i < comm.size(); ++i) { + size_t kmer_l = part[i]; + size_t proc_begin = (kmer_l == 0) ? 0 : table[kmer_l-1]; + d.lt_proc[proc_begin] = i; + } + + // kmers from: [part[comm.rank()] .. part[comm.rank()+1]) + size_t my_kmer_l = part[comm.rank()]; + size_t my_kmer_r = (comm.rank()+1 == comm.size()) ? table.size() : part[comm.rank()+1]; + + // kmer table is incl-prefix histogram, thus my_kmer_l is not the start global address + // instead the start global address should be the previous kmer count (ie, tl[my_kmer_l-1]) + // the exclusive right global index of my global range should then be: + // my_kmer_r-1 + d.my_begin = (my_kmer_l == 0) ? 0 : table[my_kmer_l-1]; + d.my_end = (my_kmer_r == 0) ? 0 : table[my_kmer_r-1]; + d.local_size = d.my_end - d.my_begin; + +#ifndef NDEBUG + d.check_correct(comm); +#endif + t.end_section("repartition: 1D-partition"); + return d; + } +}; +/** + * @brief Distributed Enhanced Suffix Array + * + * @tparam index_t Index type used for SA, RMQ, TL-lookup-table. + */ +template > +struct dist_desa { + /// Distributed Suffix Array & LCP array, and L_c array + suffix_array sa; + + /// LCP iterator type for RMQ + using it_t = typename std::vector::const_iterator; + + /// RMQ of local LCP + rmq minq; + + /// Top-Level lookup table (shared copy) + //lookup_index lt; // shared kmer lookup table + TLI tli; + + /// global size of input and arrays + size_t n; + + // save the string + std::string local_str; + // block distribution of string + mxx::blk_dist str_dist; + + // distribution of subtrees (general distribution of global range) + gen_dist subtree_dist; + + /// type of query results: typeof [l,r) + using range_t = std::pair; + + /// creates an empty DESA + dist_desa(const mxx::comm& c) : sa(c) {} + + /// naive implementation of L_c construciton + // (used only for runtime comparison with the better algorithm) +#if DESA_CONSTR_NAIVE_LC + std::vector local_Lc; + + void naive_construct_Lc() { + size_t local_size = sa.local_SA.size(); + sa.comm.with_subset(sa.local_SA.size() > 0, [&](const mxx::comm& comm) { + // Note: + // LCP[i] = lcp(SA[i-1],SA[i]) + // Lc[i] = S[SA[i-1]+LCP[i]], i=1,...n-1 + index_t prev_SA = mxx::right_shift(sa.local_SA.back(), comm); + + mxx::blk_dist dist(sa.n, comm.size(), comm.rank()); + MXX_ASSERT(dist.local_size() == local_str.size()); + + std::vector counts(comm.size(), 0); + if (comm.rank() > 0) { + if (prev_SA + sa.local_LCP[0] < sa.n) { + counts[dist.rank_of(prev_SA + sa.local_LCP[0])]++; + } + } + for (size_t i = 0; i < local_size; ++i) { + if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) { + counts[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++; + } + } + std::vector offsets = mxx::local_exscan(counts); + size_t total_count = std::accumulate(counts.begin(), counts.end(), static_cast(0)); + + std::vector charidx(total_count); + // add bucketed requests + if (comm.rank() > 0) { + if (prev_SA + sa.local_LCP[0] < sa.n) { + charidx[offsets[dist.rank_of(prev_SA + sa.local_LCP[0])]++] = prev_SA + sa.local_LCP[0]; + } + } + for (size_t i = 0; i < local_size; ++i) { + if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) { + charidx[offsets[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++] = sa.local_SA[i-1] + sa.local_LCP[i]; + } + } + + std::vector resp_chars = bulk_rma(local_str.begin(), local_str.end(), charidx, counts, comm); + charidx.clear(); + + offsets = mxx::local_exscan(counts); + + local_Lc.resize(local_size); + + // add bucketed requests + if (comm.rank() > 0) { + if (prev_SA + sa.local_LCP[0] < sa.n) { + local_Lc[0] = resp_chars[offsets[dist.rank_of(prev_SA + sa.local_LCP[0])]++]; + } + } + for (size_t i = 0; i < local_size; ++i) { + if (sa.local_SA[i-1] + sa.local_LCP[i] < sa.n) { + local_Lc[i] = resp_chars[offsets[dist.rank_of(sa.local_SA[i-1] + sa.local_LCP[i])]++]; + } + } + }); + } +#endif + + void repartition(const mxx::comm& comm) { + mxx::section_timer t; + redistr_sa(sa, subtree_dist.my_begin, subtree_dist.my_end, comm); + t.end_section("repartition: redistribute"); + } + + /** + * @brief Constructs the distributed DESA given the block + * distributed string given by [begin,end). + */ + template + void construct(Iterator begin, Iterator end, const mxx::comm& comm) { + mxx::section_timer t; + /// we need a copy of the input string for aligning queries + local_str = std::string(begin, end); // copy input string into data structure + n = mxx::allreduce(local_str.size(), comm); + str_dist = mxx::blk_dist(n, comm.size(), comm.rank()); + t.end_section("desa_construct: cpy str"); + + // create SA/LCP + sa.construct(local_str.begin(), local_str.end()); // move comm from constructor into .construct !? + MXX_ASSERT(sa.local_SA.size() == str_dist.local_size()); + t.end_section("desa_construct: SA/LCP construct"); + + tli.construct(sa, local_str, comm); + t.end_section("desa_construct: TLI construct"); + + subtree_dist = gen_dist::from_prefix_sizes(tli.prefix(), comm); + t.end_section("desa_construct: 1-D partition"); + repartition(comm); + subtree_dist.print_imbalance_stats(comm); + t.end_section("desa_construct: repartition"); + + // TODO: LCP might need some 1 element overlaps on boundaries (or different distribution??) + // construct RMQ on local LCP + if (subtree_dist.local_size > 0) { + minq = rmq(sa.local_LCP.begin(), sa.local_LCP.end()); + } + t.end_section("desa_construct: RMQ construct"); + +#if DESA_CONSTR_NAIVE_LC + naive_construct_Lc(); + t.end_section("desa_construct: Lc naive construct"); +#endif + } + + /// write distributed DESA to files using parallel IO + void write(const std::string& basename, const mxx::comm& comm) { + // writes only the SA, the rest gets re-created from the SA and input string + sa.write(basename); + } + + /// read and initialize the distributed DESA from file + void read(const std::string& string_file, const std::string& basename, const mxx::comm& comm) { + mxx::section_timer t; + // read input string + local_str = mxx::file_block_decompose(string_file.c_str(), comm); + n = mxx::allreduce(local_str.size(), comm); + str_dist = mxx::blk_dist(n, comm.size(), comm.rank()); + t.end_section("desa_read: read str"); + + // read SA + sa.read(basename); + t.end_section("desa_read: read SA"); + + tli.construct(sa, local_str, comm); + t.end_section("desa_read: init q-mer hist"); + + subtree_dist = gen_dist::from_prefix_sizes(tli.prefix(), comm); + repartition(comm); + subtree_dist.print_imbalance_stats(comm); + t.end_section("desa_read: repartition"); + + // construct RMQ(LCP) + if (subtree_dist.local_size > 0) { + minq = rmq(sa.local_LCP.begin(), sa.local_LCP.end()); + } + t.end_section("desa_read: RMQ construct"); + } + + // a ST node is virtually represented by it's interval [l,r] and it's first + // child split point `i1`, where LCP[i1] = minLCP[l..r] is the string + // depths `q` of the node. `c` is P[q], the (q+1)th char in P + inline void find_child(size_t& l, size_t& i1, size_t& r, size_t& q, char c) { + assert(l < r); + assert(l <= i1); + assert(i1 <= r); + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->sa.local_Lc[i1]; // == S[SA[l]+lcpv] for first iter + if (lc == c) { + r = i1-1; + break; + } + l = i1; + if (l == r) + break; + + i1 = this->minq(l+1, r); + } while (l < r && sa.local_LCP[i1] == q); + + if (sa.local_LCP[i1] == q) { + if (l+1 < r) { + i1 = this->minq(l+1, r); + } else { + i1 = l; + } + } + q = sa.local_LCP[i1]; + } + +#if 0 + template + inline range_t local_locate_possible(const String& P, size_t l, size_t r) { + assert(my_begin <= l && r < my_end); + size_t m = P.size(); + + // convert to local coords + l -= my_begin; + r -= my_begin; + + if (P.size() > lt.k && l <= r) { + // further narrow down search space + if (l < r) { + size_t i = this->minq(l+1, r); + size_t q = sa.local_LCP[i]; + assert(q >= lt.k); + + // FIXME: the check for l < i shouldn't be necessary, but + // somehow it happens sometimes and leads to error in find_child + while (q < m && l < r && l < i) { + this->find_child(l, i, r, q, P[q]); + } + } + + } + return range_t(l+my_begin, r+1+my_begin); + } +#else + + // manually in-lined version + template + inline range_t local_locate_possible(const String& P, size_t l, size_t r) { + assert(subtree_dist.my_begin <= l && r < subtree_dist.my_end); + size_t m = P.size(); + + // convert to local coords + l -= subtree_dist.my_begin; + r -= subtree_dist.my_begin; + if (P.size() > tli.minmatch() && l <= r) { + // further narrow down search space + if (l < r) { + + size_t i = this->minq(l+1, r); + size_t q = sa.local_LCP[i]; + + while (q < m && l < r && l < i) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + char c = P[q]; + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = sa.local_Lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == c) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + i = this->minq(l+1, r); + } while (l < r && sa.local_LCP[i] == q); + + if (sa.local_LCP[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = sa.local_LCP[i]; + } + } + } + return range_t(l+subtree_dist.my_begin, r+1+subtree_dist.my_begin); + } + +#endif + + template + range_t local_locate_possible(const String& P) { + // only if `P` is on local processor + index_t l, r; + std::tie(l, r) = tli.lookup(P); + if (l == r) { + return range_t(l,l); + } + --r; // convert [l,r) to [l,r] + + return local_locate_possible(P, l, r); + } + + /// execute in parallel (each processor checks the lookup table) and + /// proceeds only if it's the owner of the pattern + template + range_t locate_possible(const String& P) { + // only if `P` is on local processor + index_t l, r; + std::tie(l, r) = tli.lookup(P); + if (l == r) { + return range_t(l,l); + } + --r; // convert [l,r) to [l,r] + + if (P.size() <= tli.minmatch()) { + return range_t(l,r+1); + } + + range_t result; + int owner = -1; + if (subtree_dist.my_begin <= l && r < subtree_dist.my_end) { + result = local_locate_possible(P, l, r); + owner = sa.comm.rank(); + } + owner = mxx::allreduce(owner, mxx::max(), sa.comm); + mxx::bcast(result, owner, sa.comm); + + return result; + } + + std::vector bulk_locate(const strings& ss) { + mxx::section_timer timer(std::cerr, sa.comm); + size_t nstrings = ss.nstrings; + std::vector send_data_sizes(sa.comm.size()); // per processor sum of string sizes + + std::vector sprocs(nstrings, -1); + std::vector solutions(nstrings); + + std::vector send_counts(sa.comm.size()); + + for (size_t i = 0; i < ss.nstrings; ++i) { + mystring s; + s.ptr = ss.str_begins[i]; + s.len = ss.str_lens[i]; + range_t res = tli.lookup(s); + // get processor for this range + int sproc = subtree_dist.target_processor(res.first); + + if (s.len > tli.minmatch() && res.second > res.first) { + // this one needs further consideration + if (sproc != sa.comm.rank()) { + assert(res.second <= subtree_dist.my_begin || res.first >= subtree_dist.my_end); + sprocs[i] = sproc; + ++send_counts[sproc]; + } else { + assert(subtree_dist.my_begin <= res.first && res.second <= subtree_dist.my_end); + // can be solved locally (do these while sending?) + + if (res.first < res.second) { + // local query + res = local_locate_possible(s, res.first, res.second-1); + } + solutions[i] = res; + } + } else { + // solution is known + // save soluton, don't send this one + solutions[i] = res; + } + } + + timer.end_section("Phase I: local queries"); + + // communicate the patterns to where they can be answered + strings recv_ss = all2all_strings(ss, sprocs, send_counts, sa.comm); + timer.end_section("Phase I: all2all patterns"); + + std::vector recv_res(recv_ss.nstrings); + // locally continue querying the received strings + for (size_t i = 0; i < recv_ss.nstrings; ++i) { + // need to lookup again, then query + mystring P; + P.ptr = recv_ss.str_begins[i]; + P.len = recv_ss.str_lens[i]; + + range_t res = tli.lookup(P); + assert(subtree_dist.my_begin <= res.first && res.second <= subtree_dist.my_end); + assert(P.len > tli.minmatch()); + + if (res.first < res.second) { + // local query + res = local_locate_possible(P, res.first, res.second-1); + } + recv_res[i] = res; + } + + timer.end_section("Phase II: local_locate_possible"); + + // rule out false positives + // - take first or first few !?, sent to SA[first] for string alignment and checks + // - string is still equally block distributed! + //std::vector sa_idx; + // steps: + // - bucket patterns and results by rank_of(SA[l]) + // - all2all of patterns, results, SA[l], and originating processor? + // - local string compare (might cross boundaries!) [for now assume single boundary?] + // - reverse all all2alls? + std::vector rank_sa(recv_ss.nstrings); + std::vector fp_send_counts(sa.comm.size()); + for (size_t i = 0; i < recv_ss.nstrings; ++i) { + size_t stridx = sa.local_SA[recv_res[i].first - subtree_dist.my_begin]; + rank_sa[i] = str_dist.rank_of(stridx); + fp_send_counts[rank_sa[i]]++; + } + std::vector stridxs(recv_ss.nstrings); + std::vector fp_offset = mxx::local_exscan(fp_send_counts); + for (size_t i = 0; i < recv_ss.nstrings; ++i) { + stridxs[fp_offset[rank_sa[i]]++] = sa.local_SA[recv_res[i].first - subtree_dist.my_begin]; + } + + std::vector fp_recv_counts = mxx::all2all(fp_send_counts, sa.comm); + stridxs = mxx::all2allv(stridxs, fp_send_counts, sa.comm); + strings fp_ss = all2all_strings(recv_ss, rank_sa, fp_send_counts, sa.comm); + + timer.end_section("all2all patterns for cmp"); + + + std::vector overlap_sidx; + std::vector overlap_strs; + // strcmp the patterns to the stridxs in the underlying string data + // + std::vector fp_cmp(stridxs.size()); + for (size_t i = 0; i < stridxs.size(); ++i) { + bool match = true; + mystring P; + P.ptr = fp_ss.str_begins[i]; + P.len = fp_ss.str_lens[i]; + // compare P with `stridxs` + size_t cmp_len = std::min(str_dist.iprefix_size() - stridxs[i], P.len); + for (size_t j = 0; j < cmp_len; ++j) { + match = match && (local_str[stridxs[i] - str_dist.eprefix_size()+j] == P.ptr[j]); + } + if (match && cmp_len < P.len) { + overlap_sidx.emplace_back(stridxs[i]); + overlap_strs.emplace_back(i); + } + fp_cmp[i] = match; + } + + timer.end_section("Phase III: local strcmp"); + + + // TODO: right shift those in overlap_xxx and cmp on next processor + + // return results all the way back to originating processor + fp_cmp = mxx::all2allv(fp_cmp, fp_recv_counts, fp_send_counts, sa.comm); + fp_offset = mxx::local_exscan(fp_send_counts); + for (size_t i = 0; i < recv_ss.nstrings; ++i) { + if (!fp_cmp[fp_offset[rank_sa[i]]++]) { + recv_res[i] = range_t(recv_res[i].first, recv_res[i].first); + } + } + + timer.end_section("return P3 results"); + + + /// Phase IV: send shit back to its origin + + // TODO: re-use this from within the all2all_strings + std::vector recv_counts = mxx::all2all(send_counts, sa.comm); + + // send back results + std::vector ret_res = mxx::all2allv(recv_res, recv_counts, sa.comm); + + // iterate through the original strings and add the results in correct place + std::vector offset = mxx::local_exscan(send_counts); + for (size_t i = 0; i < ss.nstrings; ++i) { + if (sprocs[i] >= 0) { + // save received solutions + solutions[i] = ret_res[offset[sprocs[i]]++]; + } + } + + timer.end_section("return P2 results"); + + return solutions; + } +}; + +#endif // DESA_HPP diff --git a/include/dstrings.hpp b/include/dstrings.hpp new file mode 100644 index 0000000..8f736bf --- /dev/null +++ b/include/dstrings.hpp @@ -0,0 +1,284 @@ +/* + * Copyright 2018 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DSTRINGS_HPP +#define DSTRINGS_HPP + +/** + * This file implements a simple distributed stringset, used mainly for + * loading query-patterns. + * In this representation, strings are split between processors, such + * that each stirng is fully contained within a processor. + */ + + +#include +#include +#include +#include +#include + +#include +#include +#include + +/** + * @brief A simple string representation that does not own its own storage. + * + * Not necessarily '\0' terminated. Just a pointer and length. + * + * Implements some simple string operations, so that it can be used + * interchangably with std::string in some functions. Namely implements: + * + * - size(), length() + * - operator[] + * - data() + */ +struct mystring { + /// pointer to string start + char* ptr; + /// string length + size_t len; + + /// returns the size (length) of the string + size_t size() const { + return len; + } + + /// returns the size (length) of the string + size_t length() const { + return len; + } + + /// returns a reference to the `i`th character + char& operator[](size_t i) const { + return ptr[i]; + } + + /// returns a reference to the `i`th character + char& operator[](size_t i) { + return ptr[i]; + } + + /// returns a pointer to the first character + char* data() { + return ptr; + } + + /// returns a pointer to the first character + const char* data() const { + return ptr; + } +}; + + +/** + * @brief A simple stringset with shared storage. + * + * The data for all stirngs is stored in `data`. + * Each string `i` is represented by a pointer to its start + * `str_begins[i]` and its length `str_lens[i]`. + */ +struct strings { + /// raw data + std::vector data; + /// number of strings + size_t nstrings; + /// pointer to first char for each string + std::vector str_begins; + /// length of each string + std::vector str_lens; + static constexpr char sep = '\n'; + + // TODO: constructors?? + // TODO: iterators through strings? + + static strings from_vec(const std::vector& v) { + strings ss; + ss.nstrings = v.size(); + size_t total_size = 0; + for (size_t i = 0; i < v.size(); ++i) { + total_size += v[i].size() + 1; + } + ss.data.resize(total_size); + + ss.str_begins.resize(ss.nstrings); + ss.str_lens.resize(ss.nstrings); + + char* out = ss.data.data(); + for (size_t i = 0; i < v.size(); ++i) { + ss.str_begins[i] = out; + ss.str_lens[i] = v[i].size(); + memcpy(out, &v[i][0], v[i].size()); + out += v[i].size(); + if (i+1 < v.size()) { + *(out++) = '\n'; + } else { + *(out++) = '\0'; + } + } + + // replace very last sep with 0 + + return ss; + } + + void parse() { + // parse the strings by seperator into str_begins, and str_lens + // given that the data vector contains the string data + // now parse in mem data + nstrings = 0; + str_begins.clear(); + str_lens.clear(); + + size_t pos = 0; + while (pos < data.size()) { + // skip seperators + while (pos < data.size() && data[pos] == '\n') { + ++pos; + } + size_t str_beg = pos; + while (pos < data.size() && data[pos] != '\n') { + ++pos; + } + if (pos > str_beg && pos < data.size()) { + // save string info + nstrings++; + str_begins.emplace_back(data.data() + str_beg); + str_lens.emplace_back(pos - str_beg); + } + } + } + + static strings from_string(const std::string& s) { + strings ss; + + // copy into ss + ss.data.resize(s.size()+1); + memcpy(ss.data.data(), &s[0], s.size()+1); + ss.parse(); + + return ss; + } + + static strings from_dfile(const std::string& filename, const mxx::comm& comm) { + size_t size = mxx::get_filesize(filename.c_str()); + mxx::blk_dist dist(size, comm.size(), comm.rank()); + + // open file and seek to my pos + std::ifstream t(filename); + t.seekg(dist.eprefix_size()); + // scan for first newline (sep?) + size_t my_offset = 0; + if (comm.rank() == 0) { + my_offset = 0; + } else { + my_offset = dist.eprefix_size(); + // find first '\n' + char c; + while (t.get(c) && c != '\n') { + ++my_offset; + } + if (my_offset < size) { + ++my_offset; // advance one further + } + } + + size_t my_end = mxx::left_shift(my_offset, comm); + if (comm.rank() + 1 == comm.size()) { + my_end = size; + } + + // create rangebuf + mxx::rangebuf rb(my_offset, my_end-my_offset, t.rdbuf()); + // read file (range) buffer into string stream + std::stringstream sstr; + sstr << &rb; + + std::string local_str(sstr.str()); + + return strings::from_string(local_str); + } +}; + +std::ostream& operator<<(std::ostream& os, const strings& ss) { + os << "{nstrings=" << ss.nstrings << ", nbytes=" << ss.data.size() << ", ["; + for (size_t i = 0; i < ss.nstrings; ++i) { + std::string s(ss.str_begins[i], ss.str_begins[i] + ss.str_lens[i]); + os << "\"" << s << "\""; + if (i+1 < ss.nstrings) + os << ", "; + } + os << "]}"; + return os; +} + +strings all2all_strings(const strings& ss, std::vector& target, std::vector& send_counts, const mxx::comm& comm) { + std::vector offset = mxx::local_exscan(send_counts); + size_t send_num = offset.back() + send_counts.back(); + + // string lengths to send + std::vector send_lens(send_num); // str-lens in bucketed order of strings + std::vector send_data_sizes(comm.size()); // total data size per target proc + for (size_t i = 0; i < ss.nstrings; ++i) { + if (target[i] >= 0 && target[i] != comm.rank()) { + send_lens[offset[target[i]]++] = ss.str_lens[i]; + send_data_sizes[target[i]] += ss.str_lens[i]; + } + } + + std::vector data_offsets = mxx::local_exscan(send_data_sizes); + size_t send_data_size = data_offsets.back() + send_data_sizes.back(); + std::vector send_data(send_data_size); // buffer for sending + // create a "sorted" send buffer + for (size_t i = 0; i < ss.nstrings; ++i) { + if (target[i] >= 0 && target[i] != comm.rank()) { + memcpy(&send_data[data_offsets[target[i]]], ss.str_begins[i], ss.str_lens[i]); + data_offsets[target[i]] += ss.str_lens[i]; + // TODO: just copy this extra char from `ss`? + //send_data[data_offsets[sprocs[i]]] = '\n'; + //data_offsets[sprocs[i]] += 1; + // keep string lengths + //send_lens[lens_offset[target[i]]++] = ss.str_lens[i]; + } + } + + // send/recv the sequence lengths + std::vector recv_counts = mxx::all2all(send_counts, comm); + std::vector recv_lens = mxx::all2allv(send_lens, send_counts, recv_counts, comm); + + // send/recv the string data of the patterns + std::vector recv_data_sizes = mxx::all2all(send_data_sizes, comm); + std::vector recv_data = mxx::all2allv(send_data, send_data_sizes, recv_data_sizes, comm); + + // create `strings` data structure from received data + strings recv_ss; + recv_ss.data.swap(recv_data); // FIXME allocate memory for data + recv_ss.nstrings = std::accumulate(recv_counts.begin(), recv_counts.end(), static_cast(0)); + recv_ss.str_lens = recv_lens; + recv_ss.str_begins.resize(recv_ss.nstrings); + + // init str_begins + size_t beg = 0; + for (size_t i = 0; i < recv_ss.nstrings; ++i) { + recv_ss.str_begins[i] = recv_ss.data.data() + beg; + beg += recv_ss.str_lens[i]; + } + + return recv_ss; +} + +#endif // DSTRINGS_HPP diff --git a/include/kmer.hpp b/include/kmer.hpp index e02150e..55cf2d4 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -125,6 +125,8 @@ void par_for_each_kmer(InputIterator begin, InputIterator end, unsigned int k, c if (kmer_mask == 0) kmer_mask = ~static_cast(0); + MXX_ASSERT(std::distance(begin,end) >= k-1); + // fill first k-mer (until k-1 positions) and send to left processor // filling k-mer with first character = MSBs for lexicographical ordering InputIterator str_it = begin; diff --git a/include/lookup_table.hpp b/include/lookup_table.hpp new file mode 100644 index 0000000..6c522ab --- /dev/null +++ b/include/lookup_table.hpp @@ -0,0 +1,151 @@ +/* + * Copyright 2018 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LOOKUP_TABLE_HPP +#define LOOKUP_TABLE_HPP + +#include +#include + +#include "alphabet.hpp" +#include "kmer.hpp" + +// Top-Level Lookup-Table (small kmer index) + +/** + * @brief Top-Level Lookup-Table + * + * The table is represented as a prefix-sum over the k-mer histogram. + * + * @tparam index_t Index type (unsigned integer). Should be big enough to hold + * the number of total elements indexed. + */ +template +struct lookup_index { + /// prefixsum of k-mer histogram + std::vector table; + /// k + unsigned int k; + /// alphabet + alphabet alpha; + /// output type + using range_t = std::pair; + + /** + * @brief Constructs the lookup table from a k-mer histogram. + * + * @param hist k-mer histogram + * @param k + * @param a alphabet for underlying string. + */ + template + void construct_from_hist(const std::vector& hist, unsigned int k, const alphabet& a) { + this->alpha = a; + this->k = k; + + if (table.size() != hist.size()) { + table.resize(hist.size()); + } + std::partial_sum(hist.begin(), hist.end(), table.begin()); + } + + /** + * @brief Constructs the lookup table from a given string and alphabet. + * + * @param begin Iterator the string start. + * @param end Iterator to string end. + * @param bits Used to compute `k` as `k = bits / log(alphabet size)`. + * The table will have size 2^bits. + * @param a The alphabet of the string. + */ + template + void construct(Iterator begin, Iterator end, unsigned int bits, const alphabet& a) { + this->alpha = a; + + unsigned int l = this->alpha.bits_per_char(); + this->k = bits / l; + assert(k >= 1); + + // scan through string to create kmer histogram + table = kmer_hist(begin, end, k, alpha); + + // prefix sum over hist + std::partial_sum(table.begin(), table.end(), table.begin()); + } + + /** + * @brief Constructs the lookup table from a given string. + * + * @param begin Iterator the string start. + * @param end Iterator to string end. + * @param bits Used to compute `k` as `k = bits / log(alphabet size)`. + * The table will have size 2^bits. + */ + template + void construct(Iterator begin, Iterator end, unsigned int bits) { + // get alphabet? + alphabet a = alphabet::from_sequence(begin, end); + this->construct(begin, end, bits, a); + } + + /** + * @brief Queries the lookup table for a given string patter `P`. + * + * Returns the range [l,r) of suffixes for the first `k` characters of `P`. + * + * @param P Input string pattern. + * + * @return Range [l,r) of sorted suffixes where `P[0..k-1]` occurs. + */ + template + range_t lookup(const String& P) const { + unsigned int l = alpha.bits_per_char(); + assert(l*k < sizeof(index_t)*8); + + if (P.size() >= k) { + // create kmer from P + index_t kmer = 0; + for (size_t i = 0; i < k; ++i) { + kmer <<= l; + kmer |= alpha.encode(P[i]); + } + + if (kmer == 0) { + return range_t(0, table[0]); + } else { + return range_t(table[kmer-1],table[kmer]); + } + } else { + // create partial kmer + index_t kmer = 0; + for (size_t i = 0; i < P.size(); ++i) { + kmer <<= l; + kmer |= alpha.encode(P[i]); + } + size_t extra = k - P.size(); + for (size_t i = P.size(); i < k; ++i) { + kmer <<= l; + } + if (kmer == 0) { + return range_t(0, table[kmer + (1 << (extra*l))-1]); + } else { + return range_t(table[kmer-1], table[kmer + (1 << (extra*l))-1]); + } + } + } +}; + +#endif // LOOKUP_TABLE_HPP diff --git a/include/partition.hpp b/include/partition.hpp new file mode 100644 index 0000000..0502969 --- /dev/null +++ b/include/partition.hpp @@ -0,0 +1,115 @@ +/* + * Copyright 2018 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef PARTITION_HPP +#define PARTITION_HPP + +#include + +/** + * @brief Simple recursive 1D-partitioning via bisection. + * + * Partitions bins given by an inclusive prefix sum of bin sizes. + * Recursively partitions index range [l, r) of `tl` onto + * processors (or partitons) [p0,p0+p). + * + * @param tl Number of elements in each bin given as an incl. prefix sum + * (partial_sum) of bin sizes. + * @param l Left index (incl.) [l,r). + * @param r Right index (excl.) [l,r). + * @param p0 First partition index. + * @param p Number of partitions + * @param pstart Returns the assigned bins in pstart[p0],...,pstart[p0+p-1] + * where each `pstart[i]` is the index (number) of the first + * bin assigned to partition `i`. I.e., each element is between + * [0,tl.size()]. + * Partition `i` is assigned bins `pstart[i],..,pstart[i+1]` + * or `pstart[i],...,tl.size()` for the last partition. + */ +template +void rec_partition(const std::vector& tl, size_t l, size_t r, int p0, int p, std::vector& pstart) { + if (p == 1) { + return; + } + + if (l == r) { + for (int i = p0+1; i < p0+p; ++i) { + pstart[i] = l; + } + return; + } + + // p odd: (p+1)/2, (p-1)/2 weighted halfs (extreme case: 2,1) + size_t excl = l == 0 ? 0 : tl[l-1]; + size_t num = tl[r-1] - excl; + + // compute how many elements for each side would be optimal + size_t left_size; + int left_p; + if (p % 2 == 0) { + left_size = num/2; + left_p = p / 2; + } else { + // (p+1)/ 2 to the left, (p-1)/2 to the right + left_size = (num*(p+1))/(2*p); + left_p = (p+1)/2; + } + + // find first index which is larger + auto it = std::lower_bound(tl.begin()+l, tl.begin()+r, excl+left_size); + size_t i = it - tl.begin(); + + // decide whether this or the previous split is better (closer to `left size`) + if (i > 0) { + if (tl[i] - (left_size+excl) > (left_size+excl) - tl[i-1]) { + --i; + } + } + + // left partition is now [l, i], convert to [l, i) + ++i; + + // save the result + pstart[p0+left_p] = i; + + // p even: find i in tl closest to (tl[l]+tl[r])/2 + rec_partition(tl, l, i, p0, left_p, pstart); + rec_partition(tl, i, r, p0 + left_p, p - left_p, pstart); +} + +/** + * @brief Simple recursive 1D-partitioning via bisection. + * + * Partitions bins given by an inclusive prefix sum of bin sizes. + * + * @param tl Number of elements in each bin given as an incl. prefix sum + * (partial_sum) of bin sizes. + * @param p Number of partitions + * @return Returns for each parititon, the assigned bins. + * Each `pstart[i]` is the index (number) of the first + * bin assigned to partition `i`. + * I.e., each element is between [0,tl.size()]. + * Partition `i` is assigned bins `pstart[i],..,pstart[i+1]` + * or `pstart[i],...,tl.size()` for the last partition. + */ +template +std::vector partition(const std::vector& tl, int p) { + // partition by recursive weighted bisection + std::vector pstarts(p, 0); + rec_partition(tl, 0, tl.size(), 0, p, pstarts); + return pstarts; +} + +#endif // PARTITION_HPP diff --git a/include/seq_query.hpp b/include/seq_query.hpp new file mode 100644 index 0000000..05b6830 --- /dev/null +++ b/include/seq_query.hpp @@ -0,0 +1,908 @@ + +#include +#include +#include +#include + +#include + +// psac stuff: +//#include +#include +#include +#include +#include + +#include + +#define RMQ_USE_SDSL 0 + +#if RMQ_USE_SDSL +#include +#endif + +#ifndef SEQ_QUERY_HPP +#define SEQ_QUERY_HPP + +namespace my { +struct timer { + using clock_type = std::chrono::steady_clock; + using time_point = clock_type::time_point; + using duration = time_point::duration; + + time_point ts; + duration elapsed; + duration total; + + timer() : ts(), elapsed(0), total(0) {} + + inline void tic() { + ts = clock_type::now(); + } + + inline void toc() { + elapsed = clock_type::now() - ts; + total += elapsed; + } + + template + inline typename precision::rep get_time() const { + return std::chrono::duration_cast(elapsed).count(); + } + + template + inline typename precision::rep get_total_time() const { + return std::chrono::duration_cast(total).count(); + } + + inline std::chrono::nanoseconds::rep get_ns() const { + return get_time(); + } + + inline std::chrono::nanoseconds::rep get_total_ns() const { + return get_total_time(); + } +}; +} // namepsace my + +// LCP from SA and string?? + +// libdivsufsort wrapper +//#include + +/// TODO: compare SA, SA+LCP, ESA, and DESA querying +/// (opt) compare against sdsl (although ours requires tons more mem...) + + +// TODO: speedup this comparison function using AVX etc +// compares pattern P lexicographically to S[pos...) and returns: +// <0: P is smaller +// =0: P is equal +// >0: P is larger +inline int cmp_pattern(const std::string& P, const std::string& S, size_t pos) { + if (pos >= S.size()) { + return 1; + } + return strncmp(&P[0], &S[pos], P.size()); +} + +template +inline int cmp_pattern(const std::string& P, Iterator strbeg, size_t n, size_t pos) { + if (pos >= n) { + return 1; + } + return strncmp(&P[0], &strbeg[pos], P.size()); +} + + +// simple binary search for given condition +// assuming for the sequence `data`, all cond(data[i]) == false come before all +// cond(data[i]) == true. This returns the smallest index i such that +// cond(data[i]] == true, and returns data.size() if none exist +template +size_t binary_search(const std::vector& data, Cond cond) { + if (data.empty()) { + return 0; + } + if (cond(data.front())) { + return 0; + } + if (!cond(data.back())) { + return data.size(); + } + + size_t l = 0; + size_t r = data.size(); + + while (l + 1 < r) { + size_t m = (l+r) >> 1; + if (!cond(data[m])) { + l = m; + } else { + r = m; + } + } + return r; +} + + +// O(P log(n)) binary search for pattern using only string and SA +template +std::pair locate_sa(const std::string& S, const std::vector& SA, const std::string& P) { + // binary search using SA and S + assert(S.size() == SA.size()); + + // TODO: can be speed up by first searching until we hit a midpoint for which cmp == 0, then do sepearate lb and ub + index_t lb = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, S, pos) <= 0; }); + index_t ub = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, S, pos) < 0; }); + return std::pair(lb,ub); +} + +// locate using LCP information (M&M use interval tree, we use RMQ !?) +// O(P + log(n)) + + +// returns the offset for the first differening character between x and y +// this compares until at most n characters in. if x[0..n) == y[0..n), then +// this returns `n` +int my_stricmp(const char* x, const char* y, size_t n, size_t& len) { + len = 0; + while (len < n && x[len] == y[len]) { + ++len; + } + if (len == n) + return 0; + else if (x[len] < y[len]) + return -1; + else + return 1; +} + +// computes the lcp between the suffix S[pos+offset...) and the pattern P[offset...) +inline size_t lcp_offset(const std::string& P, const std::string& S, size_t pos, size_t offset) { + assert(pos+offset <= S.size()); + size_t max = std::min(S.size() - pos, P.size()); + size_t i = offset; + while (i < max && P[i] == S[pos+i]) { + ++i; + } + return i; +} + +inline size_t lcp_offset(const std::string& P, const char* strbegin, size_t n, size_t pos, size_t offset) { + assert(pos+offset <= n); + size_t max = std::min(n - pos, P.size()); + size_t i = offset; + while (i < max && P[i] == strbegin[pos+i]) { + ++i; + } + return i; +} + +template +size_t lb_rmq(const std::vector& LCP, const RMQ& minq, size_t l, size_t r, index_t q) { + // we have a `q` match at `r` and trying to find the lower bound + // LCP[r] = lcp(SA[r-1],SA[r]) + if (LCP[r] < q) + return r; // `r` is the lower bound + + //typename std::vector::const_iterator b = LCP.begin(); + + while (l + 1 < r) { + size_t mid = (l+r)/2; + index_t x = LCP[minq(mid+1, r)]; + if (x < q) { + l = mid; + } else { + r = mid; + } + } + + return r; +} + +template +size_t ub_rmq(const std::vector& LCP, const RMQ& minq, size_t l, size_t r, index_t q) { + // we have a `q` match at `l` and trying to find the upper bound + // LCP[l+1] = lcp(SA[l],SA[l+1]) + if (LCP[l+1] < q) + return l; // `l` is the upper bound + + + //typename std::vector::const_iterator b = LCP.begin(); + + // one-sided binary search + while (l + 1 < r) { + size_t mid = (l+r)/2; + index_t x = LCP[minq(l+1, mid)]; + if (x < q) { + r = mid; + } else { + l = mid; + } + } + + return l; +} + +template +struct sa_index { + + size_t n; + const char* strbegin; + const char* strend; + std::vector SA; + + my::timer rmqtimer; + + template + void construct(Iterator begin, Iterator end) { + strbegin = &(*begin); + strend = &(*end); + dss::construct(begin, end, SA); + n = SA.size(); + } + + std::pair locate(const std::string& P) { + // TODO: can be speed up by first searching until we hit a midpoint for which cmp == 0, then do sepearate lb and ub + index_t lb = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, strbegin, n, pos) <= 0; }); + index_t ub = binary_search(SA, [&](index_t pos) { return cmp_pattern(P, strbegin, n, pos) < 0; }); + return std::pair(lb,ub); + } +}; + +template +struct salcp_index : public sa_index { + + std::vector LCP; + + template + void construct(Iterator begin, Iterator end) { + sa_index::construct(begin, end); + // construct LCP from SA: + // create ISA needed for constructing LCP + std::vector ISA(this->n); + for (size_t i = 0; i < this->n; ++i) { + ISA[this->SA[i]] = i; + } + std::vector LCP; + lcp_from_sa(std::string(this->strbegin, this->strend), this->SA, ISA, this->LCP); + } +}; + + +// O(sigma*P) using repeated RMQ on LCP [Fischer & Heun 2007] +template +struct esa_index : public salcp_index { + using it_t = typename std::vector::const_iterator; + +#if RMQ_USE_SDSL + sdsl::rmq_succinct_sct<> minq; +#else + rmq minq; +#endif + + template + void construct(Iterator begin, Iterator end) { + salcp_index::construct(begin, end); + // construct RMQ ontop of LCP +#if RMQ_USE_SDSL + minq = sdsl::rmq_succinct_sct<>(&this->LCP); +#else + minq = rmq(this->LCP.begin(), this->LCP.end()); +#endif + } + + std::pair locate(const std::string& P) const { + + size_t n = this->n; + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + size_t q = 0; // size of current match + + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + + // l, r <- getChild(q, c, l, r): + + // get first child interval and depth + //it_t ii = minq.query(b+l+1, b+r+1); // this may have already been run by the previous step + //size_t i = ii - b; + size_t i = minq(l+1, r); + index_t lcpv = this->LCP[i]; + assert(lcpv >= q); + + // check skipped characters (XXX optional?) + // characters in P[q..lcpv), compare to [l] + bool match = true; + for (size_t j = q; q < lcpv; ++q) { + match &= (P[j] == this->strbegin[this->SA[l]+j]); + } + if (!match) { + return std::pair(l,l); + } + + // check if we've reached the end of the pattern + if (lcpv >= m) { + return std::pair(l, r+1); + } + + char c = P[lcpv]; + do { + // `i` is the lcp(SA[i-1],SA[i]) + char Lc = this->strbegin[this->SA[i-1]+lcpv]; // == S[SA[l]+lcpv] for first iter + //char Rc = S[SA[i]+lcpv]; + if (Lc == c) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + i = minq(l+1, r); + } while (l < r && this->LCP[i] == lcpv); + + if (this->strbegin[this->SA[l]+lcpv] == c) { + // found the interval we were looking for + q = lcpv+1; + } else { + return std::pair(l,l); + } + + } + return std::pair(l, r+1); + } +}; + +// O(P + log(n)) via binary search range queries on LCP +// original M&M does this via pre-computing the RLCP and LLCP queries for +// all possible (l,mid,r) points of the binary search +// +// this version uses binary search on an underlying RMQ +template +struct bs_esa_index : public esa_index { + using it_t = typename std::vector::const_iterator; + + std::pair locate(const std::string& P) const { + size_t n = this->n; + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + size_t llcp = lcp_offset(P, this->strbegin, n, this->SA[0], 0); + size_t rlcp = lcp_offset(P, this->strbegin, n, this->SA[n-1], 0); + + index_t q = 0; // num of chars matched + + if (llcp < m && P[llcp] < this->strbegin[this->SA[0]+llcp]) { + // first suffix is larger than `P` -> no match + return std::pair(0,0); + } else if (llcp == m) { + // first suffixes matches, just find ub + r = ub_rmq(this->LCP, this->minq, 0, n-1, m); + return std::pair(0,r+1); + } + + if (rlcp < m && P[rlcp] > this->strbegin[this->SA[n-1]+rlcp]) { + // last suffix is smaller than `P` -> no match + return std::pair(n,n); + } else if (rlcp == m) { + // last suffix matches `P` -> find lb + l = lb_rmq(this->LCP, this->minq, 0, n-1, m); + return std::pair(l, n); + } + + + while (l+1 < r) { + + size_t mid = (l+r) / 2; + index_t pos = this->SA[mid]; + + if (llcp >= rlcp) { + // sharing more with left than right + size_t i = this->minq(l+1, mid); + q = this->LCP[i]; // minLCP for suffixes in SA[l..m] + if (q >= llcp) { + q = lcp_offset(P, this->strbegin, n, pos, llcp); + } + } else { + size_t i = this->minq(mid+1, r); // minLCP for suffixes in SA[m..r] + q = this->LCP[i]; + if (q >= rlcp) { + q = lcp_offset(P, this->strbegin, n, pos, rlcp); + } + } + + if (q == m) { + // found _a_ match, now find left and right boundaries + assert(llcp < m && rlcp < m); + + // find ub + r = ub_rmq(this->LCP, this->minq, mid, r, m); + + // find lb + l = lb_rmq(this->LCP, this->minq, l, mid, m); + + return std::pair(l, r+1); + + } else if (P[q] <= this->strbegin[pos+q]) { + r = mid; + rlcp = q; + } else { + l = mid; + llcp = q; + } + } + // found no match + size_t lb = r; + return std::pair(lb, lb); + } +}; + +template +struct desa_index : public esa_index { + using it_t = typename esa_index::it_t; + // adds Lc array of characters + std::vector Lc; + + template + void construct(Iterator begin, Iterator end) { + esa_index::construct(begin, end); + + // NOTE: + // LCP[i] = lcp(SA[i-1],SA[i]) + // differing chars (needed for decision) + // left: S[SA[i-1]+LCP[i]] (need only one of these) + // right: S[SA[i] +LCP[i]] + // construct char array Lc[i] = S[SA[i-1]+LCP[i]], i=1,...n-1 + Lc.resize(this->n); + for (size_t i = 1; i < this->n; ++i) { + Lc[i] = this->strbegin[this->SA[i-1]+this->LCP[i]]; + } + } + + // a ST node is virtually represented by it's interval [l,r] and it's first + // child split point `i1`, where LCP[i1] = minLCP[l..r] is the string + // depths `q` of the node. `c` is P[q], the (q+1)th char in P + inline void find_child(size_t& l, size_t& i1, size_t& r, size_t& q, char c) const { + assert(l < r); + assert(l <= i1); + assert(i1 <= r); + do { + // `i` is the lcp(sa[i-1],sa[i]) + char lc = this->lc[i1]; // == s[sa[l]+lcpv] for first iter + if (lc == c) { + r = i1-1; + break; + } + l = i1; + if (l == r) + break; + + this->rmqtimer.tic(); + i1 = this->minq(l+1, r); + this->rmqtimer.toc(); + } while (l < r && this->lcp[i1] == q); + + if (this->lcp[i1] == q) { + if (l+1 < r) { + this->rmqtimer.tic(); + i1 = this->minq(l+1, r); + this->rmqtimer.toc(); + } else { + i1 = l; + } + } + q = this->lcp[i1]; + } + + /* + std::pair locate_possible(const std::string& P) { + size_t n = this->n; + size_t m = P.size(); + + size_t l = 0; + size_t r = n-1; + //size_t q = 0; // size of current match + + if (l == r) { + return std::pair(l, r+1); + } + + + this->rmqtimer.tic(); + size_t i = this->minq(l+1, r); + this->rmqtimer.toc(); + size_t q = this->LCP[i]; + + while (q < m && l < r) { + find_child(l, i, r, q, P[q]); + } + + return std::pair(l, r+1); + } + + std::pair locate(const std::string& P) { + std::pair res = locate_possible(P); + // check if pattern matches + if (res.first < res.second) { + int cmp = strncmp(&this->strbegin[this->SA[res.first]], &P[0], P.size()); + if (cmp == 0) { + return res; + } else { + return std::pair(res.first, res.first); + } + } + return res; + } + */ + + /* + std::pair locate(const std::string& P) { + + size_t n = this->n; + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + size_t q = 0; // size of current match + + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t lcpv = this->LCP[i]; + assert(lcpv >= q); + + // check if we've reached the end of the pattern + if (lcpv >= m) { + break; + } + + char c = P[lcpv]; + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == c) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + + i = this->minq(l+1, r); + } while (l < r && this->LCP[i] == lcpv); + + if (this->strbegin[this->SA[l]+lcpv] == c) { + // found the interval we were looking for + q = lcpv+1; + } else { + return std::pair(l,l); + } + } + // check if pattern matches + if (l <= r) { + int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size()); + if (cmp != 0) { + return std::pair(l, l); + } + } + return std::pair(l, r+1); + } + */ + + + template + inline std::pair locate_possible(const String& P) const { + + size_t n = this->n; + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t q = this->LCP[i]; + + // blind search + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == P[q]) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + + i = this->minq(l+1, r); + } while (l < r && this->LCP[i] == q); + + if (this->LCP[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = this->LCP[i]; + } + return std::pair(l, r+1); + } + + + std::pair locate(const std::string& P) const { + + size_t n = this->n; + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t q = this->LCP[i]; + + // blind search + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == P[q]) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + + i = this->minq(l+1, r); + } while (l < r && this->LCP[i] == q); + + if (this->LCP[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = this->LCP[i]; + } + + // check if pattern matches the string + if (l <= r) { + int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size()); + if (cmp != 0) { + return std::pair(l, l); + } + } + return std::pair(l, r+1); + } +}; + + +template +struct lookup_desa_index : public desa_index { + lookup_index tl; + + template + void construct(Iterator begin, Iterator end) { + desa_index::construct(begin, end); + tl.construct(begin, end, 16); // automatically size `k` given table size and use alphabet size + } + + std::pair locate(const std::string& P) const { + size_t m = P.size(); + + index_t l, r; + std::tie(l, r) = tl.lookup(P); + if (l == r) { + return std::pair(l,l); + } + --r; // convert [l,r) to [l,r] + + if (P.size() > tl.k && l <= r) { + // further narrow down search space + if (l < r) { + + size_t i = this->minq(l+1, r); + index_t q = this->LCP[i]; + assert(q >= tl.k); + + // blind search + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->Lc[i]; // == S[SA[l]+q] for first iter + if (lc == P[q]) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + i = this->minq(l+1, r); + } while (l < r && this->LCP[i] == q); + + if (this->LCP[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = this->LCP[i]; + } + } + + // check if pattern matches + if (l <= r) { + int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size()); + if (cmp == 0) { + return std::pair(l, r+1); + } else { + // no match + return std::pair(l, l); + } + } + } + return std::pair(l, r+1); + } + /* + std::pair locate(const std::string& P) { + size_t n = this->n; + size_t m = P.size(); + + index_t l, r; + std::tie(l, r) = tl.lookup(P); + if (l == r) { + return std::pair(l,l); + } + --r; // convert [l,r) to [l,r] + + if (P.size() > tl.k && l <= r) { + // further narrow down search space + if (l < r) { + + size_t i = this->minq(l+1, r); + size_t q = this->LCP[i]; + assert(q >= tl.k); + + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t lcpv = this->LCP[i]; + assert(lcpv >= q); + + // check if we've reached the end of the pattern + if (lcpv >= m) { + break; + } + + char c = P[lcpv]; + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->Lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == c) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + i = this->minq(l+1, r); + } while (l < r && this->LCP[i] == lcpv); + + if (this->strbegin[this->SA[l]+lcpv] == c) { + // found the interval we were looking for + q = lcpv+1; + } else { + return std::pair(l,l); + } + } + } + + // check if pattern matches + if (l <= r) { + int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size()); + if (cmp == 0) { + return std::pair(l, r+1); + } else { + // no match + return std::pair(l, l); + } + } + } + return std::pair(l, r+1); + } + */ + /* + std::pair locate(const std::string& P) { + size_t m = P.size(); + + index_t l, r; + std::tie(l, r) = tl.lookup(P); + if (l == r) { + return std::pair(l,l); + } + --r; // convert [l,r) to [l,r] + + if (P.size() > tl.k && l <= r) { + // further narrow down search space + if (l < r) { + this->rmqtimer.tic(); + size_t i = this->minq(l+1, r); + this->rmqtimer.toc(); + size_t q = this->LCP[i]; + assert(q >= tl.k); + + while (q < m && l < r) { + this->find_child(l, i, r, q, P[q]); + } + } + + // check if pattern matches + if (l <= r) { + int cmp = strncmp(&this->strbegin[this->SA[l]], &P[0], P.size()); + if (cmp == 0) { + return std::pair(l, r+1); + } else { + // no match + return std::pair(l, l); + } + } + } + return std::pair(l, r+1); + } + */ +}; + + + +#endif // SEQ_QUERY_HPP diff --git a/include/suffix_array.hpp b/include/suffix_array.hpp index 2e9869a..64dfb2c 100644 --- a/include/suffix_array.hpp +++ b/include/suffix_array.hpp @@ -139,17 +139,29 @@ void write_dist_int_array(const std::string& basename, const std::vector& vec template std::vector read_dist_int_array(const std::string& filename, const mxx::comm& comm) { static_assert(std::is_integral::value && !std::is_same::value, "This function works only for integral types excluding bool"); - // TODO: make sure the file ending matches the type size? - mxx::coll_file f(filename, comm); - f.open(MPI_MODE_RDONLY); - - size_t nbytes = f.get_size(); - size_t n = nbytes / sizeof(T); + std::ifstream f(filename, std::ios::binary | std::ios::ate); + size_t n = f.tellg() / sizeof(T); mxx::blk_dist d(n, comm); + // Use C++ fileIO if local size is larger than INT_MAX + // because some MPI implementations will silently fail to load the input + // file otherwise std::vector data(d.local_size()); - f.read_ordered(d.local_size(), data.data()); + if ((n + comm.size())*sizeof(T) / comm.size() >= std::numeric_limits::max()) { + if (comm.rank() == 0) { + std::cerr << "[WARNING] local size is larger than INT_MAX, using C++ file IO instead of MPI IO" << std::endl; + } + f.seekg(d.eprefix_size(), std::ios::beg); + f.read((char*)data.data(), d.local_size()*sizeof(T)); + } else { + // use MPI File IO + f.close(); + mxx::coll_file f2(filename, comm); + f2.open(MPI_MODE_RDONLY); + f2.read_ordered(d.local_size(), data.data()); + } + return data; } @@ -225,6 +237,8 @@ void write(const std::string& basename) { if (_CONSTRUCT_LC) { write_dist_int_array(basename + ".lc", local_Lc, comm); } + + alpha.write(basename + ".alpha", comm); } // load from file using parallel IO @@ -244,6 +258,8 @@ void read(const std::string& basename) { } } + alpha.read(basename + ".alpha", comm); + // initialize the rest init_size(local_SA.size()); } diff --git a/include/tldt.hpp b/include/tldt.hpp new file mode 100644 index 0000000..05a2cff --- /dev/null +++ b/include/tldt.hpp @@ -0,0 +1,473 @@ +/* + * Copyright 2018 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TLDT_HPP +#define TLDT_HPP + +#include + +#include +#include +#include + +#include +#include + +#include +#include + + +template +std::vector sample_lcp(const std::vector& LCP, index_t maxsize) { + + index_t n = LCP.size(); + + struct node { + index_t lcp; // == LCP[pos] + index_t pos; // index of node + index_t l; // index of left parent + }; + + std::stack st; + st.push(node{0,0,0}); + + index_t total_out = 1; + std::vector do_output(n, false); + do_output[0] = true; + + for (index_t i = 1; i < n; ++i) { + + while (!st.empty() && st.top().lcp > LCP[i]) { + // new node is smaller LCP + // -> "complete" all nodes which are on the stack + node& u = st.top(); + + // u.pos has range [u.l, .. , i) + index_t parent_size = i - u.l; + if (parent_size > maxsize) { + // output but in inverse order !? + do_output[u.pos] = true; + ++total_out; + } + st.pop(); + } + + if (st.empty()) { + // cant happen, because always: LCP[0] = 0 + assert(false); + } else if (st.top().lcp == LCP[i]) { + st.push(node{LCP[i], i, st.top().l}); + if (LCP[i] == 0) { + do_output[i] = true; + ++total_out; + } + } else { + assert(st.top().lcp < LCP[i]); + st.push(node{LCP[i], i, st.top().pos}); + } + } + // virtually, there is a 0 at the very end of the LCP array -> output them all + while (!st.empty() && st.top().lcp > 0) { + // new node is smaller LCP + // -> "complete" all nodes which are on the stack + node& u = st.top(); + + // u.pos has range [u.l, .. , i) + index_t parent_size = n - u.l; + if (parent_size > maxsize) { + do_output[u.pos] = true; + ++total_out; + } + st.pop(); + } + + // TODO: output everything still in the queue? + + std::vector idx(total_out); + index_t j = 0; + for (index_t i = 0; i < n; ++i) { + if (do_output[i]) { + idx[j] = i; + ++j; + } + } + return idx; +} + + +template +std::vector sample_lcp_indirect(const std::vector& LCP, index_t maxsize) { + assert(LCP[0] == 0); + + std::vector samples; // all indices for output, eventually the result + samples.push_back(0); + size_t n = LCP.size(); + + std::vector st; // keep track of where each sequence starts? best to use indces into `samples`!? + //st.push(0); // not necessary, 0 never gets "popped" + + //lcp_t minval = LCP[1]; + + for (size_t i = 1; i < LCP.size(); ++i) { + index_t l = LCP[i]; + index_t topl = LCP[samples.back()]; + + while (!st.empty() && topl > l) { + // l is rightmatch for topl and all its equals, their left match is in stack + index_t lsidx = st.back(); // index of parent in `samples` + index_t lnsv = samples[lsidx]; // index of parent in `LCP` + st.pop_back(); + + index_t parent_size = i - lnsv; + if (parent_size > maxsize) { + // here i can output everything on the stack and quit the while loop + // ie, everything in samples is safe, also `i` necessarily is safe and everything smaller to come after! + // so i can push i, keep the stack empty. empty stack means: everything larger than samples.back() gets pushed to both, l == samples.back() -> nothing to stack, push samples. l < samples.back() -> output and safe, ie. not onto stack?? + // stack contains parent of those elements which may be removed at a later time + //st = std::stack(); + st.clear(); + break; + } else { + // elements are not output, but skipped + // [x y z L L L] + // ^ + // 2 6, resize to 3 + samples.resize(lsidx + 1); + } + + topl = LCP[samples.back()]; + // if i clear off the entire stack without outputting anything, what happens? + } + + if (topl < l) { // only if new larger sequence do we push the stack + st.push_back(samples.size()-1); + samples.push_back(i); + } else if (topl == l) { + samples.push_back(i); + } else { + assert(st.empty()); + samples.push_back(i); + } + } + + // at the end we assume we get a 0 element. we go through the remaining stack + // and remove everything that isn't large enough until the distance becomes big enough + index_t topl = LCP[samples.back()]; + while (!st.empty() && topl > 0) { + index_t lsidx = st.back(); // index of parent in `samples` + index_t lnsv = samples[lsidx]; // index of parent in `LCP` + st.pop_back(); + + index_t parent_size = n - lnsv; + if (parent_size > maxsize) { + break; + } else { + samples.resize(lsidx + 1); + } + } + + return samples; +} + +template +std::vector send_vec_left(const std::vector& vec, const mxx::comm& c) { + size_t send_size = vec.size(); + size_t recv_size = 0; + mxx::datatype dt = mxx::get_datatype(); + mxx::datatype size_dt = mxx::get_datatype(); + int dst = c.rank() > 0 ? c.rank() - 1 : MPI_PROC_NULL; + int src = c.rank() + 1 < c.size() ? c.rank() + 1 : MPI_PROC_NULL; + MPI_Sendrecv(&send_size, 1, size_dt.type(), dst, 0, + &recv_size, 1, size_dt.type(), src, 0, c, MPI_STATUS_IGNORE); + + std::vector res(recv_size); + MPI_Sendrecv(const_cast(vec.data()), send_size, dt.type(), dst, 0, + res.data(), recv_size, dt.type(), src, 0, c, MPI_STATUS_IGNORE); + + return res; +} + +// if this is not included as part of google test, define our own assert functions! +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +#define ASSERT_TRUE(x) {if (!(x)) { std::cerr << "[ERROR]: Assertion failed in " __FILE__ ":" << __LINE__ << std::endl;return ;}} std::cerr << "" +#define ASSERT_EQ(x, y) ASSERT_TRUE((x) == (y)) +#define ASSERT_GT(x, y) ASSERT_TRUE((x) > (y)) +#define ASSERT_LT(x, y) ASSERT_TRUE((x) < (y)) +#endif + +#define PV(x, c) mxx::sync_cout(c) << "[" << c.rank() << "] " << "" # x " = " << x << std::endl; + +template +void seq_check_sample(const std::vector& LCP, const std::vector& off, index_t maxsize) { + size_t n = LCP.size(); + // create sampling + std::vector lcp(off.size()); + for (size_t i = 0; i < off.size(); ++i) { + ASSERT_TRUE(0 <= off[i] && off[i] < n); + lcp[i] = LCP[off[i]]; + } + // check correctness + // (EXPECTED, ACTUAL) + ASSERT_TRUE(off.size() > 0); + ASSERT_EQ(0u, off[0]); + + for (size_t i = 1; i < off.size(); ++i) { + // everything between off[i-1], .., off[i] should have LCP > than those two + ASSERT_TRUE(off[i] - off[i-1] <= maxsize); // TODO: +-1 errors? + size_t minlcp = std::max(LCP[off[i-1]], LCP[off[i]]); + for (size_t j = off[i-1]+1; j < off[i]; ++j) { + ASSERT_TRUE(0 <= j && j < n); + ASSERT_TRUE(LCP[j] > minlcp); // << "LCP values skipped must be smaller than the larger of the two surrounding ones"; + } + } + + // so far: + // this tests the necessary conditions: intervals are valid, and not larger than the max + + // next: + // need to also check they are the largest not exceeding the max, + // ie. check that parent ranges of everything selected exceed the max + + std::stack st; + st.push(0); + + for (size_t i = 0; i < off.size(); ++i) { + while (lcp[st.top()] > lcp[i]) { + // skip all equal to find parent + size_t l = lcp[st.top()]; + while (lcp[st.top()] == l) { + st.pop(); + } + + // parent range should be larger than max size + ASSERT_TRUE(off[i] - off[st.top()] > maxsize); + } + st.push(i); + } + + + while (lcp[st.top()] > 0) { + // skip all equal to find parent + size_t l = lcp[st.top()]; + while (lcp[st.top()] == l) { + st.pop(); + } + + // parent range should be larger than max size + ASSERT_TRUE(n - off[st.top()] > maxsize); + } + +} + +// distributed version for sampling the LCP array +// -> used for TLDT construction +template +std::vector sample_lcp_distr(const std::vector& local_LCP, index_t maxsize, const mxx::comm& comm) { + + // set up global sizes + size_t n = mxx::allreduce(local_LCP.size(), comm); + mxx::blk_dist dist(n, comm); + size_t prefix = dist.eprefix_size(); + + std::vector samples; // all indices for output, eventually the result + samples.push_back(prefix); + + std::vector st; // stack of missing right matches, index into samples + + std::vector left_st; // index into `samples`, stack of missing left matches + left_st.push_back(0); + + // minimum seen so far + lcp_t minval = local_LCP[0]; + + for (size_t i = 1; i < local_LCP.size(); ++i) { + index_t l = local_LCP[i]; + index_t topl = local_LCP[samples.back()-prefix]; + + while (!st.empty() && topl > l) { + // l is rightmatch for topl and all its equals, their left match is in stack + index_t lsidx = st.back(); // index of parent in `samples` + index_t lnsv = samples[lsidx]; // index of parent in `LCP` + st.pop_back(); + + index_t parent_size = i+prefix - lnsv; + if (parent_size > maxsize) { + // here i can output everything on the stack and quit the while loop + // ie, everything in samples is safe, also `i` necessarily is safe and everything smaller to come after! + // so i can push i, keep the stack empty. empty stack means: everything larger than samples.back() gets pushed to both, l == samples.back() -> nothing to stack, push samples. l < samples.back() -> output and safe, ie. not onto stack?? + // stack contains parent of those elements which may be removed at a later time + st.clear(); + break; + } else { + // elements are not output, but skipped + samples.resize(lsidx + 1); + } + + topl = local_LCP[samples.back()-prefix]; + } + + if (topl < l) { // only if new larger sequence do we push the stack + st.push_back(samples.size()-1); // st always points toward right-most equal + samples.push_back(i+prefix); + } else if (topl == l) { + samples.push_back(i+prefix); + } else { + assert(st.empty()); + samples.push_back(i+prefix); + if (l < minval) { + minval = l; + if (samples[left_st.back()]-prefix <= maxsize) { + left_st.push_back(samples.size()-1); + } else { + left_st.back() = samples.size() - 1; + } + } + } + } + + using pair_t = std::pair; + std::vector left(left_st.size()); + for (size_t i = 0; i < left_st.size(); ++i) { + left[i].second = samples[left_st[i]]; + left[i].first = local_LCP[samples[left_st[i]]-prefix]; + } + + std::vector right = send_vec_left(left, comm); + if (comm.rank() == comm.size()-1) { + right.emplace_back(0, n); + } + + // use `right` to solve remaining elements in stack `st` + index_t first_out = right.size()-1; + for (size_t i = 0; i < right.size(); ++i) { + lcp_t l = right[i].first; + index_t topl = local_LCP[samples.back()-prefix]; + while (!st.empty() && topl > l) { // right[i] is right match for samples.back() + index_t lsidx = st.back(); // index of left parent in `samples` + index_t lnsv = samples[lsidx]; // index of left parent in `LCP` + st.pop_back(); + + index_t parent_size = right[i].second - lnsv; + if (parent_size > maxsize) { + // here, I usually clear the stack, all remain! + // thus right[i] as a parent is also a valid output! + st.clear(); + break; + } else { + // remove from output + samples.resize(lsidx + 1); + } + topl = local_LCP[samples.back()-prefix]; + } + + + // what is right[i]'s parent range? + // + // 3 cases: + if (topl < right[i].first) { + // topl < right[i] -> samples.back() is lnsv for right[i] + if (i + 1 < right.size() && right[i+1].second - samples.back() > maxsize) { + first_out = i; + break; + } + } else if (topl == l) { + // parent range is larger + if (i + 1 < right.size() && right[i+1].second - samples[st.back()] > maxsize) { + first_out = i; + break; + } + } else if (st.empty()) { + // empty and topl > right[i] => def output `i` + first_out = i; + break; + } + } + + // send back start point! + index_t leftstart = mxx::right_shift(first_out, comm); + + // remove elements from front if they have a parent range smaller than maxsize + if (comm.rank() > 0 && leftstart > 0) { + std::vector tmp; + tmp.swap(samples); + samples = std::vector(tmp.begin() + left_st[leftstart], tmp.end()); + } + + return samples; +} + +template +struct tldt { + desa_index idx; // uses a sequential DESA underneath + + using range_t = std::pair; + + std::vector offsets; + size_t n; + + template + void construct(const suffix_array& sa, const std::string& local_str, const mxx::comm& comm) { + // sample to 100 + n = sa.n; + size_t prefix = sa.part.eprefix_size(); + index_t maxsize = sa.n / comm.size() / 128; + if (maxsize < 2) + maxsize = 2; // smallest parent interval + std::vector local_off = sample_lcp_distr(sa.local_LCP, maxsize, comm); + + // sample LCP and Lc at local_off + std::vector local_sample_lcp(local_off.size()); + std::vector local_sample_lc(local_off.size()); + + for (size_t i = 0; i < local_off.size(); ++i) { + local_sample_lcp[i] = sa.local_LCP[local_off[i] - prefix]; + local_sample_lc[i] = sa.local_Lc[local_off[i] - prefix]; + } + + // allgather offsets and sampled lcp and Lc + offsets = mxx::allgatherv(local_off, comm); + idx.n = offsets.size(); + idx.LCP = mxx::allgatherv(local_sample_lcp, comm); + idx.Lc = mxx::allgatherv(local_sample_lc, comm); + + // construct rmq + idx.minq = rmq::const_iterator,index_t>(idx.LCP.begin(), idx.LCP.end()); + } + + inline index_t minmatch() const { + return 1; + } + + // for partitioning + std::vector prefix() const { + // return inclusive prefix sum of bin size + // (offsets is already the exclusive prefix sum, thus just need to shift by one) + std::vector ps(offsets.size()); + for (size_t i = 0; i < offsets.size()-1; ++i) { + ps[i] = offsets[i+1]; + } + ps.back() = n; + return ps; + } + + template + inline range_t lookup(const String& P) const { + range_t r = idx.locate_possible(P); + return range_t(offsets[r.first], r.second == offsets.size() ? n : offsets[r.second]); + } +}; + +#endif // TLDT_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f9c8705..52323ea 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,24 +1,25 @@ -cmake_minimum_required(VERSION 2.6) +project(psac-src) -# project settings -project(psac-main) +# for tclap command line parsing header library +include_directories(../ext/) -################# -# shared libs # -################# -##################### -# main executable # -##################### +###################### +# main executables # +###################### # our main executable for suffix array construction add_executable(psac psac.cpp) -target_link_libraries(psac ${EXTRA_LIBS} rt) +target_link_libraries(psac psac-dss-lib) add_executable(gsac gsac.cpp) -target_link_libraries(gsac ${EXTRA_LIBS} rt) -target_link_libraries(gsac divsufsort) -target_link_libraries(gsac divsufsort64) +target_link_libraries(gsac psac-dss-lib) + +add_executable(desa-main desa_main.cpp) +target_link_libraries(desa-main psac-dss-lib) + +add_executable(test-slcp test_slcp.cpp) +target_link_libraries(test-slcp psac-dss-lib) ################ # benchmarks # @@ -26,34 +27,38 @@ target_link_libraries(gsac divsufsort64) # benchmarks of our different internal methods add_executable(benchmark_sac benchmark.cpp) -target_link_libraries(benchmark_sac ${EXTRA_LIBS} rt) +target_link_libraries(benchmark_sac psaclib) # benchmark with different values of `k` for initial k-mer sorting add_executable(benchmark_k benchmark_k.cpp) -target_link_libraries(benchmark_k ${EXTRA_LIBS} rt) +target_link_libraries(benchmark_k psaclib) # benchmark ANSV add_executable(benchmark-ansv benchmark_ansv.cpp) -target_link_libraries(benchmark-ansv ${EXTRA_LIBS} rt) +target_link_libraries(benchmark-ansv psaclib) + ################ # tools/utils # ################ + add_executable(print64 print64.cpp) +add_executable(mkpattern mkpattern.cpp) + +add_executable(kmer-stats kmer_partition.cpp) +target_link_libraries(kmer-stats psac-dss-lib) + + ################ # divsufsort # ################ # divsufsort executable (supporting 32 and 64 bits) add_executable(dss dss.cpp) -target_link_libraries(dss ${EXTRA_LIBS}) -target_link_libraries(dss divsufsort) -target_link_libraries(dss divsufsort64) +target_link_libraries(dss psac-dss-lib) # compare our algorithm against divsufsort and check correctness add_executable(psac-vs-dss psac_vs_dss.cpp) -target_link_libraries(psac-vs-dss ${EXTRA_LIBS} rt) -target_link_libraries(psac-vs-dss divsufsort) -target_link_libraries(psac-vs-dss divsufsort64) +target_link_libraries(psac-vs-dss psac-dss-lib) diff --git a/src/desa_main.cpp b/src/desa_main.cpp new file mode 100644 index 0000000..99cc830 --- /dev/null +++ b/src/desa_main.cpp @@ -0,0 +1,117 @@ + +#include +#include +#include +#include +#include + +// using TCLAP for command line parsing +#include + +// mxx dependencies +#include +#include +#include +#include + +#include +#include +#include + +#include "seq_query.hpp" +#include "desa.hpp" +#include "tldt.hpp" + +// define index and TLI types for experiment +using index_t = uint64_t; +using TLI_t = tldt; +//using TLI_t = tllt; +using desa_t = dist_desa; + +int main(int argc, char *argv[]) { + // set up mxx / MPI + mxx::env e(argc, argv); + mxx::env::set_exception_on_error(); + mxx::comm c; + mxx::print_node_distribution(c); + + // given a file, compute suffix array and lcp array, how to do count/locate query? + // via sa_range() + /* + if (argc < 3) { + std::cerr << "Usage: ./xx " << std::endl; + } + */ + try { + // define commandline usage + TCLAP::CmdLine cmd("Distributed Enhanced Suffix Array"); + TCLAP::ValueArg fileArg("f", "file", "Input string filename.", true, "", "filename"); + cmd.add(fileArg); + TCLAP::ValueArg loadArg("l", "load", "Load index from given basename", false, "", "filename"); + TCLAP::SwitchArg constructArg("c", "construct", "Construct SA/LCP/Lc from input file", false); + cmd.xorAdd(loadArg, constructArg); // either load or construct SA/LCP + TCLAP::ValueArg outArg("o", "outfile", "Output file base name. If --construct was used, this stores the resulting DESA.", false, "", "filename"); + cmd.add(outArg); + + TCLAP::ValueArg queryArg("q", "query", "Query file for benchmarking querying.", false, "", "filename"); + cmd.add(queryArg); + cmd.parse(argc, argv); + + + mxx::section_timer t; + + // create distributed DESA class + using range_t = desa_t::range_t; + desa_t idx(c); + + if (constructArg.getValue()) { + if (c.rank() == 0) { + std::cout << "constructing DESA (SA+LCP+LC)..." << std::endl; + } + // read input file into in-memory string + std::string input_str = mxx::file_block_decompose(fileArg.getValue().c_str(), c); + t.end_section("read input file"); + // construct DESA from scratch + idx.construct(input_str.begin(), input_str.end(), c); + t.end_section("construct idx"); + + if (outArg.getValue() != "") { + // store DESA to given basename + if (c.rank() == 0) { + std::cout << "saving DESA to basename `" << outArg.getValue() << "` ..." << std::endl; + } + idx.write(outArg.getValue(), c); + } + } else { + if (outArg.getValue() != "") { + if (c.rank() == 0) { + std::cerr << "WARNING: --outfile argument will be ignored since the input is loaded from file (don't use in conjuction with --load)." << std::endl; + } + } + if (c.rank() == 0) { + std::cout << "loading DESA (SA+LCP+LC) from basename `" << loadArg.getValue() << "` ..." << std::endl; + } + idx.read(fileArg.getValue(), loadArg.getValue(), c); + + } + + // query benchmarking? + if (queryArg.getValue() != "") { + strings ss = strings::from_dfile(queryArg.getValue(), c); + t.end_section("read patterns file"); + + // run locate a couple of times + int reps = 10; + for (int i = 0; i < reps; ++i) { + std::vector mysols = idx.bulk_locate(ss); + t.end_section("bulk_locate"); + } + } + + } catch (TCLAP::ArgException& e) { + std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl; + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/src/kmer_partition.cpp b/src/kmer_partition.cpp new file mode 100644 index 0000000..611687a --- /dev/null +++ b/src/kmer_partition.cpp @@ -0,0 +1,233 @@ +/* + * Copyright 2015 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +// using TCLAP for command line parsing +#include + +// mxx dependency +#include + +// suffix array construction +#include +#include // for random DNA +#include +#include +#include +#include +#include + +std::string file2string(const std::string& filename) { + // read input file into in-memory string + std::string input_str; + std::ifstream t(filename.c_str()); + std::stringstream buffer; + buffer << t.rdbuf(); + input_str = buffer.str(); + return input_str; +} + + +template +struct tl_desa { + index_t total_size; + std::vector lc; + std::vector lcp; + std::vector off; // offsets + + using range_t = std::pair; + + using it_t = typename std::vector::const_iterator; + rmq minq; + + void construct(const std::vector& LCP, const std::vector Lc, index_t maxsize) { + // local ansv !? + off = sample_lcp(LCP, maxsize); + + // create sampled DESA + std::cout << "creating TL DT with " << off.size() << " els" << std::endl; + lcp.resize(off.size()); + lc.resize(off.size()); + + for (size_t i = 0; i < off.size(); ++i) { + lcp[i] = LCP[off[i]]; + lc[i] = Lc[off[i]]; + } + + // constrct RMQ over new sampled LCP + minq = rmq(lcp.begin(), lcp.end()); + } + + + range_t locate(const std::string& P) { + + size_t n = lcp.size(); + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t q = this->lcp[i]; + + // blind search + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == P[q]) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + + i = this->minq(l+1, r); + } while (l < r && this->lcp[i] == q); + + if (this->lcp[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = this->lcp[i]; + } + + // return the range using offsets + if (r == n-1) { + return range_t(off[l], total_size); + } else { + return range_t(off[l], off[r+1]); + } + } +}; + +// table: inclusive prefix sum? +void evaluate_partition(const std::vector& table) { + + for (int p = 2; p <= 1024; p <<= 1) { + // partitining by bisection + std::vector part = partition(table, p); + + // partition pointers to partition size + int start = 0; + std::vector part_size(p, 0); + for (int i = 0; i < p; ++i) { + int end = (i+1 < p) ? part[i+1] : table.size(); + //table[start] // inclusive !? + // table[i] = + // hist[i] = table[i] - table[i-1] + // sum [start,end) -> + // table[end-1] - table[start-1] + part_size[i] = table[end-1]; + if (start > 0) + part_size[i] -= table[start-1]; + + start = end; + } + + size_t max_size = *std::max_element(part_size.begin(), part_size.end()); + size_t min_size = *std::min_element(part_size.begin(), part_size.end()); + size_t avg = table.back() / p; + + std::cout << "for p=" << p << " size range [" << max_size << "," << min_size << "], load imbalance: " << (max_size - avg)*100./avg << " %" << std::endl; + } +} + +int main(int argc, char *argv[]) +{ + // set up mxx / MPI + mxx::env e(argc, argv); + + try { + // define commandline usage + TCLAP::CmdLine cmd("Load Imbalance study for partitioning of kmers"); + TCLAP::ValueArg fileArg("f", "file", "Input filename.", true, "", "filename"); + cmd.add(fileArg); + TCLAP::SwitchArg trieArg("t", "trie", "use dynamic TL trie", false); + cmd.add(trieArg); + cmd.parse(argc, argv); + + if (!trieArg.getValue()) { + int bits = 25; + + std::string str = file2string(fileArg.getValue()); + alphabet a = alphabet::from_sequence(str.begin(), str.end()); + unsigned int l = a.bits_per_char(); + std::cout << "alphabet: " << a << std::endl; + int k = bits / l; + std::cout << "using k = " << k << std::endl; + + // scan through string to create kmer histogram + std::vector hist = kmer_hist(str.begin(), str.end(), k, a); + std::vector table(hist.size()); + std::cout << "kmer table size: " << table.size()*sizeof(size_t) / 1024 << " kiB" << std::endl; + + std::partial_sum(hist.begin(), hist.end(), table.begin()); + evaluate_partition(table); + + } else { + // use top level trie + std::cout << "loading string" << std::endl; + std::string str = file2string(fileArg.getValue()); + std::cout << "construting DESA" << std::endl; + desa_index idx; + idx.construct(str.begin(), str.end()); + + + std::cout << "constructing TL DESA" << std::endl; + tl_desa tl; + size_t maxsize = str.size() / (1024*128); + tl.construct(idx.LCP, idx.Lc, maxsize); + std::cout << "constructed TL with " << tl.lcp.size() << " elements" << std::endl; + std::cout << "TL total size: " << tl.off.size()*(2*sizeof(size_t)+1) / 1024 << " kiB" << std::endl; + + // use TL offsets and create inclusive prefix sum + std::vector table(tl.off.size()); + + for (size_t i = 0; i+1 < table.size(); ++i) { + table[i] = tl.off[i+1]; + } + table.back() = str.size(); + + evaluate_partition(table); + } + + // catch any TCLAP exception + } catch (TCLAP::ArgException& e) { + std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl; + } + + return 0; +} diff --git a/src/mkpattern.cpp b/src/mkpattern.cpp new file mode 100644 index 0000000..cefa3da --- /dev/null +++ b/src/mkpattern.cpp @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include +#include + +std::string read_file(const std::string& filename) { + // read input file into in-memory string + std::string input_str; + std::ifstream t(filename.c_str()); + std::stringstream buffer; + buffer << t.rdbuf(); + input_str = buffer.str(); + return input_str; +} + +std::vector rand_patterns(const std::string& str, size_t len, size_t num) { + std::vector patterns; + patterns.reserve(num); + std::minstd_rand g; + std::uniform_int_distribution d(0, str.size()-len-1); + for (size_t i = 0; i < num; ++i) { + size_t start = d(g); + patterns.emplace_back(str.begin() + start, str.begin() + start + len); + } + return patterns; +} + +int main(int argc, char *argv[]) +{ + if (argc < 4) { + std::cerr << "Usage: ./mkpattern " << std::endl; + exit(EXIT_FAILURE); + } + + std::string filename(argv[1]); + int num = std::stoi(argv[2]); + int len = std::stoi(argv[3]); + + if (num <= 0 || len < 1) { + std::cerr << "Usage: ./mkpattern " << std::endl; + exit(EXIT_FAILURE); + } + + // read input file + std::string input = read_file(filename); + + // generate random patterns of given length + std::vector patterns = rand_patterns(input, len, num); + + // output patterns in text format + for (const std::string& P : patterns) { + std::cout << P << std::endl; + } + + return 0; +} diff --git a/src/print64.cpp b/src/print64.cpp index 62dd4fa..85c2b3c 100644 --- a/src/print64.cpp +++ b/src/print64.cpp @@ -30,8 +30,13 @@ int main(int argc, char** argv) { } // read file - std::vector buf(size/8); - int bytes_read = fread(&buf[0], sizeof(uint64_t), size/8, f); + size_t count = size/sizeof(uint64_t); + std::vector buf(count); + size_t read_count = fread(&buf[0], sizeof(uint64_t), count, f); + if (read_count != count) { + std::cerr << "Unexpected error reading file." << std::endl; + exit(EXIT_FAILURE); + } // output file in decimal format for (size_t i = 0; i < buf.size(); ++i) { diff --git a/src/test_slcp.cpp b/src/test_slcp.cpp new file mode 100644 index 0000000..15e43ad --- /dev/null +++ b/src/test_slcp.cpp @@ -0,0 +1,43 @@ + + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) +{ + mxx::env e(argc, argv); + mxx::comm c; + + size_t n = 1000; + size_t maxsize = 20; + + salcp_index idx; + if (c.rank() == 0) { + std::string s = rand_dna(n, 13); + //std::cout << "s = " << s << std::endl; + idx.construct(s.begin(), s.end()); + } + std::vector local_LCP = stable_distribute(idx.LCP, c); + + + std::vector off = sample_lcp_distr(local_LCP, maxsize, c); + + + std::vector goff = mxx::gatherv(off, 0, c); + + if (c.rank() == 0) { + seq_check_sample(idx.LCP, goff, maxsize); + } + + + return 0; +} diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt deleted file mode 100644 index e172452..0000000 --- a/src/tests/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -cmake_minimum_required(VERSION 2.6) - -# project settings -project(psac-tests) - -# RMQ -add_executable(test_rmq test_rmq.cpp) -# Bitops -add_executable(test_bitops test_bitops.cpp) - diff --git a/src/tests/test_bitops.cpp b/src/tests/test_bitops.cpp deleted file mode 100644 index 7e608ee..0000000 --- a/src/tests/test_bitops.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2015 Georgia Institute of Technology - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "bitops.hpp" - - -// TODO do these things as google test -int main() -{ - // run a large set of random tests - unsigned int sum = 0; - int num_tests = 100000000; - for (int i = 0; i < num_tests; ++i) - { - uint64_t x = 0; - x |= static_cast(rand()) << 32; - x |= static_cast(rand()); - - unsigned int t1 = trailing_zeros(x); - unsigned int t2 = reference_trailing_zeros(x); - //unsigned int t2 = t1; - //sum += t2; - - unsigned int l1 = leading_zeros(x); - unsigned int l2 = reference_leading_zeros(x); - //unsigned int l2 = l1; - //sum += l2; - - if (t1 != t2) { - std::cerr << "Error! trailing zeros: " << t1 << " != " << t2 << std::endl; - } - if (l1 != l2) { - std::cerr << "Error! leading zeros: " << l1 << " != " << l2 << std::endl; - } - } - return sum; -} diff --git a/src/tests/test_rmq.cpp b/src/tests/test_rmq.cpp deleted file mode 100644 index 5004d67..0000000 --- a/src/tests/test_rmq.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright 2015 Georgia Institute of Technology - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include "rmq.hpp" - -template class std::vector; - -class rmq_tester -{ - std::size_t size; - std::vector els; - rmq::iterator>* minquery; -public: - rmq_tester(std::size_t size) : size(size), els(size) - { - std::generate(els.begin(), els.end(), [](){return std::rand();}); - minquery = new rmq::iterator>(els.begin(), els.end()); - } - - bool test(int start, int end) - { - std::vector::iterator min_it = minquery->query(els.begin()+start, els.begin()+end); - if (*min_it != *std::min_element(els.begin()+start, els.begin()+end)) - { - std::cerr << "ERROR: range(" << start << "," << end << ")=" << *(min_it) << " at i=" << min_it - els.begin() << std::endl; - return false; - } - else - return true; - } - - void test_all() - { - for (std::size_t i = 0; i < size; ++i) - for (std::size_t j = i+1; j < size; ++j) - { - if (!test(i,j)) - { - std::cerr << "Error with range(" << i << "," << j << ")" << std::endl; - exit(EXIT_FAILURE); - } - } - } - - ~rmq_tester() - { - delete minquery; - } -}; - -void timing_comp(std::size_t size) -{ - auto start_time = std::chrono::high_resolution_clock::now(); - std::vector els(size); - std::generate(els.begin(), els.end(), [](){return std::rand() % 1000;}); - - auto now = std::chrono::high_resolution_clock::now(); - long duration_ms = std::chrono::duration_cast(now - start_time).count(); - start_time = now; - std::cerr << "Generate input: " << duration_ms << std::endl; - - rmq::iterator> rmq(els.begin(), els.end()); - - now = std::chrono::high_resolution_clock::now(); - duration_ms = std::chrono::duration_cast(now - start_time).count(); - std::cerr << "Construction RMQ: " << duration_ms << std::endl; - - int rand_tests = size; - //double rmq_queries = 0.0; - //double minel_queries = 0.0; - start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < rand_tests; ++i) - { - // get random range and start timing - int range_start = std::rand() % (size - 2); - int range_end = std::rand() % (size - range_start - 1) + range_start; - - // query RMQ - auto min = rmq.query(els.begin()+range_start, els.begin()+range_end); - - // stop timing - //now = std::chrono::high_resolution_clock::now(); - //duration_ms = std::chrono::duration_cast(now - start_time).count(); - //rmq_queries += duration_ms; - //start_time = now; - - // use linear std::min_element - auto min2 = std::min_element(els.begin()+range_start, els.begin() + range_end); - - // stop time - //now = std::chrono::high_resolution_clock::now(); - //duration_ms = std::chrono::duration_cast(now - start_time).count(); - //minel_queries += duration_ms; - - if (*min != *min2) - { - std::cerr << "ERROR: different mins for range(" << range_start << "," << range_end << ")" << std::endl; - std::cerr << "rmq at " << min - els.begin() << " = " << *min << std::endl; - std::cerr << "mne at " << min2 - els.begin() << " = " << *min2 << std::endl; - exit(EXIT_FAILURE); - } - else - std::cerr << "." << std::endl; - } - - now = std::chrono::high_resolution_clock::now(); - duration_ms = std::chrono::duration_cast(now - start_time).count(); - std::cout << "RMQ queries total time: " << duration_ms/1000.0 << "s" << std::endl; - //std::cout << "std::min_element queries total time: " << minel_queries/1000.0 << "s" << std::endl; -} - -int main(int argc, char *argv[]) -{ - //timing_comp(1000000037); - /* - for (int size = 1; size < 1000; ++size) - { - rmq_tester rmq(size); - std::cerr << "Testing with size: " << size << std::endl; - rmq.test_all(); - } - */ - //rmq_tester rmq(10000); - //rmq.test_all(); - timing_comp(100000); - - - - return 0; -} diff --git a/src/tldt.cpp b/src/tldt.cpp new file mode 100644 index 0000000..d91147b --- /dev/null +++ b/src/tldt.cpp @@ -0,0 +1,250 @@ + +#include +#include +#include +#include +#include + +// using TCLAP for command line parsing +#include + +// mxx dependencies +#include +#include +#include +#include + +#include +#include +#include +#include +//#include +//#include + +#include "seq_query.hpp" +#include "desa.hpp" + + +template +struct tl_desa { + index_t total_size; + std::vector lc; + std::vector lcp; + std::vector off; // offsets + + using range_t = std::pair; + + using it_t = typename std::vector::const_iterator; + rmq minq; + + void construct(const std::vector& LCP, const std::vector Lc, index_t maxsize) { + // local ansv !? + index_t n = LCP.size(); + total_size = n; + + struct node { + index_t lcp; + index_t pos; + index_t l; + }; + + std::stack st; + st.push(node{0,0,0,0}); + + index_t total_out = 1; + std::vector do_output(n, false); + do_output[0] = true; + + for (index_t i = 1; i < n; ++i) { + if (LCP[i] == 0) { + do_output[i] = true; + continue; + } + while (!st.empty() && st.top().lcp > LCP[i]) { + node u = st.top(); + st.pop(); + // u.pos has range [u.l, .. , i) + index_t parent_size = i - u.l; + if (parent_size > maxsize) { + // output but in inverse order !? + do_output[u.pos] = true; + ++total_out; + } + } + + if (st.empty()) { + // cant happen + assert(false); + } else if (st.top().lcp == LCP[i]) { + st.push(node{LCP[i], i, st.top().l}); + } else { + assert(st.top().lcp < LCP[i]); + st.push(node{LCP[i], i, st.top().pos}); + } + } + + std::cout << "creating TL DT with " << total_out << " els" << std::endl; + lcp.resize(total_out); + lc.resize(total_out); + off.resize(total_out); + + index_t j = 0; + for (index_t i = 0; i < n; ++i) { + if (do_output[i]) { + lcp[j] = sa.LCP[i]; + lc[j] = sa.Lc[i]; + off[j] = i; + ++j; + } + } + + // constrct RMQ over new sampled LCP + minq = rmq(lcp.begin(), lcp.end()); + } + + + range_t locate(const std::string& P) { + + size_t n = lcp.size(); + size_t m = P.size(); + size_t l = 0; + size_t r = n-1; + + // get first child interval and depth + size_t i = this->minq(l+1, r); + index_t q = this->lcp[i]; + + // blind search + while (q < m && l < r) { + + // NOTE: LCP[i] = lcp(SA[i-1],SA[i]), LCP[0] = 0 + // using [l,r] as an inclusive SA range + // corresponding to LCP query range [l+1,r] + + // check if we've reached the end of the pattern + if (q >= m) { + break; + } + + do { + // `i` is the lcp(SA[i-1],SA[i]) + char lc = this->lc[i]; // == S[SA[l]+lcpv] for first iter + if (lc == P[q]) { + r = i-1; + break; + } + l = i; + if (l == r) + break; + + i = this->minq(l+1, r); + } while (l < r && this->lcp[i] == q); + + if (this->lcp[i] == q) { + if (l+1 < r) { + i = this->minq(l+1, r); + } else { + i = l; + } + } + q = this->lcp[i]; + } + + // return the range using offsets + if (r == n-1) { + return range_t(off[l], total_size); + } else { + return range_t(off[l], off[r+1]); + } + } +}; + + +int main(int argc, char *argv[]) { + // set up mxx / MPI + mxx::env e(argc, argv); + mxx::env::set_exception_on_error(); + mxx::comm c; + mxx::print_node_distribution(c); + + // given a file, compute suffix array and lcp array, how to do count/locate query? + // via sa_range() + /* + if (argc < 3) { + std::cerr << "Usage: ./xx " << std::endl; + } + */ + try { + // define commandline usage + TCLAP::CmdLine cmd("Distributed Enhanced Suffix Array"); + TCLAP::ValueArg fileArg("f", "file", "Input string filename.", true, "", "filename"); + cmd.add(fileArg); + TCLAP::ValueArg loadArg("l", "load", "Load index from given basename", false, "", "filename"); + TCLAP::SwitchArg constructArg("c", "construct", "Construct SA/LCP/Lc from input file", false); + cmd.xorAdd(loadArg, constructArg); // either load or construct SA/LCP + TCLAP::ValueArg outArg("o", "outfile", "Output file base name. If --construct was used, this stores the resulting DESA.", false, "", "filename"); + cmd.add(outArg); + + TCLAP::ValueArg queryArg("q", "query", "Query file for benchmarking querying.", false, "", "filename"); + cmd.add(queryArg); + cmd.parse(argc, argv); + + + mxx::section_timer t; + + // create distributed DESA class + using index_t = uint64_t; + using range_t = dist_desa::range_t; + dist_desa idx(c); + + if (constructArg.getValue()) { + if (c.rank() == 0) { + std::cout << "constructing DESA (SA+LCP+LC)..." << std::endl; + } + // read input file into in-memory string + std::string input_str = mxx::file_block_decompose(fileArg.getValue().c_str(), c); + t.end_section("read input file"); + // construct DESA from scratch + idx.construct(input_str.begin(), input_str.end(), c); + t.end_section("construct idx"); + + if (outArg.getValue() != "") { + // store DESA to given basename + if (c.rank() == 0) { + std::cout << "saving DESA to basename `" << outArg.getValue() << "` ..." << std::endl; + } + idx.write(outArg.getValue(), c); + } + } else { + if (outArg.getValue() != "") { + if (c.rank() == 0) { + std::cerr << "WARNING: --outfile argument will be ignored since the input is loaded from file (don't use in conjuction with --load)." << std::endl; + } + } + if (c.rank() == 0) { + std::cout << "loading DESA (SA+LCP+LC) from basename `" << loadArg.getValue() << "` ..." << std::endl; + } + idx.read(fileArg.getValue(), loadArg.getValue(), c); + + } + + // query benchmarking? + if (queryArg.getValue() != "") { + strings ss = strings::from_dfile(queryArg.getValue(), c); + t.end_section("read patterns file"); + + // run locate a couple of times + int reps = 10; + for (int i = 0; i < reps; ++i) { + std::vector mysols = idx.bulk_locate(ss); + t.end_section("bulk_locate"); + } + } + + } catch (TCLAP::ArgException& e) { + std::cerr << "error: " << e.error() << " for arg " << e.argId() << std::endl; + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7ee9532..ee0a64a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,28 +5,35 @@ project(psac-test) # bitops TODO: don't use mxx for these sequential tests! add_executable(test-bitops test_bitops.cpp) -target_link_libraries(test-bitops mxx-gtest-main) +target_link_libraries(test-bitops psaclib mxx-gtest-main) add_executable(test-rmq test_rmq.cpp) -target_link_libraries(test-rmq mxx-gtest-main) +target_link_libraries(test-rmq psaclib mxx-gtest-main) + +add_executable(test-samplelcp test_samplelcp.cpp) +target_link_libraries(test-samplelcp psac-dss-lib mxx-gtest-main) # Parallel tests add_executable(test-ansv test_ansv.cpp) -target_link_libraries(test-ansv mxx-gtest-main) +target_link_libraries(test-ansv psaclib mxx-gtest-main) add_executable(test-ss test_stringset.cpp) -target_link_libraries(test-ss mxx-gtest-main) +target_link_libraries(test-ss psaclib mxx-gtest-main) add_executable(test-gsa test_gsa.cpp) -target_link_libraries(test-gsa mxx-gtest-main rt) +target_link_libraries(test-gsa psaclib mxx-gtest-main) add_executable(test-suffixtree test_suffixtree.cpp) -target_link_libraries(test-suffixtree mxx-gtest-main rt) +target_link_libraries(test-suffixtree psaclib mxx-gtest-main) add_executable(test-psac test_psac.cpp) -target_link_libraries(test-psac mxx-gtest-main) -target_link_libraries(test-psac divsufsort) -target_link_libraries(test-psac divsufsort64) +target_link_libraries(test-psac psac-dss-lib mxx-gtest-main) + +add_executable(test-seq-query test_seq_query.cpp) +target_link_libraries(test-seq-query psac-dss-lib mxx-gtest-main) + +add_executable(test-desa test_desa.cpp) +target_link_libraries(test-desa psac-dss-lib mxx-gtest-main) # standalone tests #add_executable(test-ss test_stringset.cpp) @@ -34,4 +41,4 @@ target_link_libraries(test-psac divsufsort64) # standalone tests add_executable(test-lcp test_glcp.cpp) -target_link_libraries(test-lcp ${EXTRA_LIBS} rt) +target_link_libraries(test-lcp psaclib) diff --git a/test/test_desa.cpp b/test/test_desa.cpp new file mode 100644 index 0000000..1370dab --- /dev/null +++ b/test/test_desa.cpp @@ -0,0 +1,154 @@ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "seq_query.hpp" +#include "desa.hpp" +#include "tldt.hpp" + + +using index_t = uint64_t; +using range_t = dist_desa::range_t; + + +inline std::vector> mississippi_testcase() { + using r = range_t; + + std::vector> patterns; + patterns.emplace_back("i", r(0,4)); + patterns.emplace_back("mississip", r(4,5)); + patterns.emplace_back("ss", r(9,11)); + patterns.emplace_back("pp", r(6,7)); + + patterns.emplace_back("m", r(4,5)); + patterns.emplace_back("pi", r(5,6)); + patterns.emplace_back("is", r(2,4)); + patterns.emplace_back("ssi", r(9,11)); + + patterns.emplace_back("p", r(5,7)); + patterns.emplace_back("ississippi", r(3,4)); + patterns.emplace_back("si", r(7,9)); + patterns.emplace_back("miss", r(4,5)); + + patterns.emplace_back("s", r(7,11)); + patterns.emplace_back("ip", r(1,2)); + patterns.emplace_back("ppi", r(6,7)); + patterns.emplace_back("mississippi", r(4,5)); + + return patterns; +} + + +TEST(DesaTest,MississippiLocatePossible) { + mxx::comm c; + ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs"; + + std::string s; + if (c.rank() == 0) { + s = "mississippi"; + } + std::string input_str = mxx::stable_distribute(s, c); + // construct DESA + dist_desa idx(c); + idx.construct(input_str.begin(), input_str.end(), c); + + + auto patterns = mississippi_testcase(); + + // locate single pattern in distributed representation (collective op) + for (auto pat : patterns) { + std::string P = pat.first; + range_t exp_res = pat.second; + + range_t r = idx.locate_possible(P); + // (EXPECTED, ACTUAL) + EXPECT_EQ(exp_res, r); + } +} + +TEST(DesaTest, FileIO) { + mxx::comm c; + ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs"; + + std::string s; + if (c.rank() == 0) { + s = "mississippi"; + std::ofstream f("miss.str"); + f.write(&s[0],s.size()); + } + std::string input_str = mxx::stable_distribute(s, c); + // construct DESA + dist_desa desa(c); + desa.construct(input_str.begin(), input_str.end(), c); + + desa.write("desa_miss", c); + + dist_desa desa2(c); + desa2.read("miss.str", "desa_miss", c); + EXPECT_EQ(desa.sa.local_SA, desa2.sa.local_SA); + EXPECT_EQ(desa.sa.local_LCP, desa2.sa.local_LCP); + EXPECT_EQ(desa.sa.local_Lc, desa2.sa.local_Lc); + EXPECT_EQ(desa.tli.idx.table, desa2.tli.idx.table); + EXPECT_EQ(desa.tli.idx.alpha, desa2.tli.idx.alpha); + + // TODO: implement test for loading with smaller/larger nuber of processors + // and still check that querying works correctly + // -> introduce function to test queries for a given desa +} + +#define PV(x, c) mxx::sync_cout(c) << "[" << c.rank() << "] " << "" # x " = " << x << std::endl; + +TEST(DesaTest,MississippiBulkLocatePossible) { + mxx::comm c; + + ASSERT_TRUE(c.size() <= 7) << "the `mississippi` test shouldn't run with more processors than character pairs"; + + std::string s; + if (c.rank() == 0) { + s = "mississippi"; + } + std::string input_str = mxx::stable_distribute(s, c); + // construct DESA + dist_desa> idx(c); + idx.construct(input_str.begin(), input_str.end(), c); + + PV(input_str, c); + PV(idx.sa.local_LCP, c); + PV(idx.sa.local_SA, c); + PV(idx.tli.prefix(), c); + PV(idx.tli.idx.LCP, c); + + + // create `strings` from patterns + auto patterns = mississippi_testcase(); + mxx::blk_dist dist(patterns.size(), c.size(), c.rank()); + + + std::vector my_patterns(dist.local_size()); + + for (size_t i = 0; i < dist.local_size(); ++i) { + my_patterns[i] = patterns[i+dist.eprefix_size()].first; + } + + strings ss = strings::from_vec(my_patterns); + std::vector mysols = idx.bulk_locate(ss); + + // check results + for (size_t i = 0; i < dist.local_size(); ++i) { + EXPECT_EQ(patterns[i+dist.eprefix_size()].second, mysols[i]); + } +} diff --git a/test/test_psac.cpp b/test/test_psac.cpp index c0d8771..80116b8 100644 --- a/test/test_psac.cpp +++ b/test/test_psac.cpp @@ -324,6 +324,8 @@ TEST(PSAC, FileIO) { EXPECT_EQ(sa.local_LCP, sa2.local_LCP); EXPECT_EQ(sa.local_Lc, sa2.local_Lc); + EXPECT_EQ(sa.alpha, sa2.alpha); + std::vector gsa = mxx::gatherv(sa.local_SA, 0, c); std::vector glcp = mxx::gatherv(sa.local_LCP, 0, c); std::vector glc = mxx::gatherv(sa.local_Lc, 0, c); @@ -332,6 +334,7 @@ TEST(PSAC, FileIO) { c.with_subset(c.rank() < c.size() - 2, [&](const mxx::comm& sc) { suffix_array sa3(sc); sa3.read("miss"); + EXPECT_EQ(sa.alpha, sa3.alpha); // gather and check EXPECT_EQ(gsa, mxx::gatherv(sa3.local_SA, 0, sc)); EXPECT_EQ(glcp, mxx::gatherv(sa3.local_LCP, 0, sc)); diff --git a/test/test_samplelcp.cpp b/test/test_samplelcp.cpp new file mode 100644 index 0000000..4c80d91 --- /dev/null +++ b/test/test_samplelcp.cpp @@ -0,0 +1,76 @@ +/* + * Copyright 2019 Georgia Institute of Technology + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @brief Unit tests for sampling the LCP + */ + +#include + +#include + +#include +#include + +#include // TODO: renmae/move code to different file? +#include // for rand_dna +#include // for sequential SA/LCP construction +#include + + + +TEST(PsacSampleLCP, SeqSmall) { + // sequentially get SA and LCP for random or easy input!? + size_t n = 10000; + size_t maxsize = 78; + std::string s = rand_dna(n, 13); + + + salcp_index idx; + idx.construct(s.begin(), s.end()); + + + std::vector off = sample_lcp_indirect(idx.LCP, maxsize); + + + // check correctness + // (EXPECTED, ACTUAL) + seq_check_sample(idx.LCP, off, maxsize); +} + +TEST(PsacSampleLCP, ParSmall) { + // sequentially get SA and LCP for random or easy input!? + size_t n = 100000; + size_t maxsize = 123; + + mxx::comm c; + + salcp_index idx; + if (c.rank() == 0) { + std::string s = rand_dna(n, 13); + idx.construct(s.begin(), s.end()); + } + + std::vector local_LCP = mxx::stable_distribute(idx.LCP, c); + + std::vector local_off = sample_lcp_distr(local_LCP, maxsize, c); + + std::vector off = mxx::gatherv(local_off, 0, c); + + if (c.rank() == 0) { + seq_check_sample(idx.LCP, off, maxsize); + } +} diff --git a/test/test_seq_query.cpp b/test/test_seq_query.cpp new file mode 100644 index 0000000..1b524cf --- /dev/null +++ b/test/test_seq_query.cpp @@ -0,0 +1,83 @@ + +#include + +#include + +#include +#include // TODO: move partition tests elsewhere + + +/* type parameterized index test */ + +template +class SeqIndexTest : public ::testing::Test { + public: + IdxType idx; +}; + +using index_t = uint64_t; +using MyTypes = ::testing::Types, esa_index, bs_esa_index, desa_index, lookup_desa_index>; +TYPED_TEST_CASE(SeqIndexTest, MyTypes); + +// search for patterns inside `mississippi` +TYPED_TEST(SeqIndexTest, Locate_mississippi) { + std::string input_str = "mississippi"; + using r = std::pair; + + TypeParam idx; + idx.construct(input_str.begin(), input_str.end()); + + // TODO: flip arguments to: (EXPECTED, ACTUAL) + EXPECT_EQ(idx.locate("i"), r(0,4)); + EXPECT_EQ(idx.locate("is"), r(2,4)); + EXPECT_EQ(idx.locate("ip"), r(1,2)); + EXPECT_EQ(idx.locate("ississippi"), r(3,4)); + + EXPECT_EQ(idx.locate("m"), r(4,5)); + EXPECT_EQ(idx.locate("miss"), r(4,5)); + EXPECT_EQ(idx.locate("mississip"), r(4,5)); + EXPECT_EQ(idx.locate("mississippi"), r(4,5)); + + EXPECT_EQ(idx.locate("p"), r(5,7)); + EXPECT_EQ(idx.locate("pi"), r(5,6)); + EXPECT_EQ(idx.locate("pp"), r(6,7)); + EXPECT_EQ(idx.locate("ppi"), r(6,7)); + + EXPECT_EQ(idx.locate("s"), r(7,11)); + EXPECT_EQ(idx.locate("ss"), r(9,11)); + EXPECT_EQ(idx.locate("si"), r(7,9)); + EXPECT_EQ(idx.locate("ssi"), r(9,11)); + + // expect not to find these: + for (auto P : {"misx", "ississippii", "issississi", "mippi", "piss"}) { + r res = idx.locate(P); + EXPECT_EQ(res.first, res.second); + + } +} + +// TODO: test case for testing random string and random patterns +// check correcness by checking left and right next SA suffixes + + + +TEST(Partition, part0) { + lookup_desa_index idx; + std::string input_str = "mississippi"; + idx.construct(input_str.begin(), input_str.end()); + + int p = 6; + std::vector part = partition(idx.tl.table, p); + std::cout << "part: " << part << std::endl; + + // print size stats + for (int i = 0; i < p; ++i) { + size_t start = part[i]; + size_t end = i+1 < p ? part[i+1] : idx.tl.table.size()-1; + size_t num = idx.tl.table[end] - idx.tl.table[start]; + std::cout << "[p " << i << "]: [" << start << "," << end << "): " << num*100. / idx.tl.table.back() << " % share" << std::endl; + } +} + + +