Skip to content

Commit

Permalink
Merge pull request #4352 from vgteam/split-out-band-padding
Browse files Browse the repository at this point in the history
Split out band padding algorithm from MultipathAligner
  • Loading branch information
adamnovak authored Jul 29, 2024
2 parents 9f18067 + b55be13 commit 62bb596
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 38 deletions.
45 changes: 45 additions & 0 deletions src/algorithms/pad_band.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* \file pad_band.cpp
*
* Defines implementation for band padding functions for banded global alignment.
*/

#include "pad_band.hpp"

#include <cmath>

namespace vg {
namespace algorithms {

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier, size_t band_padding_memo_size) {

// We're goign to capture this vector by value into the closure
std::vector<size_t> band_padding_memo;

// Fill it in to initialize
band_padding_memo.resize(band_padding_memo_size);
for (size_t i = 0; i < band_padding_memo.size(); i++) {
band_padding_memo[i] = size_t(band_padding_multiplier * sqrt(i)) + 1;
}

function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding = [band_padding_multiplier, band_padding_memo](const Alignment& seq, const HandleGraph& graph) {
size_t read_length = seq.sequence().size();
return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length)
: size_t(band_padding_multiplier * sqrt(read_length)) + 1;
};

// And return the closure which now owns the memo table.
return choose_band_padding;
}

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_constant(size_t band_padding) {
// don't dynamically choose band padding, shim constant value into a function type
function<size_t(const Alignment&,const HandleGraph&)> constant_padding = [band_padding](const Alignment& seq, const HandleGraph& graph) {
return band_padding;
};

return constant_padding;
}

}
}
28 changes: 28 additions & 0 deletions src/algorithms/pad_band.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef VG_ALGORITHMS_PAD_BAND_HPP_INCLUDED
#define VG_ALGORITHMS_PAD_BAND_HPP_INCLUDED

/**
* \file pad_band.hpp
*
* Defines algorithm for computing band padding for banded alignment.
*/

#include "../handle.hpp"
#include <vg/vg.pb.h>

namespace vg {
namespace algorithms {

using namespace std;

/// Get a band padding function that uses the expected distance of a random
/// walk, memoized out to the given length.
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier = 1.0, size_t band_padding_memo_size = 2000);

/// Get a band padding function that uses a constant value.
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_constant(size_t band_padding);

}
}

#endif
9 changes: 3 additions & 6 deletions src/multipath_alignment_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "algorithms/extract_connecting_graph.hpp"
#include "algorithms/extract_extending_graph.hpp"
#include "algorithms/pad_band.hpp"

//#define debug_multipath_alignment
//#define debug_decompose_algorithm
Expand Down Expand Up @@ -4231,10 +4232,6 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap
SnarlDistanceIndex* dist_index, const function<pair<id_t, bool>(id_t)>* project,
bool allow_negative_scores, unordered_map<handle_t, bool>* left_align_strand) {

// don't dynamically choose band padding, shim constant value into a function type
function<size_t(const Alignment&,const HandleGraph&)> constant_padding = [&](const Alignment& seq, const HandleGraph& graph) {
return band_padding;
};
align(alignment,
align_graph,
aligner,
Expand All @@ -4246,7 +4243,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap
max_tail_length,
simplify_topologies,
unmergeable_len,
constant_padding,
algorithms::pad_band_constant(band_padding),
multipath_aln_out,
cutting_snarls,
dist_index,
Expand Down Expand Up @@ -5186,7 +5183,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap
bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap,
double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len,
function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
const function<size_t(const Alignment&,const HandleGraph&)>& band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls,
SnarlDistanceIndex* dist_index, const function<pair<id_t, bool>(id_t)>* project,
bool allow_negative_scores, unordered_map<handle_t, bool>* left_align_strand) {
Expand Down
2 changes: 1 addition & 1 deletion src/multipath_alignment_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ namespace vg {
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
bool simplify_topologies, size_t unmergeable_len, const function<size_t(const Alignment&,const HandleGraph&)>& band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);
Expand Down
23 changes: 2 additions & 21 deletions src/multipath_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "algorithms/jump_along_path.hpp"
#include "algorithms/ref_path_distance.hpp"
#include "algorithms/component.hpp"
#include "algorithms/pad_band.hpp"

namespace vg {

Expand All @@ -60,6 +61,7 @@ namespace vg {
haplo::ScoreProvider* haplo_score_provider, SnarlManager* snarl_manager,
SnarlDistanceIndex* distance_index) :
BaseMapper(graph, gcsa_index, lcp_array, haplo_score_provider),
choose_band_padding(algorithms::pad_band_random_walk(1.0, 0)),
snarl_manager(snarl_manager),
distance_index(distance_index),
path_component_index(distance_index ? nullptr : new PathComponentIndex(graph)),
Expand Down Expand Up @@ -895,15 +897,6 @@ namespace vg {
}
}
}

void MultipathMapper::init_band_padding_memo() {
band_padding_memo.clear();
band_padding_memo.resize(band_padding_memo_size);

for (size_t i = 0; i < band_padding_memo.size(); i++) {
band_padding_memo[i] = size_t(band_padding_multiplier * sqrt(i)) + 1;
}
}

void MultipathMapper::set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend,
int8_t full_length_bonus) {
Expand Down Expand Up @@ -6248,12 +6241,6 @@ namespace vg {
}
#endif

function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding = [&](const Alignment& seq, const HandleGraph& graph) {
size_t read_length = seq.sequence().size();
return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length)
: size_t(band_padding_multiplier * sqrt(read_length)) + 1;
};

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
Expand Down Expand Up @@ -6305,12 +6292,6 @@ namespace vg {
multi_aln_graph.topological_sort(topological_order);
multi_aln_graph.remove_transitive_edges(topological_order);

function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding = [&](const Alignment& seq, const HandleGraph& graph) {
size_t read_length = seq.sequence().end() - seq.sequence().begin();
return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length)
: size_t(band_padding_multiplier * sqrt(read_length)) + 1;
};

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
Expand Down
11 changes: 3 additions & 8 deletions src/multipath_mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,6 @@ namespace vg {
/// Experimental: skeleton code for predicting path distance from minimum distance
void determine_distance_correlation();

/// Should be called once after construction, or any time the band padding multiplier is changed
void init_band_padding_memo();

using AlignerClient::set_alignment_scores;

/// Set the algner scoring parameters and create the stored aligner instances. The
Expand Down Expand Up @@ -115,7 +112,6 @@ namespace vg {
size_t max_tail_merge_supress_length = 4;
bool suppress_tail_anchors = false;
size_t min_tail_anchor_length = 3;
double band_padding_multiplier = 1.0;
bool use_pessimistic_tail_alignment = false;
double pessimistic_gap_multiplier = 0.0;
bool restrained_graph_extraction = false;
Expand All @@ -136,7 +132,6 @@ namespace vg {
int max_fanout_base_quality = 20;
int max_fans_out = 5;
size_t max_p_value_memo_size = 500;
size_t band_padding_memo_size = 2000;
double max_exponential_rate_intercept = 0.612045;
double max_exponential_rate_slope = 0.000555181;
double max_exponential_shape_intercept = 12.136;
Expand Down Expand Up @@ -203,6 +198,9 @@ namespace vg {
// the maximum number of pairs of each motif that we will consider during spliced alignment
size_t max_motif_pairs = 1024;
unordered_set<path_handle_t> ref_path_handles;

// A function for computing band padding
std::function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding;

//static size_t PRUNE_COUNTER;
//static size_t SUBGRAPH_TOTAL;
Expand Down Expand Up @@ -688,9 +686,6 @@ namespace vg {
static thread_local unordered_map<double, vector<int64_t>> pessimistic_gap_memo;
static const size_t gap_memo_max_size;

// a memo for transcendental band padidng function (gets initialized at construction)
vector<size_t> band_padding_memo;

#ifdef mpmap_instrument_mem_statistics
public:
ofstream _mem_stats;
Expand Down
4 changes: 2 additions & 2 deletions src/subcommand/mpmap_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <vg/io/vpkg.hpp>
#include "../algorithms/component.hpp"
#include "../algorithms/pad_band.hpp"
#include "../multipath_mapper.hpp"
#include "../mem_accelerator.hpp"
#include "../surjector.hpp"
Expand Down Expand Up @@ -1900,8 +1901,7 @@ int main_mpmap(int argc, char** argv) {
}
multipath_mapper.adjust_alignments_for_base_quality = qual_adjusted;
multipath_mapper.strip_bonuses = strip_full_length_bonus;
multipath_mapper.band_padding_multiplier = band_padding_multiplier;
multipath_mapper.init_band_padding_memo();
multipath_mapper.choose_band_padding = vg::algorithms::pad_band_random_walk(band_padding_multiplier);

// set mem finding parameters
multipath_mapper.hit_max = hit_max;
Expand Down

1 comment on commit 62bb596

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17355 seconds

Please sign in to comment.