Skip to content

Commit

Permalink
Merge pull request #4371 from vgteam/left-align-better
Browse files Browse the repository at this point in the history
Even more better base-level alignments when surjecting long reads
  • Loading branch information
jeizenga authored Aug 11, 2024
2 parents 683aa18 + fd85a78 commit 10d0f96
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 135 deletions.
30 changes: 23 additions & 7 deletions src/algorithms/pad_band.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,43 @@
namespace vg {
namespace algorithms {

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier, size_t band_padding_memo_size) {

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk_internal(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding,
const std::function<size_t(const Alignment&, const HandleGraph&)> size_function) {
// We're goign to capture this vector by value into the closure
std::vector<size_t> band_padding_memo;

// Fill it in to initialize
band_padding_memo.resize(band_padding_memo_size);
for (size_t i = 0; i < band_padding_memo.size(); i++) {
band_padding_memo[i] = size_t(band_padding_multiplier * sqrt(i)) + 1;
band_padding_memo[i] = std::min<size_t>(max_padding, size_t(band_padding_multiplier * sqrt(i)) + 1);
}

function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding = [band_padding_multiplier, band_padding_memo](const Alignment& seq, const HandleGraph& graph) {
size_t read_length = seq.sequence().size();
return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length)
: size_t(band_padding_multiplier * sqrt(read_length)) + 1;

function<size_t(const Alignment&, const HandleGraph&)> choose_band_padding =
[band_padding_multiplier, band_padding_memo, size_function, max_padding](const Alignment& seq, const HandleGraph& graph) {
size_t size = size_function(seq, graph);
return size < band_padding_memo.size() ? band_padding_memo.at(size)
: std::min<size_t>(max_padding, size_t(band_padding_multiplier * sqrt(size)) + 1);
};

// And return the closure which now owns the memo table.
return choose_band_padding;
}

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding) {
return pad_band_random_walk_internal(band_padding_multiplier, band_padding_memo_size, max_padding,
[](const Alignment& seq, const HandleGraph&) {
return seq.sequence().size();
});
}

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_min_random_walk(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding) {
return pad_band_random_walk_internal(band_padding_multiplier, band_padding_memo_size, max_padding,
[](const Alignment& seq, const HandleGraph& graph) {
return std::min(seq.sequence().size(), graph.get_total_length());
});
}

std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_constant(size_t band_padding) {
// don't dynamically choose band padding, shim constant value into a function type
function<size_t(const Alignment&,const HandleGraph&)> constant_padding = [band_padding](const Alignment& seq, const HandleGraph& graph) {
Expand Down
14 changes: 12 additions & 2 deletions src/algorithms/pad_band.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,25 @@

#include "../handle.hpp"
#include <vg/vg.pb.h>
#include <limits>

namespace vg {
namespace algorithms {

using namespace std;

/// Get a band padding function that uses the expected distance of a random
/// Get a band padding function that scales with the expected distance of a random
/// walk, memoized out to the given length.
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier = 1.0, size_t band_padding_memo_size = 2000);
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_random_walk(double band_padding_multiplier = 1.0,
size_t band_padding_memo_size = 2000,
size_t max_padding = std::numeric_limits<size_t>::max());

/// Get a band padding function that scales the expected distance of a random
/// walk, memoized out to the given length, using the minimum of graph size and
/// read size as the length.
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_min_random_walk(double band_padding_multiplier = 1.0,
size_t band_padding_memo_size = 2000,
size_t max_padding = std::numeric_limits<size_t>::max());

/// Get a band padding function that uses a constant value.
std::function<size_t(const Alignment&, const HandleGraph&)> pad_band_constant(size_t band_padding);
Expand Down
60 changes: 20 additions & 40 deletions src/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1421,8 +1421,7 @@ void Aligner::align_pinned_multi(Alignment& alignment, vector<Alignment>& alt_al
}

void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
int32_t band_padding, bool permissive_banding,
const unordered_map<handle_t, bool>* left_align_strand) const {
int32_t band_padding, bool permissive_banding) const {

if (alignment.sequence().empty()) {
// we can save time by using a specialized deletion aligner for empty strings
Expand All @@ -1447,8 +1446,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
g,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int16_t>::max() && worst_score >= numeric_limits<int16_t>::min()) {
Expand All @@ -1457,8 +1455,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
g,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int32_t>::max() && worst_score >= numeric_limits<int32_t>::min()) {
Expand All @@ -1467,8 +1464,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
g,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else {
Expand All @@ -1477,16 +1473,14 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
g,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
}
}

void Aligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>& alt_alignments, const HandleGraph& g,
int32_t max_alt_alns, int32_t band_padding, bool permissive_banding,
const unordered_map<handle_t, bool>* left_align_strand) const {
int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const {

if (alignment.sequence().empty()) {
// we can save time by using a specialized deletion aligner for empty strings
Expand All @@ -1511,8 +1505,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>&
max_alt_alns,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int16_t>::max() && worst_score >= numeric_limits<int16_t>::min()) {
Expand All @@ -1523,8 +1516,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>&
max_alt_alns,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int32_t>::max() && worst_score >= numeric_limits<int32_t>::min()) {
Expand All @@ -1535,8 +1527,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>&
max_alt_alns,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else {
Expand All @@ -1547,8 +1538,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>&
max_alt_alns,
band_padding,
permissive_banding,
false,
left_align_strand);
false);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
}
Expand Down Expand Up @@ -2105,8 +2095,7 @@ void QualAdjAligner::align_pinned_multi(Alignment& alignment, vector<Alignment>&
}

void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph& g,
int32_t band_padding, bool permissive_banding,
const unordered_map<handle_t, bool>* left_align_strand) const {
int32_t band_padding, bool permissive_banding) const {

if (alignment.sequence().empty()) {
// we can save time by using a specialized deletion aligner for empty strings
Expand All @@ -2129,8 +2118,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph
g,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int16_t>::max() && worst_score >= numeric_limits<int16_t>::min()) {
Expand All @@ -2139,8 +2127,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph
g,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int32_t>::max() && worst_score >= numeric_limits<int32_t>::min()) {
Expand All @@ -2149,8 +2136,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph
g,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else {
Expand All @@ -2159,16 +2145,14 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph
g,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
}
}

void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector<Alignment>& alt_alignments, const HandleGraph& g,
int32_t max_alt_alns, int32_t band_padding, bool permissive_banding,
const unordered_map<handle_t, bool>* left_align_strand) const {
int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const {

if (alignment.sequence().empty()) {
// we can save time by using a specialized deletion aligner for empty strings
Expand All @@ -2193,8 +2177,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector<Alig
max_alt_alns,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int16_t>::max() && worst_score >= numeric_limits<int16_t>::min()) {
Expand All @@ -2205,8 +2188,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector<Alig
max_alt_alns,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else if (best_score <= numeric_limits<int32_t>::max() && worst_score >= numeric_limits<int32_t>::min()) {
Expand All @@ -2217,8 +2199,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector<Alig
max_alt_alns,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
} else {
Expand All @@ -2229,8 +2210,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector<Alig
max_alt_alns,
band_padding,
permissive_banding,
true,
left_align_strand);
true);

band_graph.align(score_matrix, nt_table, gap_open, gap_extension);
}
Expand Down
Loading

1 comment on commit 10d0f96

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

14 tests passed, 0 tests failed and 0 tests skipped in 12710 seconds

Please sign in to comment.