kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
ScanTemplate.hpp
Go to the documentation of this file.
1#ifndef KAORI_SCAN_TEMPLATE_HPP
2#define KAORI_SCAN_TEMPLATE_HPP
3
4#include <bitset>
5#include <vector>
6#include <stdexcept>
7#include <string>
8
9#include "utils.hpp"
10
17namespace kaori {
18
36template<size_t max_size>
38private:
39 static constexpr size_t N = max_size * 4;
40
41public:
47
57 length(template_length), forward(search_forward(strand)), reverse(search_reverse(strand))
58 {
59 if (length > max_size) {
60 throw std::runtime_error("maximum template size should be " + std::to_string(max_size) + " bp");
61 }
62
63 if (forward) {
64 for (size_t i = 0; i < length; ++i) {
65 char b = template_seq[i];
66 if (b != '-') {
67 add_base_to_hash(forward_ref, b);
68 add_mask_to_hash(forward_mask);
69 forward_mask_ambiguous.set(i);
70 } else {
71 shift_hash(forward_ref);
72 shift_hash(forward_mask);
73 add_variable_base(forward_variables, i);
74 }
75 }
76 } else {
77 // Forward variable regions are always defined.
78 for (size_t i = 0; i < length; ++i) {
79 char b = template_seq[i];
80 if (b == '-') {
81 add_variable_base(forward_variables, i);
82 }
83 }
84 }
85
86 if (reverse) {
87 for (size_t i = 0; i < length; ++i) {
88 char b = template_seq[length - i - 1];
89 if (b != '-') {
90 add_base_to_hash(reverse_ref, complement_base(b));
91 add_mask_to_hash(reverse_mask);
92 reverse_mask_ambiguous.set(i);
93 } else {
94 shift_hash(reverse_ref);
95 shift_hash(reverse_mask);
96 add_variable_base(reverse_variables, i);
97 }
98 }
99 }
100 }
101
102public:
106 struct State {
111 size_t position = static_cast<size_t>(-1); // overflow should be sane.
112
118
124
129 bool finished = false;
130
134 std::bitset<N> state;
135 const char * seq;
136 size_t len;
137
138 std::bitset<N/4> ambiguous; // we only need a yes/no for the ambiguous state, so we can use a smaller bitset.
139 size_t last_ambiguous; // contains the position of the most recent ambiguous base; should only be read if any_ambiguous = true.
140 bool any_ambiguous = false; // indicates whether ambiguous.count() > 0.
144 };
145
156 State initialize(const char* read_seq, size_t read_length) const {
157 State out;
158 out.seq = read_seq;
159 out.len = read_length;
160
161 if (length <= read_length) {
162 for (size_t i = 0; i < length - 1; ++i) {
163 char base = read_seq[i];
164
165 if (is_standard_base(base)) {
166 add_base_to_hash(out.state, base);
167
168 if (out.any_ambiguous) {
169 out.ambiguous <<= 1;
170 }
171 } else {
172 add_other_to_hash(out.state);
173
174 if (out.any_ambiguous) {
175 out.ambiguous <<= 1;
176 } else {
177 out.any_ambiguous = true;
178 }
179 out.ambiguous.set(0);
180 out.last_ambiguous = i;
181 }
182 }
183 } else {
184 out.finished = true;
185 }
186
187 return out;
188 }
189
198 void next(State& state) const {
199 size_t right = state.position + length;
200 char base = state.seq[right];
201
202 if (is_standard_base(base)) {
203 add_base_to_hash(state.state, base); // no need to trim off the end, the mask will handle that.
204 if (state.any_ambiguous) {
205 state.ambiguous <<= 1;
206
207 // If the last ambiguous position is equal to 'position', the
208 // ensuing increment to the latter will shift it out of the
209 // hash... at which point, we've got no ambiguity left.
210 if (state.last_ambiguous == state.position) {
211 state.any_ambiguous = false;
212 }
213 }
214
215 } else {
217
218 if (state.any_ambiguous) {
219 state.ambiguous <<= 1;
220 } else {
221 state.any_ambiguous = true;
222 }
223 state.ambiguous.set(0);
224 state.last_ambiguous = right;
225 }
226
227 ++state.position;
228 full_match(state);
229 if (right + 1 == state.len) {
230 state.finished = true;
231 }
232
233 return;
234 }
235
236private:
237 std::bitset<N> forward_ref, forward_mask;
238 std::bitset<N> reverse_ref, reverse_mask;
239 size_t length;
240 int mismatches;
241 bool forward, reverse;
242
243 std::bitset<N/4> forward_mask_ambiguous; // we only need a yes/no for whether a position is an ambiguous base, so we can use a smaller bitset.
244 std::bitset<N/4> reverse_mask_ambiguous;
245
246 static void add_mask_to_hash(std::bitset<N>& current) {
248 current.set(0);
249 current.set(1);
250 current.set(2);
251 current.set(3);
252 return;
253 }
254
255 static int strand_match(const State& match, const std::bitset<N>& ref, const std::bitset<N>& mask, const std::bitset<N/4>& mask_ambiguous) {
256 // pop count here is equal to the number of non-ambiguous mismatches *
257 // 2 + number of ambiguous mismatches * 3. This is because
258 // non-ambiguous bases are encoded by 1 set bit per 4 bases (so 2 are
259 // left after a XOR'd mismatch), while ambiguous mismatches are encoded
260 // by all set bits per 4 bases (which means that 3 are left after XOR).
261 size_t pcount = ((match.state & mask) ^ ref).count();
262
263 if (match.any_ambiguous) {
264 size_t acount = (match.ambiguous & mask_ambiguous).count();
265 return (pcount - acount) / 2; // i.e., acount + (pcount - acount * 3) / 2;
266 } else {
267 return pcount / 2;
268 }
269 }
270
271 void full_match(State& match) const {
272 if (forward) {
273 match.forward_mismatches = strand_match(match, forward_ref, forward_mask, forward_mask_ambiguous);
274 }
275 if (reverse) {
276 match.reverse_mismatches = strand_match(match, reverse_ref, reverse_mask, reverse_mask_ambiguous);
277 }
278 }
279
280private:
281 std::vector<std::pair<int, int> > forward_variables, reverse_variables;
282
283public:
294 template<bool reverse = false>
295 const std::vector<std::pair<int, int> >& variable_regions() const {
296 if constexpr(reverse) {
297 return reverse_variables;
298 } else {
299 return forward_variables;
300 }
301 }
302
303private:
304 static void add_variable_base(std::vector<std::pair<int, int> >& variables, int i) {
305 if (!variables.empty()) {
306 auto& last = variables.back().second;
307 if (last == i) {
308 ++last;
309 return;
310 }
311 }
312 variables.emplace_back(i, i + 1);
313 return;
314 }
315};
316
317}
318
319#endif
Scan a read sequence for the template sequence.
Definition ScanTemplate.hpp:37
ScanTemplate()
Definition ScanTemplate.hpp:46
void next(State &state) const
Definition ScanTemplate.hpp:198
const std::vector< std::pair< int, int > > & variable_regions() const
Definition ScanTemplate.hpp:295
ScanTemplate(const char *template_seq, size_t template_length, SearchStrand strand)
Definition ScanTemplate.hpp:56
State initialize(const char *read_seq, size_t read_length) const
Definition ScanTemplate.hpp:156
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:13
Details on the current match to the read sequence.
Definition ScanTemplate.hpp:106
size_t position
Definition ScanTemplate.hpp:111
int reverse_mismatches
Definition ScanTemplate.hpp:123
int forward_mismatches
Definition ScanTemplate.hpp:117
bool finished
Definition ScanTemplate.hpp:129