kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
ScanTemplate.hpp
Go to the documentation of this file.
1#ifndef KAORI_SCAN_TEMPLATE_HPP
2#define KAORI_SCAN_TEMPLATE_HPP
3
4#include <bitset>
5#include <vector>
6#include <stdexcept>
7#include <string>
8
9#include "utils.hpp"
10
17namespace kaori {
18
37template<SeqLength max_size_>
39private:
40 static constexpr SeqLength N = max_size_ * 4;
41
42public:
47 ScanTemplate() = default;
48
57 ScanTemplate(const char* template_seq, SeqLength template_length, SearchStrand strand) :
58 my_length(template_length),
59 my_forward(search_forward(strand)),
60 my_reverse(search_reverse(strand))
61 {
62 if (my_length > max_size_) {
63 throw std::runtime_error("maximum template size should be " + std::to_string(max_size_) + " bp");
64 }
65
66 if (my_forward) {
67 for (SeqLength i = 0; i < my_length; ++i) {
68 char b = template_seq[i];
69 if (b != '-') {
70 add_base_to_hash(my_forward_ref, b);
71 add_mask_to_hash(my_forward_mask);
72 my_forward_mask_ambiguous.set(i);
73 } else {
74 shift_hash(my_forward_ref);
75 shift_hash(my_forward_mask);
76 add_variable_base(my_forward_variables, i);
77 }
78 }
79 } else {
80 // Forward variable regions are always defined.
81 for (SeqLength i = 0; i < my_length; ++i) {
82 char b = template_seq[i];
83 if (b == '-') {
84 add_variable_base(my_forward_variables, i);
85 }
86 }
87 }
88
89 if (my_reverse) {
90 for (SeqLength i = 0; i < my_length; ++i) {
91 char b = template_seq[my_length - i - 1];
92 if (b != '-') {
93 add_base_to_hash(my_reverse_ref, complement_base(b));
94 add_mask_to_hash(my_reverse_mask);
95 my_reverse_mask_ambiguous.set(i);
96 } else {
97 shift_hash(my_reverse_ref);
98 shift_hash(my_reverse_mask);
99 add_variable_base(my_reverse_variables, i);
100 }
101 }
102 }
103 }
104
105public:
109 struct State {
114 SeqLength position = static_cast<SeqLength>(-1); // set to -1 so that the first call to next() overflows to 0.
115
121
127
132 bool finished = false;
133
137 std::bitset<N> state;
138 const char * seq;
139 SeqLength len;
140
141 std::bitset<N/4> ambiguous; // we only need a yes/no for the ambiguous state, so we can use a smaller bitset.
142 SeqLength last_ambiguous; // contains the position of the most recent ambiguous base; should only be read if any_ambiguous = true.
143 bool any_ambiguous = false; // indicates whether ambiguous.count() > 0.
147 };
148
159 State initialize(const char* read_seq, SeqLength read_length) const {
160 State out;
161 out.seq = read_seq;
162 out.len = read_length;
163
164 if (my_length <= read_length) {
165 for (SeqLength i = 0; i < my_length - 1; ++i) {
166 char base = read_seq[i];
167
168 if (is_standard_base(base)) {
169 add_base_to_hash(out.state, base);
170
171 if (out.any_ambiguous) {
172 out.ambiguous <<= 1;
173 }
174 } else {
175 add_other_to_hash(out.state);
176
177 if (out.any_ambiguous) {
178 out.ambiguous <<= 1;
179 } else {
180 out.any_ambiguous = true;
181 }
182 out.ambiguous.set(0);
183 out.last_ambiguous = i;
184 }
185 }
186 } else {
187 out.finished = true;
188 }
189
190 return out;
191 }
192
201 void next(State& state) const {
202 SeqLength right = state.position + my_length;
203 char base = state.seq[right];
204
205 if (is_standard_base(base)) {
206 add_base_to_hash(state.state, base); // no need to trim off the end, the mask will handle that.
207 if (state.any_ambiguous) {
208 state.ambiguous <<= 1;
209
210 // If the last ambiguous position is equal to 'position', the
211 // ensuing increment to the latter will shift it out of the
212 // hash... at which point, we've got no ambiguity left.
213 if (state.last_ambiguous == state.position) {
214 state.any_ambiguous = false;
215 }
216 }
217
218 } else {
219 add_other_to_hash(state.state);
220
221 if (state.any_ambiguous) {
222 state.ambiguous <<= 1;
223 } else {
224 state.any_ambiguous = true;
225 }
226 state.ambiguous.set(0);
227 state.last_ambiguous = right;
228 }
229
230 ++state.position;
231 full_match(state);
232 if (right + 1 == state.len) {
233 state.finished = true;
234 }
235
236 return;
237 }
238
239private:
240 std::bitset<N> my_forward_ref, my_forward_mask;
241 std::bitset<N> my_reverse_ref, my_reverse_mask;
242 SeqLength my_length;
243 bool my_forward, my_reverse;
244
245 std::bitset<N/4> my_forward_mask_ambiguous; // we only need a yes/no for whether a position is an ambiguous base, so we can use a smaller bitset.
246 std::bitset<N/4> my_reverse_mask_ambiguous;
247
248 static void add_mask_to_hash(std::bitset<N>& current) {
249 shift_hash(current);
250 current.set(0);
251 current.set(1);
252 current.set(2);
253 current.set(3);
254 return;
255 }
256
257 static int strand_match(const State& match, const std::bitset<N>& ref, const std::bitset<N>& mask, const std::bitset<N/4>& mask_ambiguous) {
258 // pop count here is equal to the number of non-ambiguous mismatches *
259 // 2 + number of ambiguous mismatches * 3. This is because
260 // non-ambiguous bases are encoded by 1 set bit per 4 bases (so 2 are
261 // left after a XOR'd mismatch), while ambiguous mismatches are encoded
262 // by all set bits per 4 bases (which means that 3 are left after XOR).
263 int pcount = ((match.state & mask) ^ ref).count();
264
265 if (match.any_ambiguous) {
266 int acount = (match.ambiguous & mask_ambiguous).count();
267 return (pcount - acount) / 2; // i.e., acount + (pcount - acount * 3) / 2;
268 } else {
269 return pcount / 2;
270 }
271 }
272
273 void full_match(State& match) const {
274 if (my_forward) {
275 match.forward_mismatches = strand_match(match, my_forward_ref, my_forward_mask, my_forward_mask_ambiguous);
276 }
277 if (my_reverse) {
278 match.reverse_mismatches = strand_match(match, my_reverse_ref, my_reverse_mask, my_reverse_mask_ambiguous);
279 }
280 }
281
282private:
283 std::vector<std::pair<SeqLength, SeqLength> > my_forward_variables, my_reverse_variables;
284
285public:
291 const std::vector<std::pair<SeqLength, SeqLength> >& forward_variable_regions() const {
292 return my_forward_variables;
293 }
294
300 const std::vector<std::pair<SeqLength, SeqLength> >& reverse_variable_regions() const {
301 return my_reverse_variables;
302 }
303
308 const std::vector<std::pair<SeqLength, SeqLength> >& variable_regions(bool reverse) const {
309 if (reverse) {
311 } else {
313 }
314 }
315
316private:
317 static void add_variable_base(std::vector<std::pair<SeqLength, SeqLength> >& variables, SeqLength i) {
318 if (!variables.empty()) {
319 auto& last = variables.back().second;
320 if (last == i) {
321 ++last;
322 return;
323 }
324 }
325 variables.emplace_back(i, i + 1);
326 return;
327 }
328};
329
330}
331
332#endif
Scan a read sequence for the template sequence.
Definition ScanTemplate.hpp:38
ScanTemplate(const char *template_seq, SeqLength template_length, SearchStrand strand)
Definition ScanTemplate.hpp:57
State initialize(const char *read_seq, SeqLength read_length) const
Definition ScanTemplate.hpp:159
const std::vector< std::pair< SeqLength, SeqLength > > & forward_variable_regions() const
Definition ScanTemplate.hpp:291
void next(State &state) const
Definition ScanTemplate.hpp:201
const std::vector< std::pair< SeqLength, SeqLength > > & variable_regions(bool reverse) const
Definition ScanTemplate.hpp:308
ScanTemplate()=default
const std::vector< std::pair< SeqLength, SeqLength > > & reverse_variable_regions() const
Definition ScanTemplate.hpp:300
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:16
std::size_t SeqLength
Definition utils.hpp:37
SearchStrand
Definition utils.hpp:31
Details on the current match to the read sequence.
Definition ScanTemplate.hpp:109
int forward_mismatches
Definition ScanTemplate.hpp:120
int reverse_mismatches
Definition ScanTemplate.hpp:126
bool finished
Definition ScanTemplate.hpp:132
SeqLength position
Definition ScanTemplate.hpp:114
Utilites for sequence matching.