kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
BarcodeSearch.hpp
Go to the documentation of this file.
1#ifndef KAORI_VARIABLE_LIBRARY_HPP
2#define KAORI_VARIABLE_LIBRARY_HPP
3
4#include "BarcodePool.hpp"
5#include "MismatchTrie.hpp"
6#include "utils.hpp"
7
8#include <cstddef>
9#include <unordered_map>
10#include <string>
11#include <vector>
12#include <array>
13
20namespace kaori {
21
25template<typename Trie_>
26inline void fill_library(const std::vector<const char*>& options, std::unordered_map<std::string, BarcodeIndex>& exact, Trie_& trie, bool reverse) {
27 std::size_t len = trie.length();
28 auto nopt = options.size();
29
30 for (decltype(nopt) i = 0; i < nopt; ++i) {
31 auto ptr = options[i];
32
33 std::string current;
34 if (!reverse) {
35 current = std::string(ptr, ptr + len);
36 } else {
37 current.reserve(len);
38 for (size_t j = 0; j < len; ++j) {
39 current += complement_base<true, true>(ptr[len - j - 1]);
40 }
41 }
42
43 // Note that this must be called, even if the sequence is duplicated;
44 // otherwise the trie's internal counter will not be properly incremented.
45 auto status = trie.add(current.c_str());
46
47 if (!status.has_ambiguous) {
48 if (!status.is_duplicate || status.duplicate_replaced) {
49 exact[current] = i;
50 } else if (status.duplicate_cleared) {
51 exact[current] = STATUS_UNMATCHED;
52 }
53 }
54 }
55
56 trie.optimize();
57 return;
58}
71public:
75 struct Options {
80
84 bool reverse = false;
85
89 DuplicateAction duplicates = DuplicateAction::ERROR;
90 };
91
92public:
98
103 SimpleBarcodeSearch(const BarcodePool& barcode_pool, const Options& options) :
104 my_trie(barcode_pool.length(), options.duplicates),
105 my_max_mm(options.max_mismatches)
106 {
107 fill_library(barcode_pool.pool(), my_exact, my_trie, options.reverse);
108 }
109
110private:
111 AnyMismatches my_trie;
112 int my_max_mm;
113 std::unordered_map<std::string, BarcodeIndex> my_exact;
114
115 struct CacheEntry {
116 CacheEntry() = default;
117 CacheEntry(BarcodeIndex index, int mismatches) : index(index), mismatches(mismatches) {}
118 BarcodeIndex index;
119 int mismatches;
120 };
121 std::unordered_map<std::string, CacheEntry> my_cache;
122
123public:
130 struct State {
137
143 int mismatches = 0;
144
148 std::unordered_map<std::string, CacheEntry> cache;
152 };
153
160 return State();
161 }
162
170 void reduce(State& state) {
171 my_cache.merge(state.cache);
172 state.cache.clear();
173 }
174
175public:
185 void search(const std::string& search_seq, State& state) const {
186 search(search_seq, state, my_max_mm);
187 return;
188 }
189
202 void search(const std::string& search_seq, State& state, int allowed_mismatches) const {
203 auto it = my_exact.find(search_seq);
204 if (it != my_exact.end()) {
205 state.index = it->second;
206 state.mismatches = 0;
207 return;
208 }
209
210 auto set_from_cache = [&](const CacheEntry& cached) -> void {
211 if (cached.mismatches > allowed_mismatches) {
212 // technically cached.mismatches is only a lower bound if index == UNMATCHED,
213 // but if it's already UNMATCHED, then the result will be UNMATCHED either way.
214 state.index = STATUS_UNMATCHED;
215 } else {
216 state.index = cached.index;
217 }
218 state.mismatches = cached.mismatches;
219 };
220
221 auto cIt = my_cache.find(search_seq);
222 if (cIt != my_cache.end()) {
223 set_from_cache(cIt->second);
224 return;
225 }
226
227 auto lIt = state.cache.find(search_seq);
228 if (lIt != state.cache.end()) {
229 set_from_cache(lIt->second);
230 return;
231 }
232
233 auto missed = my_trie.search(search_seq.c_str(), allowed_mismatches);
234 if (is_barcode_index_ok(missed.index)) {
235 // No need to check against allowed_mismatches, as we explicitly searched for that in the trie.
236 state.index = missed.index;
237 state.mismatches = missed.mismatches;
238 state.cache[search_seq] = CacheEntry(missed.index, missed.mismatches);
239 return;
240 }
241
242 // The trie search breaks early when it hits allowed_mismatches, but
243 // allowed_mismatches might be different across calls to search(). If
244 // we break early and report a miss, the miss will be cached and
245 // returned in cases where there is a higher cap (and thus might
246 // actually be a hit). As such, we should only store a miss in the
247 // cache when the requested number of mismatches is equal to the
248 // maximum number of mismatches that was specified in the constructor.
249 //
250 // Of course, if the search failed because of ambiguity, then it would
251 // have failed even if we were searching with maximum mismatches;
252 // so we happily cache that.
253 if (allowed_mismatches == my_max_mm || missed.index == STATUS_AMBIGUOUS) {
254 state.cache[search_seq] = CacheEntry(missed.index, missed.mismatches);
255 }
256
257 state.index = missed.index;
258 state.mismatches = missed.mismatches;
259 }
260};
261
271template<int num_segments_>
273public:
277 struct Options {
282 Options(int max_mismatch_per_segment = 0) {
283 max_mismatches.fill(max_mismatch_per_segment);
284 }
285
291 std::array<int, num_segments_> max_mismatches;
292
298 bool reverse = false;
299
303 DuplicateAction duplicates = DuplicateAction::ERROR;
304 };
305
306public:
312
320 const BarcodePool& barcode_pool,
321 std::array<SeqLength, num_segments_> segments,
322 const Options& options
323 ) :
324 my_trie(
325 [&]{
326 if (options.reverse) {
327 std::reverse(segments.begin(), segments.end());
328 }
329 return segments;
330 }(),
331 options.duplicates
332 ),
333 my_max_mm(
334 [&]{
335 auto copy = options.max_mismatches;
336 if (options.reverse) {
337 std::reverse(copy.begin(), copy.end());
338 }
339 return copy;
340 }()
341 )
342 {
343 if (barcode_pool.length() != my_trie.length()) {
344 throw std::runtime_error("variable sequences should have the same length as the sum of segment lengths");
345 }
346 fill_library(barcode_pool.pool(), my_exact, my_trie, options.reverse);
347 }
348
349private:
350 SegmentedMismatches<num_segments_> my_trie;
351 std::array<int, num_segments_> my_max_mm;
352 std::unordered_map<std::string, BarcodeIndex> my_exact;
353
354 struct CacheEntry {
355 CacheEntry() = default;
356 CacheEntry(BarcodeIndex index, int mismatches, std::array<int, num_segments_> per_segment) :
357 index(index), mismatches(mismatches), per_segment(per_segment) {}
358 BarcodeIndex index;
359 int mismatches;
360 std::array<int, num_segments_> per_segment;
361 };
362 std::unordered_map<std::string, CacheEntry> my_cache;
363
364public:
371 struct State {
378
384 int mismatches = 0;
385
391 std::array<int, num_segments_> per_segment;
392
396 State() : per_segment() {}
397
398 std::unordered_map<std::string, CacheEntry> cache;
402 };
403
410 return State();
411 }
412
420 void reduce(State& state) {
421 my_cache.merge(state.cache);
422 state.cache.clear();
423 }
424
425public:
435 void search(const std::string& search_seq, State& state) const {
436 search(search_seq, state, my_max_mm);
437 return;
438 }
439
452 void search(const std::string& search_seq, State& state, std::array<int, num_segments_> allowed_mismatches) const {
453 auto it = my_exact.find(search_seq);
454 if (it != my_exact.end()) {
455 state.index = it->second;
456 state.mismatches = 0;
457 std::fill_n(state.per_segment.begin(), num_segments_, 0);
458 return;
459 }
460
461 auto set_from_cache = [&](const CacheEntry& cached) -> void {
462 state.mismatches = cached.mismatches;
463 state.per_segment = cached.per_segment;
464 for (int s = 0; s < num_segments_; ++s) {
465 if (cached.per_segment[s] > allowed_mismatches[s]) {
466 // technically cached.mismatches is only a lower bound if index == UNMATCHED,
467 // but if it's already UNMATCHED, then the result will be UNMATCHED either way.
468 state.index = STATUS_UNMATCHED;
469 return;
470 }
471 }
472 state.index = cached.index;
473 };
474
475 auto cIt = my_cache.find(search_seq);
476 if (cIt != my_cache.end()) {
477 set_from_cache(cIt->second);
478 return;
479 }
480
481 auto lIt = state.cache.find(search_seq);
482 if (lIt != state.cache.end()) {
483 set_from_cache(lIt->second);
484 return;
485 }
486
487 auto missed = my_trie.search(search_seq.c_str(), allowed_mismatches);
488 if (is_barcode_index_ok(missed.index)) {
489 // No need to check against allowed_mismatches, as we explicitly searched for that in the trie.
490 state.index = missed.index;
491 state.mismatches = missed.mismatches;
492 state.per_segment = missed.per_segment;
493 state.cache[search_seq] = CacheEntry(missed.index, missed.mismatches, missed.per_segment);
494 return;
495 }
496
497 // The trie search breaks early when it hits allowed_mismatches, but
498 // the allowed_mismatches might be different across calls to search().
499 // If we break early and report a miss, the miss will be cached and
500 // returned in cases where there is a higher cap (and thus might
501 // actually be a hit). As such, we should only store a miss in the
502 // cache when the requested number of mismatches is equal to the
503 // maximum number of mismatches that was specified in the constructor.
504 //
505 // Of course, if the search failed because of ambiguity, then it would
506 // have failed even if we were searching with maximum mismatches;
507 // so we happily cache that.
508 if (allowed_mismatches == my_max_mm || missed.index == STATUS_AMBIGUOUS) {
509 state.cache[search_seq] = CacheEntry(missed.index, missed.mismatches, missed.per_segment);
510 }
511
512 state.index = missed.index;
513 state.mismatches = missed.mismatches;
514 state.per_segment = missed.per_segment;
515 }
516};
517
518}
519
520#endif
Defines the BarcodePool class.
Defines trie-based classes for mismatch-tolerant sequence matching.
Search for barcodes with mismatches anywhere.
Definition MismatchTrie.hpp:383
Result search(const char *search_seq, int max_mismatches) const
Definition MismatchTrie.hpp:470
Pool of barcode sequences.
Definition BarcodePool.hpp:24
const std::vector< const char * > & pool() const
Definition BarcodePool.hpp:63
Search against known barcode sequences with segmented mismatches.
Definition BarcodeSearch.hpp:272
State initialize() const
Definition BarcodeSearch.hpp:409
SegmentedBarcodeSearch(const BarcodePool &barcode_pool, std::array< SeqLength, num_segments_ > segments, const Options &options)
Definition BarcodeSearch.hpp:319
void reduce(State &state)
Definition BarcodeSearch.hpp:420
void search(const std::string &search_seq, State &state) const
Definition BarcodeSearch.hpp:435
void search(const std::string &search_seq, State &state, std::array< int, num_segments_ > allowed_mismatches) const
Definition BarcodeSearch.hpp:452
Search against known barcodes.
Definition BarcodeSearch.hpp:70
SimpleBarcodeSearch(const BarcodePool &barcode_pool, const Options &options)
Definition BarcodeSearch.hpp:103
void search(const std::string &search_seq, State &state, int allowed_mismatches) const
Definition BarcodeSearch.hpp:202
void search(const std::string &search_seq, State &state) const
Definition BarcodeSearch.hpp:185
State initialize() const
Definition BarcodeSearch.hpp:159
void reduce(State &state)
Definition BarcodeSearch.hpp:170
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:16
constexpr BarcodeIndex STATUS_AMBIGUOUS
Definition utils.hpp:53
constexpr BarcodeIndex STATUS_UNMATCHED
Definition utils.hpp:48
DuplicateAction
Definition utils.hpp:26
bool is_barcode_index_ok(BarcodeIndex index)
Definition utils.hpp:60
std::vector< constchar * >::size_type BarcodeIndex
Definition utils.hpp:43
Optional parameters for a SegmentedBarcodeSearch.
Definition BarcodeSearch.hpp:277
std::array< int, num_segments_ > max_mismatches
Definition BarcodeSearch.hpp:291
bool reverse
Definition BarcodeSearch.hpp:298
DuplicateAction duplicates
Definition BarcodeSearch.hpp:303
Options(int max_mismatch_per_segment=0)
Definition BarcodeSearch.hpp:282
State of the search.
Definition BarcodeSearch.hpp:371
std::array< int, num_segments_ > per_segment
Definition BarcodeSearch.hpp:391
int mismatches
Definition BarcodeSearch.hpp:384
BarcodeIndex index
Definition BarcodeSearch.hpp:377
Optional parameters for SimpleBarcodeSearch.
Definition BarcodeSearch.hpp:75
bool reverse
Definition BarcodeSearch.hpp:84
int max_mismatches
Definition BarcodeSearch.hpp:79
DuplicateAction duplicates
Definition BarcodeSearch.hpp:89
State of the search.
Definition BarcodeSearch.hpp:130
BarcodeIndex index
Definition BarcodeSearch.hpp:136
int mismatches
Definition BarcodeSearch.hpp:143
Utilites for sequence matching.