kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
BarcodeSearch.hpp
Go to the documentation of this file.
1#ifndef KAORI_VARIABLE_LIBRARY_HPP
2#define KAORI_VARIABLE_LIBRARY_HPP
3
4#include "BarcodePool.hpp"
5#include "MismatchTrie.hpp"
6#include "utils.hpp"
7#include <unordered_map>
8#include <string>
9#include <vector>
10#include <array>
11
18namespace kaori {
19
23template<class Trie>
24void fill_library(
25 const std::vector<const char*>& options,
26 std::unordered_map<std::string, int>& exact,
27 Trie& trie,
28 bool reverse
29) {
30 size_t len = trie.get_length();
31
32 for (size_t i = 0; i < options.size(); ++i) {
33 auto ptr = options[i];
34
35 std::string current;
36 if (!reverse) {
37 current = std::string(ptr, ptr + len);
38 } else {
39 current.reserve(len);
40 for (size_t j = 0; j < len; ++j) {
41 current += complement_base<true, true>(ptr[len - j - 1]);
42 }
43 }
44
45 // Note that this must be called, even if the sequence is duplicated;
46 // otherwise the trie's internal counter will not be properly incremented.
47 auto status = trie.add(current.c_str());
48
49 if (!status.has_ambiguous) {
50 if (!status.is_duplicate || status.duplicate_replaced) {
51 exact[current] = i;
52 } else if (status.duplicate_cleared) {
53 exact[current] = -1;
54 }
55 }
56 }
57
58 trie.optimize();
59 return;
60}
61
62template<class Methods, class Cache, class Trie, class Result, class Mismatch>
63void matcher_in_the_rye(const std::string& x, const Cache& cache, const Trie& trie, Result& res, const Mismatch& mismatches, const Mismatch& max_mismatches) {
64 // Seeing if it's any of the caches; otherwise searching the trie.
65 auto cit = cache.find(x);
66 if (cit == cache.end()) {
67 auto lit = res.cache.find(x);
68 if (lit != res.cache.end()) {
69 Methods::update(res, lit->second, mismatches);
70
71 } else {
72 auto missed = trie.search(x.c_str(), mismatches);
73
74 // The trie search breaks early when it hits the mismatch cap,
75 // but the cap might be different across calls. If we break
76 // early and report a miss, the miss will be cached and
77 // returned in cases where there is a higher cap (and thus
78 // might actually be a hit). As such, we should only store a
79 // miss in the cache when the requested number of mismatches is
80 // equal to the maximum value specified in the constructor.
81 if (Methods::index(missed) >= 0 || mismatches == max_mismatches) {
82 res.cache[x] = missed;
83 }
84
85 // No need to pass the requested number of mismatches,
86 // as we explicitly searched for that in the trie.
87 Methods::update(res, missed);
88 }
89 } else {
90 Methods::update(res, cit->second, mismatches);
91 }
92 return;
93}
106public:
110 struct Options {
115
119 bool reverse = false;
120
124 DuplicateAction duplicates = DuplicateAction::ERROR;
125 };
126
127public:
133
138 SimpleBarcodeSearch(const BarcodePool& barcode_pool, const Options& options) :
139 trie(barcode_pool.length, options.duplicates),
140 max_mm(options.max_mismatches)
141 {
142 fill_library(barcode_pool.pool, exact, trie, options.reverse);
143 return;
144 }
145
146public:
153 struct State {
158 int index = 0;
159
164 int mismatches = 0;
165
169 std::unordered_map<std::string, std::pair<int, int> > cache;
173 };
174
181 return State();
182 }
183
192 void reduce(State& state) {
193 cache.merge(state.cache);
194 state.cache.clear();
195 }
196
197private:
198 struct Methods {
199 static int index(const std::pair<int, int>& val) {
200 return val.first;
201 }
202
203 static void update(State& state, const std::pair<int, int>& val) {
204 state.index = val.first;
205 state.mismatches = val.second;
206 return;
207 }
208
209 static void update(State& state, const std::pair<int, int>& val, int mismatches) {
210 state.index = (val.second > mismatches ? -1 : val.first);
211 state.mismatches = val.second;
212 return;
213 }
214 };
215
216public:
226 void search(const std::string& search_seq, State& state) const {
227 search(search_seq, state, max_mm);
228 return;
229 }
230
243 void search(const std::string& search_seq, State& state, int allowed_mismatches) const {
244 auto it = exact.find(search_seq);
245 if (it != exact.end()) {
246 state.index = it->second;
247 state.mismatches = 0;
248 } else {
249 matcher_in_the_rye<Methods>(search_seq, cache, trie, state, allowed_mismatches, max_mm);
250 }
251 }
252
253private:
254 std::unordered_map<std::string, int> exact;
255 AnyMismatches trie;
256 std::unordered_map<std::string, std::pair<int, int> > cache;
257 int max_mm;
258};
259
263// Using some template recursion instead of a fixed-length loop.
264// Maybe it compiles down to the same thing.
265template<size_t total, size_t position>
266struct HasMore {
267 static bool check(const std::array<int, total>& left, const std::array<int, total>& right) {
268 return (HasMore<total, position + 1>::check(left, right) || left[position] > right[position]);
269 }
270};
271
272template<size_t total>
273struct HasMore<total, total> {
274 static bool check(const std::array<int, total>&, const std::array<int, total>&) { return false; }
275};
289template<size_t num_segments>
291public:
295 struct Options {
300 Options(int max_mismatch_per_segment = 0) {
301 max_mismatches.fill(max_mismatch_per_segment);
302 }
303
309 std::array<int, num_segments> max_mismatches;
310
315 bool reverse = false;
316
320 DuplicateAction duplicates = DuplicateAction::ERROR;
321 };
322
323public:
328
336 const BarcodePool& barcode_pool,
337 std::array<int, num_segments> segments,
338 const Options& options
339 ) :
340 trie(
341 (!options.reverse ?
342 segments :
343 [&]{
344 auto copy = segments;
345 std::reverse(copy.begin(), copy.end());
346 return copy;
347 }()
348 ),
349 options.duplicates
350 ),
351 max_mm(
352 (!options.reverse ?
353 options.max_mismatches :
354 [&]{
355 auto copy = options.max_mismatches;
356 std::reverse(copy.begin(), copy.end());
357 return copy;
358 }()
359 )
360 )
361 {
362 if (barcode_pool.length != trie.get_length()) {
363 throw std::runtime_error("variable sequences should have the same length as the sum of segment lengths");
364 }
365 fill_library(barcode_pool.pool, exact, trie, options.reverse);
366 return;
367 }
368
369public:
376 struct State {
381 int index = 0;
382
387 int mismatches = 0;
388
393 std::array<int, num_segments> per_segment;
394
398 State() : per_segment() {}
399
400 std::unordered_map<std::string, typename SegmentedMismatches<num_segments>::Result> cache;
404 };
405
412 return State();
413 }
414
423 void reduce(State& state) {
424 cache.merge(state.cache);
425 state.cache.clear();
426 }
427
428private:
429 typedef typename SegmentedMismatches<num_segments>::Result SegmentedResult;
430
431 struct Methods {
432 static int index(const SegmentedResult& val) {
433 return val.index;
434 }
435
436 static void update(State& state, const SegmentedResult& val) {
437 state.index = val.index;
438 state.mismatches = val.total;
439 state.per_segment = val.per_segment;
440 return;
441 }
442
443 static void update(State& state, const SegmentedResult& val, const std::array<int, num_segments>& mismatches) {
444 state.index = (HasMore<num_segments, 0>::check(val.per_segment, mismatches) ? -1 : val.index);
445 state.mismatches = val.total;
446 state.per_segment = val.per_segment;
447 return;
448 }
449 };
450
451public:
461 void search(const std::string& search_seq, State& state) const {
462 search(search_seq, state, max_mm);
463 return;
464 }
465
478 void search(const std::string& search_seq, State& state, std::array<int, num_segments> allowed_mismatches) const {
479 auto it = exact.find(search_seq);
480 if (it != exact.end()) {
481 state.index = it->second;
482 state.mismatches = 0;
483 std::fill_n(state.per_segment.begin(), num_segments, 0);
484 } else {
485 matcher_in_the_rye<Methods>(search_seq, cache, trie, state, allowed_mismatches, max_mm);
486 }
487 }
488
489private:
490 std::unordered_map<std::string, int> exact;
492 std::unordered_map<std::string, SegmentedResult> cache;
493 std::array<int, num_segments> max_mm;
494};
495
496}
497
498#endif
Defines the BarcodePool class.
Defines the MismatchTrie class and its subclasses.
Search for barcodes with mismatches anywhere.
Definition MismatchTrie.hpp:419
Search for known barcode sequences with segmented mismatches.
Definition BarcodeSearch.hpp:290
void reduce(State &state)
Definition BarcodeSearch.hpp:423
SegmentedBarcodeSearch()
Definition BarcodeSearch.hpp:327
void search(const std::string &search_seq, State &state) const
Definition BarcodeSearch.hpp:461
State initialize() const
Definition BarcodeSearch.hpp:411
SegmentedBarcodeSearch(const BarcodePool &barcode_pool, std::array< int, num_segments > segments, const Options &options)
Definition BarcodeSearch.hpp:335
void search(const std::string &search_seq, State &state, std::array< int, num_segments > allowed_mismatches) const
Definition BarcodeSearch.hpp:478
Search for barcodes with segmented mismatches.
Definition MismatchTrie.hpp:514
Search for known barcode sequences.
Definition BarcodeSearch.hpp:105
SimpleBarcodeSearch(const BarcodePool &barcode_pool, const Options &options)
Definition BarcodeSearch.hpp:138
void search(const std::string &search_seq, State &state, int allowed_mismatches) const
Definition BarcodeSearch.hpp:243
void search(const std::string &search_seq, State &state) const
Definition BarcodeSearch.hpp:226
SimpleBarcodeSearch()
Definition BarcodeSearch.hpp:132
State initialize() const
Definition BarcodeSearch.hpp:180
void reduce(State &state)
Definition BarcodeSearch.hpp:192
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:13
Pool of barcode sequences for a variable region.
Definition BarcodePool.hpp:21
std::vector< const char * > pool
Definition BarcodePool.hpp:54
Optional parameters for a SegmentedBarcodeSearch.
Definition BarcodeSearch.hpp:295
std::array< int, num_segments > max_mismatches
Definition BarcodeSearch.hpp:309
Options(int max_mismatch_per_segment=0)
Definition BarcodeSearch.hpp:300
bool reverse
Definition BarcodeSearch.hpp:315
DuplicateAction duplicates
Definition BarcodeSearch.hpp:320
State of the search.
Definition BarcodeSearch.hpp:376
std::array< int, num_segments > per_segment
Definition BarcodeSearch.hpp:393
int index
Definition BarcodeSearch.hpp:381
int mismatches
Definition BarcodeSearch.hpp:387
Result of the segmented search.
Definition MismatchTrie.hpp:540
int index
Definition MismatchTrie.hpp:553
Optional parameters for SimpleBarcodeSearch.
Definition BarcodeSearch.hpp:110
bool reverse
Definition BarcodeSearch.hpp:119
int max_mismatches
Definition BarcodeSearch.hpp:114
DuplicateAction duplicates
Definition BarcodeSearch.hpp:124
State of the search.
Definition BarcodeSearch.hpp:153
int index
Definition BarcodeSearch.hpp:158
int mismatches
Definition BarcodeSearch.hpp:164