kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
DualBarcodesPairedEnd.hpp
Go to the documentation of this file.
1#ifndef KAORI_DUAL_BARCODES_HPP
2#define KAORI_DUAL_BARCODES_HPP
3
4#include "../ScanTemplate.hpp"
6#include "../utils.hpp"
7
14namespace kaori {
15
26template<SeqLength max_size_>
28public:
32 struct Options {
37 bool use_first = true;
38
43
48 SearchStrand strand1 = SearchStrand::FORWARD;
49
54
59 SearchStrand strand2 = SearchStrand::FORWARD;
60
64 DuplicateAction duplicates = DuplicateAction::ERROR;
65
71 bool random = false;
72 };
73
74public:
93 const char* template_seq1, SeqLength template_length1, const BarcodePool& barcode_pool1,
94 const char* template_seq2, SeqLength template_length2, const BarcodePool& barcode_pool2,
95 const Options& options
96 ) :
97 my_search_reverse1(search_reverse(options.strand1)),
98 my_search_reverse2(search_reverse(options.strand2)),
99 my_constant1(template_seq1, template_length1, options.strand1),
100 my_constant2(template_seq2, template_length2, options.strand2),
101 my_max_mm1(options.max_mismatches1),
102 my_max_mm2(options.max_mismatches2),
103 my_randomized(options.random),
104 my_use_first(options.use_first)
105 {
106 auto num_options = barcode_pool1.size();
107 if (num_options != barcode_pool2.size()) {
108 throw std::runtime_error("both barcode pools should be of the same length");
109 }
110 my_counts.resize(num_options);
111
112 SeqLength len1;
113 {
114 const auto& regions = my_constant1.forward_variable_regions();
115 if (regions.size() != 1) {
116 throw std::runtime_error("expected one variable region in the first constant template");
117 }
118 len1 = regions[0].second - regions[0].first;
119 if (len1 != barcode_pool1.length()) {
120 throw std::runtime_error("length of variable sequences (" + std::to_string(barcode_pool1.length()) +
121 ") should be the same as the variable region (" + std::to_string(len1) + ")");
122 }
123 }
124
125 SeqLength len2;
126 {
127 const auto& regions = my_constant2.forward_variable_regions();
128 if (regions.size() != 1) {
129 throw std::runtime_error("expected one variable region in the second constant template");
130 }
131 len2 = regions[0].second - regions[0].first;
132 if (len2 != barcode_pool2.length()) {
133 throw std::runtime_error("length of variable sequences (" + std::to_string(barcode_pool2.length()) +
134 ") should be the same as the variable region (" + std::to_string(len2) + ")");
135 }
136 }
137
138 // Constructing the combined strings.
139 std::vector<std::string> combined;
140 combined.reserve(num_options);
141
142 for (BarcodeIndex i = 0; i < num_options; ++i) {
143 std::string current;
144
145 auto ptr1 = barcode_pool1[i];
146 if (my_search_reverse1) {
147 for (SeqLength j = 0; j < len1; ++j) {
148 current += complement_base<true, true>(ptr1[len1 - j - 1]);
149 }
150 } else {
151 current.insert(current.end(), ptr1, ptr1 + len1);
152 }
153
154 auto ptr2 = barcode_pool2[i];
155 if (my_search_reverse2) {
156 for (SeqLength j = 0; j < len2; ++j) {
157 current += complement_base<true, true>(ptr2[len2 - j - 1]);
158 }
159 } else {
160 current.insert(current.end(), ptr2, ptr2 + len2);
161 }
162
163 combined.push_back(std::move(current));
164 }
165
166 // Constructing the combined varlib.
167 BarcodePool combined_set(combined);
168 my_varlib = SegmentedBarcodeSearch<2>(
169 combined_set,
170 std::array<SeqLength, 2>{ len1, len2 },
171 [&]{
172 typename SegmentedBarcodeSearch<2>::Options bopt;
173 bopt.max_mismatches = { my_max_mm1, my_max_mm2 };
174 bopt.duplicates = options.duplicates;
175 bopt.reverse = false; // we already handle strandedness when creating 'combined_set'.
176 return bopt;
177 }()
178 );
179 }
180
181private:
182 bool my_search_reverse1, my_search_reverse2;
183
184 ScanTemplate<max_size_> my_constant1, my_constant2;
185 SegmentedBarcodeSearch<2> my_varlib;
186 int my_max_mm1, my_max_mm2;
187
188 bool my_randomized;
189 bool my_use_first = true;
190
191 std::vector<Count> my_counts;
192 Count my_total = 0;
193
194public:
198 struct State {
199 State() = default;
200 State(typename std::vector<Count>::size_type n) : counts(n) {}
201
202 std::vector<Count> counts;
203 Count total = 0;
204
205 std::pair<std::string, int> first_match;
206 std::vector<std::pair<std::string, int> > second_matches;
207 std::string combined;
208
209 // Default constructors should be called in this case, so it should be fine.
210 typename SegmentedBarcodeSearch<2>::State details;
211 };
212
213 State initialize() const {
214 return State(my_counts.size());
215 }
216
217 void reduce(State& s) {
218 my_varlib.reduce(s.details);
219 auto csize = my_counts.size();
220 for (decltype(csize) i = 0; i < csize; ++i) {
221 my_counts[i] += s.counts[i];
222 }
223 my_total += s.total;
224 }
225
226 constexpr static bool use_names = false;
231private:
232 static void fill_store(std::pair<std::string, int>& first_match, const char* start, const char* end, int mm) {
233 first_match.first.clear();
234 first_match.first.insert(first_match.first.end(), start, end);
235 first_match.second = mm;
236 return;
237 }
238
239 static void fill_store(std::vector<std::pair<std::string, int> >& second_matches, const char* start, const char* end, int mm) {
240 second_matches.emplace_back(std::string(start, end), mm);
241 return;
242 }
243
244 template<class Store>
245 static bool inner_process(
246 bool reverse,
247 const ScanTemplate<max_size_>& constant,
248 int max_mm,
249 const char* against,
250 typename ScanTemplate<max_size_>::State& deets,
251 Store& store)
252 {
253 while (!deets.finished) {
254 constant.next(deets);
255 if (reverse) {
256 if (deets.reverse_mismatches <= max_mm) {
257 const auto& reg = constant.reverse_variable_regions()[0];
258 auto start = against + deets.position;
259 fill_store(store, start + reg.first, start + reg.second, deets.reverse_mismatches);
260 return true;
261 }
262 } else {
263 if (deets.forward_mismatches <= max_mm) {
264 const auto& reg = constant.forward_variable_regions()[0];
265 auto start = against + deets.position;
266 fill_store(store, start + reg.first, start + reg.second, deets.forward_mismatches);
267 return true;
268 }
269 }
270 }
271 return false;
272 }
273
274 bool process_first(State& state, const std::pair<const char*, const char*>& against1, const std::pair<const char*, const char*>& against2) const {
275 auto deets1 = my_constant1.initialize(against1.first, against1.second - against1.first);
276 auto deets2 = my_constant2.initialize(against2.first, against2.second - against2.first);
277
278 state.second_matches.clear();
279 typedef decltype(state.second_matches.size()) Size;
280
281 auto checker = [&](Size idx2) -> bool {
282 const auto& current2 = state.second_matches[idx2];
283 state.combined = state.first_match.first;
284 state.combined += current2.first; // on a separate line to avoid creating a std::string intermediate.
285 my_varlib.search(state.combined, state.details, std::array<int, 2>{ my_max_mm1 - state.first_match.second, my_max_mm2 - current2.second });
286
287 if (is_barcode_index_ok(state.details.index)) {
288 ++state.counts[state.details.index];
289 return true;
290 } else {
291 return false;
292 }
293 };
294
295 // Looping over all hits of the second for each hit of the first read.
296 // This is done in a slightly convoluted way; we only search for
297 // all hits of the second read _after_ we find the first hit of the
298 // first read, so as to avoid a wasted search on the second read
299 // if we never found a hit on the first read.
300 while (inner_process(my_search_reverse1, my_constant1, my_max_mm1, against1.first, deets1, state.first_match)) {
301 if (!deets2.finished) {
302 // Alright, populating the second match buffer. We also
303 // return immediately if any of them form a valid
304 // combination with the first hit of the first read.
305 while (inner_process(my_search_reverse2, my_constant2, my_max_mm2, against2.first, deets2, state.second_matches)) {
306 if (checker(state.second_matches.size() - 1)) {
307 return true;
308 }
309 }
310 if (state.second_matches.empty()) {
311 break;
312 }
313 } else {
314 // And then this part does all the pairwise comparisons with
315 // every hit in the first read.
316 for (Size i = 0, end = state.second_matches.size(); i < end; ++i) {
317 if (checker(i)) {
318 return true;
319 }
320 }
321 }
322 }
323
324 return false;
325 }
326
327 std::pair<BarcodeIndex, int> process_best(State& state, const std::pair<const char*, const char*>& against1, const std::pair<const char*, const char*>& against2) const {
328 auto deets1 = my_constant1.initialize(against1.first, against1.second - against1.first);
329 auto deets2 = my_constant2.initialize(against2.first, against2.second - against2.first);
330
331 // Getting all hits on the second read, and then looping over that
332 // vector for each hit of the first read. We have to do all pairwise
333 // comparisons anyway to find the best hit.
334 state.second_matches.clear();
335 while (inner_process(my_search_reverse2, my_constant2, my_max_mm2, against2.first, deets2, state.second_matches)) {}
336
338 int best_mismatches = my_max_mm1 + my_max_mm2 + 1;
339 auto num_second_matches = state.second_matches.size();
340
341 if (!state.second_matches.empty()) {
342 while (inner_process(my_search_reverse1, my_constant1, my_max_mm1, against1.first, deets1, state.first_match)) {
343 for (decltype(num_second_matches) i = 0; i < num_second_matches; ++i) {
344 const auto& current2 = state.second_matches[i];
345
346 state.combined = state.first_match.first;
347 state.combined += current2.first; // separate line is deliberate.
348 my_varlib.search(state.combined, state.details, std::array<int, 2>{ my_max_mm1 - state.first_match.second, my_max_mm2 - current2.second });
349
350 if (is_barcode_index_ok(state.details.index)) {
351 int cur_mismatches = state.details.mismatches + state.first_match.second + current2.second;
352 if (cur_mismatches < best_mismatches) {
353 chosen = state.details.index;
354 best_mismatches = cur_mismatches;
355 } else if (cur_mismatches == best_mismatches && chosen != state.details.index) { // ambiguous.
356 chosen = STATUS_AMBIGUOUS;
357 }
358 }
359 }
360 }
361 }
362
363 return std::make_pair(chosen, best_mismatches);
364 }
365
366public:
370 bool process(State& state, const std::pair<const char*, const char*>& r1, const std::pair<const char*, const char*>& r2) const {
371 bool found;
372
373 if (my_use_first) {
374 found = process_first(state, r1, r2);
375 if (!found && my_randomized) {
376 found = process_first(state, r2, r1);
377 }
378
379 } else {
380 auto best = process_best(state, r1, r2);
381 if (my_randomized) {
382 auto best2 = process_best(state, r2, r1);
383 if (!is_barcode_index_ok(best.first) || best.second > best2.second) {
384 best = best2;
385 } else if (best.second == best2.second && best.first != best2.first) {
386 best.first = STATUS_AMBIGUOUS; // ambiguous.
387 }
388 }
389
390 found = is_barcode_index_ok(best.first);
391 if (found) {
392 ++state.counts[best.first];
393 }
394 }
395
396 ++state.total;
397 return found;
398 }
403public:
409 const std::vector<Count>& get_counts() const {
410 return my_counts;
411 }
412
416 Count get_total() const {
417 return my_total;
418 }
419};
420
424// Soft-deprecated back-compatible aliases.
425template<SeqLength max_size_>
426using DualBarcodes = DualBarcodesPairedEnd<max_size_>;
431}
432
433#endif
Search for barcode sequences.
Defines the ScanTemplate class.
Pool of barcode sequences.
Definition BarcodePool.hpp:24
SeqLength length() const
Definition BarcodePool.hpp:70
BarcodeIndex size() const
Definition BarcodePool.hpp:77
Handler for dual barcodes.
Definition DualBarcodesPairedEnd.hpp:27
const std::vector< Count > & get_counts() const
Definition DualBarcodesPairedEnd.hpp:409
Count get_total() const
Definition DualBarcodesPairedEnd.hpp:416
DualBarcodesPairedEnd(const char *template_seq1, SeqLength template_length1, const BarcodePool &barcode_pool1, const char *template_seq2, SeqLength template_length2, const BarcodePool &barcode_pool2, const Options &options)
Definition DualBarcodesPairedEnd.hpp:92
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:16
std::size_t SeqLength
Definition utils.hpp:37
constexpr BarcodeIndex STATUS_AMBIGUOUS
Definition utils.hpp:53
SearchStrand
Definition utils.hpp:31
constexpr BarcodeIndex STATUS_UNMATCHED
Definition utils.hpp:48
DuplicateAction
Definition utils.hpp:26
bool is_barcode_index_ok(BarcodeIndex index)
Definition utils.hpp:60
std::vector< constchar * >::size_type BarcodeIndex
Definition utils.hpp:43
unsigned long long Count
Definition utils.hpp:67
Optional parameters for DualBarcodesPairedEnd.
Definition DualBarcodesPairedEnd.hpp:32
SearchStrand strand2
Definition DualBarcodesPairedEnd.hpp:59
int max_mismatches1
Definition DualBarcodesPairedEnd.hpp:42
bool use_first
Definition DualBarcodesPairedEnd.hpp:37
SearchStrand strand1
Definition DualBarcodesPairedEnd.hpp:48
bool random
Definition DualBarcodesPairedEnd.hpp:71
DuplicateAction duplicates
Definition DualBarcodesPairedEnd.hpp:64
int max_mismatches2
Definition DualBarcodesPairedEnd.hpp:53
Utilites for sequence matching.