kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
DualBarcodesPairedEnd.hpp
Go to the documentation of this file.
1#ifndef KAORI_DUAL_BARCODES_HPP
2#define KAORI_DUAL_BARCODES_HPP
3
4#include "../ScanTemplate.hpp"
5#include "../BarcodeSearch.hpp"
6#include "../utils.hpp"
7
14namespace kaori {
15
26template<size_t max_size>
28public:
32 struct Options {
37 bool use_first = true;
38
43
48 SearchStrand strand1 = SearchStrand::FORWARD;
49
54
59 SearchStrand strand2 = SearchStrand::FORWARD;
60
64 DuplicateAction duplicates = DuplicateAction::ERROR;
65
71 bool random = false;
72 };
73
74public:
93 const char* template_seq1, size_t template_length1, const BarcodePool& barcode_pool1,
94 const char* template_seq2, size_t template_length2, const BarcodePool& barcode_pool2,
95 const Options& options
96 ) :
97 search_reverse1(search_reverse(options.strand1)),
98 search_reverse2(search_reverse(options.strand2)),
99 constant1(template_seq1, template_length1, options.strand1),
100 constant2(template_seq2, template_length2, options.strand2),
101 max_mm1(options.max_mismatches1),
102 max_mm2(options.max_mismatches2),
103 randomized(options.random),
104 use_first(options.use_first)
105 {
106 auto num_options = barcode_pool1.size();
107 if (num_options != barcode_pool2.size()) {
108 throw std::runtime_error("both barcode pools should be of the same length");
109 }
110 counts.resize(num_options);
111
112 size_t len1;
113 {
114 const auto& regions = constant1.variable_regions();
115 if (regions.size() != 1) {
116 throw std::runtime_error("expected one variable region in the first constant template");
117 }
118 len1 = regions[0].second - regions[0].first;
119 if (len1 != barcode_pool1.length) {
120 throw std::runtime_error("length of variable sequences (" + std::to_string(barcode_pool1.length) +
121 ") should be the same as the variable region (" + std::to_string(len1) + ")");
122 }
123 }
124
125 size_t len2;
126 {
127 const auto& regions = constant2.variable_regions();
128 if (regions.size() != 1) {
129 throw std::runtime_error("expected one variable region in the second constant template");
130 }
131 len2 = regions[0].second - regions[0].first;
132 if (len2 != barcode_pool2.length) {
133 throw std::runtime_error("length of variable sequences (" + std::to_string(barcode_pool2.length) +
134 ") should be the same as the variable region (" + std::to_string(len2) + ")");
135 }
136 }
137
138 // Constructing the combined strings.
139 std::vector<std::string> combined;
140 combined.reserve(num_options);
141
142 for (size_t i = 0; i < num_options; ++i) {
143 std::string current;
144
145 auto ptr1 = barcode_pool1[i];
146 if (search_reverse1) {
147 for (size_t j = 0; j < len1; ++j) {
148 current += complement_base<true, true>(ptr1[len1 - j - 1]);
149 }
150 } else {
151 current.insert(current.end(), ptr1, ptr1 + len1);
152 }
153
154 auto ptr2 = barcode_pool2[i];
155 if (search_reverse2) {
156 for (size_t j = 0; j < len2; ++j) {
157 current += complement_base<true, true>(ptr2[len2 - j - 1]);
158 }
159 } else {
160 current.insert(current.end(), ptr2, ptr2 + len2);
161 }
162
163 combined.push_back(std::move(current));
164 }
165
166 // Constructing the combined varlib.
167 BarcodePool combined_set(combined);
169 combined_set,
170 std::array<int, 2>{ static_cast<int>(len1), static_cast<int>(len2) },
171 [&]{
173 bopt.max_mismatches = { max_mm1, max_mm2 };
174 bopt.duplicates = options.duplicates;
175 bopt.reverse = false; // we already handle strandedness when creating 'combined_set'.
176 return bopt;
177 }()
178 );
179 }
180
181public:
185 struct State {
186 State(size_t n = 0) : counts(n) {}
187 std::vector<int> counts;
188 int total = 0;
189
190 std::pair<std::string, int> first_match;
191 std::vector<std::pair<std::string, int> > second_matches;
192 std::string combined;
193
194 // Default constructors should be called in this case, so it should be fine.
195 typename SegmentedBarcodeSearch<2>::State details;
196 };
197
198 State initialize() const {
199 return State(counts.size());
200 }
201
202 void reduce(State& s) {
203 varlib.reduce(s.details);
204 for (size_t i = 0; i < counts.size(); ++i) {
205 counts[i] += s.counts[i];
206 }
207 total += s.total;
208 }
209
210 constexpr static bool use_names = false;
215private:
216 static void fill_store(std::pair<std::string, int>& first_match, const char* start, const char* end, int mm) {
217 first_match.first.clear();
218 first_match.first.insert(first_match.first.end(), start, end);
219 first_match.second = mm;
220 return;
221 }
222
223 static void fill_store(std::vector<std::pair<std::string, int> >& second_matches, const char* start, const char* end, int mm) {
224 second_matches.emplace_back(std::string(start, end), mm);
225 return;
226 }
227
228 template<class Store>
229 static bool inner_process(
230 bool reverse,
231 const ScanTemplate<max_size>& constant,
232 int max_mm,
233 const char* against,
234 typename ScanTemplate<max_size>::State& deets,
235 Store& store)
236 {
237 while (!deets.finished) {
238 constant.next(deets);
239 if (reverse) {
240 if (deets.reverse_mismatches <= max_mm) {
241 const auto& reg = constant.template variable_regions<true>()[0];
242 auto start = against + deets.position;
243 fill_store(store, start + reg.first, start + reg.second, deets.reverse_mismatches);
244 return true;
245 }
246 } else {
247 if (deets.forward_mismatches <= max_mm) {
248 const auto& reg = constant.variable_regions()[0];
249 auto start = against + deets.position;
250 fill_store(store, start + reg.first, start + reg.second, deets.forward_mismatches);
251 return true;
252 }
253 }
254 }
255 return false;
256 }
257
258 bool process_first(State& state, const std::pair<const char*, const char*>& against1, const std::pair<const char*, const char*>& against2) const {
259 auto deets1 = constant1.initialize(against1.first, against1.second - against1.first);
260 auto deets2 = constant2.initialize(against2.first, against2.second - against2.first);
261
262 state.second_matches.clear();
263
264 auto checker = [&](size_t idx2) -> bool {
265 const auto& current2 = state.second_matches[idx2];
266 state.combined = state.first_match.first;
267 state.combined += current2.first; // on a separate line to avoid creating a std::string intermediate.
268 varlib.search(state.combined, state.details, std::array<int, 2>{ max_mm1 - state.first_match.second, max_mm2 - current2.second });
269
270 if (state.details.index >= 0) {
271 ++state.counts[state.details.index];
272 return true;
273 } else {
274 return false;
275 }
276 };
277
278 // Looping over all hits of the second for each hit of the first read.
279 // This is done in a slightly convoluted way; we only search for
280 // all hits of the second read _after_ we find the first hit of the
281 // first read, so as to avoid a wasted search on the second read
282 // if we never found a hit on the first read.
283 while (inner_process(search_reverse1, constant1, max_mm1, against1.first, deets1, state.first_match)) {
284 if (!deets2.finished) {
285 // Alright, populating the second match buffer. We also
286 // return immediately if any of them form a valid
287 // combination with the first hit of the first read.
288 while (inner_process(search_reverse2, constant2, max_mm2, against2.first, deets2, state.second_matches)) {
289 if (checker(state.second_matches.size() - 1)) {
290 return true;
291 }
292 }
293 if (state.second_matches.empty()) {
294 break;
295 }
296 } else {
297 // And then this part does all the pairwise comparisons with
298 // every hit in the first read.
299 for (size_t i = 0; i < state.second_matches.size(); ++i) {
300 if (checker(i)) {
301 return true;
302 }
303 }
304 }
305 }
306
307 return false;
308 }
309
310 std::pair<int, int> process_best(State& state, const std::pair<const char*, const char*>& against1, const std::pair<const char*, const char*>& against2) const {
311 auto deets1 = constant1.initialize(against1.first, against1.second - against1.first);
312 auto deets2 = constant2.initialize(against2.first, against2.second - against2.first);
313
314 // Getting all hits on the second read, and then looping over that
315 // vector for each hit of the first read. We have to do all pairwise
316 // comparisons anyway to find the best hit.
317 state.second_matches.clear();
318 while (inner_process(search_reverse2, constant2, max_mm2, against2.first, deets2, state.second_matches)) {}
319
320 int chosen = -1;
321 int best_mismatches = max_mm1 + max_mm2 + 1;
322 size_t num_second_matches = state.second_matches.size();
323
324 if (!state.second_matches.empty()) {
325 while (inner_process(search_reverse1, constant1, max_mm1, against1.first, deets1, state.first_match)) {
326 for (size_t i = 0; i < num_second_matches; ++i) {
327 const auto& current2 = state.second_matches[i];
328
329 state.combined = state.first_match.first;
330 state.combined += current2.first; // separate line is deliberate.
331 varlib.search(state.combined, state.details, std::array<int, 2>{ max_mm1 - state.first_match.second, max_mm2 - current2.second });
332
333 if (state.details.index >= 0) {
334 int cur_mismatches = state.details.mismatches + state.first_match.second + current2.second;
335 if (cur_mismatches < best_mismatches) {
336 chosen = state.details.index;
337 best_mismatches = cur_mismatches;
338 } else if (cur_mismatches == best_mismatches && chosen != state.details.index) { // ambiguous.
339 chosen = -1;
340 }
341 }
342 }
343 }
344 }
345
346 return std::make_pair(chosen, best_mismatches);
347 }
348
349public:
353 bool process(State& state, const std::pair<const char*, const char*>& r1, const std::pair<const char*, const char*>& r2) const {
354 bool found;
355
356 if (use_first) {
357 found = process_first(state, r1, r2);
358 if (!found && randomized) {
359 found = process_first(state, r2, r1);
360 }
361
362 } else {
363 auto best = process_best(state, r1, r2);
364 if (randomized) {
365 auto best2 = process_best(state, r2, r1);
366 if (best.first < 0 || best.second > best2.second) {
367 best = best2;
368 } else if (best.second == best2.second && best.first != best2.first) {
369 best.first = -1; // ambiguous.
370 }
371 }
372
373 found = best.first >= 0;
374 if (found) {
375 ++state.counts[best.first];
376 }
377 }
378
379 ++state.total;
380 return found;
381 }
386private:
387 bool search_reverse1, search_reverse2;
388
389 ScanTemplate<max_size> constant1, constant2;
390 SegmentedBarcodeSearch<2> varlib;
391 int max_mm1, max_mm2;
392
393 bool randomized;
394 bool use_first = true;
395
396 std::vector<int> counts;
397 int total = 0;
398
399public:
405 const std::vector<int>& get_counts() const {
406 return counts;
407 }
408
412 int get_total() const {
413 return total;
414 }
415};
416
420// Soft-deprecated back-compatible aliases.
421template<size_t max_size>
422using DualBarcodes = DualBarcodesPairedEnd<max_size>;
427}
428
429#endif
Handler for dual barcodes.
Definition DualBarcodesPairedEnd.hpp:27
const std::vector< int > & get_counts() const
Definition DualBarcodesPairedEnd.hpp:405
DualBarcodesPairedEnd(const char *template_seq1, size_t template_length1, const BarcodePool &barcode_pool1, const char *template_seq2, size_t template_length2, const BarcodePool &barcode_pool2, const Options &options)
Definition DualBarcodesPairedEnd.hpp:92
int get_total() const
Definition DualBarcodesPairedEnd.hpp:412
Search for known barcode sequences with segmented mismatches.
Definition BarcodeSearch.hpp:290
void reduce(State &state)
Definition BarcodeSearch.hpp:423
void search(const std::string &search_seq, State &state) const
Definition BarcodeSearch.hpp:461
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:13
Pool of barcode sequences for a variable region.
Definition BarcodePool.hpp:21
size_t length
Definition BarcodePool.hpp:59
size_t size() const
Definition BarcodePool.hpp:64
Optional parameters for DualBarcodesPairedEnd.
Definition DualBarcodesPairedEnd.hpp:32
int max_mismatches1
Definition DualBarcodesPairedEnd.hpp:42
bool random
Definition DualBarcodesPairedEnd.hpp:71
int max_mismatches2
Definition DualBarcodesPairedEnd.hpp:53
SearchStrand strand2
Definition DualBarcodesPairedEnd.hpp:59
DuplicateAction duplicates
Definition DualBarcodesPairedEnd.hpp:64
bool use_first
Definition DualBarcodesPairedEnd.hpp:37
SearchStrand strand1
Definition DualBarcodesPairedEnd.hpp:48
Optional parameters for a SegmentedBarcodeSearch.
Definition BarcodeSearch.hpp:295
std::array< int, num_segments > max_mismatches
Definition BarcodeSearch.hpp:309
bool reverse
Definition BarcodeSearch.hpp:315
DuplicateAction duplicates
Definition BarcodeSearch.hpp:320