kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
DualBarcodesSingleEnd.hpp
Go to the documentation of this file.
1#ifndef KAORI_DUAL_BARCODES_SINGLE_END_HPP
2#define KAORI_DUAL_BARCODES_SINGLE_END_HPP
3
4#include "../ScanTemplate.hpp"
6#include "../utils.hpp"
7
8#include <array>
9#include <vector>
10
17namespace kaori {
18
30template<SeqLength max_size_>
32public:
36 struct Options {
41
45 bool use_first = true;
46
50 SearchStrand strand = SearchStrand::FORWARD;
51
55 DuplicateAction duplicates = DuplicateAction::ERROR;
56 };
57
58public:
68 DualBarcodesSingleEnd(const char* template_seq, SeqLength template_length, const std::vector<BarcodePool>& barcode_pools, const Options& options) :
69 my_forward(search_forward(options.strand)),
70 my_reverse(search_reverse(options.strand)),
71 my_max_mm(options.max_mismatches),
72 my_use_first(options.use_first),
73 my_constant_matcher(template_seq, template_length, options.strand)
74 {
75 const auto& regions = my_constant_matcher.forward_variable_regions();
76 my_num_variable = regions.size();
77 if (barcode_pools.size() != my_num_variable) {
78 throw std::runtime_error("length of 'barcode_pools' should equal the number of variable regions");
79 }
80
81 for (decltype(my_num_variable) i = 0; i < my_num_variable; ++i) {
82 SeqLength rlen = regions[i].second - regions[i].first;
83 SeqLength vlen = barcode_pools[i].length();
84 if (vlen != rlen) {
85 throw std::runtime_error("length of variable region " + std::to_string(i + 1) + " (" + std::to_string(rlen) +
86 ") should be the same as its sequences (" + std::to_string(vlen) + ")");
87 }
88 }
89
90 BarcodeIndex num_choices = 0;
91 if (my_num_variable) {
92 num_choices = barcode_pools[0].size();
93 for (decltype(my_num_variable) i = 1; i < my_num_variable; ++i) {
94 if (num_choices != barcode_pools[i].size()) {
95 throw std::runtime_error("all entries of 'barcode_pools' should have the same length");
96 }
97 }
98 }
99 my_counts.resize(num_choices);
100
101 // Constructing the combined varlib.
102 std::vector<std::string> combined(num_choices);
103 for (decltype(my_num_variable) v = 0; v < my_num_variable; ++v) {
104 const auto& curpool = barcode_pools[v];
105 SeqLength n = curpool.length();
106 for (BarcodeIndex c = 0; c < num_choices; ++c) {
107 auto ptr = curpool[c];
108 combined[c].insert(combined[c].end(), ptr, ptr + n);
109 }
110 }
111
113 bopt.max_mismatches = options.max_mismatches;
114 bopt.duplicates = options.duplicates;
115
116 if (my_forward) {
117 bopt.reverse = false;
118 my_forward_lib = SimpleBarcodeSearch(combined, bopt);
119 }
120
121 if (my_reverse) {
122 bopt.reverse = true;
123 my_reverse_lib = SimpleBarcodeSearch(combined, bopt);
124 }
125 }
126
127private:
128 bool my_forward;
129 bool my_reverse;
130 int my_max_mm;
131 bool my_use_first;
132
133 ScanTemplate<max_size_> my_constant_matcher;
134 std::size_t my_num_variable;
135
136 SimpleBarcodeSearch my_forward_lib, my_reverse_lib;
137 std::vector<Count > my_counts;
138 Count my_total = 0;
139
140public:
144 struct State {
145 State() = default;
146 State(typename std::vector<Count>::size_type n) : counts(n) {}
147
148 std::vector<Count> counts;
149 Count total = 0;
150
151 std::string buffer;
152
153 // Default constructors should be called in this case, so it should be fine.
154 typename SimpleBarcodeSearch::State forward_details, reverse_details;
155 };
160private:
161 std::pair<BarcodeIndex, int> find_match(
162 bool reverse,
163 const char* seq,
164 SeqLength position,
165 int obs_mismatches,
166 const SimpleBarcodeSearch& lib,
167 typename SimpleBarcodeSearch::State state,
168 std::string& buffer
169 ) const {
170 const auto& regions = my_constant_matcher.variable_regions(reverse);
171 buffer.clear();
172
173 for (decltype(my_num_variable) r = 0; r < my_num_variable; ++r) {
174 auto start = seq + position;
175 buffer.insert(buffer.end(), start + regions[r].first, start + regions[r].second);
176 }
177
178 lib.search(buffer, state, my_max_mm - obs_mismatches);
179 return std::make_pair(state.index, obs_mismatches + state.mismatches);
180 }
181
182 std::pair<BarcodeIndex, int> forward_match(const char* seq, const typename ScanTemplate<max_size_>::State& deets, State& state) const {
183 return find_match(false, seq, deets.position, deets.forward_mismatches, my_forward_lib, state.forward_details, state.buffer);
184 }
185
186 std::pair<BarcodeIndex, int> reverse_match(const char* seq, const typename ScanTemplate<max_size_>::State& deets, State& state) const {
187 return find_match(true, seq, deets.position, deets.reverse_mismatches, my_reverse_lib, state.reverse_details, state.buffer);
188 }
189
190private:
191 bool process_first(State& state, const std::pair<const char*, const char*>& x) const {
192 auto deets = my_constant_matcher.initialize(x.first, x.second - x.first);
193
194 while (!deets.finished) {
195 my_constant_matcher.next(deets);
196
197 if (my_forward && deets.forward_mismatches <= my_max_mm) {
198 auto id = forward_match(x.first, deets, state).first;
199 if (is_barcode_index_ok(id)) {
200 ++state.counts[id];
201 return true;
202 }
203 }
204
205 if (my_reverse && deets.reverse_mismatches <= my_max_mm) {
206 auto id = reverse_match(x.first, deets, state).first;
207 if (is_barcode_index_ok(id)) {
208 ++state.counts[id];
209 return true;
210 }
211 }
212 }
213 return false;
214 }
215
216 bool process_best(State& state, const std::pair<const char*, const char*>& x) const {
217 auto deets = my_constant_matcher.initialize(x.first, x.second - x.first);
218 bool found = false;
219 int best_mismatches = my_max_mm + 1;
221
222 auto update = [&](std::pair<BarcodeIndex, int> match) -> void {
223 if (!is_barcode_index_ok(match.first)){
224 return;
225 }
226 if (match.second == best_mismatches) {
227 if (best_id != match.first) { // ambiguous.
228 found = false;
229 }
230 } else if (match.second < best_mismatches) {
231 // A further optimization at this point would be to narrow
232 // max_mm to the current 'best_mismatches'. But
233 // this probably isn't worth it.
234
235 found = true;
236 best_mismatches = match.second;
237 best_id = match.first;
238 }
239 };
240
241 while (!deets.finished) {
242 my_constant_matcher.next(deets);
243
244 if (my_forward && deets.forward_mismatches <= my_max_mm) {
245 update(forward_match(x.first, deets, state));
246 }
247
248 if (my_reverse && deets.reverse_mismatches <= my_max_mm) {
249 update(reverse_match(x.first, deets, state));
250 }
251 }
252
253 if (found) {
254 ++state.counts[best_id];
255 }
256 return found;
257 }
258
259public:
263 State initialize() const {
264 return State(my_counts.size());
265 }
266
267 void reduce(State& s) {
268 if (my_forward) {
269 my_forward_lib.reduce(s.forward_details);
270 }
271 if (my_reverse) {
272 my_reverse_lib.reduce(s.reverse_details);
273 }
274
275 for (decltype(my_counts.size()) i = 0, end = my_counts.size(); i < end; ++i) {
276 my_counts[i] += s.counts[i];
277 }
278 my_total += s.total;
279 return;
280 }
281
282 bool process(State& state, const std::pair<const char*, const char*>& x) const {
283 ++state.total;
284 if (my_use_first) {
285 return process_first(state, x);
286 } else {
287 return process_best(state, x);
288 }
289 }
290
291 static constexpr bool use_names = false;
296public:
302 const std::vector<Count>& get_counts() const {
303 return my_counts;
304 }
305
309 Count get_total() const {
310 return my_total;
311 }
312};
313
314}
315
316#endif
Search for barcode sequences.
Defines the ScanTemplate class.
Handler for single-end dual barcodes.
Definition DualBarcodesSingleEnd.hpp:31
Count get_total() const
Definition DualBarcodesSingleEnd.hpp:309
const std::vector< Count > & get_counts() const
Definition DualBarcodesSingleEnd.hpp:302
DualBarcodesSingleEnd(const char *template_seq, SeqLength template_length, const std::vector< BarcodePool > &barcode_pools, const Options &options)
Definition DualBarcodesSingleEnd.hpp:68
Scan a read sequence for the template sequence.
Definition ScanTemplate.hpp:38
State initialize(const char *read_seq, SeqLength read_length) const
Definition ScanTemplate.hpp:159
void next(State &state) const
Definition ScanTemplate.hpp:201
const std::vector< std::pair< SeqLength, SeqLength > > & variable_regions(bool reverse) const
Definition ScanTemplate.hpp:308
Search against known barcodes.
Definition BarcodeSearch.hpp:70
void reduce(State &state)
Definition BarcodeSearch.hpp:170
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:16
std::size_t SeqLength
Definition utils.hpp:37
SearchStrand
Definition utils.hpp:31
constexpr BarcodeIndex STATUS_UNMATCHED
Definition utils.hpp:48
DuplicateAction
Definition utils.hpp:26
bool is_barcode_index_ok(BarcodeIndex index)
Definition utils.hpp:60
std::vector< constchar * >::size_type BarcodeIndex
Definition utils.hpp:43
unsigned long long Count
Definition utils.hpp:67
Optional parameters for DualBarcodeSingleEnd.
Definition DualBarcodesSingleEnd.hpp:36
int max_mismatches
Definition DualBarcodesSingleEnd.hpp:40
DuplicateAction duplicates
Definition DualBarcodesSingleEnd.hpp:55
bool use_first
Definition DualBarcodesSingleEnd.hpp:45
SearchStrand strand
Definition DualBarcodesSingleEnd.hpp:50
Optional parameters for SimpleBarcodeSearch.
Definition BarcodeSearch.hpp:75
bool reverse
Definition BarcodeSearch.hpp:84
int max_mismatches
Definition BarcodeSearch.hpp:79
DuplicateAction duplicates
Definition BarcodeSearch.hpp:89
Utilites for sequence matching.