kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
CombinatorialBarcodesSingleEnd.hpp
Go to the documentation of this file.
1#ifndef KAORI_COMBINATORIAL_BARCODES_SINGLE_END_HPP
2#define KAORI_COMBINATORIAL_BARCODES_SINGLE_END_HPP
3
4#include "../ScanTemplate.hpp"
5#include "../BarcodeSearch.hpp"
6#include "../utils.hpp"
7
8#include <array>
9#include <vector>
10
17namespace kaori {
18
30template<size_t max_size, size_t num_variable>
32public:
36 struct Options {
41
45 bool use_first = true;
46
50 SearchStrand strand = SearchStrand::FORWARD;
51
55 DuplicateAction duplicates = DuplicateAction::ERROR;
56 };
57
58public:
70 template<class BarcodePoolContainer>
71 CombinatorialBarcodesSingleEnd(const char* template_seq, size_t template_length, const BarcodePoolContainer& barcode_pools, const Options& options) :
72 forward(search_forward(options.strand)),
73 reverse(search_reverse(options.strand)),
74 max_mm(options.max_mismatches),
75 use_first(options.use_first),
76 constant_matcher(template_seq, template_length, options.strand)
77 {
78 const auto& regions = constant_matcher.variable_regions();
79 if (regions.size() != num_variable) {
80 throw std::runtime_error("expected " + std::to_string(num_variable) + " variable regions in the constant template");
81 }
82 if (barcode_pools.size() != num_variable) {
83 throw std::runtime_error("length of 'barcode_pools' should be equal to the number of variable regions");
84 }
85
86 for (size_t i = 0; i < num_variable; ++i) {
87 size_t rlen = regions[i].second - regions[i].first;
88 size_t vlen = barcode_pools[i].length;
89 if (vlen != rlen) {
90 throw std::runtime_error("length of variable region " + std::to_string(i + 1) + " (" + std::to_string(rlen) +
91 ") should be the same as its sequences (" + std::to_string(vlen) + ")");
92 }
93 }
94
95 // We'll be using this later.
96 for (size_t i = 0; i < num_variable; ++i) {
97 num_options[i] = barcode_pools[i].pool.size();
98 }
99
101 bopt.max_mismatches = options.max_mismatches;
102 bopt.duplicates = options.duplicates;
103
104 if (forward) {
105 bopt.reverse = false;
106 for (size_t i = 0; i < num_variable; ++i) {
107 forward_lib[i] = SimpleBarcodeSearch(barcode_pools[i], bopt);
108 }
109 }
110
111 if (reverse) {
112 bopt.reverse = true;
113 for (size_t i = 0; i < num_variable; ++i) {
114 reverse_lib[i] = SimpleBarcodeSearch(barcode_pools[num_variable - i - 1], bopt);
115 }
116 }
117 }
118
126 use_first = t;
127 return *this;
128 }
129
130public:
134 struct State {
135 std::vector<std::array<int, num_variable> >collected;
136 int total = 0;
137
138 std::array<int, num_variable> temp;
139 std::string buffer;
140
141 // Default constructors should be called in this case, so it should be fine.
142 std::array<typename SimpleBarcodeSearch::State, num_variable> forward_details, reverse_details;
143 };
148private:
149 template<bool reverse>
150 std::pair<bool, int> find_match(
151 const char* seq,
152 size_t position,
153 int obs_mismatches,
154 const std::array<SimpleBarcodeSearch, num_variable>& libs,
155 std::array<typename SimpleBarcodeSearch::State, num_variable>& states,
156 std::array<int, num_variable>& temp,
157 std::string& buffer
158 ) const {
159 const auto& regions = constant_matcher.template variable_regions<reverse>();
160
161 for (size_t r = 0; r < num_variable; ++r) {
162 auto range = regions[r];
163 auto start = seq + position;
164 buffer.clear(); // clear and insert preserves buffer's existing heap allocation.
165 buffer.insert(buffer.end(), start + range.first, start + range.second);
166
167 auto& curstate = states[r];
168 libs[r].search(buffer, curstate, max_mm - obs_mismatches);
169 if (curstate.index < 0) {
170 return std::make_pair(false, 0);
171 }
172
173 obs_mismatches += curstate.mismatches;
174 if (obs_mismatches > max_mm) {
175 return std::make_pair(false, 0);
176 }
177
178 if constexpr(reverse) {
179 temp[num_variable - r - 1] = curstate.index;
180 } else {
181 temp[r] = curstate.index;
182 }
183 }
184
185 return std::make_pair(true, obs_mismatches);
186 }
187
188 std::pair<bool, int> forward_match(const char* seq, const typename ScanTemplate<max_size>::State& deets, State& state) const {
189 return find_match<false>(seq, deets.position, deets.forward_mismatches, forward_lib, state.forward_details, state.temp, state.buffer);
190 }
191
192 std::pair<bool, int> reverse_match(const char* seq, const typename ScanTemplate<max_size>::State& deets, State& state) const {
193 return find_match<true>(seq, deets.position, deets.reverse_mismatches, reverse_lib, state.reverse_details, state.temp, state.buffer);
194 }
195
196private:
197 void process_first(State& state, const std::pair<const char*, const char*>& x) const {
198 auto deets = constant_matcher.initialize(x.first, x.second - x.first);
199
200 while (!deets.finished) {
201 constant_matcher.next(deets);
202
203 if (forward && deets.forward_mismatches <= max_mm) {
204 if (forward_match(x.first, deets, state).first) {
205 state.collected.push_back(state.temp);
206 return;
207 }
208 }
209
210 if (reverse && deets.reverse_mismatches <= max_mm) {
211 if (reverse_match(x.first, deets, state).first) {
212 state.collected.push_back(state.temp);
213 return;
214 }
215 }
216 }
217 }
218
219 void process_best(State& state, const std::pair<const char*, const char*>& x) const {
220 auto deets = constant_matcher.initialize(x.first, x.second - x.first);
221 bool found = false;
222 int best_mismatches = max_mm + 1;
223 std::array<int, num_variable> best_id;
224
225 auto update = [&](std::pair<bool, int> match) -> void {
226 if (match.first && match.second <= best_mismatches) {
227 if (match.second == best_mismatches) {
228 if (best_id != state.temp) { // ambiguous.
229 found = false;
230 }
231 } else {
232 // A further optimization at this point would be to narrow
233 // max_mm to the current 'best_mismatches'. But
234 // this probably isn't worth it.
235
236 found = true;
237 best_mismatches = match.second;
238 best_id = state.temp;
239 }
240 }
241 };
242
243 while (!deets.finished) {
244 constant_matcher.next(deets);
245
246 if (forward && deets.forward_mismatches <= max_mm) {
247 update(forward_match(x.first, deets, state));
248 }
249
250 if (reverse && deets.reverse_mismatches <= max_mm) {
251 update(reverse_match(x.first, deets, state));
252 }
253 }
254
255 if (found) {
256 state.collected.push_back(best_id);
257 }
258 }
259
260public:
264 State initialize() const {
265 return State();
266 }
267
268 void reduce(State& s) {
269 if (forward) {
270 for (size_t r = 0; r < num_variable; ++r) {
271 forward_lib[r].reduce(s.forward_details[r]);
272 }
273 }
274 if (reverse) {
275 for (size_t r = 0; r < num_variable; ++r) {
276 reverse_lib[r].reduce(s.reverse_details[r]);
277 }
278 }
279
280 combinations.insert(combinations.end(), s.collected.begin(), s.collected.end());
281 total += s.total;
282 return;
283 }
284
285 void process(State& state, const std::pair<const char*, const char*>& x) const {
286 if (use_first) {
287 process_first(state, x);
288 } else {
289 process_best(state, x);
290 }
291 ++state.total;
292 }
293
294 static constexpr bool use_names = false;
299public:
303 void sort() {
304 sort_combinations(combinations, num_options);
305 }
306
310 const std::vector<std::array<int, num_variable> >& get_combinations() const {
311 return combinations;
312 }
313
317 int get_total() const {
318 return total;
319 }
320private:
321 bool forward;
322 bool reverse;
323 int max_mm;
324 bool use_first;
325 size_t nregions;
326
327 ScanTemplate<max_size> constant_matcher;
328 std::array<SimpleBarcodeSearch, num_variable> forward_lib, reverse_lib;
329 std::array<size_t, num_variable> num_options;
330
331 std::vector<std::array<int, num_variable> > combinations;
332 int total = 0;
333};
334
335}
336
337#endif
Handler for single-end combinatorial barcodes.
Definition CombinatorialBarcodesSingleEnd.hpp:31
CombinatorialBarcodesSingleEnd & set_first(bool t=true)
Definition CombinatorialBarcodesSingleEnd.hpp:125
const std::vector< std::array< int, num_variable > > & get_combinations() const
Definition CombinatorialBarcodesSingleEnd.hpp:310
void sort()
Definition CombinatorialBarcodesSingleEnd.hpp:303
int get_total() const
Definition CombinatorialBarcodesSingleEnd.hpp:317
CombinatorialBarcodesSingleEnd(const char *template_seq, size_t template_length, const BarcodePoolContainer &barcode_pools, const Options &options)
Definition CombinatorialBarcodesSingleEnd.hpp:71
Scan a read sequence for the template sequence.
Definition ScanTemplate.hpp:37
Search for known barcode sequences.
Definition BarcodeSearch.hpp:105
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:13
Optional parameters for CombinatorialBarcodeSingleEnd.
Definition CombinatorialBarcodesSingleEnd.hpp:36
SearchStrand strand
Definition CombinatorialBarcodesSingleEnd.hpp:50
int max_mismatches
Definition CombinatorialBarcodesSingleEnd.hpp:40
bool use_first
Definition CombinatorialBarcodesSingleEnd.hpp:45
DuplicateAction duplicates
Definition CombinatorialBarcodesSingleEnd.hpp:55
Optional parameters for SimpleBarcodeSearch.
Definition BarcodeSearch.hpp:110
bool reverse
Definition BarcodeSearch.hpp:119
int max_mismatches
Definition BarcodeSearch.hpp:114
DuplicateAction duplicates
Definition BarcodeSearch.hpp:124