kaori
A C++ library for barcode extraction and matching
Loading...
Searching...
No Matches
CombinatorialBarcodesSingleEnd.hpp
Go to the documentation of this file.
1#ifndef KAORI_COMBINATORIAL_BARCODES_SINGLE_END_HPP
2#define KAORI_COMBINATORIAL_BARCODES_SINGLE_END_HPP
3
4#include "../ScanTemplate.hpp"
6#include "../utils.hpp"
7
8#include <array>
9#include <vector>
10#include <unordered_map>
11
18namespace kaori {
19
31template<SeqLength max_size_, int num_variable_>
33public:
37 struct Options {
42
46 bool use_first = true;
47
51 SearchStrand strand = SearchStrand::FORWARD;
52
56 DuplicateAction duplicates = DuplicateAction::ERROR;
57 };
58
59public:
71 template<class BarcodePoolContainer>
72 CombinatorialBarcodesSingleEnd(const char* template_seq, SeqLength template_length, const BarcodePoolContainer& barcode_pools, const Options& options) :
73 my_forward(search_forward(options.strand)),
74 my_reverse(search_reverse(options.strand)),
75 my_max_mm(options.max_mismatches),
76 my_use_first(options.use_first),
77 my_constant_matcher(template_seq, template_length, options.strand)
78 {
79 const auto& regions = my_constant_matcher.forward_variable_regions();
80 if (regions.size() != num_variable_) {
81 throw std::runtime_error("expected " + std::to_string(num_variable_) + " variable regions in the constant template");
82 }
83 if (barcode_pools.size() != num_variable_) {
84 throw std::runtime_error("length of 'barcode_pools' should be equal to the number of variable regions");
85 }
86
87 for (int i = 0; i < num_variable_; ++i) {
88 SeqLength rlen = regions[i].second - regions[i].first;
89 SeqLength vlen = barcode_pools[i].length();
90 if (vlen != rlen) {
91 throw std::runtime_error("length of variable region " + std::to_string(i + 1) + " (" + std::to_string(rlen) +
92 ") should be the same as its sequences (" + std::to_string(vlen) + ")");
93 }
94 }
95
96 // We'll be using this later.
97 for (int i = 0; i < num_variable_; ++i) {
98 my_pool_size[i] = barcode_pools[i].pool().size();
99 }
100
102 bopt.max_mismatches = options.max_mismatches;
103 bopt.duplicates = options.duplicates;
104
105 if (my_forward) {
106 bopt.reverse = false;
107 for (int i = 0; i < num_variable_; ++i) {
108 my_forward_lib[i] = SimpleBarcodeSearch(barcode_pools[i], bopt);
109 }
110 }
111
112 if (my_reverse) {
113 bopt.reverse = true;
114 for (int i = 0; i < num_variable_; ++i) {
115 my_reverse_lib[i] = SimpleBarcodeSearch(barcode_pools[num_variable_ - i - 1], bopt);
116 }
117 }
118 }
119
127 my_use_first = t;
128 return *this;
129 }
130
131private:
132 bool my_forward;
133 bool my_reverse;
134 int my_max_mm;
135 bool my_use_first;
136
137 ScanTemplate<max_size_> my_constant_matcher;
138 std::array<SimpleBarcodeSearch, num_variable_> my_forward_lib, my_reverse_lib;
139 std::array<BarcodeIndex, num_variable_> my_pool_size;
140
141 std::unordered_map<std::array<BarcodeIndex, num_variable_>, Count, CombinationHash<num_variable_> > my_combinations;
142 Count my_total = 0;
143
144public:
148 struct State {
149 std::unordered_map<std::array<BarcodeIndex, num_variable_>, Count, CombinationHash<num_variable_> > collected;
150 Count total = 0;
151
152 std::array<BarcodeIndex, num_variable_> temp;
153 std::string buffer;
154
155 // Default constructors should be called in this case, so it should be fine.
156 std::array<typename SimpleBarcodeSearch::State, num_variable_> forward_details, reverse_details;
157 };
162private:
163 std::pair<bool, int> find_match(
164 bool reverse,
165 const char* seq,
166 SeqLength position,
167 int obs_mismatches,
168 const std::array<SimpleBarcodeSearch, num_variable_>& libs,
169 std::array<typename SimpleBarcodeSearch::State, num_variable_>& states,
170 std::array<BarcodeIndex, num_variable_>& temp,
171 std::string& buffer
172 ) const {
173 const auto& regions = my_constant_matcher.variable_regions(reverse);
174
175 for (int r = 0; r < num_variable_; ++r) {
176 const auto& range = regions[r];
177 auto start = seq + position;
178 buffer.clear(); // clear and insert preserves buffer's existing heap allocation.
179 buffer.insert(buffer.end(), start + range.first, start + range.second);
180
181 auto& curstate = states[r];
182 libs[r].search(buffer, curstate, my_max_mm - obs_mismatches);
183 if (!is_barcode_index_ok(curstate.index)) {
184 return std::make_pair(false, 0);
185 }
186
187 obs_mismatches += curstate.mismatches;
188 if (obs_mismatches > my_max_mm) {
189 return std::make_pair(false, 0);
190 }
191
192 if (reverse) {
193 temp[num_variable_ - r - 1] = curstate.index;
194 } else {
195 temp[r] = curstate.index;
196 }
197 }
198
199 return std::make_pair(true, obs_mismatches);
200 }
201
202 std::pair<bool, int> forward_match(const char* seq, const typename ScanTemplate<max_size_>::State& deets, State& state) const {
203 return find_match(false, seq, deets.position, deets.forward_mismatches, my_forward_lib, state.forward_details, state.temp, state.buffer);
204 }
205
206 std::pair<bool, int> reverse_match(const char* seq, const typename ScanTemplate<max_size_>::State& deets, State& state) const {
207 return find_match(true, seq, deets.position, deets.reverse_mismatches, my_reverse_lib, state.reverse_details, state.temp, state.buffer);
208 }
209
210private:
211 void process_first(State& state, const std::pair<const char*, const char*>& x) const {
212 auto deets = my_constant_matcher.initialize(x.first, x.second - x.first);
213
214 while (!deets.finished) {
215 my_constant_matcher.next(deets);
216
217 if (my_forward && deets.forward_mismatches <= my_max_mm) {
218 if (forward_match(x.first, deets, state).first) {
219 ++state.collected[state.temp];
220 return;
221 }
222 }
223
224 if (my_reverse && deets.reverse_mismatches <= my_max_mm) {
225 if (reverse_match(x.first, deets, state).first) {
226 ++state.collected[state.temp];
227 return;
228 }
229 }
230 }
231 }
232
233 void process_best(State& state, const std::pair<const char*, const char*>& x) const {
234 auto deets = my_constant_matcher.initialize(x.first, x.second - x.first);
235 bool found = false;
236 int best_mismatches = my_max_mm + 1;
237 std::array<BarcodeIndex, num_variable_> best_id;
238
239 auto update = [&](std::pair<bool, int> match) -> void {
240 if (match.first && match.second <= best_mismatches) {
241 if (match.second == best_mismatches) {
242 if (best_id != state.temp) { // ambiguous.
243 found = false;
244 }
245 } else {
246 // A further optimization at this point would be to narrow
247 // max_mm to the current 'best_mismatches'. But
248 // this probably isn't worth it.
249
250 found = true;
251 best_mismatches = match.second;
252 best_id = state.temp;
253 }
254 }
255 };
256
257 while (!deets.finished) {
258 my_constant_matcher.next(deets);
259
260 if (my_forward && deets.forward_mismatches <= my_max_mm) {
261 update(forward_match(x.first, deets, state));
262 }
263
264 if (my_reverse && deets.reverse_mismatches <= my_max_mm) {
265 update(reverse_match(x.first, deets, state));
266 }
267 }
268
269 if (found) {
270 ++state.collected[best_id];
271 }
272 }
273
274public:
278 State initialize() const {
279 return State();
280 }
281
282 void reduce(State& s) {
283 if (my_forward) {
284 for (int r = 0; r < num_variable_; ++r) {
285 my_forward_lib[r].reduce(s.forward_details[r]);
286 }
287 }
288 if (my_reverse) {
289 for (int r = 0; r < num_variable_; ++r) {
290 my_reverse_lib[r].reduce(s.reverse_details[r]);
291 }
292 }
293
294 for (const auto& col : s.collected) {
295 my_combinations[col.first] += col.second;
296 }
297 my_total += s.total;
298 return;
299 }
300
301 void process(State& state, const std::pair<const char*, const char*>& x) const {
302 if (my_use_first) {
303 process_first(state, x);
304 } else {
305 process_best(state, x);
306 }
307 ++state.total;
308 }
309
310 static constexpr bool use_names = false;
315public:
319 const std::unordered_map<std::array<BarcodeIndex, num_variable_>, Count, CombinationHash<num_variable_> >& get_combinations() const {
320 return my_combinations;
321 }
322
326 Count get_total() const {
327 return my_total;
328 }
329};
330
331}
332
333#endif
Search for barcode sequences.
Defines the ScanTemplate class.
Hash a combination of barcode indices.
Definition utils.hpp:286
Handler for single-end combinatorial barcodes.
Definition CombinatorialBarcodesSingleEnd.hpp:32
const std::unordered_map< std::array< BarcodeIndex, num_variable_ >, Count, CombinationHash< num_variable_ > > & get_combinations() const
Definition CombinatorialBarcodesSingleEnd.hpp:319
CombinatorialBarcodesSingleEnd & set_first(bool t=true)
Definition CombinatorialBarcodesSingleEnd.hpp:126
Count get_total() const
Definition CombinatorialBarcodesSingleEnd.hpp:326
CombinatorialBarcodesSingleEnd(const char *template_seq, SeqLength template_length, const BarcodePoolContainer &barcode_pools, const Options &options)
Definition CombinatorialBarcodesSingleEnd.hpp:72
Scan a read sequence for the template sequence.
Definition ScanTemplate.hpp:38
State initialize(const char *read_seq, SeqLength read_length) const
Definition ScanTemplate.hpp:159
void next(State &state) const
Definition ScanTemplate.hpp:201
const std::vector< std::pair< SeqLength, SeqLength > > & variable_regions(bool reverse) const
Definition ScanTemplate.hpp:308
Search against known barcodes.
Definition BarcodeSearch.hpp:70
Namespace for the kaori barcode-matching library.
Definition BarcodePool.hpp:16
std::size_t SeqLength
Definition utils.hpp:37
SearchStrand
Definition utils.hpp:31
DuplicateAction
Definition utils.hpp:26
bool is_barcode_index_ok(BarcodeIndex index)
Definition utils.hpp:60
unsigned long long Count
Definition utils.hpp:67
Optional parameters for CombinatorialBarcodeSingleEnd.
Definition CombinatorialBarcodesSingleEnd.hpp:37
bool use_first
Definition CombinatorialBarcodesSingleEnd.hpp:46
DuplicateAction duplicates
Definition CombinatorialBarcodesSingleEnd.hpp:56
SearchStrand strand
Definition CombinatorialBarcodesSingleEnd.hpp:51
int max_mismatches
Definition CombinatorialBarcodesSingleEnd.hpp:41
Optional parameters for SimpleBarcodeSearch.
Definition BarcodeSearch.hpp:75
bool reverse
Definition BarcodeSearch.hpp:84
int max_mismatches
Definition BarcodeSearch.hpp:79
DuplicateAction duplicates
Definition BarcodeSearch.hpp:89
Utilites for sequence matching.