SDSL  3.0.0
Succinct Data Structure Library
csa_sada.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef INCLUDED_SDSL_CSA_SADA
9 #define INCLUDED_SDSL_CSA_SADA
10 
11 #include <algorithm>
12 #include <cassert>
13 #include <cstring> // for strlen
14 #include <iomanip>
15 #include <iostream>
16 #include <iterator>
17 
20 #include <sdsl/enc_vector.hpp>
21 #include <sdsl/int_vector.hpp>
22 #include <sdsl/iterators.hpp>
24 #include <sdsl/util.hpp>
25 
26 namespace sdsl
27 {
28 
30 
41 template <class t_enc_vec = enc_vector<>, // Vector type used to store the Psi-function
42  uint32_t t_dens = 32, // Sample density for suffix array (SA) values
43  uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
44  class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
45  class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
46  class t_alphabet_strat = byte_alphabet // Policy class for the representation of the alphabet.
47  >
48 class csa_sada
49 {
50  static_assert(is_enc_vec<t_enc_vec>::value, "First template argument has to be of type env_vector.");
51  static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
52  static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
53  static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
54  "Forth template argument has to be a suffix array sampling strategy.");
55  static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
56  "Fifth template argument has to be a inverse suffix array sampling strategy.");
57  static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
58 
59  friend class bwt_of_csa_psi<csa_sada>;
60 
61  public:
62  enum
63  {
64  sa_sample_dens = t_dens,
65  isa_sample_dens = t_inv_dens
66  };
67 
68  typedef uint64_t value_type;
71  typedef const value_type const_reference;
74  typedef const pointer const_pointer;
77  typedef ptrdiff_t difference_type;
78  typedef t_enc_vec enc_vector_type;
85  typedef typename t_sa_sample_strat::template type<csa_sada> sa_sample_type;
86  typedef typename t_isa_sample_strat::template type<csa_sada> isa_sample_type;
87  typedef t_alphabet_strat alphabet_type;
88  typedef typename alphabet_type::alphabet_category alphabet_category;
89  typedef typename alphabet_type::comp_char_type comp_char_type;
90  typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
91  typedef typename alphabet_type::string_type string_type;
92  typedef csa_sada csa_type;
93 
96 
97  friend class traverse_csa_psi<csa_sada, true>;
98  friend class traverse_csa_psi<csa_sada, false>;
99 
100  static const uint32_t linear_decode_limit = 100000;
101 
102  private:
103  enc_vector_type m_psi; // psi function
104  sa_sample_type m_sa_sample; // suffix array samples
105  isa_sample_type m_isa_sample; // inverse suffix array samples
106  alphabet_type m_alphabet; // alphabet component
107 
108  mutable std::vector<uint64_t> m_psi_buf; // buffer for decoded psi values
109 
110  void create_buffer()
111  {
112  if (enc_vector_type::sample_dens < linear_decode_limit)
113  {
114  m_psi_buf = std::vector<uint64_t>(enc_vector_type::sample_dens + 1);
115  }
116  }
117 
118  public:
119  const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
120  const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
121  const typename alphabet_type::C_type & C = m_alphabet.C;
122  const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
123  const psi_type & psi = m_psi;
124  const lf_type lf = lf_type(*this);
125  const bwt_type bwt = bwt_type(*this);
126  const isa_type isa = isa_type(*this);
127  const bwt_type L = bwt_type(*this);
129  const text_type text = text_type(*this);
130  const sa_sample_type & sa_sample = m_sa_sample;
131  const isa_sample_type & isa_sample = m_isa_sample;
132 
134  csa_sada() { create_buffer(); }
137 
139  csa_sada(const csa_sada & csa)
140  : m_psi(csa.m_psi)
141  , m_sa_sample(csa.m_sa_sample)
142  , m_isa_sample(csa.m_isa_sample)
143  , m_alphabet(csa.m_alphabet)
144  {
145  create_buffer();
146  m_isa_sample.set_vector(&m_sa_sample);
147  }
148 
151  : m_psi(std::move(csa.m_psi))
152  , m_sa_sample(std::move(csa.m_sa_sample))
153  , m_isa_sample(std::move(csa.m_isa_sample))
154  , m_alphabet(std::move(csa.m_alphabet))
155  {
156  create_buffer();
157  m_isa_sample.set_vector(&m_sa_sample);
158  }
159 
160  csa_sada(cache_config & config);
161 
163 
168  size_type size() const { return m_psi.size(); }
169 
171 
174  static size_type max_size() { return t_enc_vec::max_size(); }
175 
177 
180  bool empty() const { return m_psi.empty(); }
181 
183 
186  const_iterator begin() const { return const_iterator(this, 0); }
187 
189 
192  const_iterator end() const { return const_iterator(this, size()); }
193 
195 
201  inline value_type operator[](size_type i) const;
202 
204 
207  csa_sada & operator=(const csa_sada & csa)
208  {
209  if (this != &csa)
210  {
211  csa_sada tmp(csa);
212  *this = std::move(tmp);
213  }
214  return *this;
215  }
216 
218 
222  {
223  if (this != &csa)
224  {
225  m_psi = std::move(csa.m_psi);
226  m_sa_sample = std::move(csa.m_sa_sample);
227  m_isa_sample = std::move(csa.m_isa_sample);
228  m_isa_sample.set_vector(&m_sa_sample);
229  m_alphabet = std::move(csa.m_alphabet);
230  m_psi_buf = std::move(csa.m_psi_buf);
231  }
232  return *this;
233  }
234 
236  bool operator==(csa_sada const & other) const noexcept
237  {
238  return (m_psi == other.m_psi) && (m_sa_sample == other.m_sa_sample) && (m_isa_sample == other.m_isa_sample) &&
239  (m_alphabet == other.m_alphabet);
240  }
241 
243  bool operator!=(csa_sada const & other) const noexcept { return !(*this == other); }
244 
246 
249  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
250 
252 
254  void load(std::istream & in);
255 
256  template <typename archive_t>
257  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
258 
259  template <typename archive_t>
260  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
261 
262  uint32_t get_sample_dens() const { return t_dens; }
263 
264  private:
265  // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
266  /*
267  * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
268  * \param c The symbol to count the occurrences in the prefix.
269  * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
270  * \par Time complexity
271  * \f$ \Order{\log n t_{\Psi}} \f$
272  */
273  size_type rank_bwt(size_type i, const char_type c) const
274  {
275  comp_char_type cc = char2comp[c];
276  if (cc == 0 and c != 0) // character is not in the text => return 0
277  return 0;
278  if (i == 0) return 0;
279  assert(i <= size());
280 
281  size_type lower_b, upper_b; // lower_b inclusive, upper_b exclusive
282 
283  const size_type sd = m_psi.get_sample_dens();
284  size_type lower_sb = (C[cc] + sd - 1) / sd; // lower_sb inclusive
285  size_type upper_sb = (C[cc + 1] + sd - 1) / sd; // upper_sb exclusive
286  while (lower_sb + 1 < upper_sb)
287  {
288  size_type mid = (lower_sb + upper_sb) / 2;
289  if (m_psi.sample(mid) >= i)
290  upper_sb = mid;
291  else
292  lower_sb = mid;
293  }
294 
295  if (lower_sb == upper_sb)
296  { // the interval was smaller than sd
297  lower_b = C[cc];
298  upper_b = C[cc + 1];
299  }
300  else if (lower_sb > (C[cc] + sd - 1) / sd)
301  { // main case
302  // TODO: don't use get_inter_sampled_values if t_dens is really
303  // large
304  lower_b = lower_sb * sd;
305  if (0 == m_psi_buf.size())
306  {
307  upper_b = std::min(upper_sb * sd, C[cc + 1]);
308  goto finish;
309  }
310  uint64_t * p = m_psi_buf.data();
311  // extract the psi values between two samples
312  m_psi.get_inter_sampled_values(lower_sb, p);
313  p = m_psi_buf.data();
314  uint64_t smpl = m_psi.sample(lower_sb);
315  // handle border cases
316  if (lower_b + m_psi.get_sample_dens() >= C[cc + 1])
317  m_psi_buf[C[cc + 1] - lower_b] = size() - smpl;
318  else
319  m_psi_buf[m_psi.get_sample_dens()] = size() - smpl;
320  // search the result linear
321  while ((*p++) + smpl < i)
322  ;
323 
324  return p - 1 - m_psi_buf.data() + lower_b - C[cc];
325  }
326  else
327  { // lower_b == (m_C[cc]+sd-1)/sd and lower_sb < upper_sb
328  if (m_psi.sample(lower_sb) >= i)
329  {
330  lower_b = C[cc];
331  upper_b = lower_sb * sd + 1;
332  }
333  else
334  {
335  lower_b = lower_sb * sd;
336  upper_b = std::min(upper_sb * sd, C[cc + 1]);
337  }
338  }
339  finish:
340  // binary search the interval [C[cc]..C[cc+1]-1] for the result
341  // size_type lower_b = m_C[cc], upper_b = m_C[cc+1]; // lower_b inclusive, upper_b exclusive
342  while (lower_b + 1 < upper_b)
343  {
344  size_type mid = (lower_b + upper_b) / 2;
345  if (m_psi[mid] >= i)
346  upper_b = mid;
347  else
348  lower_b = mid;
349  }
350  if (lower_b > C[cc])
351  return lower_b - C[cc] + 1;
352  else
353  { // lower_b == m_C[cc]
354  return m_psi[lower_b] < i; // 1 if m_psi[lower_b]<i, 0 otherwise
355  }
356  }
357 
358  // Calculates the position of the i-th c in the BWT of the original text.
359  /*
360  * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$.
361  * \param c Symbol c.
362  * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
363  * \par Time complexity
364  * \f$ \Order{t_{\Psi}} \f$
365  */
366  size_type select_bwt(size_type i, const char_type c) const
367  {
368  assert(i > 0);
369  comp_char_type cc = char2comp[c];
370  if (cc == 0 and c != 0) // character is not in the text => return 0
371  return size();
372  assert(cc != 255);
373  if (C[cc] + i - 1 < C[cc + 1]) { return m_psi[C[cc] + i - 1]; }
374  else
375  return size();
376  }
377 };
378 
379 // == template functions ==
380 
381 template <class t_enc_vec,
382  uint32_t t_dens,
383  uint32_t t_inv_dens,
384  class t_sa_sample_strat,
385  class t_isa,
386  class t_alphabet_strat>
388 {
389  create_buffer();
390  if (!cache_file_exists(key_bwt<alphabet_type::int_width>(), config)) { return; }
391  size_type n = 0;
392  {
394  cache_file_name(key_bwt<alphabet_type::int_width>(), config));
395  n = bwt_buf.size();
396  auto event = memory_monitor::event("construct csa-alpbabet");
397  m_alphabet = alphabet_type(bwt_buf, n);
398  }
399  {
400  auto event = memory_monitor::event("sample SA");
401  m_sa_sample = sa_sample_type(config);
402  }
403  {
404  auto event = memory_monitor::event("sample ISA");
405  isa_sample_type isa_s(config, &m_sa_sample);
406  util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type *)nullptr);
407  }
408  // if ( config.delete_files ) {
409  // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
410  // }
411 
412  int_vector<> cnt_chr(sigma, 0, bits::hi(n) + 1);
413  for (typename alphabet_type::sigma_type i = 0; i < sigma; ++i) { cnt_chr[i] = C[i]; }
414  // calculate psi
415  {
416  auto event = memory_monitor::event("construct PSI");
418  cache_file_name(key_bwt<alphabet_type::int_width>(), config));
419  std::string psi_file = cache_file_name(conf::KEY_PSI, config);
420  auto psi = write_out_mapper<>::create(psi_file, n, bits::hi(n) + 1);
421  for (size_type i = 0; i < n; ++i) { psi[cnt_chr[char2comp[bwt_buf[i]]]++] = i; }
423  }
424  {
425  auto event = memory_monitor::event("encode PSI");
427  m_psi = t_enc_vec(psi_buf);
428  }
429 }
430 
431 template <class t_enc_vec,
432  uint32_t t_dens,
433  uint32_t t_inv_dens,
434  class t_sa_sample_strat,
435  class t_isa,
436  class t_alphabet_strat>
438  size_type i) const -> value_type
439 {
440  size_type off = 0;
441  while (!m_sa_sample.is_sampled(i))
442  { // while i mod t_dens != 0 (SA[i] is not sampled)
443  i = psi[i]; // go to the position where SA[i]+1 is located
444  ++off; // add 1 to the offset
445  }
446  value_type result = m_sa_sample[i];
447  if (result < off) { return m_psi.size() - (off - result); }
448  else
449  return result - off;
450 }
451 
452 template <class t_enc_vec,
453  uint32_t t_dens,
454  uint32_t t_inv_dens,
455  class t_sa_sample_strat,
456  class t_isa,
457  class t_alphabet_strat>
459  std::ostream & out,
461  std::string name) const -> size_type
462 {
463  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
464  size_type written_bytes = 0;
465  written_bytes += m_psi.serialize(out, child, "psi");
466  written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
467  written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
468  written_bytes += m_alphabet.serialize(out, child, "alphabet");
469  structure_tree::add_size(child, written_bytes);
470  return written_bytes;
471 }
472 
473 template <class t_enc_vec,
474  uint32_t t_dens,
475  uint32_t t_inv_dens,
476  class t_sa_sample_strat,
477  class t_isa,
478  class t_alphabet_strat>
480 {
481  m_psi.load(in);
482  m_sa_sample.load(in);
483  m_isa_sample.load(in, &m_sa_sample);
484  m_alphabet.load(in);
485 }
486 
487 template <class t_enc_vec,
488  uint32_t t_dens,
489  uint32_t t_inv_dens,
490  class t_sa_sample_strat,
491  class t_isa,
492  class t_alphabet_strat>
493 template <typename archive_t>
495  archive_t & ar) const
496 {
497  ar(CEREAL_NVP(m_psi));
498  ar(CEREAL_NVP(m_sa_sample));
499  ar(CEREAL_NVP(m_isa_sample));
500  ar(CEREAL_NVP(m_alphabet));
501 }
502 
503 template <class t_enc_vec,
504  uint32_t t_dens,
505  uint32_t t_inv_dens,
506  class t_sa_sample_strat,
507  class t_isa,
508  class t_alphabet_strat>
509 template <typename archive_t>
511  archive_t & ar)
512 {
513  ar(CEREAL_NVP(m_psi));
514  ar(CEREAL_NVP(m_sa_sample));
515  ar(CEREAL_NVP(m_isa_sample));
516  m_isa_sample.set_vector(&m_sa_sample);
517  ar(CEREAL_NVP(m_alphabet));
518 }
519 
520 } // end namespace sdsl
521 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
A wrapper for the bwt of a compressed suffix array that is based on the function.
A class for the Compressed Suffix Array (CSA) proposed by Sadakane for practical implementation.
Definition: csa_sada.hpp:49
t_isa_sample_strat::template type< csa_sada > isa_sample_type
Definition: csa_sada.hpp:86
alphabet_type::string_type string_type
Definition: csa_sada.hpp:91
uint32_t get_sample_dens() const
Definition: csa_sada.hpp:262
alphabet_type::alphabet_category alphabet_category
Definition: csa_sada.hpp:88
csa_tag index_category
Definition: csa_sada.hpp:94
isa_of_csa_psi< csa_sada > isa_type
Definition: csa_sada.hpp:82
bwt_of_csa_psi< csa_sada > bwt_type
Definition: csa_sada.hpp:81
const alphabet_type::comp2char_type & comp2char
Definition: csa_sada.hpp:120
const first_row_type F
Definition: csa_sada.hpp:128
const bwt_type bwt
Definition: csa_sada.hpp:125
t_enc_vec enc_vector_type
Definition: csa_sada.hpp:78
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: csa_sada.hpp:186
const_reference reference
Definition: csa_sada.hpp:72
const isa_type isa
Definition: csa_sada.hpp:126
csa_sada & operator=(const csa_sada &csa)
Assignment Copy Operator.
Definition: csa_sada.hpp:207
csa_sada csa_type
Definition: csa_sada.hpp:92
t_sa_sample_strat::template type< csa_sada > sa_sample_type
Definition: csa_sada.hpp:85
const bwt_type L
Definition: csa_sada.hpp:127
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition: csa_sada.hpp:458
bool operator!=(csa_sada const &other) const noexcept
Inequality operator.
Definition: csa_sada.hpp:243
alphabet_type::comp_char_type comp_char_type
Definition: csa_sada.hpp:89
bool empty() const
Returns if the data strucutre is empty.
Definition: csa_sada.hpp:180
static size_type max_size()
Returns the largest size that csa_sada can ever have.
Definition: csa_sada.hpp:174
static const uint32_t linear_decode_limit
Definition: csa_sada.hpp:100
csa_sada(csa_sada &&csa)
Move constructor.
Definition: csa_sada.hpp:150
value_type operator[](size_type i) const
[]-operator
Definition: csa_sada.hpp:437
const_reference * pointer
Definition: csa_sada.hpp:73
const_iterator iterator
Definition: csa_sada.hpp:70
bool operator==(csa_sada const &other) const noexcept
Equality operator.
Definition: csa_sada.hpp:236
uint64_t value_type
Definition: csa_sada.hpp:68
t_alphabet_strat alphabet_type
Definition: csa_sada.hpp:87
void load(std::istream &in)
Load from a stream.
Definition: csa_sada.hpp:479
const psi_type & psi
Definition: csa_sada.hpp:123
const lf_type lf
Definition: csa_sada.hpp:124
const isa_sample_type & isa_sample
Definition: csa_sada.hpp:131
~csa_sada()
Default Destructor.
Definition: csa_sada.hpp:136
text_of_csa< csa_sada > text_type
Definition: csa_sada.hpp:83
csa_sada(const csa_sada &csa)
Copy constructor.
Definition: csa_sada.hpp:139
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Definition: csa_sada.hpp:494
first_row_of_csa< csa_sada > first_row_type
Definition: csa_sada.hpp:84
int_vector ::size_type size_type
Definition: csa_sada.hpp:75
size_type size() const
Number of elements in the .
Definition: csa_sada.hpp:168
alphabet_type::char_type char_type
Definition: csa_sada.hpp:90
ptrdiff_t difference_type
Definition: csa_sada.hpp:77
const pointer const_pointer
Definition: csa_sada.hpp:74
random_access_const_iterator< csa_sada > const_iterator
Definition: csa_sada.hpp:69
enc_vector_type psi_type
Definition: csa_sada.hpp:79
const text_type text
Definition: csa_sada.hpp:129
const alphabet_type::C_type & C
Definition: csa_sada.hpp:121
const alphabet_type::char2comp_type & char2comp
Definition: csa_sada.hpp:119
csa_sada & operator=(csa_sada &&csa)
Assignment Move Operator.
Definition: csa_sada.hpp:221
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: csa_sada.hpp:192
traverse_csa_psi< csa_sada, false > lf_type
Definition: csa_sada.hpp:80
const sa_sample_type & sa_sample
Definition: csa_sada.hpp:130
size_type csa_size_type
Definition: csa_sada.hpp:76
const value_type const_reference
Definition: csa_sada.hpp:71
const alphabet_type::sigma_type & sigma
Definition: csa_sada.hpp:122
psi_tag extract_category
Definition: csa_sada.hpp:95
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Definition: csa_sada.hpp:510
csa_sada()
Default Constructor.
Definition: csa_sada.hpp:134
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:253
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static int_vector_mapper< t_width > create(const std::string &key, cache_config &config)
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
enc_vector.hpp contains the sdsl::enc_vector class.
int_vector.hpp contains the sdsl::int_vector class.
iterators.hpp contains an generic iterator for random access containers.
constexpr char KEY_PSI[]
Definition: config.hpp:43
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
Helper class for construction process.
Definition: config.hpp:67
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.