SDSL  3.0.0
Succinct Data Structure Library
lcp_support_tree2.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
4 #ifndef INCLUDED_SDSL_SUPPORT_TREE2
5 #define INCLUDED_SDSL_SUPPORT_TREE2
6 
7 #include <iostream>
8 #include <string>
9 
10 #include <sdsl/lcp.hpp>
11 #include <sdsl/rank_support_v.hpp>
13 #include <sdsl/util.hpp>
14 #include <sdsl/wt_huff.hpp>
15 
16 namespace sdsl
17 {
18 
19 // Forward declaration of helper method
20 template <uint32_t t_dens, uint8_t t_bwt_width>
21 void construct_first_child_and_lf_lcp(int_vector_buffer<> &,
22  int_vector_buffer<t_bwt_width> &,
23  const std::string &,
24  const std::string &,
25  int_vector<> &);
26 
36 template <uint32_t t_dens, class t_cst>
38 {
39  public:
43  typedef const value_type const_reference;
46  typedef const pointer const_pointer;
49  typedef t_cst cst_type;
51 
53 
54  enum
55  {
58  sa_order = 0
59  };
60 
61  template <class CST>
62  struct type
63  {
65  };
66 
67  private:
68  const cst_type * m_cst;
69  small_lcp_type m_small_lcp; // vector for lcp values < 254
70  int_vector<> m_big_lcp; // vector for lcp values >= 254
71 
72  public:
75 
81 
83 
86  _lcp_support_tree2(cache_config & config, const cst_type * cst = nullptr)
87  {
88  m_cst = cst;
89 
91  std::string bwt_file = cache_file_name(key_bwt<t_cst::csa_type::alphabet_type::int_width>(), config);
93 
94  std::string sml_lcp_file = tmp_file(config, "_fc_lf_lcp_sml");
95  std::string big_lcp_file = tmp_file(config, "_fc_lf_lcp_big");
96 
97  construct_first_child_and_lf_lcp<t_dens>(lcp_buf, bwt_buf, sml_lcp_file, big_lcp_file, m_big_lcp);
98  int_vector_buffer<8> sml_lcp_buf(sml_lcp_file);
99 
100  m_small_lcp = small_lcp_type(sml_lcp_buf.begin(), sml_lcp_buf.end(), config.dir);
101  sml_lcp_buf.close(true);
102  sdsl::remove(big_lcp_file);
103  }
104 
105  void set_cst(const cst_type * cst) { m_cst = cst; }
106 
107  size_type size() const { return m_cst->size(); }
108 
110 
111  size_type empty() const { return m_small_lcp.empty(); }
112 
114  const_iterator begin() const { return const_iterator(this, 0); }
115 
117  const_iterator end() const { return const_iterator(this, size()); }
118 
120 
125  {
126  size_type idx, offset = 0;
127  uint8_t val;
128  start:
129  idx = m_cst->tlcp_idx(i);
130  val = m_small_lcp[idx];
131  if (val < 254)
132  {
133  return val; // - offset;
134  }
135  else if (val == 254)
136  { // if lcp value is >= 254 and position i is reducible
137  i = m_cst->csa.lf[i]; // i = LF[i] // (*m_psi)(i);
138  ++offset; // goto lcp value, which is one bigger
139  goto start;
140  }
141  else
142  { // if lcp value is >= 254 and (not reducable or sampled)
143  return m_big_lcp[m_small_lcp.rank(idx, 255)] - offset;
144  }
145  }
146 
148  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
149  {
150  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
151  size_type written_bytes = 0;
152  written_bytes += m_small_lcp.serialize(out, child, "small_lcp");
153  written_bytes += m_big_lcp.serialize(out, child, "large_lcp");
154  structure_tree::add_size(child, written_bytes);
155  return written_bytes;
156  }
157 
159  void load(std::istream & in, const t_cst * cst = nullptr)
160  {
161  m_small_lcp.load(in);
162  m_big_lcp.load(in);
163  m_cst = cst;
164  }
165 
166  template <typename archive_t>
167  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
168  {
169  ar(CEREAL_NVP(m_small_lcp));
170  ar(CEREAL_NVP(m_big_lcp));
171  }
172 
173  template <typename archive_t>
174  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
175  {
176  ar(CEREAL_NVP(m_small_lcp));
177  ar(CEREAL_NVP(m_big_lcp));
178  }
179 
181  bool operator==(_lcp_support_tree2 const & other) const noexcept
182  {
183  return (m_small_lcp == other.m_small_lcp) && (m_big_lcp == other.m_big_lcp);
184  }
185 
187  bool operator!=(_lcp_support_tree2 const & other) const noexcept { return !(*this == other); }
188 };
189 
191 template <uint32_t t_dens = 16>
193 {
194  template <class t_cst>
196 };
197 
203 template <uint32_t t_dens, uint8_t t_bwt_width>
206  const std::string & small_lcp_file,
207  const std::string & big_lcp_file,
208  int_vector<> & big_lcp)
209 {
211  const size_type M = 255; // limit for values represented in the small LCP part
212  size_type buf_len = 1000000;
213  lcp_buf.buffersize(buf_len);
214  bwt_buf.buffersize(buf_len);
215  size_type n = lcp_buf.size();
216 
217  int_vector_buffer<8> sml_lcp_out(small_lcp_file, std::ios::out);
218 
219  osfstream big_lcp_out(big_lcp_file, std::ios::out | std::ios::trunc | std::ios::binary);
220 
221  size_type fc_cnt = 0; // number of lcp values at the first child r
222  size_type fc_cnt_big = 0; // number of lcp values at the first child which are big and not reducible
223  size_type fc_cnt_big2 = 0;
224  sorted_multi_stack_support vec_stack(n); // occupies 2n bits
225  bit_vector is_big_and_not_reducable(n, 0); // initialized with 0s
226  bool is_one_big_and_not_reducable = false; // all positions have to be reducible
227 
228  size_type y, max_lcp = 0;
229  uint64_t last_bwti = 0, val;
230  for (size_type i = 0, x; i < n; ++i)
231  {
232  x = lcp_buf[i];
233  is_one_big_and_not_reducable = false;
234 
235  while (!vec_stack.empty() and x < vec_stack.top())
236  {
237  y = vec_stack.top();
238  is_one_big_and_not_reducable |= is_big_and_not_reducable[vec_stack.size() - 1];
239  if (vec_stack.pop())
240  { // if y was the last copy of y on the stack
241  if (y > M - 2)
242  {
243  if (is_one_big_and_not_reducable)
244  {
245  val = M;
246  big_lcp_out.write((char *)&y, sizeof(y));
247  ++fc_cnt_big;
248  if (y > max_lcp) max_lcp = y;
249  }
250  else
251  {
252  val = M - 1;
253  ++fc_cnt_big2;
254  }
255  }
256  else
257  {
258  val = y;
259  }
260  sml_lcp_out.push_back(val);
261  ++fc_cnt;
262  is_one_big_and_not_reducable = false;
263  }
264  }
265  if (x > M - 2 and (0 == i or last_bwti != bwt_buf[i] or x % t_dens == 0))
266  {
267  is_big_and_not_reducable[vec_stack.size()] = 1;
268  }
269  else
270  {
271  is_big_and_not_reducable[vec_stack.size()] = 0;
272  }
273  vec_stack.push(x);
274  last_bwti = bwt_buf[i];
275  }
276 
277  while (!vec_stack.empty())
278  {
279  y = vec_stack.top();
280  if (vec_stack.pop())
281  {
282  if (y > M - 2)
283  {
284  if (is_big_and_not_reducable[vec_stack.size()])
285  {
286  val = M;
287  big_lcp_out.write((char *)&y, sizeof(y));
288  ++fc_cnt_big;
289  if (y > max_lcp) max_lcp = y;
290  }
291  else
292  {
293  val = M - 1;
294  ++fc_cnt_big2;
295  }
296  }
297  else
298  {
299  val = y;
300  }
301  sml_lcp_out.push_back(val);
302  ++fc_cnt;
303  }
304  }
305 
306  big_lcp_out.close();
307  isfstream big_lcp_in(big_lcp_file, std::ios::in | std::ios::binary);
308  big_lcp.width(bits::hi(max_lcp) + 1);
309  big_lcp.resize(fc_cnt_big);
310 
311  for (size_type i = 0; i < fc_cnt_big; ++i)
312  {
313  big_lcp_in.read((char *)&y, sizeof(y));
314  big_lcp[i] = y;
315  }
316 }
317 
318 } // namespace sdsl
319 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
An lcp array class for cst_sct3 and cst_sada.
const_iterator end() const
Returns a const_iterator to the element after the last element.
_lcp_support_tree2()
Default constructor.
random_access_const_iterator< _lcp_support_tree2 > const_iterator
_lcp_support_tree2(cache_config &config, const cst_type *cst=nullptr)
Constructor.
_lcp_support_tree2 & operator=(_lcp_support_tree2 &&)=default
int_vector ::size_type size_type
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
lcp_tree_and_lf_compressed_tag lcp_category
bool operator!=(_lcp_support_tree2 const &other) const noexcept
Inequality operator.
wt_huff< bit_vector, rank_support_v5<>, select_support_scan< 1 >, select_support_scan< 0 > > small_lcp_type
void set_cst(const cst_type *cst)
_lcp_support_tree2(_lcp_support_tree2 &&)=default
_lcp_support_tree2 & operator=(const _lcp_support_tree2 &)=default
int_vector ::value_type value_type
value_type operator[](size_type i) const
[]-operator
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
const value_type const_reference
bool operator==(_lcp_support_tree2 const &other) const noexcept
Equality operator.
const_iterator begin() const
Returns a const_iterator to the first element.
_lcp_support_tree2(const _lcp_support_tree2 &)=default
Copy / Move constructor.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
int_vector ::difference_type difference_type
void load(std::istream &in, const t_cst *cst=nullptr)
Load from a stream.
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
void load(std::istream &in)
Load the int_vector for a stream.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
static size_type max_size() noexcept
Maximum size of the int_vector.
Definition: int_vector.hpp:566
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
void close()
Close the stream.
Definition: sfstream.hpp:87
Generic iterator for a random access container.
Definition: iterators.hpp:24
A class supporting linear time select queries.
Stack which contains elements from [0..n] in sorted order. Duplicates are possible.
size_type size() const
Returns the number of element is the stack.
bool empty() const
Returns if the stack is empty.
size_type top() const
Returns the topmost index on the stack.
bool pop()
Pop the topmost index of the stack.
bool push(size_type x)
Push the index x of vector vec onto the stack.
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A prefix code-shaped wavelet.
Definition: wt_pc.hpp:44
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition: wt_pc.hpp:335
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_pc.hpp:676
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_pc.hpp:660
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_pc.hpp:287
lcp.hpp contains classes for lcp information.
constexpr char KEY_LCP[]
Definition: config.hpp:44
int_vector ::size_type size_type
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void construct_first_child_and_lf_lcp(int_vector_buffer<> &, int_vector_buffer< t_bwt_width > &, const std::string &, const std::string &, int_vector<> &)
rank_support_v.hpp contains rank_support_v.
sorted_multi_stack_support.hpp contains a data structure for a stack which contains elements from [0....
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
Helper class for construction process.
Definition: config.hpp:67
std::string dir
Definition: config.hpp:71
Helper class which provides _lcp_support_tree2 the context of a CST.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.