SDSL  3.0.0
Succinct Data Structure Library
construct_lcp_helper.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
4 #ifndef INCLUDED_SDSL_CONSTRUCT_LCP_HELPER
5 #define INCLUDED_SDSL_CONSTRUCT_LCP_HELPER
6 
7 #include <list>
8 #include <queue>
9 #include <vector>
10 
11 #include <sdsl/int_vector.hpp>
12 
13 namespace sdsl
14 {
15 
17 
26 inline void insert_lcp_values(int_vector<> & partial_lcp,
27  bit_vector & index_done,
28  std::string lcp_file,
29  uint64_t max_lcp_value,
30  uint64_t lcp_value_offset)
31 {
32  std::string tmp_lcp_file = lcp_file + "_TMP";
33  const uint64_t buffer_size = 1000000; // has to be a multiple of 64
35  int_vector_buffer<> lcp_buffer(lcp_file, std::ios::in, buffer_size); // open lcp_file
36  uint64_t n = lcp_buffer.size();
37 
38  // open tmp_lcp_file
39  uint8_t int_width = bits::hi(max_lcp_value) + 1;
40  int_vector_buffer<> out_buf(tmp_lcp_file, std::ios::out, buffer_size, int_width); // Output buffer
41  // Write values into buffer
42  for (size_type i = 0, calc_idx = 0; i < n; ++i)
43  {
44  if (index_done[i])
45  { // If value was already calculated
46  out_buf[i] = lcp_buffer[i]; // Copy value
47  }
48  else
49  {
50  if (partial_lcp[calc_idx])
51  { // If value was calculated now
52  // Insert value
53  out_buf[i] = partial_lcp[calc_idx] + lcp_value_offset;
54  index_done[i] = true;
55  }
56  ++calc_idx;
57  }
58  }
59  lcp_buffer.close();
60  out_buf.close();
61  // Close file and replace old file with new one
62  sdsl::rename(tmp_lcp_file, lcp_file);
63 }
64 
65 template <class tWT>
66 void create_C_array(std::vector<uint64_t> & C, const tWT & wt)
67 {
68  uint64_t quantity; // quantity of characters in interval
69  std::vector<unsigned char> cs(wt.sigma); // list of characters in the interval
70  std::vector<uint64_t> rank_c_i(wt.sigma); // number of occurrence of character in [0 .. i-1]
71  std::vector<uint64_t> rank_c_j(wt.sigma); // number of occurrence of character in [0 .. j-1]
72 
73  C = std::vector<uint64_t>(257, 0);
74  interval_symbols(wt, 0, wt.size(), quantity, cs, rank_c_i, rank_c_j);
75  for (uint64_t i = 0; i < quantity; ++i)
76  {
77  unsigned char c = cs[i];
78  C[c + 1] = rank_c_j[i];
79  }
80  for (uint64_t i = 1; i < C.size() - 1; ++i) { C[i + 1] += C[i]; }
81 }
82 
84 {
85  typedef bit_vector::size_type size_type;
86  typedef std::queue<uint8_t> tQ;
87 
88  private:
89  static const uint32_t m_buffer_size = 10000; // 409600;
90  uint8_t m_write_buf[m_buffer_size];
91  uint8_t m_read_buf[m_buffer_size];
92  size_type m_widx; // write index
93  size_type m_ridx; // read index
94  bool m_sync; // are read and write buffer the same?
95  size_type m_disk_buffered_blocks; // number of blocks written to disk and not read again yet
96  char m_c;
97  size_type m_rb; // read blocks
98  size_type m_wb; // written blocks
99 
100  std::string m_file_name;
101 
102  std::fstream m_stream;
103 
104  public:
106  : m_widx(0)
107  , m_ridx(0)
108  , m_sync(true)
109  , m_disk_buffered_blocks(0)
110  , m_c('?')
111  , m_rb(0)
112  , m_wb(0)
113  {}
114 
115  void init(const std::string & dir, char c)
116  {
117  m_c = c;
118  m_file_name = dir + "buffered_char_queue_" + util::to_string(util::pid());
119  // m_stream.rdbuf()->pubsetbuf(0, 0);
120  }
121 
123  {
124  m_stream.close();
125  sdsl::remove(m_file_name);
126  }
127 
128  void push_back(uint8_t x)
129  {
130  m_write_buf[m_widx] = x;
131  if (m_sync) { m_read_buf[m_widx] = x; }
132  ++m_widx;
133  if (m_widx == m_buffer_size)
134  {
135  if (!m_sync)
136  { // if not sync, write block to disk
137  if (!m_stream.is_open())
138  {
139  m_stream.open(m_file_name, std::ios::in | std::ios::out | std::ios::binary | std::ios::trunc);
140  }
141  m_stream.seekp(m_buffer_size * (m_wb++), std::ios::beg);
142  m_stream.write((char *)m_write_buf, m_buffer_size);
143  ++m_disk_buffered_blocks;
144  }
145  m_sync = 0;
146  m_widx = 0;
147  }
148  }
149 
150  uint8_t pop_front()
151  {
152  uint8_t x = m_read_buf[m_ridx];
153  ++m_ridx;
154  if (m_ridx == m_buffer_size)
155  {
156  if (m_disk_buffered_blocks > 0)
157  {
158  m_stream.seekg(m_buffer_size * (m_rb++), std::ios::beg);
159  m_stream.read((char *)m_read_buf, m_buffer_size);
160  --m_disk_buffered_blocks;
161  }
162  else
163  { // m_disk_buffered_blocks == 0
164  m_sync = 1;
165  memcpy(m_read_buf, m_write_buf, m_widx + 1);
166  }
167  m_ridx = 0;
168  }
169  return x;
170  }
171 };
172 
175 
176 template <class size_type_class>
177 void push_front_m_index(size_type_class i,
178  uint8_t c,
179  tLI (&m_list)[256],
180  uint8_t (&m_chars)[256],
181  size_type_class & m_char_count)
182 {
183  if (m_list[c].empty()) { m_chars[m_char_count++] = c; }
184  m_list[c].push_front(i);
185 }
186 
187 template <class size_type_class>
188 void push_back_m_index(size_type_class i,
189  uint8_t c,
190  tLI (&m_list)[256],
191  uint8_t (&m_chars)[256],
192  size_type_class & m_char_count)
193 {
194  if (m_list[c].empty()) { m_chars[m_char_count++] = c; }
195  m_list[c].push_back(i);
196 }
197 
198 } // namespace sdsl
199 
200 #endif
void init(const std::string &dir, char c)
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
int_vector_size_type size_type
Definition: int_vector.hpp:266
int_vector.hpp contains the sdsl::int_vector class.
int_vector ::size_type size_type
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void interval_symbols(const t_wt &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
void create_C_array(std::vector< uint64_t > &C, const tWT &wt)
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
bool empty(const range_type &r)
Empty range check.
Definition: wt_helper.hpp:782
int rename(const std::string &old_filename, const std::string &new_filename)
Rename a file.
Definition: ram_fs.hpp:204
std::vector< int_vector<>::size_type > tVI
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661