SDSL  3.0.0
Succinct Data Structure Library
coder_comma.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef SDSL_CODER_COMMA_INCLUDED
9 #define SDSL_CODER_COMMA_INCLUDED
10 
11 #include <algorithm>
12 #include <array>
13 #include <math.h>
14 
15 #include <sdsl/bits.hpp>
16 #include <sdsl/config.hpp>
17 
18 namespace sdsl
19 {
20 
21 namespace coder
22 {
23 
25 
46 template <uint8_t t_width = 2>
47 class comma
48 {
49  private:
50  static_assert(t_width > 1 && t_width <= 32, "comma coder: Width must be in interval [2,32]");
51 
52  static constexpr size_t base_fits_in_64(uint32_t base, uint64_t product, size_t res)
53  {
54  return product == 0 ? res : base_fits_in_64(base, product / base, res + 1);
55  }
56 
57  // base in which numbers are coded
58  static const uint32_t base = (1 << t_width) - 1;
59 
60  // table needed for computation of encoding lengths.
61  // table contains entries of the kind (index, base^index)
62  // to know how much digits a number needs to be encoded.
63  static constexpr size_t codelentbllen = base_fits_in_64(base, 0xFFFFFFFFFFFFFFFFULL, 0);
64 
65  static struct impl
66  {
67  std::array<uint64_t, codelentbllen> codelentbl;
68  impl()
69  {
70  // intialize codelentbl
71  uint64_t n = 1;
72  for (size_t i = 0; i < codelentbllen; i++)
73  {
74  codelentbl[i] = n;
75  n = (n << t_width) - n; // n = n * base
76  }
77  }
78  } data;
79 
80  // helper function to encode a single number without
81  // termination digit
82  static void encode_in_base(uint64_t x, uint64_t *& z, uint8_t & offset);
83 
84  public:
85  typedef uint64_t size_type;
86  static const uint8_t min_codeword_length = t_width; // 0 needs t_width bits as termination
87 
89 
91  // the value w in comma code.
94  static uint8_t encoding_length(uint64_t w);
95 
97  // at bit position start_idx.
98  /* \param x Positive integer to encode.
99  * \param z Raw data of vector to write the encoded form of x.
100  * \param start_idx Beginning bit index to write the encoded form ox x in z.
101  */
102  static void encode(uint64_t x, uint64_t *& z, uint8_t & offset);
103 
105  /* \param v vector containing positive integer values
106  * \param z vector to put the encoded values
107  */
108  template <class int_vector>
109  static bool encode(const int_vector & v, int_vector & z);
110 
112 
114  // in the bitstring "data"
115  /* \param data Bitstring
116  * \param start_idx Starting index of the decoding.
117  * \param n Number of values to decode from the bitstring.
118  * \param it Iterator to store the values.
119  */
120  template <bool t_sumup, bool t_inc, class t_iter>
121  static uint64_t decode(const uint64_t * data, const size_type start_idx, size_type n, t_iter it = (t_iter) nullptr);
122 
124  // beginning at start_idx in the bitstring "data"
125  // and return the sum of these values.
134  static uint64_t decode_prefix_sum(const uint64_t * data, const size_type start_idx, size_type n);
135 
137  // beginning at start_idx ending at end_idx (exclusive)
138  // in the bitstring "data"
139  // and return the sum of these values.
150  static uint64_t decode_prefix_sum(const uint64_t * data,
151  const size_type start_idx,
152  const size_type end_idx,
153  size_type n);
154 
156  // and store them in vector v.
160  template <class int_vector>
161  static bool decode(const int_vector & z, int_vector & v);
162 
163  // interface needs this function for whatever :>
164  template <class int_vector>
165  static uint64_t * raw_data(int_vector & v)
166  {
167  return v.m_data;
168  }
169 };
170 
172 
174 
176 
177 template <uint8_t t_width>
178 typename comma<t_width>::impl comma<t_width>::data;
179 
180 template <uint8_t t_width>
181 inline uint8_t comma<t_width>::encoding_length(uint64_t w)
182 {
183  // use function table and binary search to determine the number of digits
184  // needed to encode w in given base.
185  uint8_t numdigits = std::upper_bound(data.codelentbl.begin(), data.codelentbl.end(), w) - data.codelentbl.begin();
186  // finally calculate length.
187  // Don't forget termination character on calculations ;)
188  return (numdigits + 1) * t_width;
189 }
190 
191 template <uint8_t t_width>
192 void comma<t_width>::encode_in_base(uint64_t x, uint64_t *& z, uint8_t & offset)
193 {
194  if (x)
195  {
196  uint32_t digit = x % base; // get next digit
197  // encode digits with higher order
198  encode_in_base(x / base, z, offset);
199  // and write own digit
200  bits::write_int_and_move(z, digit, offset, t_width);
201  }
202 }
203 
204 template <uint8_t t_width>
205 inline void comma<t_width>::encode(uint64_t x, uint64_t *& z, uint8_t & offset)
206 {
207  // encode x itself
208  encode_in_base(x, z, offset);
209  // and append the termination digit
210  bits::write_int_and_move(z, base, offset, t_width);
211 }
212 
213 template <uint8_t t_width>
214 template <class int_vector>
216 {
217  // first, find out how much bits vector z needs to save values
218  typedef typename int_vector::size_type size_type;
219  size_type z_bit_size = 0;
220  for (typename int_vector::const_iterator it = v.begin(), end = v.end(); it != end; ++it)
221  {
222  z_bit_size += encoding_length(*it);
223  }
224 
225  // trim vector z to correct size
226  z.width(v.width());
227  z.bit_resize(z_bit_size); // for future may check if resizing works
228  z.shrink_to_fit();
229 
230  // iterate again and save values in z
231  uint64_t * z_data = z.m_data;
232  uint8_t offset = 0;
233  for (typename int_vector::const_iterator it = v.begin(), end = v.end(); it != end; ++it)
234  {
235  encode(*it, z_data, offset);
236  }
237  return true;
238 }
239 
241 
242 template <uint8_t t_width>
243 template <bool t_sumup, bool t_inc, class t_iter>
244 inline uint64_t comma<t_width>::decode(const uint64_t * data, const size_type start_idx, size_type n, t_iter it)
245 {
246  data += (start_idx >> 6); // jump to byte offset
247  uint8_t offset = start_idx & 0x3F; // and calculate bit offset
248  uint64_t value = 0;
249  for (size_type i = 0; i < n; i++)
250  {
251  // read next value
252  uint64_t v = 0;
253  for (uint32_t digit = (uint32_t)bits::read_int_and_move(data, offset, t_width); // read first digit
254  digit != base; // while digit is not the terminating digit
255  v = (v << t_width) - v + digit, // v = v * base + digit
256  digit = (uint32_t)bits::read_int_and_move(data, offset, t_width))
257  ; // and read next digit
258  // now decide how to handle value
259  value = (t_sumup) ? value + v : v;
260  if (t_inc) *(it++) = value;
261  }
262  return value;
263 }
264 
265 template <uint8_t t_width>
266 uint64_t comma<t_width>::decode_prefix_sum(const uint64_t * data, const size_type start_idx, size_type n)
267 {
268  // easiest seems to be to use already build function decode...
269  return decode<true, false, int *>(data, start_idx, n);
270  // Note for above: 3rd template parameter ca be any pntr except void *
271 }
272 
273 template <uint8_t t_width>
274 uint64_t comma<t_width>::decode_prefix_sum(const uint64_t * data,
275  const size_type start_idx,
276  SDSL_UNUSED const size_type end_idx,
277  size_type n)
278 {
279  // end index does not change anything here...
280  return decode_prefix_sum(data, start_idx, n);
281 }
282 
283 template <uint8_t t_width>
284 template <class int_vector>
286 {
287  // check if bit size is dividable through t_width.
288  if (z.bit_size() % t_width != 0) return false;
289 
290  // calculate num of overall digits in z (including terminating digit)
291  uint64_t numOfDigits = z.bit_size() / t_width;
292  // iteration vars for z vector
293  const uint64_t * z_data = z.data();
294  uint8_t z_offset = 0;
295  // utility to count number of entries in z, and last read digit
296  uint32_t digit = base;
297  typename int_vector::size_type n = 0;
298 
299  // iterate over all digits. each time a termination digit is
300  // detected, a encoded number in vector ends.
301  while (numOfDigits--)
302  {
303  digit = (uint32_t)bits::read_int_and_move(z_data, z_offset, t_width);
304  if (digit == base) n++; // termination digit detected
305  }
306 
307  // also, ensure last read digit was a termination digit
308  if (digit != base) return false;
309 
310  // resize vector v
311  v.width(z.width());
312  v.resize(n);
313  v.shrink_to_fit();
314 
315  // and finally decode and save result in v
316  decode<false, true>(z.data(), 0, n, v.begin());
317  return true;
318 }
319 
320 } // end of namespace coder
321 } // end of namespace sdsl
322 
323 #endif
bits.hpp contains the sdsl::bits class.
A class to encode and decode between comma code and binary code.
Definition: coder_comma.hpp:48
static uint64_t decode_prefix_sum(const uint64_t *data, const size_type start_idx, const size_type end_idx, size_type n)
Decode n comma gamma encoded integers.
static uint8_t encoding_length(uint64_t w)
Get the number of bits that are necessary to encode.
static uint64_t decode_prefix_sum(const uint64_t *data, const size_type start_idx, size_type n)
Decode n comma gamma encoded integers.
static void encode(uint64_t x, uint64_t *&z, uint8_t &offset)
Encode one positive integer x to an int_vector.
static const uint8_t min_codeword_length
Definition: coder_comma.hpp:86
static uint64_t decode(const uint64_t *data, const size_type start_idx, size_type n, t_iter it=(t_iter) nullptr)
Decode n comma encoded values beginning at start_idx.
static uint64_t * raw_data(int_vector &v)
A generic vector class for integers of width .
Definition: int_vector.hpp:253
iterator end() noexcept
Iterator that points to the element after the last element of int_vector.
Definition: int_vector.hpp:788
int_vector_size_type size_type
Definition: int_vector.hpp:266
size_type bit_size() const noexcept
The number of bits in the int_vector.
Definition: int_vector.hpp:571
const uint64_t * data() const noexcept
Pointer to the raw data of the int_vector.
Definition: int_vector.hpp:590
void shrink_to_fit()
Free unused allocated memory.
Definition: int_vector.hpp:530
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
iterator begin() noexcept
Iterator that points to the first element of the int_vector.
Definition: int_vector.hpp:783
#define SDSL_UNUSED
Definition: config.hpp:13
int_vector ::size_type size_type
Namespace for the succinct data structure library.
static SDSL_CONSTEXPR void write_int_and_move(uint64_t *&word, uint64_t x, uint8_t &offset, const uint8_t len)
Writes value x to an bit position in an array and moves the bit-pointer.
Definition: bits.hpp:752
static SDSL_CONSTEXPR uint64_t read_int_and_move(const uint64_t *&word, uint8_t &offset, const uint8_t len=64)
Reads a value from a bit position in an array and moved the bit-pointer.
Definition: bits.hpp:805