SDSL  3.0.0
Succinct Data Structure Library
construct_sa.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_SDSL_CONSTRUCT_SA
10 #define INCLUDED_SDSL_CONSTRUCT_SA
11 
12 #include <sdsl/config.hpp>
14 #include <sdsl/construct_sa_se.hpp>
15 #include <sdsl/divsufsort.hpp>
16 #include <sdsl/int_vector.hpp>
17 #include <sdsl/qsufsort.hpp>
18 
19 namespace sdsl
20 {
21 
23 
44 inline void construct_sa_se(cache_config & config)
45 {
46  int_vector<8> text;
48 
49  if (text.size() <= 2)
50  {
51  // If text is c$ or $ write suffix array [1, 0] or [0]
52  int_vector_buffer<> sa(cache_file_name(conf::KEY_SA, config), std::ios::out, 8, 2);
53  if (text.size() == 2) { sa.push_back(1); }
54  sa.push_back(0);
55  }
56  else
57  {
58  _construct_sa_se<int_vector<8>>(text, cache_file_name(conf::KEY_SA, config), 256, 0);
59  }
61 }
62 
63 namespace algorithm
64 {
65 
66 //
67 // Forward declarations
68 //----------------------------------------------------------
69 
71 
78 template <typename t_int_vec>
79 void calculate_sa(const unsigned char * c, typename t_int_vec::size_type len, t_int_vec & sa)
80 {
81  typedef typename t_int_vec::size_type size_type;
82  constexpr uint8_t t_width = t_int_vec::fixed_int_width;
83  if (len <= 1)
84  { // handle special case
85  sa.width(1);
86  sa.resize(len);
87  if (len > 0) sa[0] = 0;
88  return;
89  }
90  bool small_file = (sizeof(len) <= 4 or len < 0x7FFFFFFFULL);
91  if (small_file)
92  {
93  uint8_t sa_width = sa.width();
94  if (32 == t_width or (0 == t_width and 32 >= sa_width))
95  {
96  sa.width(32);
97  sa.resize(len);
98  divsufsort(c, (int32_t *)sa.data(), (int32_t)len);
99  // copy integers back to the right positions
100  if (sa_width != 32)
101  {
102  for (size_type i = 0, p = 0; i < len; ++i, p += sa_width)
103  {
104  sa.set_int(p, sa.get_int(i << 5, 32), sa_width);
105  }
106  sa.width(sa_width);
107  sa.resize(len);
108  }
109  }
110  else
111  {
112  if (sa.width() < bits::hi(len) + 1)
113  {
114  throw std::logic_error("width of int_vector is to small for the text!!!");
115  }
116  int_vector<> sufarray(len, 0, 32);
117  divsufsort(c, (int32_t *)sufarray.data(), (int32_t)len);
118  sa.resize(len);
119  for (size_type i = 0; i < len; ++i) { sa[i] = sufarray[i]; }
120  }
121  }
122  else
123  {
124  uint8_t sa_width = sa.width();
125  sa.width(64);
126  sa.resize(len);
127  divsufsort64(c, (int64_t *)sa.data(), len);
128  // copy integers back to the right positions
129  if (sa_width != 64)
130  {
131  for (size_type i = 0, p = 0; i < len; ++i, p += sa_width)
132  {
133  sa.set_int(p, sa.get_int(i << 6, 64), sa_width);
134  }
135  sa.width(sa_width);
136  sa.resize(len);
137  }
138  }
139 }
140 
141 } // end namespace algorithm
142 
144 
159 template <uint8_t t_width>
161 {
162  static_assert(t_width == 0 or t_width == 8,
163  "construct_sa: width must be `0` for integer alphabet and `8` for byte alphabet");
165  if (t_width == 8)
166  {
167  if (construct_config().byte_algo_sa == LIBDIVSUFSORT)
168  {
169  read_only_mapper<t_width> text(KEY_TEXT, config);
170  auto sa = write_out_mapper<0>::create(cache_file_name(conf::KEY_SA, config), 0, bits::hi(text.size()) + 1);
171  // call divsufsort
172  algorithm::calculate_sa((const unsigned char *)text.data(), text.size(), sa);
173  }
174  else if (construct_config().byte_algo_sa == SE_SAIS)
175  {
176  construct_sa_se(config);
177  }
178  }
179  else if (t_width == 0)
180  {
181  // call qsufsort
182  int_vector<> sa;
183  sdsl::qsufsort::construct_sa(sa, cache_file_name(KEY_TEXT, config).c_str(), 0);
184  store_to_cache(sa, conf::KEY_SA, config);
185  }
186  else
187  {
188  std::cerr << "Unknown alphabet type" << std::endl;
189  }
190 }
191 
192 } // end namespace sdsl
193 
194 #endif
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
const uint64_t * data() const
const uint64_t * data() const noexcept
Pointer to the raw data of the int_vector.
Definition: int_vector.hpp:590
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
size_type size() const noexcept
The number of elements in the int_vector.
static int_vector_mapper< t_width > create(const std::string &key, cache_config &config)
int_vector.hpp contains the sdsl::int_vector class.
void calculate_sa(const unsigned char *c, typename t_int_vec::size_type len, t_int_vec &sa)
Calculates the Suffix Array for a text.
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_TEXT[]
Definition: config.hpp:41
int_vector ::size_type size_type
void construct_sa(int_vector_type &sa, const char *file, uint8_t num_bytes)
Construct a suffix array for the sequence stored in a file.
Definition: qsufsort.hpp:60
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
@ LIBDIVSUFSORT
Definition: config.hpp:61
@ SE_SAIS
Definition: config.hpp:62
construct_config_data & construct_config()
int32_t divsufsort(const uint8_t *T, saidx_t *SA, saidx_t n)
void construct_sa_se(cache_config &config)
Constructs the Suffix Array (SA) from text over byte-alphabet.
bool load_from_file(T &v, const std::string &file)
Load sdsl-object v from a file.
Definition: io.hpp:901
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
void construct_sa(cache_config &config)
Constructs the Suffix Array (SA) from text over byte- or integer-alphabet.
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition: io.hpp:737
int32_t divsufsort64(const uint8_t *T, int64_t *SA, int64_t n)
qsufsort.hpp contains the interface for the suffix array construction algorithm of Larsson.
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
Helper class for construction process.
Definition: config.hpp:67
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition: config.hpp:91