RESTinio
Loading...
Searching...
No Matches
utf8_checker.hpp
Go to the documentation of this file.
1/*
2 * RESTinio
3 */
4
5/*!
6 * @file
7 * @brief An implementation of checker for UTF-8 sequences.
8 *
9 * @since v.0.6.5
10 */
11
12#pragma once
13
14#include <restinio/compiler_features.hpp>
15
16#include <cstdint>
17
18namespace restinio
19{
20
21namespace utils
22{
23
24//
25// utf8_checker_t
26//
27
28/*!
29 * @brief Helper class for checking UTF-8 byte sequence during parsing
30 * URI or incoming byte stream.
31 */
33{
34 //! Enumeration of all possible checker states.
46
47 //! The current UNICODE symbol.
48 /*!
49 * Contains a valid value only if some bytes were successfully
50 * processed by process_byte() and the current state is
51 * wait_first_byte.
52 */
54
55 //! The current state of the checker.
57
58 void
59 on_first_byte( std::uint8_t byte ) noexcept
60 {
61 if( byte <= 0x7Fu )
62 {
64 m_current_symbol = byte;
65 }
66 else if( 0xC0u == (byte & 0xE0u) )
67 {
69 m_current_symbol = (byte & 0x1Fu);
70 }
71 else if( 0xE0u == (byte & 0xF0u) )
72 {
74 m_current_symbol = (byte & 0x0Fu);
75 }
76 else if( 0xF0u == (byte & 0xF8u) )
77 {
79 m_current_symbol = (byte & 0x07u);
80 }
81 else
82 {
83 // Because UTF-8 can represent only ranges from:
84 //
85 // 0000 0000-0000 007F
86 // 0000 0080-0000 07FF
87 // 0000 0800-0000 FFFF
88 // 0001 0000-0010 FFFF
89 //
90 // There is no need to check masks like 0b111110xx and so on.
91 //
92 // See https://datatracker.ietf.org/doc/html/rfc3629
93 //
95 }
96 }
97
98 void
99 on_second_of_two( std::uint8_t byte ) noexcept
100 {
101 if( 0x80u == (byte & 0xC0u) )
102 {
103 m_current_symbol <<= 6;
104 m_current_symbol |= (byte & 0x3Fu);
105
106 // Check for overlong sequence.
107 // The valid range for two bytes representation is 0x0080..0x07FF.
108 if( m_current_symbol < 0x0080u )
109 {
110 // The value is too small, it's overlong.
112 }
113 else
114 // Three is no need to check the result value against
115 // invalid ranges (0xD800..0xDFFF and 0x110000..)
116 // because two bytes only represents 0x0080..0x07FF.
118 }
119 else
120 {
122 }
123 }
124
125 void
126 on_second_of_three( std::uint8_t byte ) noexcept
127 {
128 if( 0x80u == (byte & 0xC0u) )
129 {
130 m_current_symbol <<= 6;
131 m_current_symbol |= (byte & 0x3Fu);
132
134 }
135 else
136 {
138 }
139 }
140
141 void
142 on_second_of_four( std::uint8_t byte ) noexcept
143 {
144 if( 0x80u == (byte & 0xC0u) )
145 {
146 m_current_symbol <<= 6;
147 m_current_symbol |= (byte & 0x3Fu);
148
150 }
151 else
152 {
154 }
155 }
156
157 void
158 on_third_of_three( std::uint8_t byte ) noexcept
159 {
160 if( 0x80u == (byte & 0xC0u) )
161 {
162 m_current_symbol <<= 6;
163 m_current_symbol |= (byte & 0x3Fu);
164
165 // Check for overlong sequence.
166 // The valid range for three bytes representation is 0x0800..0xFFFF.
167 if( m_current_symbol < 0x0800u )
168 {
169 // The value is too small, it's overlong.
171 }
172 else
173 {
174 // It's necessary to check illigal points 0xD800..0xDFFF.
175 if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
177 else
179 }
180 }
181 else
182 {
184 }
185 }
186
187 void
188 on_third_of_four( std::uint8_t byte ) noexcept
189 {
190 if( 0x80u == (byte & 0xC0u) )
191 {
192 m_current_symbol <<= 6;
193 m_current_symbol |= (byte & 0x3Fu);
194
196 }
197 else
198 {
200 }
201 }
202
203 void
204 on_fourth_of_four( std::uint8_t byte ) noexcept
205 {
206 if( 0x80u == (byte & 0xC0u) )
207 {
208 m_current_symbol <<= 6;
209 m_current_symbol |= (byte & 0x3Fu);
210
211 // Check for overlong sequence.
212 // The valid range for three bytes representation is 0x10000..0x10FFFF.
213 if( m_current_symbol < 0x10000u )
214 {
215 // The value is too small, it's overlong.
217 }
218 else
219 {
220 // It's necessary to check for values above 0x10FFFF.
221 // There is no need to check 0xD800..0xDFFF range because
222 // it was already handled by overlong check.
223 if( m_current_symbol >= 0x110000 )
225 else
227 }
228 }
229 else
230 {
232 }
233 }
234
235public:
236 utf8_checker_t() = default;
237
238 /*!
239 * Checks another byte.
240 *
241 * @note
242 * The actual value of the current symbol can be obtained only if
243 * process_byte() returns `true` and the subsequent call to
244 * finalized() returns `true`:
245 *
246 * @code
247 * utf8checker_t checker;
248 * for( const auto ch : some_string )
249 * {
250 * if( checker.process_byte() )
251 * {
252 * if( checker.finalized() )
253 * process_unicode_symbol( checker.current_symbol() );
254 * }
255 * else
256 * {
257 * ... // Invalid sequence found!
258 * break;
259 * }
260 * }
261 * @endcode
262 *
263 * @retval true if the sequence is still valid and the next byte
264 * can be given to the next call to process_byte().
265 *
266 * @retval false if the sequence is invalid an there is no sense
267 * to continue call process_byte().
268 */
269 [[nodiscard]]
270 bool
271 process_byte( std::uint8_t byte ) noexcept
272 {
273 switch( m_state )
274 {
276 on_first_byte( byte );
277 break;
278
280 on_second_of_two( byte );
281 break;
282
285 break;
286
289 break;
290
293 break;
294
296 on_third_of_four( byte );
297 break;
298
301 break;
302
303 case state_t::invalid:
304 // Nothing to do.
305 break;
306 }
307
308 return (state_t::invalid != m_state);
309 }
310
311 /*!
312 * @return true if the current sequence finalized.
313 */
314 [[nodiscard]]
315 bool
316 finalized() const noexcept
317 {
319 }
320
321 /*!
322 * Return the object into the initial state.
323 */
324 void
325 reset() noexcept
326 {
327 m_current_symbol = 0u;
329 }
330
331 /*!
332 * Get the collected value of the current symbol.
333 *
334 * @note
335 * It returns the actual value only if:
336 *
337 * - some bytes were successfully feed into process_byte();
338 * - finalized() returns `true`.
339 */
340 [[nodiscard]]
341 std::uint32_t
342 current_symbol() const noexcept { return m_current_symbol; }
343};
344
345} /* namespace utils */
346
347} /* namespace restinio */
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream.
void on_fourth_of_four(std::uint8_t byte) noexcept
bool process_byte(std::uint8_t byte) noexcept
void on_third_of_three(std::uint8_t byte) noexcept
void on_second_of_three(std::uint8_t byte) noexcept
state_t
Enumeration of all possible checker states.
bool finalized() const noexcept
void on_second_of_four(std::uint8_t byte) noexcept
void on_third_of_four(std::uint8_t byte) noexcept
std::uint32_t current_symbol() const noexcept
void on_first_byte(std::uint8_t byte) noexcept
std::uint32_t m_current_symbol
The current UNICODE symbol.
void on_second_of_two(std::uint8_t byte) noexcept
state_t m_state
The current state of the checker.