RESTinio
Loading...
Searching...
No Matches
percent_encoding.hpp
Go to the documentation of this file.
1/*
2 restinio
3*/
4
5/*!
6 Percent encoding routine.
7*/
8
9#pragma once
10
11#include <string>
12
13#include <restinio/impl/include_fmtlib.hpp>
14
15#include <restinio/string_view.hpp>
16#include <restinio/exception.hpp>
17#include <restinio/expected.hpp>
18
19#include <restinio/utils/utf8_checker.hpp>
20
21namespace restinio
22{
23
24namespace utils
25{
26
27/*!
28 * @brief The default traits for escaping and unexcaping symbols in
29 * a query string.
30 *
31 * Unescaped asterisk is not allowed.
32 *
33 * @since v.0.4.9.1
34 */
36{
37 static constexpr bool
38 ordinary_char( char c ) noexcept
39 {
40 return
41 ( '0' <= c && c <= '9' ) ||
42 ( 'a' <= c && c <= 'z' ) ||
43 ( 'A' <= c && c <= 'Z' ) ||
44 '-' == c ||
45 '.' == c ||
46 '~' == c ||
47 '_' == c;
48 }
49};
50
51/*!
52 * @brief Traits for escaping and unexcaping symbols in
53 * a query string in correspondence with application/x-www-form-urlencoded
54 * rules.
55 *
56 * Reference for more details: https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
57 *
58 * @since v.0.6.5
59 */
61{
62 static constexpr bool
63 ordinary_char( char c ) noexcept
64 {
65 return
66 ( '0' <= c && c <= '9' ) ||
67 ( 'a' <= c && c <= 'z' ) ||
68 ( 'A' <= c && c <= 'Z' ) ||
69 '*' == c ||
70 '-' == c ||
71 '.' == c ||
72 '_' == c;
73 }
74};
75
76/*!
77 * @brief Traits for escaping and unescaping symbols in
78 * a query string in very relaxed mode.
79 *
80 * In that mode all characters described in that rule from
81 * [RCF3986](https://tools.ietf.org/html/rfc3986) can be used as unescaped:
82@verbatim
83query = *( pchar / "/" / "?" )
84pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
85unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
86reserved = gen-delims / sub-delims
87gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
88sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
89 / "*" / "+" / "," / ";" / "="
90@endverbatim
91 *
92 * Additionaly this traits allows to use unescaped space character.
93 *
94 * @since v.0.6.5
95 */
97{
98 static bool
99 ordinary_char( char c ) noexcept
100 {
101 return nullptr != std::strchr(
102 " " // Space
103 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // ALPHA
104 "abcdefghijklmnopqrstuvwxyz"
105 "0123456789" // DIGIT
106 "-._~" // unreserved
107 ":/?#[]@" // gen-delims
108 "!$&'()*+,;=", c );
109 }
110};
111
112/*!
113 * @brief The traits for escaping and unexcaping symbols in
114 * JavaScript-compatible mode.
115 *
116 * The following symbols are allowed to be unescaped:
117 * `-`, `.`, `~`, `_`, `*`, `!`, `'`, `(`, `)`
118 *
119 * @note
120 * The list of allowed symbols was extended in v.0.6.5.
121 *
122 * @since v.0.4.9.1, v.0.6.5
123 */
125{
126 static constexpr bool
127 ordinary_char( char c ) noexcept
128 {
129 return
130 ( '0' <= c && c <= '9' ) ||
131 ( 'a' <= c && c <= 'z' ) ||
132 ( 'A' <= c && c <= 'Z' ) ||
133 '-' == c ||
134 '.' == c ||
135 '~' == c ||
136 '_' == c ||
137 '*' == c ||
138 '!' == c ||
139 '\'' == c ||
140 '(' == c ||
141 ')' == c;
142 }
143};
144
145/*!
146 * @brief Type that indicates that unescaping of percent-encoded symbols
147 * completed successfully.
148 *
149 * @since v.0.6.5
150 */
152
153/*!
154 * @brief Type that indicates a failure of unescaping of percent-encoded
155 * symbols.
156 *
157 * @since v.0.6.5
158 */
160{
161 //! Description of a failure.
162 std::string m_description;
163
164public:
166 std::string description )
168 {}
169
170 //! Get a reference to the description of the failure.
171 [[nodiscard]]
172 const std::string &
173 description() const noexcept { return m_description; }
174
175 //! Get out the value of the description of the failure.
176 /*!
177 * This method is intended for cases when this description should be move
178 * elsewhere (to another object like unescape_percent_encoding_failure_t or
179 * to some exception-like object).
180 */
181 [[nodiscard]]
182 std::string
183 giveout_description() noexcept { return std::move(m_description); }
184};
185
186namespace impl
187{
188
189inline bool
190is_hexdigit( char c )
191{
192 return
193 ( '0' <= c && c <= '9' ) ||
194 ( 'a' <= c && c <= 'f' ) ||
195 ( 'A' <= c && c <= 'F' );
196}
197
198inline char
199extract_escaped_char( char c1, char c2 )
200{
201 char result;
202
203 if( '0' <= c1 && c1 <= '9' )
204 result = c1 - '0';
205 else
206 {
207 c1 |= 0x20;
208 result = 10 + c1 - 'a';
209 }
210
211 result <<= 4;
212
213 if( '0' <= c2 && c2 <= '9' )
214 result += c2 - '0';
215 else
216 {
217 c2 |= 0x20;
218 result += 10 + c2 - 'a';
219 }
220
221 return result;
222}
223
224//
225// do_unescape_percent_encoding
226//
227/*!
228 * @brief The actual implementation of unescape-percent-encoding procedure.
229 *
230 * @since v.0.6.5
231 */
232template<
233 typename Traits,
234 typename Chars_Collector >
235[[nodiscard]]
240 const string_view_t data,
241 Chars_Collector && collector )
242{
243 std::size_t chars_to_handle = data.size();
244 const char * d = data.data();
245
246 utf8_checker_t utf8_checker;
247 bool expect_next_utf8_byte = false;
248
249 const auto current_pos = [&d, &data]() noexcept { return d - data.data(); };
250
251 while( 0 < chars_to_handle )
252 {
253 char c = *d;
254 if( expect_next_utf8_byte && '%' != c )
255 return make_unexpected( unescape_percent_encoding_failure_t{
256 fmt::format(
258 "next byte from UTF-8 sequence expected at {}" ),
259 current_pos() )
260 } );
261
262 if( '%' == c )
263 {
264 if( chars_to_handle >= 3 &&
265 is_hexdigit( d[ 1 ] ) &&
266 is_hexdigit( d[ 2 ] ) )
267 {
268 const auto ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
269 if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
270 return make_unexpected( unescape_percent_encoding_failure_t{
271 fmt::format(
273 "invalid UTF-8 sequence detected at {}" ),
274 current_pos() )
275 } );
276
277 collector( ch );
278 chars_to_handle -= 3;
279 d += 3;
280
281 expect_next_utf8_byte = !utf8_checker.finalized();
282 if( !expect_next_utf8_byte )
283 utf8_checker.reset();
284 }
285 else
286 {
287 return make_unexpected( unescape_percent_encoding_failure_t{
288 fmt::format(
290 "invalid escape sequence at pos {}" ),
291 current_pos() )
292 } );
293 }
294 }
295 else if( '+' == c )
296 {
297 collector( ' ' );
298 --chars_to_handle;
299 ++d;
300 }
301 else if( Traits::ordinary_char( c ) )
302 {
303 collector( c );
304 --chars_to_handle;
305 ++d;
306 }
307 else
308 {
309 return make_unexpected( unescape_percent_encoding_failure_t{
310 fmt::format(
312 "invalid non-escaped char with code {:#02X} at pos: {}" ),
313 c,
314 current_pos() )
315 } );
316 }
317 }
318
319 if( expect_next_utf8_byte )
320 return make_unexpected( unescape_percent_encoding_failure_t{
321 fmt::format(
322 RESTINIO_FMT_FORMAT_STRING( "unfinished UTF-8 sequence" ) )
323 } );
324
326}
327
328} /* namespace impl */
329
330//! Percent encoding.
331//! \{
332template< typename Traits = restinio_default_unescape_traits >
333[[nodiscard]]
334std::string
335escape_percent_encoding( const string_view_t data )
336{
337 std::string result;
338 const auto escaped_chars_count = static_cast<std::size_t>(
339 std::count_if(
340 data.begin(),
341 data.end(),
342 []( auto c ){ return !Traits::ordinary_char(c); } ));
343
344 if( 0 == escaped_chars_count )
345 {
346 // No escaped chars.
347 result.assign( data.data(), data.size() );
348 }
349 else
350 {
351 // Having escaped chars.
352 result.reserve( data.size() + 2*escaped_chars_count );
353 for( auto c : data )
354 {
355 if( Traits::ordinary_char( c ) )
356 result += c;
357 else
358 {
359 result += fmt::format( RESTINIO_FMT_FORMAT_STRING( "%{:02X}" ), c );
360 }
361 }
362 }
363
364 return result;
365}
366
367template< typename Traits = restinio_default_unescape_traits >
368[[nodiscard]]
369std::string
370unescape_percent_encoding( const string_view_t data )
371{
372 std::string result;
373 result.reserve( data.size() );
374
375 auto r = impl::do_unescape_percent_encoding<Traits>(
376 data,
377 [&result]( char ch ) { result += ch; } );
378 if( !r )
379 throw exception_t{ r.error().giveout_description() };
380
381 return result;
382}
383
384/*!
385 * @brief Helper function for unescaping percent-encoded string.
386 *
387 * This function doesn't throw if some character can't be unescaped or
388 * some ill-formed sequence is found.
389 *
390 * @note
391 * This function is not noexcept and can throw on other types of
392 * failures (like unability to allocate a memory).
393 *
394 * @since v.0.6.5
395 */
396template< typename Traits = restinio_default_unescape_traits >
397[[nodiscard]]
399try_unescape_percent_encoding( const string_view_t data )
400{
401 std::string result;
402 result.reserve( data.size() );
403
404 auto r = impl::do_unescape_percent_encoding<Traits>(
405 data,
406 [&result]( char ch ) { result += ch; } );
407 if( !r )
408 return make_unexpected( std::move(r.error()) );
409
410 return std::move(result);
411}
412
413template< typename Traits = restinio_default_unescape_traits >
414[[nodiscard]]
415std::size_t
416inplace_unescape_percent_encoding( char * data, std::size_t size )
417{
418 std::size_t result_size = 0u;
419 char * dest = data;
420
421 auto r = impl::do_unescape_percent_encoding<Traits>(
422 string_view_t{ data, size },
423 [&result_size, &dest]( char ch ) {
424 *dest++ = ch;
425 ++result_size;
426 } );
427 if( !r )
428 throw exception_t{ r.error().giveout_description() };
429
430 return result_size;
431}
432
433/*!
434 * @brief Helper function for unescaping percent-encoded string inplace.
435 *
436 * This function doesn't throw if some character can't be unescaped or
437 * some ill-formed sequence is found.
438 *
439 * @note
440 * This function is not noexcept and can throw on other types of
441 * failures.
442 *
443 * @since v.0.6.5
444 */
445template< typename Traits = restinio_default_unescape_traits >
446[[nodiscard]]
448try_inplace_unescape_percent_encoding( char * data, std::size_t size )
449{
450 std::size_t result_size = 0u;
451 char * dest = data;
452
453 auto r = impl::do_unescape_percent_encoding<Traits>(
454 string_view_t{ data, size },
455 [&result_size, &dest]( char ch ) {
456 *dest++ = ch;
457 ++result_size;
458 } );
459 if( !r )
460 return make_unexpected( std::move(r.error()) );
461
462 return result_size;
463}
464
465//! \}
466
468{
469
471{
472
473namespace impl
474{
475
476/*!
477 * @brief Is this symbol a part of unreserved set?
478 *
479 * See https://tools.ietf.org/html/rfc3986#section-2.3 for more details.
480 *
481 * @since v.0.6.2
482 */
483[[nodiscard]]
484constexpr inline bool
485is_unreserved_char( const char ch ) noexcept
486{
487 // In this version of RESTinio class restinio_default_unescape_traits
488 // already implements necessary check.
490}
491
492/*!
493 * @brief Internal helper to perform the main logic of enumeration
494 * of symbols in URI.
495 *
496 * Inspect the content of \a what and calls \a one_byte_handler if
497 * single characted should be used as output, otherwise calls
498 * \a three_bytes_handler (if percent-encoding sequence from three chars
499 * should be passed to the output as is).
500 *
501 * @attention
502 * Throws if invalid UTF-8 sequence is found.
503 *
504 * @brief v.0.6.5
505 */
506template<
507 typename One_Byte_Handler,
508 typename Three_Byte_Handler >
509void
511 string_view_t what,
512 One_Byte_Handler && one_byte_handler,
513 Three_Byte_Handler && three_byte_handler )
514{
515 using namespace restinio::utils::impl;
516
517 std::size_t chars_to_handle = what.size();
518 const char * d = what.data();
519
520 utf8_checker_t utf8_checker;
521 bool expect_next_utf8_byte = false;
522
523 const auto current_pos = [&d, &what]() noexcept { return d - what.data(); };
524
525 while( 0 < chars_to_handle )
526 {
527 if( expect_next_utf8_byte && '%' != *d )
528 throw exception_t{
529 fmt::format(
531 "next byte from UTF-8 sequence expected at {}" ),
532 current_pos() )
533 };
534
535 if( '%' != *d )
536 {
537 // Just one symbol to the output.
538 one_byte_handler( *d );
539 ++d;
540 --chars_to_handle;
541 }
542 else if( chars_to_handle >= 3 &&
543 is_hexdigit( d[ 1 ] ) && is_hexdigit( d[ 2 ] ) )
544 {
545 const char ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
546 if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
547 throw exception_t{
548 fmt::format(
550 "invalid UTF-8 sequence detected at {}" ),
551 current_pos() )
552 };
553
554 bool keep_three_bytes = true;
555
556 if( utf8_checker.finalized() )
557 {
558 expect_next_utf8_byte = false;
559
560 const auto symbol = utf8_checker.current_symbol();
561 utf8_checker.reset();
562
563 if( symbol < 0x80u )
564 {
565 const char ascii_char = static_cast<char>(symbol);
566 if( is_unreserved_char( ascii_char ) )
567 {
568 // percent encoded char will be replaced by one char.
569 one_byte_handler( ascii_char );
570 keep_three_bytes = false;
571 }
572 }
573 }
574 else
575 {
576 expect_next_utf8_byte = true;
577 }
578
579 if( keep_three_bytes )
580 {
581 // this part of multi-byte char will go to the output as is.
582 three_byte_handler( d[ 0 ], d[ 1 ], d[ 2 ] );
583 }
584
585 chars_to_handle -= 3;
586 d += 3u;
587 }
588 else
589 {
590 throw exception_t{
591 fmt::format(
592 RESTINIO_FMT_FORMAT_STRING( "invalid escape sequence at pos {}" ),
593 current_pos() )
594 };
595 }
596 }
597
598 if( expect_next_utf8_byte )
599 throw exception_t{
600 fmt::format( RESTINIO_FMT_FORMAT_STRING( "unfinished UTF-8 sequence" ) ) };
601}
602
603} /* namespace impl */
604
605/*!
606 * @brief Calculate the size of a buffer to hold normalized value of a URI.
607 *
608 * If @a what has some chars from unreserved set in percent-encoded form
609 * then this function returns the size of a buffer to hold normalized value
610 * of @a what. Otherwise the original size of @a what is returned.
611 *
612 * @note
613 * This functions throws if @a what has invalid value.
614 *
615 * @since v.0.6.2
616 */
617[[nodiscard]]
618inline std::size_t
620 string_view_t what )
621{
622 std::size_t calculated_capacity = 0u;
623
624 impl::run_normalization_algo( what,
625 [&calculated_capacity]( char ) noexcept {
626 ++calculated_capacity;
627 },
628 [&calculated_capacity]( char, char, char ) noexcept {
629 calculated_capacity += 3u;
630 } );
631
632 return calculated_capacity;
633}
634
635/*!
636 * @brief Perform normalization of URI value.
637 *
638 * Copies the content of @a what into @a dest and replaces the
639 * percent-encoded representation of chars from unreserved set into
640 * their normal values.
641 *
642 * @attention
643 * The capacity of @a dest should be enough to hold the result value.
644 * It's assumed that estimate_required_capacity() is called before that
645 * function and the result of estimate_required_capacity() is used for
646 * allocation of a buffer for @a dest.
647 *
648 * @note
649 * This functions throws if @a what has invalid value.
650 *
651 * @since v.0.6.2
652 */
653inline void
655 string_view_t what,
656 char * dest )
657{
658 impl::run_normalization_algo( what,
659 [&dest]( char ch ) noexcept {
660 *dest++ = ch;
661 },
662 [&dest]( char ch1, char ch2, char ch3 ) noexcept {
663 dest[ 0 ] = ch1;
664 dest[ 1 ] = ch2;
665 dest[ 2 ] = ch3;
666 dest += 3;
667 } );
668}
669
670} /* namespace unreserved_chars */
671
672} /* namespace uri_normalization */
673
674} /* namespace utils */
675
676} /* namespace restinio */
Exception class for all exceptions thrown by RESTinio.
Definition exception.hpp:26
Type that indicates a failure of unescaping of percent-encoded symbols.
const std::string & description() const noexcept
Get a reference to the description of the failure.
std::string giveout_description() noexcept
Get out the value of the description of the failure.
std::string m_description
Description of a failure.
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream.
bool process_byte(std::uint8_t byte) noexcept
bool finalized() const noexcept
std::uint32_t current_symbol() const noexcept
#define RESTINIO_FMT_FORMAT_STRING(s)
char extract_escaped_char(char c1, char c2)
expected_t< unescape_percent_encoding_success_t, unescape_percent_encoding_failure_t > do_unescape_percent_encoding(const string_view_t data, Chars_Collector &&collector)
The actual implementation of unescape-percent-encoding procedure.
constexpr bool is_unreserved_char(const char ch) noexcept
Is this symbol a part of unreserved set?
void run_normalization_algo(string_view_t what, One_Byte_Handler &&one_byte_handler, Three_Byte_Handler &&three_byte_handler)
Internal helper to perform the main logic of enumeration of symbols in URI.
std::size_t estimate_required_capacity(string_view_t what)
Calculate the size of a buffer to hold normalized value of a URI.
void normalize_to(string_view_t what, char *dest)
Perform normalization of URI value.
std::string escape_percent_encoding(const string_view_t data)
Percent encoding.
std::string unescape_percent_encoding(const string_view_t data)
std::size_t inplace_unescape_percent_encoding(char *data, std::size_t size)
expected_t< std::size_t, unescape_percent_encoding_failure_t > try_inplace_unescape_percent_encoding(char *data, std::size_t size)
Helper function for unescaping percent-encoded string inplace.
expected_t< std::string, unescape_percent_encoding_failure_t > try_unescape_percent_encoding(const string_view_t data)
Helper function for unescaping percent-encoded string.
The traits for escaping and unexcaping symbols in JavaScript-compatible mode.
static constexpr bool ordinary_char(char c) noexcept
Traits for escaping and unescaping symbols in a query string in very relaxed mode.
static bool ordinary_char(char c) noexcept
The default traits for escaping and unexcaping symbols in a query string.
static constexpr bool ordinary_char(char c) noexcept
Type that indicates that unescaping of percent-encoded symbols completed successfully.
Traits for escaping and unexcaping symbols in a query string in correspondence with application/x-www...
static constexpr bool ordinary_char(char c) noexcept