RESTinio
percent_encoding.hpp
Go to the documentation of this file.
1 /*
2  restinio
3 */
4 
5 /*!
6  Percent encoding routine.
7 */
8 
9 #pragma once
10 
11 #include <string>
12 
13 #include <restinio/impl/include_fmtlib.hpp>
14 
15 #include <restinio/string_view.hpp>
16 #include <restinio/exception.hpp>
17 #include <restinio/expected.hpp>
18 
19 #include <restinio/utils/utf8_checker.hpp>
20 
21 namespace restinio
22 {
23 
24 namespace utils
25 {
26 
27 /*!
28  * @brief The default traits for escaping and unexcaping symbols in
29  * a query string.
30  *
31  * Unescaped asterisk is not allowed.
32  *
33  * @since v.0.4.9.1
34  */
36 {
37  static constexpr bool
38  ordinary_char( char c ) noexcept
39  {
40  return
41  ( '0' <= c && c <= '9' ) ||
42  ( 'a' <= c && c <= 'z' ) ||
43  ( 'A' <= c && c <= 'Z' ) ||
44  '-' == c ||
45  '.' == c ||
46  '~' == c ||
47  '_' == c;
48  }
49 };
50 
51 /*!
52  * @brief Traits for escaping and unexcaping symbols in
53  * a query string in correspondence with application/x-www-form-urlencoded
54  * rules.
55  *
56  * Reference for more details: https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
57  *
58  * @since v.0.6.5
59  */
61 {
62  static constexpr bool
63  ordinary_char( char c ) noexcept
64  {
65  return
66  ( '0' <= c && c <= '9' ) ||
67  ( 'a' <= c && c <= 'z' ) ||
68  ( 'A' <= c && c <= 'Z' ) ||
69  '*' == c ||
70  '-' == c ||
71  '.' == c ||
72  '_' == c;
73  }
74 };
75 
76 /*!
77  * @brief Traits for escaping and unescaping symbols in
78  * a query string in very relaxed mode.
79  *
80  * In that mode all characters described in that rule from
81  * [RCF3986](https://tools.ietf.org/html/rfc3986) can be used as unescaped:
82 @verbatim
83 query = *( pchar / "/" / "?" )
84 pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
85 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
86 reserved = gen-delims / sub-delims
87 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
88 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
89  / "*" / "+" / "," / ";" / "="
90 @endverbatim
91  *
92  * Additionaly this traits allows to use unescaped space character.
93  *
94  * @since v.0.6.5
95  */
97 {
98  static bool
99  ordinary_char( char c ) noexcept
100  {
101  return nullptr != std::strchr(
102  " " // Space
103  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // ALPHA
104  "abcdefghijklmnopqrstuvwxyz"
105  "0123456789" // DIGIT
106  "-._~" // unreserved
107  ":/?#[]@" // gen-delims
108  "!$&'()*+,;=", c );
109  }
110 };
111 
112 /*!
113  * @brief The traits for escaping and unexcaping symbols in
114  * JavaScript-compatible mode.
115  *
116  * The following symbols are allowed to be unescaped:
117  * `-`, `.`, `~`, `_`, `*`, `!`, `'`, `(`, `)`
118  *
119  * @note
120  * The list of allowed symbols was extended in v.0.6.5.
121  *
122  * @since v.0.4.9.1, v.0.6.5
123  */
125 {
126  static constexpr bool
127  ordinary_char( char c ) noexcept
128  {
129  return
130  ( '0' <= c && c <= '9' ) ||
131  ( 'a' <= c && c <= 'z' ) ||
132  ( 'A' <= c && c <= 'Z' ) ||
133  '-' == c ||
134  '.' == c ||
135  '~' == c ||
136  '_' == c ||
137  '*' == c ||
138  '!' == c ||
139  '\'' == c ||
140  '(' == c ||
141  ')' == c;
142  }
143 };
144 
145 /*!
146  * @brief Type that indicates that unescaping of percent-encoded symbols
147  * completed successfully.
148  *
149  * @since v.0.6.5
150  */
152 
153 /*!
154  * @brief Type that indicates a failure of unescaping of percent-encoded
155  * symbols.
156  *
157  * @since v.0.6.5
158  */
160 {
161  //! Description of a failure.
163 
164 public:
166  std::string description )
168  {}
169 
170  //! Get a reference to the description of the failure.
172  const std::string &
173  description() const noexcept { return m_description; }
174 
175  //! Get out the value of the description of the failure.
176  /*!
177  * This method is intended for cases when this description should be move
178  * elsewhere (to another object like unescape_percent_encoding_failure_t or
179  * to some exception-like object).
180  */
182  std::string
183  giveout_description() noexcept { return std::move(m_description); }
184 };
185 
186 namespace impl
187 {
188 
189 inline bool
190 is_hexdigit( char c )
191 {
192  return
193  ( '0' <= c && c <= '9' ) ||
194  ( 'a' <= c && c <= 'f' ) ||
195  ( 'A' <= c && c <= 'F' );
196 }
197 
198 inline char
199 extract_escaped_char( char c1, char c2 )
200 {
201  char result;
202 
203  if( '0' <= c1 && c1 <= '9' )
204  result = c1 - '0';
205  else
206  {
207  c1 |= 0x20;
208  result = 10 + c1 - 'a';
209  }
210 
211  result <<= 4;
212 
213  if( '0' <= c2 && c2 <= '9' )
214  result += c2 - '0';
215  else
216  {
217  c2 |= 0x20;
218  result += 10 + c2 - 'a';
219  }
220 
221  return result;
222 }
223 
224 //
225 // do_unescape_percent_encoding
226 //
227 /*!
228  * @brief The actual implementation of unescape-percent-encoding procedure.
229  *
230  * @since v.0.6.5
231  */
232 template<
233  typename Traits,
234  typename Chars_Collector >
235 RESTINIO_NODISCARD
236 expected_t<
240  const string_view_t data,
242 {
244  const char * d = data.data();
245 
247  bool expect_next_utf8_byte = false;
248 
249  const auto current_pos = [&d, &data]() noexcept { return d - data.data(); };
250 
251  while( 0 < chars_to_handle )
252  {
253  char c = *d;
254  if( expect_next_utf8_byte && '%' != c )
256  fmt::format(
257  "next byte from UTF-8 sequence expected at {}",
258  current_pos() )
259  } );
260 
261  if( '%' == c )
262  {
263  if( chars_to_handle >= 3 &&
264  is_hexdigit( d[ 1 ] ) &&
265  is_hexdigit( d[ 2 ] ) )
266  {
267  const auto ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
268  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
270  fmt::format( "invalid UTF-8 sequence detected at {}",
271  current_pos() )
272  } );
273 
274  collector( ch );
275  chars_to_handle -= 3;
276  d += 3;
277 
279  if( !expect_next_utf8_byte )
281  }
282  else
283  {
285  fmt::format(
286  "invalid escape sequence at pos {}", current_pos() )
287  } );
288  }
289  }
290  else if( '+' == c )
291  {
292  collector( ' ' );
293  --chars_to_handle;
294  ++d;
295  }
296  else if( Traits::ordinary_char( c ) )
297  {
298  collector( c );
299  --chars_to_handle;
300  ++d;
301  }
302  else
303  {
305  fmt::format(
306  "invalid non-escaped char with code {:#02X} at pos: {}",
307  c,
308  current_pos() )
309  } );
310  }
311  }
312 
315  fmt::format( "unfinished UTF-8 sequence" )
316  } );
317 
319 }
320 
321 } /* namespace impl */
322 
323 //! Percent encoding.
324 //! \{
325 template< typename Traits = restinio_default_unescape_traits >
327 std::string
329 {
330  std::string result;
331  const auto escaped_chars_count = static_cast<std::size_t>(
332  std::count_if(
333  data.begin(),
334  data.end(),
335  []( auto c ){ return !Traits::ordinary_char(c); } ));
336 
337  if( 0 == escaped_chars_count )
338  {
339  // No escaped chars.
340  result.assign( data.data(), data.size() );
341  }
342  else
343  {
344  // Having escaped chars.
346  for( auto c : data )
347  {
348  if( Traits::ordinary_char( c ) )
349  result += c;
350  else
351  {
352  result += fmt::format( "%{:02X}", c );
353  }
354  }
355  }
356 
357  return result;
358 }
359 
360 template< typename Traits = restinio_default_unescape_traits >
362 std::string
364 {
365  std::string result;
366  result.reserve( data.size() );
367 
369  data,
370  [&result]( char ch ) { result += ch; } );
371  if( !r )
372  throw exception_t{ r.error().giveout_description() };
373 
374  return result;
375 }
376 
377 /*!
378  * @brief Helper function for unescaping percent-encoded string.
379  *
380  * This function doesn't throw if some character can't be unescaped or
381  * some ill-formed sequence is found.
382  *
383  * @note
384  * This function is not noexcept and can throw on other types of
385  * failures (like unability to allocate a memory).
386  *
387  * @since v.0.6.5
388  */
389 template< typename Traits = restinio_default_unescape_traits >
393 {
394  std::string result;
395  result.reserve( data.size() );
396 
398  data,
399  [&result]( char ch ) { result += ch; } );
400  if( !r )
401  return make_unexpected( std::move(r.error()) );
402 
403  return std::move(result);
404 }
405 
406 template< typename Traits = restinio_default_unescape_traits >
408 std::size_t
410 {
411  std::size_t result_size = 0u;
412  char * dest = data;
413 
415  string_view_t{ data, size },
416  [&result_size, &dest]( char ch ) {
417  *dest++ = ch;
418  ++result_size;
419  } );
420  if( !r )
421  throw exception_t{ r.error().giveout_description() };
422 
423  return result_size;
424 }
425 
426 /*!
427  * @brief Helper function for unescaping percent-encoded string inplace.
428  *
429  * This function doesn't throw if some character can't be unescaped or
430  * some ill-formed sequence is found.
431  *
432  * @note
433  * This function is not noexcept and can throw on other types of
434  * failures.
435  *
436  * @since v.0.6.5
437  */
438 template< typename Traits = restinio_default_unescape_traits >
442 {
443  std::size_t result_size = 0u;
444  char * dest = data;
445 
447  string_view_t{ data, size },
448  [&result_size, &dest]( char ch ) {
449  *dest++ = ch;
450  ++result_size;
451  } );
452  if( !r )
453  return make_unexpected( std::move(r.error()) );
454 
455  return result_size;
456 }
457 
458 //! \}
459 
461 {
462 
464 {
465 
466 namespace impl
467 {
468 
469 /*!
470  * @brief Is this symbol a part of unreserved set?
471  *
472  * See https://tools.ietf.org/html/rfc3986#section-2.3 for more details.
473  *
474  * @since v.0.6.2
475  */
477 constexpr inline bool
478 is_unreserved_char( const char ch ) noexcept
479 {
480  // In this version of RESTinio class restinio_default_unescape_traits
481  // already implements necessary check.
483 }
484 
485 /*!
486  * @brief Internal helper to perform the main logic of enumeration
487  * of symbols in URI.
488  *
489  * Inspect the content of \a what and calls \a one_byte_handler if
490  * single characted should be used as output, otherwise calls
491  * \a three_bytes_handler (if percent-encoding sequence from three chars
492  * should be passed to the output as is).
493  *
494  * @attention
495  * Throws if invalid UTF-8 sequence is found.
496  *
497  * @brief v.0.6.5
498  */
499 template<
500  typename One_Byte_Handler,
501  typename Three_Byte_Handler >
502 void
507 {
508  using namespace restinio::utils::impl;
509 
511  const char * d = what.data();
512 
514  bool expect_next_utf8_byte = false;
515 
516  const auto current_pos = [&d, &what]() noexcept { return d - what.data(); };
517 
518  while( 0 < chars_to_handle )
519  {
520  if( expect_next_utf8_byte && '%' != *d )
521  throw exception_t{
522  fmt::format( "next byte from UTF-8 sequence expected at {}",
523  current_pos() )
524  };
525 
526  if( '%' != *d )
527  {
528  // Just one symbol to the output.
529  one_byte_handler( *d );
530  ++d;
531  --chars_to_handle;
532  }
533  else if( chars_to_handle >= 3 &&
534  is_hexdigit( d[ 1 ] ) && is_hexdigit( d[ 2 ] ) )
535  {
536  const char ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
537  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
538  throw exception_t{
539  fmt::format( "invalid UTF-8 sequence detected at {}",
540  current_pos() )
541  };
542 
543  bool keep_three_bytes = true;
544 
545  if( utf8_checker.finalized() )
546  {
547  expect_next_utf8_byte = false;
548 
549  const auto symbol = utf8_checker.current_symbol();
551 
552  if( symbol < 0x80u )
553  {
554  const char ascii_char = static_cast<char>(symbol);
556  {
557  // percent encoded char will be replaced by one char.
559  keep_three_bytes = false;
560  }
561  }
562  }
563  else
564  {
565  expect_next_utf8_byte = true;
566  }
567 
568  if( keep_three_bytes )
569  {
570  // this part of multi-byte char will go to the output as is.
571  three_byte_handler( d[ 0 ], d[ 1 ], d[ 2 ] );
572  }
573 
574  chars_to_handle -= 3;
575  d += 3u;
576  }
577  else
578  {
579  throw exception_t{
580  fmt::format( "invalid escape sequence at pos {}", current_pos() )
581  };
582  }
583  }
584 
586  throw exception_t{ fmt::format( "unfinished UTF-8 sequence" ) };
587 }
588 
589 } /* namespace impl */
590 
591 /*!
592  * @brief Calculate the size of a buffer to hold normalized value of a URI.
593  *
594  * If @a what has some chars from unreserved set in percent-encoded form
595  * then this function returns the size of a buffer to hold normalized value
596  * of @a what. Otherwise the original size of @a what is returned.
597  *
598  * @note
599  * This functions throws if @a what has invalid value.
600  *
601  * @since v.0.6.2
602  */
604 inline std::size_t
607 {
609 
611  [&calculated_capacity]( char ) noexcept {
613  },
614  [&calculated_capacity]( char, char, char ) noexcept {
615  calculated_capacity += 3u;
616  } );
617 
618  return calculated_capacity;
619 }
620 
621 /*!
622  * @brief Perform normalization of URI value.
623  *
624  * Copies the content of @a what into @a dest and replaces the
625  * percent-encoded representation of chars from unreserved set into
626  * their normal values.
627  *
628  * @attention
629  * The capacity of @a dest should be enough to hold the result value.
630  * It's assumed that estimate_required_capacity() is called before that
631  * function and the result of estimate_required_capacity() is used for
632  * allocation of a buffer for @a dest.
633  *
634  * @note
635  * This functions throws if @a what has invalid value.
636  *
637  * @since v.0.6.2
638  */
639 inline void
642  char * dest )
643 {
645  [&dest]( char ch ) noexcept {
646  *dest++ = ch;
647  },
648  [&dest]( char ch1, char ch2, char ch3 ) noexcept {
649  dest[ 0 ] = ch1;
650  dest[ 1 ] = ch2;
651  dest[ 2 ] = ch3;
652  dest += 3;
653  } );
654 }
655 
656 } /* namespace unreserved_chars */
657 
658 } /* namespace uri_normalization */
659 
660 } /* namespace utils */
661 
662 } /* namespace restinio */
static constexpr bool ordinary_char(char c) noexcept
static bool ordinary_char(char c) noexcept
The traits for escaping and unexcaping symbols in JavaScript-compatible mode.
std::string m_description
Description of a failure.
RESTINIO_NODISCARD std::string giveout_description() noexcept
Get out the value of the description of the failure.
The default traits for escaping and unexcaping symbols in a query string.
Traits for escaping and unescaping symbols in a query string in very relaxed mode.
void normalize_to(string_view_t what, char *dest)
Perform normalization of URI value.
char extract_escaped_char(char c1, char c2)
Traits for escaping and unexcaping symbols in a query string in correspondence with application/x-www...
static constexpr bool ordinary_char(char c) noexcept
static constexpr bool ordinary_char(char c) noexcept
RESTINIO_NODISCARD expected_t< unescape_percent_encoding_success_t, unescape_percent_encoding_failure_t > do_unescape_percent_encoding(const string_view_t data, Chars_Collector &&collector)
The actual implementation of unescape-percent-encoding procedure.
std::enable_if< std::is_same< Parameter_Container, query_string_params_t >::value||std::is_same< Parameter_Container, router::route_params_t >::value, optional_t< Value_Type > >::type opt_value(const Parameter_Container &params, string_view_t key)
Gets the value of a parameter specified by key wrapped in optional_t<Value_Type> if parameter exists ...
Definition: value_or.hpp:64
Type that indicates that unescaping of percent-encoded symbols completed successfully.