1 | // class template regex -*- C++ -*- |
2 | |
3 | // Copyright (C) 2013-2024 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | /** |
26 | * @file bits/regex_scanner.tcc |
27 | * This is an internal header file, included by other library headers. |
28 | * Do not attempt to use it directly. @headername{regex} |
29 | */ |
30 | |
31 | // FIXME make comments doxygen format. |
32 | |
33 | // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep |
34 | // and awk |
35 | // 1) grep is basic except '\n' is treated as '|' |
36 | // 2) egrep is extended except '\n' is treated as '|' |
37 | // 3) awk is extended except special escaping rules, and there's no |
38 | // back-reference. |
39 | // |
40 | // References: |
41 | // |
42 | // ECMAScript: ECMA-262 15.10 |
43 | // |
44 | // basic, extended: |
45 | // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html |
46 | // |
47 | // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html |
48 | |
49 | namespace std _GLIBCXX_VISIBILITY(default) |
50 | { |
51 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
52 | |
53 | namespace __detail |
54 | { |
55 | template<typename _CharT> |
56 | _Scanner<_CharT>:: |
57 | _Scanner(const _CharT* __begin, const _CharT* __end, |
58 | _FlagT __flags, std::locale __loc) |
59 | : _ScannerBase(__flags), |
60 | _M_current(__begin), _M_end(__end), |
61 | _M_ctype(std::use_facet<_CtypeT>(__loc)), |
62 | _M_eat_escape(_M_is_ecma() |
63 | ? &_Scanner::_M_eat_escape_ecma |
64 | : &_Scanner::_M_eat_escape_posix) |
65 | { _M_advance(); } |
66 | |
67 | template<typename _CharT> |
68 | void |
69 | _Scanner<_CharT>:: |
70 | _M_advance() |
71 | { |
72 | if (_M_current == _M_end) |
73 | { |
74 | _M_token = _S_token_eof; |
75 | return; |
76 | } |
77 | |
78 | if (_M_state == _S_state_normal) |
79 | _M_scan_normal(); |
80 | else if (_M_state == _S_state_in_bracket) |
81 | _M_scan_in_bracket(); |
82 | else if (_M_state == _S_state_in_brace) |
83 | _M_scan_in_brace(); |
84 | else |
85 | { |
86 | __glibcxx_assert(!"unexpected state while processing regex" ); |
87 | } |
88 | } |
89 | |
90 | // Differences between styles: |
91 | // 1) "\(", "\)", "\{" in basic. It's not escaping. |
92 | // 2) "(?:", "(?=", "(?!" in ECMAScript. |
93 | template<typename _CharT> |
94 | void |
95 | _Scanner<_CharT>:: |
96 | _M_scan_normal() |
97 | { |
98 | auto __c = *_M_current++; |
99 | |
100 | if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) |
101 | { |
102 | _M_token = _S_token_ord_char; |
103 | _M_value.assign(1, __c); |
104 | return; |
105 | } |
106 | if (__c == '\\') |
107 | { |
108 | if (_M_current == _M_end) |
109 | __throw_regex_error( |
110 | ecode: regex_constants::error_escape, |
111 | what: "Invalid escape at end of regular expression" ); |
112 | |
113 | if (!_M_is_basic() |
114 | || (*_M_current != '(' |
115 | && *_M_current != ')' |
116 | && *_M_current != '{')) |
117 | { |
118 | (this->*_M_eat_escape)(); |
119 | return; |
120 | } |
121 | __c = *_M_current++; |
122 | } |
123 | if (__c == '(') |
124 | { |
125 | if (_M_is_ecma() && *_M_current == '?') |
126 | { |
127 | if (++_M_current == _M_end) |
128 | __throw_regex_error(ecode: regex_constants::error_paren); |
129 | |
130 | if (*_M_current == ':') |
131 | { |
132 | ++_M_current; |
133 | _M_token = _S_token_subexpr_no_group_begin; |
134 | } |
135 | else if (*_M_current == '=') |
136 | { |
137 | ++_M_current; |
138 | _M_token = _S_token_subexpr_lookahead_begin; |
139 | _M_value.assign(1, 'p'); |
140 | } |
141 | else if (*_M_current == '!') |
142 | { |
143 | ++_M_current; |
144 | _M_token = _S_token_subexpr_lookahead_begin; |
145 | _M_value.assign(1, 'n'); |
146 | } |
147 | else |
148 | __throw_regex_error(ecode: regex_constants::error_paren, |
149 | what: "Invalid '(?...)' zero-width assertion " |
150 | "in regular expression" ); |
151 | } |
152 | else if (_M_flags & regex_constants::nosubs) |
153 | _M_token = _S_token_subexpr_no_group_begin; |
154 | else |
155 | _M_token = _S_token_subexpr_begin; |
156 | } |
157 | else if (__c == ')') |
158 | _M_token = _S_token_subexpr_end; |
159 | else if (__c == '[') |
160 | { |
161 | _M_state = _S_state_in_bracket; |
162 | _M_at_bracket_start = true; |
163 | if (_M_current != _M_end && *_M_current == '^') |
164 | { |
165 | _M_token = _S_token_bracket_neg_begin; |
166 | ++_M_current; |
167 | } |
168 | else |
169 | _M_token = _S_token_bracket_begin; |
170 | } |
171 | else if (__c == '{') |
172 | { |
173 | _M_state = _S_state_in_brace; |
174 | _M_token = _S_token_interval_begin; |
175 | } |
176 | else if (__builtin_expect(__c == _CharT(0), false)) |
177 | { |
178 | if (!_M_is_ecma()) |
179 | __throw_regex_error(ecode: regex_constants::_S_null); |
180 | _M_token = _S_token_ord_char; |
181 | _M_value.assign(1, __c); |
182 | } |
183 | else if (__c != ']' && __c != '}') |
184 | { |
185 | auto __it = _M_token_tbl; |
186 | auto __narrowc = _M_ctype.narrow(__c, '\0'); |
187 | for (; __it->first != '\0'; ++__it) |
188 | if (__it->first == __narrowc) |
189 | { |
190 | _M_token = __it->second; |
191 | return; |
192 | } |
193 | __glibcxx_assert(!"unexpected special character in regex" ); |
194 | } |
195 | else |
196 | { |
197 | _M_token = _S_token_ord_char; |
198 | _M_value.assign(1, __c); |
199 | } |
200 | } |
201 | |
202 | // Differences between styles: |
203 | // 1) different semantics of "[]" and "[^]". |
204 | // 2) Escaping in bracket expr. |
205 | template<typename _CharT> |
206 | void |
207 | _Scanner<_CharT>:: |
208 | _M_scan_in_bracket() |
209 | { |
210 | if (_M_current == _M_end) |
211 | __throw_regex_error(ecode: regex_constants::error_brack); |
212 | |
213 | auto __c = *_M_current++; |
214 | |
215 | if (__c == '-') |
216 | _M_token = _S_token_bracket_dash; |
217 | else if (__c == '[') |
218 | { |
219 | if (_M_current == _M_end) |
220 | __throw_regex_error(ecode: regex_constants::error_brack, |
221 | what: "Incomplete '[[' character class in " |
222 | "regular expression" ); |
223 | |
224 | if (*_M_current == '.') |
225 | { |
226 | _M_token = _S_token_collsymbol; |
227 | _M_eat_class(*_M_current++); |
228 | } |
229 | else if (*_M_current == ':') |
230 | { |
231 | _M_token = _S_token_char_class_name; |
232 | _M_eat_class(*_M_current++); |
233 | } |
234 | else if (*_M_current == '=') |
235 | { |
236 | _M_token = _S_token_equiv_class_name; |
237 | _M_eat_class(*_M_current++); |
238 | } |
239 | else |
240 | { |
241 | _M_token = _S_token_ord_char; |
242 | _M_value.assign(1, __c); |
243 | } |
244 | } |
245 | // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted |
246 | // literally. So "[]]" and "[^]]" are valid regexes. See the testcases |
247 | // `.../empty_range.cc`. |
248 | else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) |
249 | { |
250 | _M_token = _S_token_bracket_end; |
251 | _M_state = _S_state_normal; |
252 | } |
253 | // ECMAScript and awk permits escaping in bracket. |
254 | else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) |
255 | (this->*_M_eat_escape)(); |
256 | else |
257 | { |
258 | _M_token = _S_token_ord_char; |
259 | _M_value.assign(1, __c); |
260 | } |
261 | _M_at_bracket_start = false; |
262 | } |
263 | |
264 | // Differences between styles: |
265 | // 1) "\}" in basic style. |
266 | template<typename _CharT> |
267 | void |
268 | _Scanner<_CharT>:: |
269 | _M_scan_in_brace() |
270 | { |
271 | if (_M_current == _M_end) |
272 | __throw_regex_error(ecode: regex_constants::error_brace); |
273 | |
274 | auto __c = *_M_current++; |
275 | |
276 | if (_M_ctype.is(_CtypeT::digit, __c)) |
277 | { |
278 | _M_token = _S_token_dup_count; |
279 | _M_value.assign(1, __c); |
280 | while (_M_current != _M_end |
281 | && _M_ctype.is(_CtypeT::digit, *_M_current)) |
282 | _M_value += *_M_current++; |
283 | } |
284 | else if (__c == ',') |
285 | _M_token = _S_token_comma; |
286 | // basic use \}. |
287 | else if (_M_is_basic()) |
288 | { |
289 | if (__c == '\\' && _M_current != _M_end && *_M_current == '}') |
290 | { |
291 | _M_state = _S_state_normal; |
292 | _M_token = _S_token_interval_end; |
293 | ++_M_current; |
294 | } |
295 | else |
296 | __throw_regex_error(ecode: regex_constants::error_badbrace); |
297 | } |
298 | else if (__c == '}') |
299 | { |
300 | _M_state = _S_state_normal; |
301 | _M_token = _S_token_interval_end; |
302 | } |
303 | else |
304 | __throw_regex_error(ecode: regex_constants::error_badbrace); |
305 | } |
306 | |
307 | template<typename _CharT> |
308 | void |
309 | _Scanner<_CharT>:: |
310 | _M_eat_escape_ecma() |
311 | { |
312 | if (_M_current == _M_end) |
313 | __throw_regex_error(ecode: regex_constants::error_escape); |
314 | |
315 | auto __c = *_M_current++; |
316 | auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0')); |
317 | |
318 | if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) |
319 | { |
320 | _M_token = _S_token_ord_char; |
321 | _M_value.assign(1, *__pos); |
322 | } |
323 | else if (__c == 'b') |
324 | { |
325 | _M_token = _S_token_word_bound; |
326 | _M_value.assign(1, 'p'); |
327 | } |
328 | else if (__c == 'B') |
329 | { |
330 | _M_token = _S_token_word_bound; |
331 | _M_value.assign(1, 'n'); |
332 | } |
333 | // N3376 28.13 |
334 | else if (__c == 'd' |
335 | || __c == 'D' |
336 | || __c == 's' |
337 | || __c == 'S' |
338 | || __c == 'w' |
339 | || __c == 'W') |
340 | { |
341 | _M_token = _S_token_quoted_class; |
342 | _M_value.assign(1, __c); |
343 | } |
344 | else if (__c == 'c') |
345 | { |
346 | if (_M_current == _M_end) |
347 | __throw_regex_error(ecode: regex_constants::error_escape, |
348 | what: "invalid '\\cX' control character in " |
349 | "regular expression" ); |
350 | _M_token = _S_token_ord_char; |
351 | _M_value.assign(1, *_M_current++); |
352 | } |
353 | else if (__c == 'x' || __c == 'u') |
354 | { |
355 | _M_value.clear(); |
356 | const int __n = __c == 'x' ? 2 : 4; |
357 | for (int __i = 0; __i < __n; __i++) |
358 | { |
359 | if (_M_current == _M_end |
360 | || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) |
361 | __throw_regex_error(ecode: regex_constants::error_escape, |
362 | what: __n == 2 |
363 | ? "Invalid '\\xNN' control character in " |
364 | "regular expression" |
365 | : "Invalid '\\uNNNN' control character in " |
366 | "regular expression" ); |
367 | _M_value += *_M_current++; |
368 | } |
369 | _M_token = _S_token_hex_num; |
370 | } |
371 | // ECMAScript recognizes multi-digit back-references. |
372 | else if (_M_ctype.is(_CtypeT::digit, __c)) |
373 | { |
374 | _M_value.assign(1, __c); |
375 | while (_M_current != _M_end |
376 | && _M_ctype.is(_CtypeT::digit, *_M_current)) |
377 | _M_value += *_M_current++; |
378 | _M_token = _S_token_backref; |
379 | } |
380 | else |
381 | { |
382 | _M_token = _S_token_ord_char; |
383 | _M_value.assign(1, __c); |
384 | } |
385 | } |
386 | |
387 | // Differences between styles: |
388 | // 1) Extended doesn't support backref, but basic does. |
389 | template<typename _CharT> |
390 | void |
391 | _Scanner<_CharT>:: |
392 | _M_eat_escape_posix() |
393 | { |
394 | if (_M_current == _M_end) |
395 | __throw_regex_error(ecode: regex_constants::error_escape); |
396 | |
397 | auto __c = *_M_current; |
398 | auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); |
399 | |
400 | if (__pos != nullptr && *__pos != '\0') |
401 | { |
402 | _M_token = _S_token_ord_char; |
403 | _M_value.assign(1, __c); |
404 | } |
405 | // We MUST judge awk before handling backrefs. There's no backref in awk. |
406 | else if (_M_is_awk()) |
407 | { |
408 | _M_eat_escape_awk(); |
409 | return; |
410 | } |
411 | else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') |
412 | { |
413 | _M_token = _S_token_backref; |
414 | _M_value.assign(1, __c); |
415 | } |
416 | else |
417 | { |
418 | #ifdef __STRICT_ANSI__ |
419 | // POSIX says it is undefined to escape ordinary characters |
420 | __throw_regex_error(regex_constants::error_escape); |
421 | #else |
422 | _M_token = _S_token_ord_char; |
423 | _M_value.assign(1, __c); |
424 | #endif |
425 | } |
426 | ++_M_current; |
427 | } |
428 | |
429 | template<typename _CharT> |
430 | void |
431 | _Scanner<_CharT>:: |
432 | _M_eat_escape_awk() |
433 | { |
434 | auto __c = *_M_current++; |
435 | auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0')); |
436 | |
437 | if (__pos != nullptr) |
438 | { |
439 | _M_token = _S_token_ord_char; |
440 | _M_value.assign(1, *__pos); |
441 | } |
442 | // \ddd for oct representation |
443 | else if (_M_ctype.is(_CtypeT::digit, __c) |
444 | && __c != '8' |
445 | && __c != '9') |
446 | { |
447 | _M_value.assign(1, __c); |
448 | for (int __i = 0; |
449 | __i < 2 |
450 | && _M_current != _M_end |
451 | && _M_ctype.is(_CtypeT::digit, *_M_current) |
452 | && *_M_current != '8' |
453 | && *_M_current != '9'; |
454 | __i++) |
455 | _M_value += *_M_current++; |
456 | _M_token = _S_token_oct_num; |
457 | return; |
458 | } |
459 | else |
460 | __throw_regex_error(ecode: regex_constants::error_escape); |
461 | } |
462 | |
463 | // Eats a character class or throws an exception. |
464 | // __ch could be ':', '.' or '=', _M_current is the char after ']' when |
465 | // returning. |
466 | template<typename _CharT> |
467 | void |
468 | _Scanner<_CharT>:: |
469 | _M_eat_class(char __ch) |
470 | { |
471 | for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) |
472 | _M_value += *_M_current++; |
473 | if (_M_current == _M_end |
474 | || *_M_current++ != __ch |
475 | || _M_current == _M_end // skip __ch |
476 | || *_M_current++ != ']') // skip ']' |
477 | { |
478 | __throw_regex_error(ecode: __ch == ':' ? regex_constants::error_ctype |
479 | : regex_constants::error_collate); |
480 | } |
481 | } |
482 | |
483 | #ifdef _GLIBCXX_DEBUG |
484 | template<typename _CharT> |
485 | std::ostream& |
486 | _Scanner<_CharT>:: |
487 | _M_print(std::ostream& __ostr) |
488 | { |
489 | switch (_M_token) |
490 | { |
491 | case _S_token_anychar: |
492 | __ostr << "any-character\n" ; |
493 | break; |
494 | case _S_token_backref: |
495 | __ostr << "backref\n" ; |
496 | break; |
497 | case _S_token_bracket_begin: |
498 | __ostr << "bracket-begin\n" ; |
499 | break; |
500 | case _S_token_bracket_neg_begin: |
501 | __ostr << "bracket-neg-begin\n" ; |
502 | break; |
503 | case _S_token_bracket_end: |
504 | __ostr << "bracket-end\n" ; |
505 | break; |
506 | case _S_token_char_class_name: |
507 | __ostr << "char-class-name \"" << _M_value << "\"\n" ; |
508 | break; |
509 | case _S_token_closure0: |
510 | __ostr << "closure0\n" ; |
511 | break; |
512 | case _S_token_closure1: |
513 | __ostr << "closure1\n" ; |
514 | break; |
515 | case _S_token_collsymbol: |
516 | __ostr << "collsymbol \"" << _M_value << "\"\n" ; |
517 | break; |
518 | case _S_token_comma: |
519 | __ostr << "comma\n" ; |
520 | break; |
521 | case _S_token_dup_count: |
522 | __ostr << "dup count: " << _M_value << "\n" ; |
523 | break; |
524 | case _S_token_eof: |
525 | __ostr << "EOF\n" ; |
526 | break; |
527 | case _S_token_equiv_class_name: |
528 | __ostr << "equiv-class-name \"" << _M_value << "\"\n" ; |
529 | break; |
530 | case _S_token_interval_begin: |
531 | __ostr << "interval begin\n" ; |
532 | break; |
533 | case _S_token_interval_end: |
534 | __ostr << "interval end\n" ; |
535 | break; |
536 | case _S_token_line_begin: |
537 | __ostr << "line begin\n" ; |
538 | break; |
539 | case _S_token_line_end: |
540 | __ostr << "line end\n" ; |
541 | break; |
542 | case _S_token_opt: |
543 | __ostr << "opt\n" ; |
544 | break; |
545 | case _S_token_or: |
546 | __ostr << "or\n" ; |
547 | break; |
548 | case _S_token_ord_char: |
549 | __ostr << "ordinary character: \"" << _M_value << "\"\n" ; |
550 | break; |
551 | case _S_token_subexpr_begin: |
552 | __ostr << "subexpr begin\n" ; |
553 | break; |
554 | case _S_token_subexpr_no_group_begin: |
555 | __ostr << "no grouping subexpr begin\n" ; |
556 | break; |
557 | case _S_token_subexpr_lookahead_begin: |
558 | __ostr << "lookahead subexpr begin\n" ; |
559 | break; |
560 | case _S_token_subexpr_end: |
561 | __ostr << "subexpr end\n" ; |
562 | break; |
563 | case _S_token_unknown: |
564 | __ostr << "-- unknown token --\n" ; |
565 | break; |
566 | case _S_token_oct_num: |
567 | __ostr << "oct number " << _M_value << "\n" ; |
568 | break; |
569 | case _S_token_hex_num: |
570 | __ostr << "hex number " << _M_value << "\n" ; |
571 | break; |
572 | case _S_token_quoted_class: |
573 | __ostr << "quoted class " << "\\" << _M_value << "\n" ; |
574 | break; |
575 | default: |
576 | _GLIBCXX_DEBUG_ASSERT(false); |
577 | } |
578 | return __ostr; |
579 | } |
580 | #endif |
581 | |
582 | } // namespace __detail |
583 | _GLIBCXX_END_NAMESPACE_VERSION |
584 | } // namespace |
585 | |