VirtualBox

source: kBuild/vendor/grep/2.12/lib/regexec.c

Last change on this file was 2595, checked in by bird, 12 years ago

gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)

  • Property svn:eol-style set to native
File size: 128.3 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19#include "verify.h"
20#include "intprops.h"
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 Idx n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
26 Idx str_idx, Idx from, Idx to)
27 internal_function;
28static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
31 Idx str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 Idx node, Idx str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, Idx last_node,
37 Idx last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, Idx length,
41 Idx start, Idx last_start, Idx stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, Idx length1,
46 const char *string2, Idx length2,
47 Idx start, regoff_t range,
48 struct re_registers *regs,
49 Idx stop, bool ret_len) internal_function;
50static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
51 const char *string, Idx length, Idx start,
52 regoff_t range, Idx stop,
53 struct re_registers *regs,
54 bool ret_len) internal_function;
55static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
56 Idx nregs, int regs_allocated) internal_function;
57static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 internal_function;
59static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
60 Idx *p_match_first) internal_function;
61static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
63 internal_function;
64static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
69 regmatch_t *regs,
70 re_node_set *eps_via_nodes)
71 internal_function;
72static reg_errcode_t set_regs (const regex_t *preg,
73 const re_match_context_t *mctx,
74 size_t nmatch, regmatch_t *pmatch,
75 bool fl_backtrack) internal_function;
76static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
77 internal_function;
78
79#ifdef RE_ENABLE_I18N
80static int sift_states_iter_mb (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx node_idx, Idx str_idx, Idx max_str_idx)
83 internal_function;
84#endif /* RE_ENABLE_I18N */
85static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
86 re_sift_context_t *sctx)
87 internal_function;
88static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
89 re_sift_context_t *sctx, Idx str_idx,
90 re_node_set *cur_dest)
91 internal_function;
92static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
93 re_sift_context_t *sctx,
94 Idx str_idx,
95 re_node_set *dest_nodes)
96 internal_function;
97static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
98 re_node_set *dest_nodes,
99 const re_node_set *candidates)
100 internal_function;
101static bool check_dst_limits (const re_match_context_t *mctx,
102 const re_node_set *limits,
103 Idx dst_node, Idx dst_idx, Idx src_node,
104 Idx src_idx) internal_function;
105static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
106 int boundaries, Idx subexp_idx,
107 Idx from_node, Idx bkref_idx)
108 internal_function;
109static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
110 Idx limit, Idx subexp_idx,
111 Idx node, Idx str_idx,
112 Idx bkref_idx) internal_function;
113static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
114 re_node_set *dest_nodes,
115 const re_node_set *candidates,
116 re_node_set *limits,
117 struct re_backref_cache_entry *bkref_ents,
118 Idx str_idx) internal_function;
119static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
120 re_sift_context_t *sctx,
121 Idx str_idx, const re_node_set *candidates)
122 internal_function;
123static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
124 re_dfastate_t **dst,
125 re_dfastate_t **src, Idx num)
126 internal_function;
127static re_dfastate_t *find_recover_state (reg_errcode_t *err,
128 re_match_context_t *mctx) internal_function;
129static re_dfastate_t *transit_state (reg_errcode_t *err,
130 re_match_context_t *mctx,
131 re_dfastate_t *state) internal_function;
132static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *next_state)
135 internal_function;
136static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
137 re_node_set *cur_nodes,
138 Idx str_idx) internal_function;
139#if 0
140static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
141 re_match_context_t *mctx,
142 re_dfastate_t *pstate)
143 internal_function;
144#endif
145#ifdef RE_ENABLE_I18N
146static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
147 re_dfastate_t *pstate)
148 internal_function;
149#endif /* RE_ENABLE_I18N */
150static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes)
152 internal_function;
153static reg_errcode_t get_subexp (re_match_context_t *mctx,
154 Idx bkref_node, Idx bkref_str_idx)
155 internal_function;
156static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
157 const re_sub_match_top_t *sub_top,
158 re_sub_match_last_t *sub_last,
159 Idx bkref_node, Idx bkref_str)
160 internal_function;
161static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
162 Idx subexp_idx, int type) internal_function;
163static reg_errcode_t check_arrival (re_match_context_t *mctx,
164 state_array_t *path, Idx top_node,
165 Idx top_str, Idx last_node, Idx last_str,
166 int type) internal_function;
167static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
168 Idx str_idx,
169 re_node_set *cur_nodes,
170 re_node_set *next_nodes)
171 internal_function;
172static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
173 re_node_set *cur_nodes,
174 Idx ex_subexp, int type)
175 internal_function;
176static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
177 re_node_set *dst_nodes,
178 Idx target, Idx ex_subexp,
179 int type) internal_function;
180static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
181 re_node_set *cur_nodes, Idx cur_str,
182 Idx subexp_num, int type)
183 internal_function;
184static bool build_trtable (const re_dfa_t *dfa,
185 re_dfastate_t *state) internal_function;
186#ifdef RE_ENABLE_I18N
187static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
188 const re_string_t *input, Idx idx)
189 internal_function;
190# ifdef _LIBC
191static unsigned int find_collation_sequence_value (const unsigned char *mbs,
192 size_t name_len)
193 internal_function;
194# endif /* _LIBC */
195#endif /* RE_ENABLE_I18N */
196static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
197 const re_dfastate_t *state,
198 re_node_set *states_node,
199 bitset_t *states_ch) internal_function;
200static bool check_node_accept (const re_match_context_t *mctx,
201 const re_token_t *node, Idx idx)
202 internal_function;
203static reg_errcode_t extend_buffers (re_match_context_t *mctx)
204 internal_function;
205
206
207/* Entry point for POSIX code. */
208
209/* regexec searches for a given pattern, specified by PREG, in the
210 string STRING.
211
212 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
213 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
214 least NMATCH elements, and we set them to the offsets of the
215 corresponding matched substrings.
216
217 EFLAGS specifies "execution flags" which affect matching: if
218 REG_NOTBOL is set, then ^ does not match at the beginning of the
219 string; if REG_NOTEOL is set, then $ does not match at the end.
220
221 We return 0 if we find a match and REG_NOMATCH if not. */
222
223int
224regexec (preg, string, nmatch, pmatch, eflags)
225 const regex_t *_Restrict_ preg;
226 const char *_Restrict_ string;
227 size_t nmatch;
228 regmatch_t pmatch[_Restrict_arr_];
229 int eflags;
230{
231 reg_errcode_t err;
232 Idx start, length;
233#ifdef _LIBC
234 re_dfa_t *dfa = preg->buffer;
235#endif
236
237 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
238 return REG_BADPAT;
239
240 if (eflags & REG_STARTEND)
241 {
242 start = pmatch[0].rm_so;
243 length = pmatch[0].rm_eo;
244 }
245 else
246 {
247 start = 0;
248 length = strlen (string);
249 }
250
251 __libc_lock_lock (dfa->lock);
252 if (preg->no_sub)
253 err = re_search_internal (preg, string, length, start, length,
254 length, 0, NULL, eflags);
255 else
256 err = re_search_internal (preg, string, length, start, length,
257 length, nmatch, pmatch, eflags);
258 __libc_lock_unlock (dfa->lock);
259 return err != REG_NOERROR;
260}
261
262#ifdef _LIBC
263# include <shlib-compat.h>
264versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
265
266# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
267__typeof__ (__regexec) __compat_regexec;
268
269int
270attribute_compat_text_section
271__compat_regexec (const regex_t *_Restrict_ preg,
272 const char *_Restrict_ string, size_t nmatch,
273 regmatch_t pmatch[], int eflags)
274{
275 return regexec (preg, string, nmatch, pmatch,
276 eflags & (REG_NOTBOL | REG_NOTEOL));
277}
278compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
279# endif
280#endif
281
282/* Entry points for GNU code. */
283
284/* re_match, re_search, re_match_2, re_search_2
285
286 The former two functions operate on STRING with length LENGTH,
287 while the later two operate on concatenation of STRING1 and STRING2
288 with lengths LENGTH1 and LENGTH2, respectively.
289
290 re_match() matches the compiled pattern in BUFP against the string,
291 starting at index START.
292
293 re_search() first tries matching at index START, then it tries to match
294 starting from index START + 1, and so on. The last start position tried
295 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
296 way as re_match().)
297
298 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
299 the first STOP characters of the concatenation of the strings should be
300 concerned.
301
302 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
303 and all groups is stored in REGS. (For the "_2" variants, the offsets are
304 computed relative to the concatenation, not relative to the individual
305 strings.)
306
307 On success, re_match* functions return the length of the match, re_search*
308 return the position of the start of the match. Return value -1 means no
309 match was found and -2 indicates an internal error. */
310
311regoff_t
312re_match (bufp, string, length, start, regs)
313 struct re_pattern_buffer *bufp;
314 const char *string;
315 Idx length, start;
316 struct re_registers *regs;
317{
318 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
319}
320#ifdef _LIBC
321weak_alias (__re_match, re_match)
322#endif
323
324regoff_t
325re_search (bufp, string, length, start, range, regs)
326 struct re_pattern_buffer *bufp;
327 const char *string;
328 Idx length, start;
329 regoff_t range;
330 struct re_registers *regs;
331{
332 return re_search_stub (bufp, string, length, start, range, length, regs,
333 false);
334}
335#ifdef _LIBC
336weak_alias (__re_search, re_search)
337#endif
338
339regoff_t
340re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
341 struct re_pattern_buffer *bufp;
342 const char *string1, *string2;
343 Idx length1, length2, start, stop;
344 struct re_registers *regs;
345{
346 return re_search_2_stub (bufp, string1, length1, string2, length2,
347 start, 0, regs, stop, true);
348}
349#ifdef _LIBC
350weak_alias (__re_match_2, re_match_2)
351#endif
352
353regoff_t
354re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
355 struct re_pattern_buffer *bufp;
356 const char *string1, *string2;
357 Idx length1, length2, start, stop;
358 regoff_t range;
359 struct re_registers *regs;
360{
361 return re_search_2_stub (bufp, string1, length1, string2, length2,
362 start, range, regs, stop, false);
363}
364#ifdef _LIBC
365weak_alias (__re_search_2, re_search_2)
366#endif
367
368static regoff_t
369re_search_2_stub (struct re_pattern_buffer *bufp,
370 const char *string1, Idx length1,
371 const char *string2, Idx length2,
372 Idx start, regoff_t range, struct re_registers *regs,
373 Idx stop, bool ret_len)
374{
375 const char *str;
376 regoff_t rval;
377 Idx len = length1 + length2;
378 char *s = NULL;
379
380 verify (! TYPE_SIGNED (Idx));
381 if (BE (len < length1, 0))
382 return -2;
383 /* if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
384 return -2; */
385
386 /* Concatenate the strings. */
387 if (length2 > 0)
388 if (length1 > 0)
389 {
390 s = re_malloc (char, len);
391
392 if (BE (s == NULL, 0))
393 return -2;
394#ifdef _LIBC
395 memcpy (__mempcpy (s, string1, length1), string2, length2);
396#else
397 memcpy (s, string1, length1);
398 memcpy (s + length1, string2, length2);
399#endif
400 str = s;
401 }
402 else
403 str = string2;
404 else
405 str = string1;
406
407 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
408 ret_len);
409 re_free (s);
410 return rval;
411}
412
413/* The parameters have the same meaning as those of re_search.
414 Additional parameters:
415 If RET_LEN is true the length of the match is returned (re_match style);
416 otherwise the position of the match is returned. */
417
418static regoff_t
419re_search_stub (struct re_pattern_buffer *bufp,
420 const char *string, Idx length,
421 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
422 bool ret_len)
423{
424 reg_errcode_t result;
425 regmatch_t *pmatch;
426 Idx nregs;
427 regoff_t rval;
428 int eflags = 0;
429#ifdef _LIBC
430 re_dfa_t *dfa = bufp->buffer;
431#endif
432 Idx last_start = start + range;
433
434 /* Check for out-of-range. */
435 verify (! TYPE_SIGNED (Idx));
436 /* if (BE (start < 0, 0))
437 return -1; */
438 if (BE (start > length, 0))
439 return -1;
440 if (BE (length < last_start || (0 <= range && last_start < start), 0))
441 last_start = length;
442 else if (BE (/* last_start < 0 || */ (range < 0 && start <= last_start), 0))
443 last_start = 0;
444
445 __libc_lock_lock (dfa->lock);
446
447 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
448 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
449
450 /* Compile fastmap if we haven't yet. */
451 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
452 re_compile_fastmap (bufp);
453
454 if (BE (bufp->no_sub, 0))
455 regs = NULL;
456
457 /* We need at least 1 register. */
458 if (regs == NULL)
459 nregs = 1;
460 else if (BE (bufp->regs_allocated == REGS_FIXED
461 && regs->num_regs <= bufp->re_nsub, 0))
462 {
463 nregs = regs->num_regs;
464 if (BE (nregs < 1, 0))
465 {
466 /* Nothing can be copied to regs. */
467 regs = NULL;
468 nregs = 1;
469 }
470 }
471 else
472 nregs = bufp->re_nsub + 1;
473 pmatch = re_malloc (regmatch_t, nregs);
474 if (BE (pmatch == NULL, 0))
475 {
476 rval = -2;
477 goto out;
478 }
479
480 result = re_search_internal (bufp, string, length, start, last_start, stop,
481 nregs, pmatch, eflags);
482
483 rval = 0;
484
485 /* I hope we needn't fill ther regs with -1's when no match was found. */
486 if (result != REG_NOERROR)
487 rval = result == REG_NOMATCH ? -1 : -2;
488 else if (regs != NULL)
489 {
490 /* If caller wants register contents data back, copy them. */
491 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
492 bufp->regs_allocated);
493 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
494 rval = -2;
495 }
496
497 if (BE (rval == 0, 1))
498 {
499 if (ret_len)
500 {
501 assert (pmatch[0].rm_so == start);
502 rval = pmatch[0].rm_eo - start;
503 }
504 else
505 rval = pmatch[0].rm_so;
506 }
507 re_free (pmatch);
508 out:
509 __libc_lock_unlock (dfa->lock);
510 return rval;
511}
512
513static unsigned
514re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
515 int regs_allocated)
516{
517 int rval = REGS_REALLOCATE;
518 Idx i;
519 Idx need_regs = nregs + 1;
520 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
521 uses. */
522
523 /* Have the register data arrays been allocated? */
524 if (regs_allocated == REGS_UNALLOCATED)
525 { /* No. So allocate them with malloc. */
526 regs->start = re_malloc (regoff_t, need_regs);
527 if (BE (regs->start == NULL, 0))
528 return REGS_UNALLOCATED;
529 regs->end = re_malloc (regoff_t, need_regs);
530 if (BE (regs->end == NULL, 0))
531 {
532 re_free (regs->start);
533 return REGS_UNALLOCATED;
534 }
535 regs->num_regs = need_regs;
536 }
537 else if (regs_allocated == REGS_REALLOCATE)
538 { /* Yes. If we need more elements than were already
539 allocated, reallocate them. If we need fewer, just
540 leave it alone. */
541 if (BE (need_regs > regs->num_regs, 0))
542 {
543 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
544 regoff_t *new_end;
545 if (BE (new_start == NULL, 0))
546 return REGS_UNALLOCATED;
547 new_end = re_realloc (regs->end, regoff_t, need_regs);
548 if (BE (new_end == NULL, 0))
549 {
550 re_free (new_start);
551 return REGS_UNALLOCATED;
552 }
553 regs->start = new_start;
554 regs->end = new_end;
555 regs->num_regs = need_regs;
556 }
557 }
558 else
559 {
560 assert (regs_allocated == REGS_FIXED);
561 /* This function may not be called with REGS_FIXED and nregs too big. */
562 assert (regs->num_regs >= nregs);
563 rval = REGS_FIXED;
564 }
565
566 /* Copy the regs. */
567 for (i = 0; i < nregs; ++i)
568 {
569 regs->start[i] = pmatch[i].rm_so;
570 regs->end[i] = pmatch[i].rm_eo;
571 }
572 for ( ; i < regs->num_regs; ++i)
573 regs->start[i] = regs->end[i] = -1;
574
575 return rval;
576}
577
578/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
579 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
580 this memory for recording register information. STARTS and ENDS
581 must be allocated using the malloc library routine, and must each
582 be at least NUM_REGS * sizeof (regoff_t) bytes long.
583
584 If NUM_REGS == 0, then subsequent matches should allocate their own
585 register data.
586
587 Unless this function is called, the first search or match using
588 PATTERN_BUFFER will allocate its own register data, without
589 freeing the old data. */
590
591void
592re_set_registers (bufp, regs, num_regs, starts, ends)
593 struct re_pattern_buffer *bufp;
594 struct re_registers *regs;
595 __re_size_t num_regs;
596 regoff_t *starts, *ends;
597{
598 if (num_regs)
599 {
600 bufp->regs_allocated = REGS_REALLOCATE;
601 regs->num_regs = num_regs;
602 regs->start = starts;
603 regs->end = ends;
604 }
605 else
606 {
607 bufp->regs_allocated = REGS_UNALLOCATED;
608 regs->num_regs = 0;
609 regs->start = regs->end = NULL;
610 }
611}
612#ifdef _LIBC
613weak_alias (__re_set_registers, re_set_registers)
614#endif
615
616
617/* Entry points compatible with 4.2 BSD regex library. We don't define
618 them unless specifically requested. */
619
620#if defined _REGEX_RE_COMP || defined _LIBC
621int
622# ifdef _LIBC
623weak_function
624# endif
625re_exec (s)
626 const char *s;
627{
628 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
629}
630#endif /* _REGEX_RE_COMP */
631
632
633/* Internal entry point. */
634
635/* Searches for a compiled pattern PREG in the string STRING, whose
636 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
637 meaning as with regexec. LAST_START is START + RANGE, where
638 START and RANGE have the same meaning as with re_search.
639 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
640 otherwise return the error code.
641 Note: We assume front end functions already check ranges.
642 (0 <= LAST_START && LAST_START <= LENGTH) */
643
644static reg_errcode_t
645__attribute_warn_unused_result__
646re_search_internal (const regex_t *preg,
647 const char *string, Idx length,
648 Idx start, Idx last_start, Idx stop,
649 size_t nmatch, regmatch_t pmatch[],
650 int eflags)
651{
652 reg_errcode_t err;
653 const re_dfa_t *dfa = preg->buffer;
654 Idx left_lim, right_lim;
655 int incr;
656 bool fl_longest_match;
657 int match_kind;
658 Idx match_first;
659 Idx match_last = REG_MISSING;
660 Idx extra_nmatch;
661 bool sb;
662 int ch;
663#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
664 re_match_context_t mctx = { .dfa = dfa };
665#else
666 re_match_context_t mctx;
667#endif
668 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
669 && start != last_start && !preg->can_be_null)
670 ? preg->fastmap : NULL);
671 RE_TRANSLATE_TYPE t = preg->translate;
672
673#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
674 memset (&mctx, '\0', sizeof (re_match_context_t));
675 mctx.dfa = dfa;
676#endif
677
678 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
679 nmatch -= extra_nmatch;
680
681 /* Check if the DFA haven't been compiled. */
682 if (BE (preg->used == 0 || dfa->init_state == NULL
683 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
684 || dfa->init_state_begbuf == NULL, 0))
685 return REG_NOMATCH;
686
687#ifdef DEBUG
688 /* We assume front-end functions already check them. */
689 assert (0 <= last_start && last_start <= length);
690#endif
691
692 /* If initial states with non-begbuf contexts have no elements,
693 the regex must be anchored. If preg->newline_anchor is set,
694 we'll never use init_state_nl, so do not check it. */
695 if (dfa->init_state->nodes.nelem == 0
696 && dfa->init_state_word->nodes.nelem == 0
697 && (dfa->init_state_nl->nodes.nelem == 0
698 || !preg->newline_anchor))
699 {
700 if (start != 0 && last_start != 0)
701 return REG_NOMATCH;
702 start = last_start = 0;
703 }
704
705 /* We must check the longest matching, if nmatch > 0. */
706 fl_longest_match = (nmatch != 0 || dfa->nbackref);
707
708 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
709 preg->translate, (preg->syntax & RE_ICASE) != 0,
710 dfa);
711 if (BE (err != REG_NOERROR, 0))
712 goto free_return;
713 mctx.input.stop = stop;
714 mctx.input.raw_stop = stop;
715 mctx.input.newline_anchor = preg->newline_anchor;
716
717 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
718 if (BE (err != REG_NOERROR, 0))
719 goto free_return;
720
721 /* We will log all the DFA states through which the dfa pass,
722 if nmatch > 1, or this dfa has "multibyte node", which is a
723 back-reference or a node which can accept multibyte character or
724 multi character collating element. */
725 if (nmatch > 1 || dfa->has_mb_node)
726 {
727 /* Avoid overflow. */
728 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
729 <= mctx.input.bufs_len), 0))
730 {
731 err = REG_ESPACE;
732 goto free_return;
733 }
734
735 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
736 if (BE (mctx.state_log == NULL, 0))
737 {
738 err = REG_ESPACE;
739 goto free_return;
740 }
741 }
742 else
743 mctx.state_log = NULL;
744
745 match_first = start;
746 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
747 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
748
749 /* Check incrementally whether of not the input string match. */
750 incr = (last_start < start) ? -1 : 1;
751 left_lim = (last_start < start) ? last_start : start;
752 right_lim = (last_start < start) ? start : last_start;
753 sb = dfa->mb_cur_max == 1;
754 match_kind =
755 (fastmap
756 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
757 | (start <= last_start ? 2 : 0)
758 | (t != NULL ? 1 : 0))
759 : 8);
760
761 for (;; match_first += incr)
762 {
763 err = REG_NOMATCH;
764 if (match_first < left_lim || right_lim < match_first)
765 goto free_return;
766
767 /* Advance as rapidly as possible through the string, until we
768 find a plausible place to start matching. This may be done
769 with varying efficiency, so there are various possibilities:
770 only the most common of them are specialized, in order to
771 save on code size. We use a switch statement for speed. */
772 switch (match_kind)
773 {
774 case 8:
775 /* No fastmap. */
776 break;
777
778 case 7:
779 /* Fastmap with single-byte translation, match forward. */
780 while (BE (match_first < right_lim, 1)
781 && !fastmap[t[(unsigned char) string[match_first]]])
782 ++match_first;
783 goto forward_match_found_start_or_reached_end;
784
785 case 6:
786 /* Fastmap without translation, match forward. */
787 while (BE (match_first < right_lim, 1)
788 && !fastmap[(unsigned char) string[match_first]])
789 ++match_first;
790
791 forward_match_found_start_or_reached_end:
792 if (BE (match_first == right_lim, 0))
793 {
794 ch = match_first >= length
795 ? 0 : (unsigned char) string[match_first];
796 if (!fastmap[t ? t[ch] : ch])
797 goto free_return;
798 }
799 break;
800
801 case 4:
802 case 5:
803 /* Fastmap without multi-byte translation, match backwards. */
804 while (match_first >= left_lim)
805 {
806 ch = match_first >= length
807 ? 0 : (unsigned char) string[match_first];
808 if (fastmap[t ? t[ch] : ch])
809 break;
810 --match_first;
811 }
812 if (match_first < left_lim)
813 goto free_return;
814 break;
815
816 default:
817 /* In this case, we can't determine easily the current byte,
818 since it might be a component byte of a multibyte
819 character. Then we use the constructed buffer instead. */
820 for (;;)
821 {
822 /* If MATCH_FIRST is out of the valid range, reconstruct the
823 buffers. */
824 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
825 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
826 {
827 err = re_string_reconstruct (&mctx.input, match_first,
828 eflags);
829 if (BE (err != REG_NOERROR, 0))
830 goto free_return;
831
832 offset = match_first - mctx.input.raw_mbs_idx;
833 }
834 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
835 Note that MATCH_FIRST must not be smaller than 0. */
836 ch = (match_first >= length
837 ? 0 : re_string_byte_at (&mctx.input, offset));
838 if (fastmap[ch])
839 break;
840 match_first += incr;
841 if (match_first < left_lim || match_first > right_lim)
842 {
843 err = REG_NOMATCH;
844 goto free_return;
845 }
846 }
847 break;
848 }
849
850 /* Reconstruct the buffers so that the matcher can assume that
851 the matching starts from the beginning of the buffer. */
852 err = re_string_reconstruct (&mctx.input, match_first, eflags);
853 if (BE (err != REG_NOERROR, 0))
854 goto free_return;
855
856#ifdef RE_ENABLE_I18N
857 /* Don't consider this char as a possible match start if it part,
858 yet isn't the head, of a multibyte character. */
859 if (!sb && !re_string_first_byte (&mctx.input, 0))
860 continue;
861#endif
862
863 /* It seems to be appropriate one, then use the matcher. */
864 /* We assume that the matching starts from 0. */
865 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
866 match_last = check_matching (&mctx, fl_longest_match,
867 start <= last_start ? &match_first : NULL);
868 if (match_last != REG_MISSING)
869 {
870 if (BE (match_last == REG_ERROR, 0))
871 {
872 err = REG_ESPACE;
873 goto free_return;
874 }
875 else
876 {
877 mctx.match_last = match_last;
878 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
879 {
880 re_dfastate_t *pstate = mctx.state_log[match_last];
881 mctx.last_node = check_halt_state_context (&mctx, pstate,
882 match_last);
883 }
884 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
885 || dfa->nbackref)
886 {
887 err = prune_impossible_nodes (&mctx);
888 if (err == REG_NOERROR)
889 break;
890 if (BE (err != REG_NOMATCH, 0))
891 goto free_return;
892 match_last = REG_MISSING;
893 }
894 else
895 break; /* We found a match. */
896 }
897 }
898
899 match_ctx_clean (&mctx);
900 }
901
902#ifdef DEBUG
903 assert (match_last != REG_MISSING);
904 assert (err == REG_NOERROR);
905#endif
906
907 /* Set pmatch[] if we need. */
908 if (nmatch > 0)
909 {
910 Idx reg_idx;
911
912 /* Initialize registers. */
913 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
914 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
915
916 /* Set the points where matching start/end. */
917 pmatch[0].rm_so = 0;
918 pmatch[0].rm_eo = mctx.match_last;
919 /* FIXME: This function should fail if mctx.match_last exceeds
920 the maximum possible regoff_t value. We need a new error
921 code REG_OVERFLOW. */
922
923 if (!preg->no_sub && nmatch > 1)
924 {
925 err = set_regs (preg, &mctx, nmatch, pmatch,
926 dfa->has_plural_match && dfa->nbackref > 0);
927 if (BE (err != REG_NOERROR, 0))
928 goto free_return;
929 }
930
931 /* At last, add the offset to each register, since we slid
932 the buffers so that we could assume that the matching starts
933 from 0. */
934 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
935 if (pmatch[reg_idx].rm_so != -1)
936 {
937#ifdef RE_ENABLE_I18N
938 if (BE (mctx.input.offsets_needed != 0, 0))
939 {
940 pmatch[reg_idx].rm_so =
941 (pmatch[reg_idx].rm_so == mctx.input.valid_len
942 ? mctx.input.valid_raw_len
943 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
944 pmatch[reg_idx].rm_eo =
945 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
946 ? mctx.input.valid_raw_len
947 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
948 }
949#else
950 assert (mctx.input.offsets_needed == 0);
951#endif
952 pmatch[reg_idx].rm_so += match_first;
953 pmatch[reg_idx].rm_eo += match_first;
954 }
955 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
956 {
957 pmatch[nmatch + reg_idx].rm_so = -1;
958 pmatch[nmatch + reg_idx].rm_eo = -1;
959 }
960
961 if (dfa->subexp_map)
962 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
963 if (dfa->subexp_map[reg_idx] != reg_idx)
964 {
965 pmatch[reg_idx + 1].rm_so
966 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
967 pmatch[reg_idx + 1].rm_eo
968 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
969 }
970 }
971
972 free_return:
973 re_free (mctx.state_log);
974 if (dfa->nbackref)
975 match_ctx_free (&mctx);
976 re_string_destruct (&mctx.input);
977 return err;
978}
979
980static reg_errcode_t
981__attribute_warn_unused_result__
982prune_impossible_nodes (re_match_context_t *mctx)
983{
984 const re_dfa_t *const dfa = mctx->dfa;
985 Idx halt_node, match_last;
986 reg_errcode_t ret;
987 re_dfastate_t **sifted_states;
988 re_dfastate_t **lim_states = NULL;
989 re_sift_context_t sctx;
990#ifdef DEBUG
991 assert (mctx->state_log != NULL);
992#endif
993 match_last = mctx->match_last;
994 halt_node = mctx->last_node;
995
996 /* Avoid overflow. */
997 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
998 return REG_ESPACE;
999
1000 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
1001 if (BE (sifted_states == NULL, 0))
1002 {
1003 ret = REG_ESPACE;
1004 goto free_return;
1005 }
1006 if (dfa->nbackref)
1007 {
1008 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1009 if (BE (lim_states == NULL, 0))
1010 {
1011 ret = REG_ESPACE;
1012 goto free_return;
1013 }
1014 while (1)
1015 {
1016 memset (lim_states, '\0',
1017 sizeof (re_dfastate_t *) * (match_last + 1));
1018 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1019 match_last);
1020 ret = sift_states_backward (mctx, &sctx);
1021 re_node_set_free (&sctx.limits);
1022 if (BE (ret != REG_NOERROR, 0))
1023 goto free_return;
1024 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1025 break;
1026 do
1027 {
1028 --match_last;
1029 if (! REG_VALID_INDEX (match_last))
1030 {
1031 ret = REG_NOMATCH;
1032 goto free_return;
1033 }
1034 } while (mctx->state_log[match_last] == NULL
1035 || !mctx->state_log[match_last]->halt);
1036 halt_node = check_halt_state_context (mctx,
1037 mctx->state_log[match_last],
1038 match_last);
1039 }
1040 ret = merge_state_array (dfa, sifted_states, lim_states,
1041 match_last + 1);
1042 re_free (lim_states);
1043 lim_states = NULL;
1044 if (BE (ret != REG_NOERROR, 0))
1045 goto free_return;
1046 }
1047 else
1048 {
1049 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1050 ret = sift_states_backward (mctx, &sctx);
1051 re_node_set_free (&sctx.limits);
1052 if (BE (ret != REG_NOERROR, 0))
1053 goto free_return;
1054 if (sifted_states[0] == NULL)
1055 {
1056 ret = REG_NOMATCH;
1057 goto free_return;
1058 }
1059 }
1060 re_free (mctx->state_log);
1061 mctx->state_log = sifted_states;
1062 sifted_states = NULL;
1063 mctx->last_node = halt_node;
1064 mctx->match_last = match_last;
1065 ret = REG_NOERROR;
1066 free_return:
1067 re_free (sifted_states);
1068 re_free (lim_states);
1069 return ret;
1070}
1071
1072/* Acquire an initial state and return it.
1073 We must select appropriate initial state depending on the context,
1074 since initial states may have constraints like "\<", "^", etc.. */
1075
1076static inline re_dfastate_t *
1077__attribute ((always_inline)) internal_function
1078acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1079 Idx idx)
1080{
1081 const re_dfa_t *const dfa = mctx->dfa;
1082 if (dfa->init_state->has_constraint)
1083 {
1084 unsigned int context;
1085 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1086 if (IS_WORD_CONTEXT (context))
1087 return dfa->init_state_word;
1088 else if (IS_ORDINARY_CONTEXT (context))
1089 return dfa->init_state;
1090 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1091 return dfa->init_state_begbuf;
1092 else if (IS_NEWLINE_CONTEXT (context))
1093 return dfa->init_state_nl;
1094 else if (IS_BEGBUF_CONTEXT (context))
1095 {
1096 /* It is relatively rare case, then calculate on demand. */
1097 return re_acquire_state_context (err, dfa,
1098 dfa->init_state->entrance_nodes,
1099 context);
1100 }
1101 else
1102 /* Must not happen? */
1103 return dfa->init_state;
1104 }
1105 else
1106 return dfa->init_state;
1107}
1108
1109/* Check whether the regular expression match input string INPUT or not,
1110 and return the index where the matching end. Return REG_MISSING if
1111 there is no match, and return REG_ERROR in case of an error.
1112 FL_LONGEST_MATCH means we want the POSIX longest matching.
1113 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1114 next place where we may want to try matching.
1115 Note that the matcher assumes that the matching starts from the current
1116 index of the buffer. */
1117
1118static Idx
1119internal_function __attribute_warn_unused_result__
1120check_matching (re_match_context_t *mctx, bool fl_longest_match,
1121 Idx *p_match_first)
1122{
1123 const re_dfa_t *const dfa = mctx->dfa;
1124 reg_errcode_t err;
1125 Idx match = 0;
1126 Idx match_last = REG_MISSING;
1127 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1128 re_dfastate_t *cur_state;
1129 bool at_init_state = p_match_first != NULL;
1130 Idx next_start_idx = cur_str_idx;
1131
1132 err = REG_NOERROR;
1133 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1134 /* An initial state must not be NULL (invalid). */
1135 if (BE (cur_state == NULL, 0))
1136 {
1137 assert (err == REG_ESPACE);
1138 return REG_ERROR;
1139 }
1140
1141 if (mctx->state_log != NULL)
1142 {
1143 mctx->state_log[cur_str_idx] = cur_state;
1144
1145 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1146 later. E.g. Processing back references. */
1147 if (BE (dfa->nbackref, 0))
1148 {
1149 at_init_state = false;
1150 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1151 if (BE (err != REG_NOERROR, 0))
1152 return err;
1153
1154 if (cur_state->has_backref)
1155 {
1156 err = transit_state_bkref (mctx, &cur_state->nodes);
1157 if (BE (err != REG_NOERROR, 0))
1158 return err;
1159 }
1160 }
1161 }
1162
1163 /* If the RE accepts NULL string. */
1164 if (BE (cur_state->halt, 0))
1165 {
1166 if (!cur_state->has_constraint
1167 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1168 {
1169 if (!fl_longest_match)
1170 return cur_str_idx;
1171 else
1172 {
1173 match_last = cur_str_idx;
1174 match = 1;
1175 }
1176 }
1177 }
1178
1179 while (!re_string_eoi (&mctx->input))
1180 {
1181 re_dfastate_t *old_state = cur_state;
1182 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1183
1184 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1185 && mctx->input.bufs_len < mctx->input.len)
1186 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1187 && mctx->input.valid_len < mctx->input.len))
1188 {
1189 err = extend_buffers (mctx);
1190 if (BE (err != REG_NOERROR, 0))
1191 {
1192 assert (err == REG_ESPACE);
1193 return REG_ERROR;
1194 }
1195 }
1196
1197 cur_state = transit_state (&err, mctx, cur_state);
1198 if (mctx->state_log != NULL)
1199 cur_state = merge_state_with_log (&err, mctx, cur_state);
1200
1201 if (cur_state == NULL)
1202 {
1203 /* Reached the invalid state or an error. Try to recover a valid
1204 state using the state log, if available and if we have not
1205 already found a valid (even if not the longest) match. */
1206 if (BE (err != REG_NOERROR, 0))
1207 return REG_ERROR;
1208
1209 if (mctx->state_log == NULL
1210 || (match && !fl_longest_match)
1211 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1212 break;
1213 }
1214
1215 if (BE (at_init_state, 0))
1216 {
1217 if (old_state == cur_state)
1218 next_start_idx = next_char_idx;
1219 else
1220 at_init_state = false;
1221 }
1222
1223 if (cur_state->halt)
1224 {
1225 /* Reached a halt state.
1226 Check the halt state can satisfy the current context. */
1227 if (!cur_state->has_constraint
1228 || check_halt_state_context (mctx, cur_state,
1229 re_string_cur_idx (&mctx->input)))
1230 {
1231 /* We found an appropriate halt state. */
1232 match_last = re_string_cur_idx (&mctx->input);
1233 match = 1;
1234
1235 /* We found a match, do not modify match_first below. */
1236 p_match_first = NULL;
1237 if (!fl_longest_match)
1238 break;
1239 }
1240 }
1241 }
1242
1243 if (p_match_first)
1244 *p_match_first += next_start_idx;
1245
1246 return match_last;
1247}
1248
1249/* Check NODE match the current context. */
1250
1251static bool
1252internal_function
1253check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1254{
1255 re_token_type_t type = dfa->nodes[node].type;
1256 unsigned int constraint = dfa->nodes[node].constraint;
1257 if (type != END_OF_RE)
1258 return false;
1259 if (!constraint)
1260 return true;
1261 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1262 return false;
1263 return true;
1264}
1265
1266/* Check the halt state STATE match the current context.
1267 Return 0 if not match, if the node, STATE has, is a halt node and
1268 match the context, return the node. */
1269
1270static Idx
1271internal_function
1272check_halt_state_context (const re_match_context_t *mctx,
1273 const re_dfastate_t *state, Idx idx)
1274{
1275 Idx i;
1276 unsigned int context;
1277#ifdef DEBUG
1278 assert (state->halt);
1279#endif
1280 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1281 for (i = 0; i < state->nodes.nelem; ++i)
1282 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1283 return state->nodes.elems[i];
1284 return 0;
1285}
1286
1287/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1288 corresponding to the DFA).
1289 Return the destination node, and update EPS_VIA_NODES;
1290 return REG_MISSING in case of errors. */
1291
1292static Idx
1293internal_function
1294proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1295 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1296 struct re_fail_stack_t *fs)
1297{
1298 const re_dfa_t *const dfa = mctx->dfa;
1299 Idx i;
1300 bool ok;
1301 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1302 {
1303 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1304 re_node_set *edests = &dfa->edests[node];
1305 Idx dest_node;
1306 ok = re_node_set_insert (eps_via_nodes, node);
1307 if (BE (! ok, 0))
1308 return REG_ERROR;
1309 /* Pick up a valid destination, or return REG_MISSING if none
1310 is found. */
1311 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1312 {
1313 Idx candidate = edests->elems[i];
1314 if (!re_node_set_contains (cur_nodes, candidate))
1315 continue;
1316 if (dest_node == REG_MISSING)
1317 dest_node = candidate;
1318
1319 else
1320 {
1321 /* In order to avoid infinite loop like "(a*)*", return the second
1322 epsilon-transition if the first was already considered. */
1323 if (re_node_set_contains (eps_via_nodes, dest_node))
1324 return candidate;
1325
1326 /* Otherwise, push the second epsilon-transition on the fail stack. */
1327 else if (fs != NULL
1328 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1329 eps_via_nodes))
1330 return REG_ERROR;
1331
1332 /* We know we are going to exit. */
1333 break;
1334 }
1335 }
1336 return dest_node;
1337 }
1338 else
1339 {
1340 Idx naccepted = 0;
1341 re_token_type_t type = dfa->nodes[node].type;
1342
1343#ifdef RE_ENABLE_I18N
1344 if (dfa->nodes[node].accept_mb)
1345 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1346 else
1347#endif /* RE_ENABLE_I18N */
1348 if (type == OP_BACK_REF)
1349 {
1350 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1351 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1352 if (fs != NULL)
1353 {
1354 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1355 return REG_MISSING;
1356 else if (naccepted)
1357 {
1358 char *buf = (char *) re_string_get_buffer (&mctx->input);
1359 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1360 naccepted) != 0)
1361 return REG_MISSING;
1362 }
1363 }
1364
1365 if (naccepted == 0)
1366 {
1367 Idx dest_node;
1368 ok = re_node_set_insert (eps_via_nodes, node);
1369 if (BE (! ok, 0))
1370 return REG_ERROR;
1371 dest_node = dfa->edests[node].elems[0];
1372 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1373 dest_node))
1374 return dest_node;
1375 }
1376 }
1377
1378 if (naccepted != 0
1379 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1380 {
1381 Idx dest_node = dfa->nexts[node];
1382 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1383 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1384 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1385 dest_node)))
1386 return REG_MISSING;
1387 re_node_set_empty (eps_via_nodes);
1388 return dest_node;
1389 }
1390 }
1391 return REG_MISSING;
1392}
1393
1394static reg_errcode_t
1395internal_function __attribute_warn_unused_result__
1396push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1397 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1398{
1399 reg_errcode_t err;
1400 Idx num = fs->num++;
1401 if (fs->num == fs->alloc)
1402 {
1403 struct re_fail_stack_ent_t *new_array;
1404 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1405 * fs->alloc * 2));
1406 if (new_array == NULL)
1407 return REG_ESPACE;
1408 fs->alloc *= 2;
1409 fs->stack = new_array;
1410 }
1411 fs->stack[num].idx = str_idx;
1412 fs->stack[num].node = dest_node;
1413 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1414 if (fs->stack[num].regs == NULL)
1415 return REG_ESPACE;
1416 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1417 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1418 return err;
1419}
1420
1421static Idx
1422internal_function
1423pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1424 regmatch_t *regs, re_node_set *eps_via_nodes)
1425{
1426 Idx num = --fs->num;
1427 assert (REG_VALID_INDEX (num));
1428 *pidx = fs->stack[num].idx;
1429 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1430 re_node_set_free (eps_via_nodes);
1431 re_free (fs->stack[num].regs);
1432 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1433 return fs->stack[num].node;
1434}
1435
1436/* Set the positions where the subexpressions are starts/ends to registers
1437 PMATCH.
1438 Note: We assume that pmatch[0] is already set, and
1439 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1440
1441static reg_errcode_t
1442internal_function __attribute_warn_unused_result__
1443set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1444 regmatch_t *pmatch, bool fl_backtrack)
1445{
1446 const re_dfa_t *dfa = preg->buffer;
1447 Idx idx, cur_node;
1448 re_node_set eps_via_nodes;
1449 struct re_fail_stack_t *fs;
1450 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1451 regmatch_t *prev_idx_match;
1452 bool prev_idx_match_malloced = false;
1453
1454#ifdef DEBUG
1455 assert (nmatch > 1);
1456 assert (mctx->state_log != NULL);
1457#endif
1458 if (fl_backtrack)
1459 {
1460 fs = &fs_body;
1461 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1462 if (fs->stack == NULL)
1463 return REG_ESPACE;
1464 }
1465 else
1466 fs = NULL;
1467
1468 cur_node = dfa->init_node;
1469 re_node_set_init_empty (&eps_via_nodes);
1470
1471 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1472 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1473 else
1474 {
1475 prev_idx_match = re_malloc (regmatch_t, nmatch);
1476 if (prev_idx_match == NULL)
1477 {
1478 free_fail_stack_return (fs);
1479 return REG_ESPACE;
1480 }
1481 prev_idx_match_malloced = true;
1482 }
1483 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1484
1485 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1486 {
1487 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1488
1489 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1490 {
1491 Idx reg_idx;
1492 if (fs)
1493 {
1494 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1495 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1496 break;
1497 if (reg_idx == nmatch)
1498 {
1499 re_node_set_free (&eps_via_nodes);
1500 if (prev_idx_match_malloced)
1501 re_free (prev_idx_match);
1502 return free_fail_stack_return (fs);
1503 }
1504 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1505 &eps_via_nodes);
1506 }
1507 else
1508 {
1509 re_node_set_free (&eps_via_nodes);
1510 if (prev_idx_match_malloced)
1511 re_free (prev_idx_match);
1512 return REG_NOERROR;
1513 }
1514 }
1515
1516 /* Proceed to next node. */
1517 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1518 &eps_via_nodes, fs);
1519
1520 if (BE (! REG_VALID_INDEX (cur_node), 0))
1521 {
1522 if (BE (cur_node == REG_ERROR, 0))
1523 {
1524 re_node_set_free (&eps_via_nodes);
1525 if (prev_idx_match_malloced)
1526 re_free (prev_idx_match);
1527 free_fail_stack_return (fs);
1528 return REG_ESPACE;
1529 }
1530 if (fs)
1531 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1532 &eps_via_nodes);
1533 else
1534 {
1535 re_node_set_free (&eps_via_nodes);
1536 if (prev_idx_match_malloced)
1537 re_free (prev_idx_match);
1538 return REG_NOMATCH;
1539 }
1540 }
1541 }
1542 re_node_set_free (&eps_via_nodes);
1543 if (prev_idx_match_malloced)
1544 re_free (prev_idx_match);
1545 return free_fail_stack_return (fs);
1546}
1547
1548static reg_errcode_t
1549internal_function
1550free_fail_stack_return (struct re_fail_stack_t *fs)
1551{
1552 if (fs)
1553 {
1554 Idx fs_idx;
1555 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1556 {
1557 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1558 re_free (fs->stack[fs_idx].regs);
1559 }
1560 re_free (fs->stack);
1561 }
1562 return REG_NOERROR;
1563}
1564
1565static void
1566internal_function
1567update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1568 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1569{
1570 int type = dfa->nodes[cur_node].type;
1571 if (type == OP_OPEN_SUBEXP)
1572 {
1573 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1574
1575 /* We are at the first node of this sub expression. */
1576 if (reg_num < nmatch)
1577 {
1578 pmatch[reg_num].rm_so = cur_idx;
1579 pmatch[reg_num].rm_eo = -1;
1580 }
1581 }
1582 else if (type == OP_CLOSE_SUBEXP)
1583 {
1584 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1585 if (reg_num < nmatch)
1586 {
1587 /* We are at the last node of this sub expression. */
1588 if (pmatch[reg_num].rm_so < cur_idx)
1589 {
1590 pmatch[reg_num].rm_eo = cur_idx;
1591 /* This is a non-empty match or we are not inside an optional
1592 subexpression. Accept this right away. */
1593 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1594 }
1595 else
1596 {
1597 if (dfa->nodes[cur_node].opt_subexp
1598 && prev_idx_match[reg_num].rm_so != -1)
1599 /* We transited through an empty match for an optional
1600 subexpression, like (a?)*, and this is not the subexp's
1601 first match. Copy back the old content of the registers
1602 so that matches of an inner subexpression are undone as
1603 well, like in ((a?))*. */
1604 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1605 else
1606 /* We completed a subexpression, but it may be part of
1607 an optional one, so do not update PREV_IDX_MATCH. */
1608 pmatch[reg_num].rm_eo = cur_idx;
1609 }
1610 }
1611 }
1612}
1613
1614/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1615 and sift the nodes in each states according to the following rules.
1616 Updated state_log will be wrote to STATE_LOG.
1617
1618 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1619 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1620 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1621 the LAST_NODE, we throw away the node 'a'.
1622 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1623 string 's' and transit to 'b':
1624 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1625 away the node 'a'.
1626 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1627 thrown away, we throw away the node 'a'.
1628 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1629 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1630 node 'a'.
1631 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1632 we throw away the node 'a'. */
1633
1634#define STATE_NODE_CONTAINS(state,node) \
1635 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1636
1637static reg_errcode_t
1638internal_function
1639sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1640{
1641 reg_errcode_t err;
1642 int null_cnt = 0;
1643 Idx str_idx = sctx->last_str_idx;
1644 re_node_set cur_dest;
1645
1646#ifdef DEBUG
1647 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1648#endif
1649
1650 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1651 transit to the last_node and the last_node itself. */
1652 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1653 if (BE (err != REG_NOERROR, 0))
1654 return err;
1655 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1656 if (BE (err != REG_NOERROR, 0))
1657 goto free_return;
1658
1659 /* Then check each states in the state_log. */
1660 while (str_idx > 0)
1661 {
1662 /* Update counters. */
1663 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1664 if (null_cnt > mctx->max_mb_elem_len)
1665 {
1666 memset (sctx->sifted_states, '\0',
1667 sizeof (re_dfastate_t *) * str_idx);
1668 re_node_set_free (&cur_dest);
1669 return REG_NOERROR;
1670 }
1671 re_node_set_empty (&cur_dest);
1672 --str_idx;
1673
1674 if (mctx->state_log[str_idx])
1675 {
1676 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1677 if (BE (err != REG_NOERROR, 0))
1678 goto free_return;
1679 }
1680
1681 /* Add all the nodes which satisfy the following conditions:
1682 - It can epsilon transit to a node in CUR_DEST.
1683 - It is in CUR_SRC.
1684 And update state_log. */
1685 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1686 if (BE (err != REG_NOERROR, 0))
1687 goto free_return;
1688 }
1689 err = REG_NOERROR;
1690 free_return:
1691 re_node_set_free (&cur_dest);
1692 return err;
1693}
1694
1695static reg_errcode_t
1696internal_function __attribute_warn_unused_result__
1697build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1698 Idx str_idx, re_node_set *cur_dest)
1699{
1700 const re_dfa_t *const dfa = mctx->dfa;
1701 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1702 Idx i;
1703
1704 /* Then build the next sifted state.
1705 We build the next sifted state on 'cur_dest', and update
1706 'sifted_states[str_idx]' with 'cur_dest'.
1707 Note:
1708 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1709 'cur_src' points the node_set of the old 'state_log[str_idx]'
1710 (with the epsilon nodes pre-filtered out). */
1711 for (i = 0; i < cur_src->nelem; i++)
1712 {
1713 Idx prev_node = cur_src->elems[i];
1714 int naccepted = 0;
1715 bool ok;
1716
1717#ifdef DEBUG
1718 re_token_type_t type = dfa->nodes[prev_node].type;
1719 assert (!IS_EPSILON_NODE (type));
1720#endif
1721#ifdef RE_ENABLE_I18N
1722 /* If the node may accept "multi byte". */
1723 if (dfa->nodes[prev_node].accept_mb)
1724 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1725 str_idx, sctx->last_str_idx);
1726#endif /* RE_ENABLE_I18N */
1727
1728 /* We don't check backreferences here.
1729 See update_cur_sifted_state(). */
1730 if (!naccepted
1731 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1732 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1733 dfa->nexts[prev_node]))
1734 naccepted = 1;
1735
1736 if (naccepted == 0)
1737 continue;
1738
1739 if (sctx->limits.nelem)
1740 {
1741 Idx to_idx = str_idx + naccepted;
1742 if (check_dst_limits (mctx, &sctx->limits,
1743 dfa->nexts[prev_node], to_idx,
1744 prev_node, str_idx))
1745 continue;
1746 }
1747 ok = re_node_set_insert (cur_dest, prev_node);
1748 if (BE (! ok, 0))
1749 return REG_ESPACE;
1750 }
1751
1752 return REG_NOERROR;
1753}
1754
1755/* Helper functions. */
1756
1757static reg_errcode_t
1758internal_function
1759clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1760{
1761 Idx top = mctx->state_log_top;
1762
1763 if ((next_state_log_idx >= mctx->input.bufs_len
1764 && mctx->input.bufs_len < mctx->input.len)
1765 || (next_state_log_idx >= mctx->input.valid_len
1766 && mctx->input.valid_len < mctx->input.len))
1767 {
1768 reg_errcode_t err;
1769 err = extend_buffers (mctx);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1772 }
1773
1774 if (top < next_state_log_idx)
1775 {
1776 memset (mctx->state_log + top + 1, '\0',
1777 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1778 mctx->state_log_top = next_state_log_idx;
1779 }
1780 return REG_NOERROR;
1781}
1782
1783static reg_errcode_t
1784internal_function
1785merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1786 re_dfastate_t **src, Idx num)
1787{
1788 Idx st_idx;
1789 reg_errcode_t err;
1790 for (st_idx = 0; st_idx < num; ++st_idx)
1791 {
1792 if (dst[st_idx] == NULL)
1793 dst[st_idx] = src[st_idx];
1794 else if (src[st_idx] != NULL)
1795 {
1796 re_node_set merged_set;
1797 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1798 &src[st_idx]->nodes);
1799 if (BE (err != REG_NOERROR, 0))
1800 return err;
1801 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1802 re_node_set_free (&merged_set);
1803 if (BE (err != REG_NOERROR, 0))
1804 return err;
1805 }
1806 }
1807 return REG_NOERROR;
1808}
1809
1810static reg_errcode_t
1811internal_function
1812update_cur_sifted_state (const re_match_context_t *mctx,
1813 re_sift_context_t *sctx, Idx str_idx,
1814 re_node_set *dest_nodes)
1815{
1816 const re_dfa_t *const dfa = mctx->dfa;
1817 reg_errcode_t err = REG_NOERROR;
1818 const re_node_set *candidates;
1819 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1820 : &mctx->state_log[str_idx]->nodes);
1821
1822 if (dest_nodes->nelem == 0)
1823 sctx->sifted_states[str_idx] = NULL;
1824 else
1825 {
1826 if (candidates)
1827 {
1828 /* At first, add the nodes which can epsilon transit to a node in
1829 DEST_NODE. */
1830 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1831 if (BE (err != REG_NOERROR, 0))
1832 return err;
1833
1834 /* Then, check the limitations in the current sift_context. */
1835 if (sctx->limits.nelem)
1836 {
1837 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1838 mctx->bkref_ents, str_idx);
1839 if (BE (err != REG_NOERROR, 0))
1840 return err;
1841 }
1842 }
1843
1844 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1845 if (BE (err != REG_NOERROR, 0))
1846 return err;
1847 }
1848
1849 if (candidates && mctx->state_log[str_idx]->has_backref)
1850 {
1851 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1852 if (BE (err != REG_NOERROR, 0))
1853 return err;
1854 }
1855 return REG_NOERROR;
1856}
1857
1858static reg_errcode_t
1859internal_function __attribute_warn_unused_result__
1860add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1861 const re_node_set *candidates)
1862{
1863 reg_errcode_t err = REG_NOERROR;
1864 Idx i;
1865
1866 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1867 if (BE (err != REG_NOERROR, 0))
1868 return err;
1869
1870 if (!state->inveclosure.alloc)
1871 {
1872 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1873 if (BE (err != REG_NOERROR, 0))
1874 return REG_ESPACE;
1875 for (i = 0; i < dest_nodes->nelem; i++)
1876 {
1877 err = re_node_set_merge (&state->inveclosure,
1878 dfa->inveclosures + dest_nodes->elems[i]);
1879 if (BE (err != REG_NOERROR, 0))
1880 return REG_ESPACE;
1881 }
1882 }
1883 return re_node_set_add_intersect (dest_nodes, candidates,
1884 &state->inveclosure);
1885}
1886
1887static reg_errcode_t
1888internal_function
1889sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1890 const re_node_set *candidates)
1891{
1892 Idx ecl_idx;
1893 reg_errcode_t err;
1894 re_node_set *inv_eclosure = dfa->inveclosures + node;
1895 re_node_set except_nodes;
1896 re_node_set_init_empty (&except_nodes);
1897 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1898 {
1899 Idx cur_node = inv_eclosure->elems[ecl_idx];
1900 if (cur_node == node)
1901 continue;
1902 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1903 {
1904 Idx edst1 = dfa->edests[cur_node].elems[0];
1905 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1906 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1907 if ((!re_node_set_contains (inv_eclosure, edst1)
1908 && re_node_set_contains (dest_nodes, edst1))
1909 || (REG_VALID_NONZERO_INDEX (edst2)
1910 && !re_node_set_contains (inv_eclosure, edst2)
1911 && re_node_set_contains (dest_nodes, edst2)))
1912 {
1913 err = re_node_set_add_intersect (&except_nodes, candidates,
1914 dfa->inveclosures + cur_node);
1915 if (BE (err != REG_NOERROR, 0))
1916 {
1917 re_node_set_free (&except_nodes);
1918 return err;
1919 }
1920 }
1921 }
1922 }
1923 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1924 {
1925 Idx cur_node = inv_eclosure->elems[ecl_idx];
1926 if (!re_node_set_contains (&except_nodes, cur_node))
1927 {
1928 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1929 re_node_set_remove_at (dest_nodes, idx);
1930 }
1931 }
1932 re_node_set_free (&except_nodes);
1933 return REG_NOERROR;
1934}
1935
1936static bool
1937internal_function
1938check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1939 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1940{
1941 const re_dfa_t *const dfa = mctx->dfa;
1942 Idx lim_idx, src_pos, dst_pos;
1943
1944 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1945 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1946 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1947 {
1948 Idx subexp_idx;
1949 struct re_backref_cache_entry *ent;
1950 ent = mctx->bkref_ents + limits->elems[lim_idx];
1951 subexp_idx = dfa->nodes[ent->node].opr.idx;
1952
1953 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1954 subexp_idx, dst_node, dst_idx,
1955 dst_bkref_idx);
1956 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1957 subexp_idx, src_node, src_idx,
1958 src_bkref_idx);
1959
1960 /* In case of:
1961 <src> <dst> ( <subexp> )
1962 ( <subexp> ) <src> <dst>
1963 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1964 if (src_pos == dst_pos)
1965 continue; /* This is unrelated limitation. */
1966 else
1967 return true;
1968 }
1969 return false;
1970}
1971
1972static int
1973internal_function
1974check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1975 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1976{
1977 const re_dfa_t *const dfa = mctx->dfa;
1978 const re_node_set *eclosures = dfa->eclosures + from_node;
1979 Idx node_idx;
1980
1981 /* Else, we are on the boundary: examine the nodes on the epsilon
1982 closure. */
1983 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1984 {
1985 Idx node = eclosures->elems[node_idx];
1986 switch (dfa->nodes[node].type)
1987 {
1988 case OP_BACK_REF:
1989 if (bkref_idx != REG_MISSING)
1990 {
1991 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1992 do
1993 {
1994 Idx dst;
1995 int cpos;
1996
1997 if (ent->node != node)
1998 continue;
1999
2000 if (subexp_idx < BITSET_WORD_BITS
2001 && !(ent->eps_reachable_subexps_map
2002 & ((bitset_word_t) 1 << subexp_idx)))
2003 continue;
2004
2005 /* Recurse trying to reach the OP_OPEN_SUBEXP and
2006 OP_CLOSE_SUBEXP cases below. But, if the
2007 destination node is the same node as the source
2008 node, don't recurse because it would cause an
2009 infinite loop: a regex that exhibits this behavior
2010 is ()\1*\1* */
2011 dst = dfa->edests[node].elems[0];
2012 if (dst == from_node)
2013 {
2014 if (boundaries & 1)
2015 return -1;
2016 else /* if (boundaries & 2) */
2017 return 0;
2018 }
2019
2020 cpos =
2021 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2022 dst, bkref_idx);
2023 if (cpos == -1 /* && (boundaries & 1) */)
2024 return -1;
2025 if (cpos == 0 && (boundaries & 2))
2026 return 0;
2027
2028 if (subexp_idx < BITSET_WORD_BITS)
2029 ent->eps_reachable_subexps_map
2030 &= ~((bitset_word_t) 1 << subexp_idx);
2031 }
2032 while (ent++->more);
2033 }
2034 break;
2035
2036 case OP_OPEN_SUBEXP:
2037 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2038 return -1;
2039 break;
2040
2041 case OP_CLOSE_SUBEXP:
2042 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2043 return 0;
2044 break;
2045
2046 default:
2047 break;
2048 }
2049 }
2050
2051 return (boundaries & 2) ? 1 : 0;
2052}
2053
2054static int
2055internal_function
2056check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2057 Idx subexp_idx, Idx from_node, Idx str_idx,
2058 Idx bkref_idx)
2059{
2060 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2061 int boundaries;
2062
2063 /* If we are outside the range of the subexpression, return -1 or 1. */
2064 if (str_idx < lim->subexp_from)
2065 return -1;
2066
2067 if (lim->subexp_to < str_idx)
2068 return 1;
2069
2070 /* If we are within the subexpression, return 0. */
2071 boundaries = (str_idx == lim->subexp_from);
2072 boundaries |= (str_idx == lim->subexp_to) << 1;
2073 if (boundaries == 0)
2074 return 0;
2075
2076 /* Else, examine epsilon closure. */
2077 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2078 from_node, bkref_idx);
2079}
2080
2081/* Check the limitations of sub expressions LIMITS, and remove the nodes
2082 which are against limitations from DEST_NODES. */
2083
2084static reg_errcode_t
2085internal_function
2086check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2087 const re_node_set *candidates, re_node_set *limits,
2088 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2089{
2090 reg_errcode_t err;
2091 Idx node_idx, lim_idx;
2092
2093 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2094 {
2095 Idx subexp_idx;
2096 struct re_backref_cache_entry *ent;
2097 ent = bkref_ents + limits->elems[lim_idx];
2098
2099 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2100 continue; /* This is unrelated limitation. */
2101
2102 subexp_idx = dfa->nodes[ent->node].opr.idx;
2103 if (ent->subexp_to == str_idx)
2104 {
2105 Idx ops_node = REG_MISSING;
2106 Idx cls_node = REG_MISSING;
2107 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2108 {
2109 Idx node = dest_nodes->elems[node_idx];
2110 re_token_type_t type = dfa->nodes[node].type;
2111 if (type == OP_OPEN_SUBEXP
2112 && subexp_idx == dfa->nodes[node].opr.idx)
2113 ops_node = node;
2114 else if (type == OP_CLOSE_SUBEXP
2115 && subexp_idx == dfa->nodes[node].opr.idx)
2116 cls_node = node;
2117 }
2118
2119 /* Check the limitation of the open subexpression. */
2120 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2121 if (REG_VALID_INDEX (ops_node))
2122 {
2123 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2124 candidates);
2125 if (BE (err != REG_NOERROR, 0))
2126 return err;
2127 }
2128
2129 /* Check the limitation of the close subexpression. */
2130 if (REG_VALID_INDEX (cls_node))
2131 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2132 {
2133 Idx node = dest_nodes->elems[node_idx];
2134 if (!re_node_set_contains (dfa->inveclosures + node,
2135 cls_node)
2136 && !re_node_set_contains (dfa->eclosures + node,
2137 cls_node))
2138 {
2139 /* It is against this limitation.
2140 Remove it form the current sifted state. */
2141 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2142 candidates);
2143 if (BE (err != REG_NOERROR, 0))
2144 return err;
2145 --node_idx;
2146 }
2147 }
2148 }
2149 else /* (ent->subexp_to != str_idx) */
2150 {
2151 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2152 {
2153 Idx node = dest_nodes->elems[node_idx];
2154 re_token_type_t type = dfa->nodes[node].type;
2155 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2156 {
2157 if (subexp_idx != dfa->nodes[node].opr.idx)
2158 continue;
2159 /* It is against this limitation.
2160 Remove it form the current sifted state. */
2161 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2162 candidates);
2163 if (BE (err != REG_NOERROR, 0))
2164 return err;
2165 }
2166 }
2167 }
2168 }
2169 return REG_NOERROR;
2170}
2171
2172static reg_errcode_t
2173internal_function __attribute_warn_unused_result__
2174sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2175 Idx str_idx, const re_node_set *candidates)
2176{
2177 const re_dfa_t *const dfa = mctx->dfa;
2178 reg_errcode_t err;
2179 Idx node_idx, node;
2180 re_sift_context_t local_sctx;
2181 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2182
2183 if (first_idx == REG_MISSING)
2184 return REG_NOERROR;
2185
2186 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2187
2188 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2189 {
2190 Idx enabled_idx;
2191 re_token_type_t type;
2192 struct re_backref_cache_entry *entry;
2193 node = candidates->elems[node_idx];
2194 type = dfa->nodes[node].type;
2195 /* Avoid infinite loop for the REs like "()\1+". */
2196 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2197 continue;
2198 if (type != OP_BACK_REF)
2199 continue;
2200
2201 entry = mctx->bkref_ents + first_idx;
2202 enabled_idx = first_idx;
2203 do
2204 {
2205 Idx subexp_len;
2206 Idx to_idx;
2207 Idx dst_node;
2208 bool ok;
2209 re_dfastate_t *cur_state;
2210
2211 if (entry->node != node)
2212 continue;
2213 subexp_len = entry->subexp_to - entry->subexp_from;
2214 to_idx = str_idx + subexp_len;
2215 dst_node = (subexp_len ? dfa->nexts[node]
2216 : dfa->edests[node].elems[0]);
2217
2218 if (to_idx > sctx->last_str_idx
2219 || sctx->sifted_states[to_idx] == NULL
2220 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2221 || check_dst_limits (mctx, &sctx->limits, node,
2222 str_idx, dst_node, to_idx))
2223 continue;
2224
2225 if (local_sctx.sifted_states == NULL)
2226 {
2227 local_sctx = *sctx;
2228 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2229 if (BE (err != REG_NOERROR, 0))
2230 goto free_return;
2231 }
2232 local_sctx.last_node = node;
2233 local_sctx.last_str_idx = str_idx;
2234 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2235 if (BE (! ok, 0))
2236 {
2237 err = REG_ESPACE;
2238 goto free_return;
2239 }
2240 cur_state = local_sctx.sifted_states[str_idx];
2241 err = sift_states_backward (mctx, &local_sctx);
2242 if (BE (err != REG_NOERROR, 0))
2243 goto free_return;
2244 if (sctx->limited_states != NULL)
2245 {
2246 err = merge_state_array (dfa, sctx->limited_states,
2247 local_sctx.sifted_states,
2248 str_idx + 1);
2249 if (BE (err != REG_NOERROR, 0))
2250 goto free_return;
2251 }
2252 local_sctx.sifted_states[str_idx] = cur_state;
2253 re_node_set_remove (&local_sctx.limits, enabled_idx);
2254
2255 /* mctx->bkref_ents may have changed, reload the pointer. */
2256 entry = mctx->bkref_ents + enabled_idx;
2257 }
2258 while (enabled_idx++, entry++->more);
2259 }
2260 err = REG_NOERROR;
2261 free_return:
2262 if (local_sctx.sifted_states != NULL)
2263 {
2264 re_node_set_free (&local_sctx.limits);
2265 }
2266
2267 return err;
2268}
2269
2270
2271#ifdef RE_ENABLE_I18N
2272static int
2273internal_function
2274sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2275 Idx node_idx, Idx str_idx, Idx max_str_idx)
2276{
2277 const re_dfa_t *const dfa = mctx->dfa;
2278 int naccepted;
2279 /* Check the node can accept "multi byte". */
2280 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2281 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2282 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2283 dfa->nexts[node_idx]))
2284 /* The node can't accept the "multi byte", or the
2285 destination was already thrown away, then the node
2286 could't accept the current input "multi byte". */
2287 naccepted = 0;
2288 /* Otherwise, it is sure that the node could accept
2289 'naccepted' bytes input. */
2290 return naccepted;
2291}
2292#endif /* RE_ENABLE_I18N */
2293
2294
2295
2296/* Functions for state transition. */
2297
2298/* Return the next state to which the current state STATE will transit by
2299 accepting the current input byte, and update STATE_LOG if necessary.
2300 If STATE can accept a multibyte char/collating element/back reference
2301 update the destination of STATE_LOG. */
2302
2303static re_dfastate_t *
2304internal_function __attribute_warn_unused_result__
2305transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2306 re_dfastate_t *state)
2307{
2308 re_dfastate_t **trtable;
2309 unsigned char ch;
2310
2311#ifdef RE_ENABLE_I18N
2312 /* If the current state can accept multibyte. */
2313 if (BE (state->accept_mb, 0))
2314 {
2315 *err = transit_state_mb (mctx, state);
2316 if (BE (*err != REG_NOERROR, 0))
2317 return NULL;
2318 }
2319#endif /* RE_ENABLE_I18N */
2320
2321 /* Then decide the next state with the single byte. */
2322#if 0
2323 if (0)
2324 /* don't use transition table */
2325 return transit_state_sb (err, mctx, state);
2326#endif
2327
2328 /* Use transition table */
2329 ch = re_string_fetch_byte (&mctx->input);
2330 for (;;)
2331 {
2332 trtable = state->trtable;
2333 if (BE (trtable != NULL, 1))
2334 return trtable[ch];
2335
2336 trtable = state->word_trtable;
2337 if (BE (trtable != NULL, 1))
2338 {
2339 unsigned int context;
2340 context
2341 = re_string_context_at (&mctx->input,
2342 re_string_cur_idx (&mctx->input) - 1,
2343 mctx->eflags);
2344 if (IS_WORD_CONTEXT (context))
2345 return trtable[ch + SBC_MAX];
2346 else
2347 return trtable[ch];
2348 }
2349
2350 if (!build_trtable (mctx->dfa, state))
2351 {
2352 *err = REG_ESPACE;
2353 return NULL;
2354 }
2355
2356 /* Retry, we now have a transition table. */
2357 }
2358}
2359
2360/* Update the state_log if we need */
2361static re_dfastate_t *
2362internal_function
2363merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2364 re_dfastate_t *next_state)
2365{
2366 const re_dfa_t *const dfa = mctx->dfa;
2367 Idx cur_idx = re_string_cur_idx (&mctx->input);
2368
2369 if (cur_idx > mctx->state_log_top)
2370 {
2371 mctx->state_log[cur_idx] = next_state;
2372 mctx->state_log_top = cur_idx;
2373 }
2374 else if (mctx->state_log[cur_idx] == 0)
2375 {
2376 mctx->state_log[cur_idx] = next_state;
2377 }
2378 else
2379 {
2380 re_dfastate_t *pstate;
2381 unsigned int context;
2382 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2383 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2384 the destination of a multibyte char/collating element/
2385 back reference. Then the next state is the union set of
2386 these destinations and the results of the transition table. */
2387 pstate = mctx->state_log[cur_idx];
2388 log_nodes = pstate->entrance_nodes;
2389 if (next_state != NULL)
2390 {
2391 table_nodes = next_state->entrance_nodes;
2392 *err = re_node_set_init_union (&next_nodes, table_nodes,
2393 log_nodes);
2394 if (BE (*err != REG_NOERROR, 0))
2395 return NULL;
2396 }
2397 else
2398 next_nodes = *log_nodes;
2399 /* Note: We already add the nodes of the initial state,
2400 then we don't need to add them here. */
2401
2402 context = re_string_context_at (&mctx->input,
2403 re_string_cur_idx (&mctx->input) - 1,
2404 mctx->eflags);
2405 next_state = mctx->state_log[cur_idx]
2406 = re_acquire_state_context (err, dfa, &next_nodes, context);
2407 /* We don't need to check errors here, since the return value of
2408 this function is next_state and ERR is already set. */
2409
2410 if (table_nodes != NULL)
2411 re_node_set_free (&next_nodes);
2412 }
2413
2414 if (BE (dfa->nbackref, 0) && next_state != NULL)
2415 {
2416 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2417 later. We must check them here, since the back references in the
2418 next state might use them. */
2419 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2420 cur_idx);
2421 if (BE (*err != REG_NOERROR, 0))
2422 return NULL;
2423
2424 /* If the next state has back references. */
2425 if (next_state->has_backref)
2426 {
2427 *err = transit_state_bkref (mctx, &next_state->nodes);
2428 if (BE (*err != REG_NOERROR, 0))
2429 return NULL;
2430 next_state = mctx->state_log[cur_idx];
2431 }
2432 }
2433
2434 return next_state;
2435}
2436
2437/* Skip bytes in the input that correspond to part of a
2438 multi-byte match, then look in the log for a state
2439 from which to restart matching. */
2440static re_dfastate_t *
2441internal_function
2442find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2443{
2444 re_dfastate_t *cur_state;
2445 do
2446 {
2447 Idx max = mctx->state_log_top;
2448 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2449
2450 do
2451 {
2452 if (++cur_str_idx > max)
2453 return NULL;
2454 re_string_skip_bytes (&mctx->input, 1);
2455 }
2456 while (mctx->state_log[cur_str_idx] == NULL);
2457
2458 cur_state = merge_state_with_log (err, mctx, NULL);
2459 }
2460 while (*err == REG_NOERROR && cur_state == NULL);
2461 return cur_state;
2462}
2463
2464/* Helper functions for transit_state. */
2465
2466/* From the node set CUR_NODES, pick up the nodes whose types are
2467 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2468 expression. And register them to use them later for evaluating the
2469 corresponding back references. */
2470
2471static reg_errcode_t
2472internal_function
2473check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2474 Idx str_idx)
2475{
2476 const re_dfa_t *const dfa = mctx->dfa;
2477 Idx node_idx;
2478 reg_errcode_t err;
2479
2480 /* TODO: This isn't efficient.
2481 Because there might be more than one nodes whose types are
2482 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2483 nodes.
2484 E.g. RE: (a){2} */
2485 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2486 {
2487 Idx node = cur_nodes->elems[node_idx];
2488 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2489 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2490 && (dfa->used_bkref_map
2491 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2492 {
2493 err = match_ctx_add_subtop (mctx, node, str_idx);
2494 if (BE (err != REG_NOERROR, 0))
2495 return err;
2496 }
2497 }
2498 return REG_NOERROR;
2499}
2500
2501#if 0
2502/* Return the next state to which the current state STATE will transit by
2503 accepting the current input byte. */
2504
2505static re_dfastate_t *
2506transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2507 re_dfastate_t *state)
2508{
2509 const re_dfa_t *const dfa = mctx->dfa;
2510 re_node_set next_nodes;
2511 re_dfastate_t *next_state;
2512 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2513 unsigned int context;
2514
2515 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2516 if (BE (*err != REG_NOERROR, 0))
2517 return NULL;
2518 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2519 {
2520 Idx cur_node = state->nodes.elems[node_cnt];
2521 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2522 {
2523 *err = re_node_set_merge (&next_nodes,
2524 dfa->eclosures + dfa->nexts[cur_node]);
2525 if (BE (*err != REG_NOERROR, 0))
2526 {
2527 re_node_set_free (&next_nodes);
2528 return NULL;
2529 }
2530 }
2531 }
2532 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2533 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2534 /* We don't need to check errors here, since the return value of
2535 this function is next_state and ERR is already set. */
2536
2537 re_node_set_free (&next_nodes);
2538 re_string_skip_bytes (&mctx->input, 1);
2539 return next_state;
2540}
2541#endif
2542
2543#ifdef RE_ENABLE_I18N
2544static reg_errcode_t
2545internal_function
2546transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2547{
2548 const re_dfa_t *const dfa = mctx->dfa;
2549 reg_errcode_t err;
2550 Idx i;
2551
2552 for (i = 0; i < pstate->nodes.nelem; ++i)
2553 {
2554 re_node_set dest_nodes, *new_nodes;
2555 Idx cur_node_idx = pstate->nodes.elems[i];
2556 int naccepted;
2557 Idx dest_idx;
2558 unsigned int context;
2559 re_dfastate_t *dest_state;
2560
2561 if (!dfa->nodes[cur_node_idx].accept_mb)
2562 continue;
2563
2564 if (dfa->nodes[cur_node_idx].constraint)
2565 {
2566 context = re_string_context_at (&mctx->input,
2567 re_string_cur_idx (&mctx->input),
2568 mctx->eflags);
2569 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2570 context))
2571 continue;
2572 }
2573
2574 /* How many bytes the node can accept? */
2575 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2576 re_string_cur_idx (&mctx->input));
2577 if (naccepted == 0)
2578 continue;
2579
2580 /* The node can accepts 'naccepted' bytes. */
2581 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2582 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2583 : mctx->max_mb_elem_len);
2584 err = clean_state_log_if_needed (mctx, dest_idx);
2585 if (BE (err != REG_NOERROR, 0))
2586 return err;
2587#ifdef DEBUG
2588 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2589#endif
2590 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2591
2592 dest_state = mctx->state_log[dest_idx];
2593 if (dest_state == NULL)
2594 dest_nodes = *new_nodes;
2595 else
2596 {
2597 err = re_node_set_init_union (&dest_nodes,
2598 dest_state->entrance_nodes, new_nodes);
2599 if (BE (err != REG_NOERROR, 0))
2600 return err;
2601 }
2602 context = re_string_context_at (&mctx->input, dest_idx - 1,
2603 mctx->eflags);
2604 mctx->state_log[dest_idx]
2605 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2606 if (dest_state != NULL)
2607 re_node_set_free (&dest_nodes);
2608 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2609 return err;
2610 }
2611 return REG_NOERROR;
2612}
2613#endif /* RE_ENABLE_I18N */
2614
2615static reg_errcode_t
2616internal_function
2617transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2618{
2619 const re_dfa_t *const dfa = mctx->dfa;
2620 reg_errcode_t err;
2621 Idx i;
2622 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2623
2624 for (i = 0; i < nodes->nelem; ++i)
2625 {
2626 Idx dest_str_idx, prev_nelem, bkc_idx;
2627 Idx node_idx = nodes->elems[i];
2628 unsigned int context;
2629 const re_token_t *node = dfa->nodes + node_idx;
2630 re_node_set *new_dest_nodes;
2631
2632 /* Check whether 'node' is a backreference or not. */
2633 if (node->type != OP_BACK_REF)
2634 continue;
2635
2636 if (node->constraint)
2637 {
2638 context = re_string_context_at (&mctx->input, cur_str_idx,
2639 mctx->eflags);
2640 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2641 continue;
2642 }
2643
2644 /* 'node' is a backreference.
2645 Check the substring which the substring matched. */
2646 bkc_idx = mctx->nbkref_ents;
2647 err = get_subexp (mctx, node_idx, cur_str_idx);
2648 if (BE (err != REG_NOERROR, 0))
2649 goto free_return;
2650
2651 /* And add the epsilon closures (which is 'new_dest_nodes') of
2652 the backreference to appropriate state_log. */
2653#ifdef DEBUG
2654 assert (dfa->nexts[node_idx] != REG_MISSING);
2655#endif
2656 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2657 {
2658 Idx subexp_len;
2659 re_dfastate_t *dest_state;
2660 struct re_backref_cache_entry *bkref_ent;
2661 bkref_ent = mctx->bkref_ents + bkc_idx;
2662 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2663 continue;
2664 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2665 new_dest_nodes = (subexp_len == 0
2666 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2667 : dfa->eclosures + dfa->nexts[node_idx]);
2668 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2669 - bkref_ent->subexp_from);
2670 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2671 mctx->eflags);
2672 dest_state = mctx->state_log[dest_str_idx];
2673 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2674 : mctx->state_log[cur_str_idx]->nodes.nelem);
2675 /* Add 'new_dest_node' to state_log. */
2676 if (dest_state == NULL)
2677 {
2678 mctx->state_log[dest_str_idx]
2679 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2680 context);
2681 if (BE (mctx->state_log[dest_str_idx] == NULL
2682 && err != REG_NOERROR, 0))
2683 goto free_return;
2684 }
2685 else
2686 {
2687 re_node_set dest_nodes;
2688 err = re_node_set_init_union (&dest_nodes,
2689 dest_state->entrance_nodes,
2690 new_dest_nodes);
2691 if (BE (err != REG_NOERROR, 0))
2692 {
2693 re_node_set_free (&dest_nodes);
2694 goto free_return;
2695 }
2696 mctx->state_log[dest_str_idx]
2697 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2698 re_node_set_free (&dest_nodes);
2699 if (BE (mctx->state_log[dest_str_idx] == NULL
2700 && err != REG_NOERROR, 0))
2701 goto free_return;
2702 }
2703 /* We need to check recursively if the backreference can epsilon
2704 transit. */
2705 if (subexp_len == 0
2706 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2707 {
2708 err = check_subexp_matching_top (mctx, new_dest_nodes,
2709 cur_str_idx);
2710 if (BE (err != REG_NOERROR, 0))
2711 goto free_return;
2712 err = transit_state_bkref (mctx, new_dest_nodes);
2713 if (BE (err != REG_NOERROR, 0))
2714 goto free_return;
2715 }
2716 }
2717 }
2718 err = REG_NOERROR;
2719 free_return:
2720 return err;
2721}
2722
2723/* Enumerate all the candidates which the backreference BKREF_NODE can match
2724 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2725 Note that we might collect inappropriate candidates here.
2726 However, the cost of checking them strictly here is too high, then we
2727 delay these checking for prune_impossible_nodes(). */
2728
2729static reg_errcode_t
2730internal_function __attribute_warn_unused_result__
2731get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2732{
2733 const re_dfa_t *const dfa = mctx->dfa;
2734 Idx subexp_num, sub_top_idx;
2735 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2736 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2737 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2738 if (cache_idx != REG_MISSING)
2739 {
2740 const struct re_backref_cache_entry *entry
2741 = mctx->bkref_ents + cache_idx;
2742 do
2743 if (entry->node == bkref_node)
2744 return REG_NOERROR; /* We already checked it. */
2745 while (entry++->more);
2746 }
2747
2748 subexp_num = dfa->nodes[bkref_node].opr.idx;
2749
2750 /* For each sub expression */
2751 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2752 {
2753 reg_errcode_t err;
2754 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2755 re_sub_match_last_t *sub_last;
2756 Idx sub_last_idx, sl_str, bkref_str_off;
2757
2758 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2759 continue; /* It isn't related. */
2760
2761 sl_str = sub_top->str_idx;
2762 bkref_str_off = bkref_str_idx;
2763 /* At first, check the last node of sub expressions we already
2764 evaluated. */
2765 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2766 {
2767 regoff_t sl_str_diff;
2768 sub_last = sub_top->lasts[sub_last_idx];
2769 sl_str_diff = sub_last->str_idx - sl_str;
2770 /* The matched string by the sub expression match with the substring
2771 at the back reference? */
2772 if (sl_str_diff > 0)
2773 {
2774 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2775 {
2776 /* Not enough chars for a successful match. */
2777 if (bkref_str_off + sl_str_diff > mctx->input.len)
2778 break;
2779
2780 err = clean_state_log_if_needed (mctx,
2781 bkref_str_off
2782 + sl_str_diff);
2783 if (BE (err != REG_NOERROR, 0))
2784 return err;
2785 buf = (const char *) re_string_get_buffer (&mctx->input);
2786 }
2787 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2788 /* We don't need to search this sub expression any more. */
2789 break;
2790 }
2791 bkref_str_off += sl_str_diff;
2792 sl_str += sl_str_diff;
2793 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2794 bkref_str_idx);
2795
2796 /* Reload buf, since the preceding call might have reallocated
2797 the buffer. */
2798 buf = (const char *) re_string_get_buffer (&mctx->input);
2799
2800 if (err == REG_NOMATCH)
2801 continue;
2802 if (BE (err != REG_NOERROR, 0))
2803 return err;
2804 }
2805
2806 if (sub_last_idx < sub_top->nlasts)
2807 continue;
2808 if (sub_last_idx > 0)
2809 ++sl_str;
2810 /* Then, search for the other last nodes of the sub expression. */
2811 for (; sl_str <= bkref_str_idx; ++sl_str)
2812 {
2813 Idx cls_node;
2814 regoff_t sl_str_off;
2815 const re_node_set *nodes;
2816 sl_str_off = sl_str - sub_top->str_idx;
2817 /* The matched string by the sub expression match with the substring
2818 at the back reference? */
2819 if (sl_str_off > 0)
2820 {
2821 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2822 {
2823 /* If we are at the end of the input, we cannot match. */
2824 if (bkref_str_off >= mctx->input.len)
2825 break;
2826
2827 err = extend_buffers (mctx);
2828 if (BE (err != REG_NOERROR, 0))
2829 return err;
2830
2831 buf = (const char *) re_string_get_buffer (&mctx->input);
2832 }
2833 if (buf [bkref_str_off++] != buf[sl_str - 1])
2834 break; /* We don't need to search this sub expression
2835 any more. */
2836 }
2837 if (mctx->state_log[sl_str] == NULL)
2838 continue;
2839 /* Does this state have a ')' of the sub expression? */
2840 nodes = &mctx->state_log[sl_str]->nodes;
2841 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2842 OP_CLOSE_SUBEXP);
2843 if (cls_node == REG_MISSING)
2844 continue; /* No. */
2845 if (sub_top->path == NULL)
2846 {
2847 sub_top->path = calloc (sizeof (state_array_t),
2848 sl_str - sub_top->str_idx + 1);
2849 if (sub_top->path == NULL)
2850 return REG_ESPACE;
2851 }
2852 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2853 in the current context? */
2854 err = check_arrival (mctx, sub_top->path, sub_top->node,
2855 sub_top->str_idx, cls_node, sl_str,
2856 OP_CLOSE_SUBEXP);
2857 if (err == REG_NOMATCH)
2858 continue;
2859 if (BE (err != REG_NOERROR, 0))
2860 return err;
2861 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2862 if (BE (sub_last == NULL, 0))
2863 return REG_ESPACE;
2864 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2865 bkref_str_idx);
2866 if (err == REG_NOMATCH)
2867 continue;
2868 }
2869 }
2870 return REG_NOERROR;
2871}
2872
2873/* Helper functions for get_subexp(). */
2874
2875/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2876 If it can arrive, register the sub expression expressed with SUB_TOP
2877 and SUB_LAST. */
2878
2879static reg_errcode_t
2880internal_function
2881get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2882 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2883{
2884 reg_errcode_t err;
2885 Idx to_idx;
2886 /* Can the subexpression arrive the back reference? */
2887 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2888 sub_last->str_idx, bkref_node, bkref_str,
2889 OP_OPEN_SUBEXP);
2890 if (err != REG_NOERROR)
2891 return err;
2892 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2893 sub_last->str_idx);
2894 if (BE (err != REG_NOERROR, 0))
2895 return err;
2896 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2897 return clean_state_log_if_needed (mctx, to_idx);
2898}
2899
2900/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2901 Search '(' if FL_OPEN, or search ')' otherwise.
2902 TODO: This function isn't efficient...
2903 Because there might be more than one nodes whose types are
2904 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2905 nodes.
2906 E.g. RE: (a){2} */
2907
2908static Idx
2909internal_function
2910find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2911 Idx subexp_idx, int type)
2912{
2913 Idx cls_idx;
2914 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2915 {
2916 Idx cls_node = nodes->elems[cls_idx];
2917 const re_token_t *node = dfa->nodes + cls_node;
2918 if (node->type == type
2919 && node->opr.idx == subexp_idx)
2920 return cls_node;
2921 }
2922 return REG_MISSING;
2923}
2924
2925/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2926 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2927 heavily reused.
2928 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2929
2930static reg_errcode_t
2931internal_function __attribute_warn_unused_result__
2932check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2933 Idx top_str, Idx last_node, Idx last_str, int type)
2934{
2935 const re_dfa_t *const dfa = mctx->dfa;
2936 reg_errcode_t err = REG_NOERROR;
2937 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2938 re_dfastate_t *cur_state = NULL;
2939 re_node_set *cur_nodes, next_nodes;
2940 re_dfastate_t **backup_state_log;
2941 unsigned int context;
2942
2943 subexp_num = dfa->nodes[top_node].opr.idx;
2944 /* Extend the buffer if we need. */
2945 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2946 {
2947 re_dfastate_t **new_array;
2948 Idx old_alloc = path->alloc;
2949 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2950 Idx new_alloc;
2951 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2952 return REG_ESPACE;
2953 new_alloc = old_alloc + incr_alloc;
2954 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2955 return REG_ESPACE;
2956 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2957 if (BE (new_array == NULL, 0))
2958 return REG_ESPACE;
2959 path->array = new_array;
2960 path->alloc = new_alloc;
2961 memset (new_array + old_alloc, '\0',
2962 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2963 }
2964
2965 str_idx = path->next_idx ? path->next_idx : top_str;
2966
2967 /* Temporary modify MCTX. */
2968 backup_state_log = mctx->state_log;
2969 backup_cur_idx = mctx->input.cur_idx;
2970 mctx->state_log = path->array;
2971 mctx->input.cur_idx = str_idx;
2972
2973 /* Setup initial node set. */
2974 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2975 if (str_idx == top_str)
2976 {
2977 err = re_node_set_init_1 (&next_nodes, top_node);
2978 if (BE (err != REG_NOERROR, 0))
2979 return err;
2980 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2981 if (BE (err != REG_NOERROR, 0))
2982 {
2983 re_node_set_free (&next_nodes);
2984 return err;
2985 }
2986 }
2987 else
2988 {
2989 cur_state = mctx->state_log[str_idx];
2990 if (cur_state && cur_state->has_backref)
2991 {
2992 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2993 if (BE (err != REG_NOERROR, 0))
2994 return err;
2995 }
2996 else
2997 re_node_set_init_empty (&next_nodes);
2998 }
2999 if (str_idx == top_str || (cur_state && cur_state->has_backref))
3000 {
3001 if (next_nodes.nelem)
3002 {
3003 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3004 subexp_num, type);
3005 if (BE (err != REG_NOERROR, 0))
3006 {
3007 re_node_set_free (&next_nodes);
3008 return err;
3009 }
3010 }
3011 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3012 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3013 {
3014 re_node_set_free (&next_nodes);
3015 return err;
3016 }
3017 mctx->state_log[str_idx] = cur_state;
3018 }
3019
3020 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3021 {
3022 re_node_set_empty (&next_nodes);
3023 if (mctx->state_log[str_idx + 1])
3024 {
3025 err = re_node_set_merge (&next_nodes,
3026 &mctx->state_log[str_idx + 1]->nodes);
3027 if (BE (err != REG_NOERROR, 0))
3028 {
3029 re_node_set_free (&next_nodes);
3030 return err;
3031 }
3032 }
3033 if (cur_state)
3034 {
3035 err = check_arrival_add_next_nodes (mctx, str_idx,
3036 &cur_state->non_eps_nodes,
3037 &next_nodes);
3038 if (BE (err != REG_NOERROR, 0))
3039 {
3040 re_node_set_free (&next_nodes);
3041 return err;
3042 }
3043 }
3044 ++str_idx;
3045 if (next_nodes.nelem)
3046 {
3047 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3048 if (BE (err != REG_NOERROR, 0))
3049 {
3050 re_node_set_free (&next_nodes);
3051 return err;
3052 }
3053 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3054 subexp_num, type);
3055 if (BE (err != REG_NOERROR, 0))
3056 {
3057 re_node_set_free (&next_nodes);
3058 return err;
3059 }
3060 }
3061 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3062 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3063 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3064 {
3065 re_node_set_free (&next_nodes);
3066 return err;
3067 }
3068 mctx->state_log[str_idx] = cur_state;
3069 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3070 }
3071 re_node_set_free (&next_nodes);
3072 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3073 : &mctx->state_log[last_str]->nodes);
3074 path->next_idx = str_idx;
3075
3076 /* Fix MCTX. */
3077 mctx->state_log = backup_state_log;
3078 mctx->input.cur_idx = backup_cur_idx;
3079
3080 /* Then check the current node set has the node LAST_NODE. */
3081 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3082 return REG_NOERROR;
3083
3084 return REG_NOMATCH;
3085}
3086
3087/* Helper functions for check_arrival. */
3088
3089/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3090 to NEXT_NODES.
3091 TODO: This function is similar to the functions transit_state*(),
3092 however this function has many additional works.
3093 Can't we unify them? */
3094
3095static reg_errcode_t
3096internal_function __attribute_warn_unused_result__
3097check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3098 re_node_set *cur_nodes, re_node_set *next_nodes)
3099{
3100 const re_dfa_t *const dfa = mctx->dfa;
3101 bool ok;
3102 Idx cur_idx;
3103#ifdef RE_ENABLE_I18N
3104 reg_errcode_t err = REG_NOERROR;
3105#endif
3106 re_node_set union_set;
3107 re_node_set_init_empty (&union_set);
3108 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3109 {
3110 int naccepted = 0;
3111 Idx cur_node = cur_nodes->elems[cur_idx];
3112#ifdef DEBUG
3113 re_token_type_t type = dfa->nodes[cur_node].type;
3114 assert (!IS_EPSILON_NODE (type));
3115#endif
3116#ifdef RE_ENABLE_I18N
3117 /* If the node may accept "multi byte". */
3118 if (dfa->nodes[cur_node].accept_mb)
3119 {
3120 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3121 str_idx);
3122 if (naccepted > 1)
3123 {
3124 re_dfastate_t *dest_state;
3125 Idx next_node = dfa->nexts[cur_node];
3126 Idx next_idx = str_idx + naccepted;
3127 dest_state = mctx->state_log[next_idx];
3128 re_node_set_empty (&union_set);
3129 if (dest_state)
3130 {
3131 err = re_node_set_merge (&union_set, &dest_state->nodes);
3132 if (BE (err != REG_NOERROR, 0))
3133 {
3134 re_node_set_free (&union_set);
3135 return err;
3136 }
3137 }
3138 ok = re_node_set_insert (&union_set, next_node);
3139 if (BE (! ok, 0))
3140 {
3141 re_node_set_free (&union_set);
3142 return REG_ESPACE;
3143 }
3144 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3145 &union_set);
3146 if (BE (mctx->state_log[next_idx] == NULL
3147 && err != REG_NOERROR, 0))
3148 {
3149 re_node_set_free (&union_set);
3150 return err;
3151 }
3152 }
3153 }
3154#endif /* RE_ENABLE_I18N */
3155 if (naccepted
3156 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3157 {
3158 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3159 if (BE (! ok, 0))
3160 {
3161 re_node_set_free (&union_set);
3162 return REG_ESPACE;
3163 }
3164 }
3165 }
3166 re_node_set_free (&union_set);
3167 return REG_NOERROR;
3168}
3169
3170/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3171 CUR_NODES, however exclude the nodes which are:
3172 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3173 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3174*/
3175
3176static reg_errcode_t
3177internal_function
3178check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3179 Idx ex_subexp, int type)
3180{
3181 reg_errcode_t err;
3182 Idx idx, outside_node;
3183 re_node_set new_nodes;
3184#ifdef DEBUG
3185 assert (cur_nodes->nelem);
3186#endif
3187 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3188 if (BE (err != REG_NOERROR, 0))
3189 return err;
3190 /* Create a new node set NEW_NODES with the nodes which are epsilon
3191 closures of the node in CUR_NODES. */
3192
3193 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3194 {
3195 Idx cur_node = cur_nodes->elems[idx];
3196 const re_node_set *eclosure = dfa->eclosures + cur_node;
3197 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3198 if (outside_node == REG_MISSING)
3199 {
3200 /* There are no problematic nodes, just merge them. */
3201 err = re_node_set_merge (&new_nodes, eclosure);
3202 if (BE (err != REG_NOERROR, 0))
3203 {
3204 re_node_set_free (&new_nodes);
3205 return err;
3206 }
3207 }
3208 else
3209 {
3210 /* There are problematic nodes, re-calculate incrementally. */
3211 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3212 ex_subexp, type);
3213 if (BE (err != REG_NOERROR, 0))
3214 {
3215 re_node_set_free (&new_nodes);
3216 return err;
3217 }
3218 }
3219 }
3220 re_node_set_free (cur_nodes);
3221 *cur_nodes = new_nodes;
3222 return REG_NOERROR;
3223}
3224
3225/* Helper function for check_arrival_expand_ecl.
3226 Check incrementally the epsilon closure of TARGET, and if it isn't
3227 problematic append it to DST_NODES. */
3228
3229static reg_errcode_t
3230internal_function __attribute_warn_unused_result__
3231check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3232 Idx target, Idx ex_subexp, int type)
3233{
3234 Idx cur_node;
3235 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3236 {
3237 bool ok;
3238
3239 if (dfa->nodes[cur_node].type == type
3240 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3241 {
3242 if (type == OP_CLOSE_SUBEXP)
3243 {
3244 ok = re_node_set_insert (dst_nodes, cur_node);
3245 if (BE (! ok, 0))
3246 return REG_ESPACE;
3247 }
3248 break;
3249 }
3250 ok = re_node_set_insert (dst_nodes, cur_node);
3251 if (BE (! ok, 0))
3252 return REG_ESPACE;
3253 if (dfa->edests[cur_node].nelem == 0)
3254 break;
3255 if (dfa->edests[cur_node].nelem == 2)
3256 {
3257 reg_errcode_t err;
3258 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3259 dfa->edests[cur_node].elems[1],
3260 ex_subexp, type);
3261 if (BE (err != REG_NOERROR, 0))
3262 return err;
3263 }
3264 cur_node = dfa->edests[cur_node].elems[0];
3265 }
3266 return REG_NOERROR;
3267}
3268
3269
3270/* For all the back references in the current state, calculate the
3271 destination of the back references by the appropriate entry
3272 in MCTX->BKREF_ENTS. */
3273
3274static reg_errcode_t
3275internal_function __attribute_warn_unused_result__
3276expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3277 Idx cur_str, Idx subexp_num, int type)
3278{
3279 const re_dfa_t *const dfa = mctx->dfa;
3280 reg_errcode_t err;
3281 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3282 struct re_backref_cache_entry *ent;
3283
3284 if (cache_idx_start == REG_MISSING)
3285 return REG_NOERROR;
3286
3287 restart:
3288 ent = mctx->bkref_ents + cache_idx_start;
3289 do
3290 {
3291 Idx to_idx, next_node;
3292
3293 /* Is this entry ENT is appropriate? */
3294 if (!re_node_set_contains (cur_nodes, ent->node))
3295 continue; /* No. */
3296
3297 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3298 /* Calculate the destination of the back reference, and append it
3299 to MCTX->STATE_LOG. */
3300 if (to_idx == cur_str)
3301 {
3302 /* The backreference did epsilon transit, we must re-check all the
3303 node in the current state. */
3304 re_node_set new_dests;
3305 reg_errcode_t err2, err3;
3306 next_node = dfa->edests[ent->node].elems[0];
3307 if (re_node_set_contains (cur_nodes, next_node))
3308 continue;
3309 err = re_node_set_init_1 (&new_dests, next_node);
3310 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3311 err3 = re_node_set_merge (cur_nodes, &new_dests);
3312 re_node_set_free (&new_dests);
3313 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3314 || err3 != REG_NOERROR, 0))
3315 {
3316 err = (err != REG_NOERROR ? err
3317 : (err2 != REG_NOERROR ? err2 : err3));
3318 return err;
3319 }
3320 /* TODO: It is still inefficient... */
3321 goto restart;
3322 }
3323 else
3324 {
3325 re_node_set union_set;
3326 next_node = dfa->nexts[ent->node];
3327 if (mctx->state_log[to_idx])
3328 {
3329 bool ok;
3330 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3331 next_node))
3332 continue;
3333 err = re_node_set_init_copy (&union_set,
3334 &mctx->state_log[to_idx]->nodes);
3335 ok = re_node_set_insert (&union_set, next_node);
3336 if (BE (err != REG_NOERROR || ! ok, 0))
3337 {
3338 re_node_set_free (&union_set);
3339 err = err != REG_NOERROR ? err : REG_ESPACE;
3340 return err;
3341 }
3342 }
3343 else
3344 {
3345 err = re_node_set_init_1 (&union_set, next_node);
3346 if (BE (err != REG_NOERROR, 0))
3347 return err;
3348 }
3349 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3350 re_node_set_free (&union_set);
3351 if (BE (mctx->state_log[to_idx] == NULL
3352 && err != REG_NOERROR, 0))
3353 return err;
3354 }
3355 }
3356 while (ent++->more);
3357 return REG_NOERROR;
3358}
3359
3360/* Build transition table for the state.
3361 Return true if successful. */
3362
3363static bool
3364internal_function
3365build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3366{
3367 reg_errcode_t err;
3368 Idx i, j;
3369 int ch;
3370 bool need_word_trtable = false;
3371 bitset_word_t elem, mask;
3372 bool dests_node_malloced = false;
3373 bool dest_states_malloced = false;
3374 Idx ndests; /* Number of the destination states from 'state'. */
3375 re_dfastate_t **trtable;
3376 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3377 re_node_set follows, *dests_node;
3378 bitset_t *dests_ch;
3379 bitset_t acceptable;
3380
3381 struct dests_alloc
3382 {
3383 re_node_set dests_node[SBC_MAX];
3384 bitset_t dests_ch[SBC_MAX];
3385 } *dests_alloc;
3386
3387 /* We build DFA states which corresponds to the destination nodes
3388 from 'state'. 'dests_node[i]' represents the nodes which i-th
3389 destination state contains, and 'dests_ch[i]' represents the
3390 characters which i-th destination state accepts. */
3391 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3392 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3393 else
3394 {
3395 dests_alloc = re_malloc (struct dests_alloc, 1);
3396 if (BE (dests_alloc == NULL, 0))
3397 return false;
3398 dests_node_malloced = true;
3399 }
3400 dests_node = dests_alloc->dests_node;
3401 dests_ch = dests_alloc->dests_ch;
3402
3403 /* Initialize transition table. */
3404 state->word_trtable = state->trtable = NULL;
3405
3406 /* At first, group all nodes belonging to 'state' into several
3407 destinations. */
3408 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3409 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3410 {
3411 if (dests_node_malloced)
3412 free (dests_alloc);
3413 /* Return false in case of an error, true otherwise. */
3414 if (ndests == 0)
3415 {
3416 state->trtable = (re_dfastate_t **)
3417 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3418 if (BE (state->trtable == NULL, 0))
3419 return false;
3420 return true;
3421 }
3422 return false;
3423 }
3424
3425 err = re_node_set_alloc (&follows, ndests + 1);
3426 if (BE (err != REG_NOERROR, 0))
3427 goto out_free;
3428
3429 /* Avoid arithmetic overflow in size calculation. */
3430 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3431 / (3 * sizeof (re_dfastate_t *)))
3432 < ndests),
3433 0))
3434 goto out_free;
3435
3436 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3437 + ndests * 3 * sizeof (re_dfastate_t *)))
3438 dest_states = (re_dfastate_t **)
3439 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3440 else
3441 {
3442 dest_states = (re_dfastate_t **)
3443 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3444 if (BE (dest_states == NULL, 0))
3445 {
3446out_free:
3447 if (dest_states_malloced)
3448 free (dest_states);
3449 re_node_set_free (&follows);
3450 for (i = 0; i < ndests; ++i)
3451 re_node_set_free (dests_node + i);
3452 if (dests_node_malloced)
3453 free (dests_alloc);
3454 return false;
3455 }
3456 dest_states_malloced = true;
3457 }
3458 dest_states_word = dest_states + ndests;
3459 dest_states_nl = dest_states_word + ndests;
3460 bitset_empty (acceptable);
3461
3462 /* Then build the states for all destinations. */
3463 for (i = 0; i < ndests; ++i)
3464 {
3465 Idx next_node;
3466 re_node_set_empty (&follows);
3467 /* Merge the follows of this destination states. */
3468 for (j = 0; j < dests_node[i].nelem; ++j)
3469 {
3470 next_node = dfa->nexts[dests_node[i].elems[j]];
3471 if (next_node != REG_MISSING)
3472 {
3473 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3474 if (BE (err != REG_NOERROR, 0))
3475 goto out_free;
3476 }
3477 }
3478 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3479 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3480 goto out_free;
3481 /* If the new state has context constraint,
3482 build appropriate states for these contexts. */
3483 if (dest_states[i]->has_constraint)
3484 {
3485 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3486 CONTEXT_WORD);
3487 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3488 goto out_free;
3489
3490 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3491 need_word_trtable = true;
3492
3493 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3494 CONTEXT_NEWLINE);
3495 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3496 goto out_free;
3497 }
3498 else
3499 {
3500 dest_states_word[i] = dest_states[i];
3501 dest_states_nl[i] = dest_states[i];
3502 }
3503 bitset_merge (acceptable, dests_ch[i]);
3504 }
3505
3506 if (!BE (need_word_trtable, 0))
3507 {
3508 /* We don't care about whether the following character is a word
3509 character, or we are in a single-byte character set so we can
3510 discern by looking at the character code: allocate a
3511 256-entry transition table. */
3512 trtable = state->trtable =
3513 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3514 if (BE (trtable == NULL, 0))
3515 goto out_free;
3516
3517 /* For all characters ch...: */
3518 for (i = 0; i < BITSET_WORDS; ++i)
3519 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3520 elem;
3521 mask <<= 1, elem >>= 1, ++ch)
3522 if (BE (elem & 1, 0))
3523 {
3524 /* There must be exactly one destination which accepts
3525 character ch. See group_nodes_into_DFAstates. */
3526 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3527 ;
3528
3529 /* j-th destination accepts the word character ch. */
3530 if (dfa->word_char[i] & mask)
3531 trtable[ch] = dest_states_word[j];
3532 else
3533 trtable[ch] = dest_states[j];
3534 }
3535 }
3536 else
3537 {
3538 /* We care about whether the following character is a word
3539 character, and we are in a multi-byte character set: discern
3540 by looking at the character code: build two 256-entry
3541 transition tables, one starting at trtable[0] and one
3542 starting at trtable[SBC_MAX]. */
3543 trtable = state->word_trtable =
3544 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3545 if (BE (trtable == NULL, 0))
3546 goto out_free;
3547
3548 /* For all characters ch...: */
3549 for (i = 0; i < BITSET_WORDS; ++i)
3550 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3551 elem;
3552 mask <<= 1, elem >>= 1, ++ch)
3553 if (BE (elem & 1, 0))
3554 {
3555 /* There must be exactly one destination which accepts
3556 character ch. See group_nodes_into_DFAstates. */
3557 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3558 ;
3559
3560 /* j-th destination accepts the word character ch. */
3561 trtable[ch] = dest_states[j];
3562 trtable[ch + SBC_MAX] = dest_states_word[j];
3563 }
3564 }
3565
3566 /* new line */
3567 if (bitset_contain (acceptable, NEWLINE_CHAR))
3568 {
3569 /* The current state accepts newline character. */
3570 for (j = 0; j < ndests; ++j)
3571 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3572 {
3573 /* k-th destination accepts newline character. */
3574 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3575 if (need_word_trtable)
3576 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3577 /* There must be only one destination which accepts
3578 newline. See group_nodes_into_DFAstates. */
3579 break;
3580 }
3581 }
3582
3583 if (dest_states_malloced)
3584 free (dest_states);
3585
3586 re_node_set_free (&follows);
3587 for (i = 0; i < ndests; ++i)
3588 re_node_set_free (dests_node + i);
3589
3590 if (dests_node_malloced)
3591 free (dests_alloc);
3592
3593 return true;
3594}
3595
3596/* Group all nodes belonging to STATE into several destinations.
3597 Then for all destinations, set the nodes belonging to the destination
3598 to DESTS_NODE[i] and set the characters accepted by the destination
3599 to DEST_CH[i]. This function return the number of destinations. */
3600
3601static Idx
3602internal_function
3603group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3604 re_node_set *dests_node, bitset_t *dests_ch)
3605{
3606 reg_errcode_t err;
3607 bool ok;
3608 Idx i, j, k;
3609 Idx ndests; /* Number of the destinations from 'state'. */
3610 bitset_t accepts; /* Characters a node can accept. */
3611 const re_node_set *cur_nodes = &state->nodes;
3612 bitset_empty (accepts);
3613 ndests = 0;
3614
3615 /* For all the nodes belonging to 'state', */
3616 for (i = 0; i < cur_nodes->nelem; ++i)
3617 {
3618 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3619 re_token_type_t type = node->type;
3620 unsigned int constraint = node->constraint;
3621
3622 /* Enumerate all single byte character this node can accept. */
3623 if (type == CHARACTER)
3624 bitset_set (accepts, node->opr.c);
3625 else if (type == SIMPLE_BRACKET)
3626 {
3627 bitset_merge (accepts, node->opr.sbcset);
3628 }
3629 else if (type == OP_PERIOD)
3630 {
3631#ifdef RE_ENABLE_I18N
3632 if (dfa->mb_cur_max > 1)
3633 bitset_merge (accepts, dfa->sb_char);
3634 else
3635#endif
3636 bitset_set_all (accepts);
3637 if (!(dfa->syntax & RE_DOT_NEWLINE))
3638 bitset_clear (accepts, '\n');
3639 if (dfa->syntax & RE_DOT_NOT_NULL)
3640 bitset_clear (accepts, '\0');
3641 }
3642#ifdef RE_ENABLE_I18N
3643 else if (type == OP_UTF8_PERIOD)
3644 {
3645 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3646 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3647 else
3648 bitset_merge (accepts, utf8_sb_map);
3649 if (!(dfa->syntax & RE_DOT_NEWLINE))
3650 bitset_clear (accepts, '\n');
3651 if (dfa->syntax & RE_DOT_NOT_NULL)
3652 bitset_clear (accepts, '\0');
3653 }
3654#endif
3655 else
3656 continue;
3657
3658 /* Check the 'accepts' and sift the characters which are not
3659 match it the context. */
3660 if (constraint)
3661 {
3662 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3663 {
3664 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3665 bitset_empty (accepts);
3666 if (accepts_newline)
3667 bitset_set (accepts, NEWLINE_CHAR);
3668 else
3669 continue;
3670 }
3671 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3672 {
3673 bitset_empty (accepts);
3674 continue;
3675 }
3676
3677 if (constraint & NEXT_WORD_CONSTRAINT)
3678 {
3679 bitset_word_t any_set = 0;
3680 if (type == CHARACTER && !node->word_char)
3681 {
3682 bitset_empty (accepts);
3683 continue;
3684 }
3685#ifdef RE_ENABLE_I18N
3686 if (dfa->mb_cur_max > 1)
3687 for (j = 0; j < BITSET_WORDS; ++j)
3688 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3689 else
3690#endif
3691 for (j = 0; j < BITSET_WORDS; ++j)
3692 any_set |= (accepts[j] &= dfa->word_char[j]);
3693 if (!any_set)
3694 continue;
3695 }
3696 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3697 {
3698 bitset_word_t any_set = 0;
3699 if (type == CHARACTER && node->word_char)
3700 {
3701 bitset_empty (accepts);
3702 continue;
3703 }
3704#ifdef RE_ENABLE_I18N
3705 if (dfa->mb_cur_max > 1)
3706 for (j = 0; j < BITSET_WORDS; ++j)
3707 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3708 else
3709#endif
3710 for (j = 0; j < BITSET_WORDS; ++j)
3711 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3712 if (!any_set)
3713 continue;
3714 }
3715 }
3716
3717 /* Then divide 'accepts' into DFA states, or create a new
3718 state. Above, we make sure that accepts is not empty. */
3719 for (j = 0; j < ndests; ++j)
3720 {
3721 bitset_t intersec; /* Intersection sets, see below. */
3722 bitset_t remains;
3723 /* Flags, see below. */
3724 bitset_word_t has_intersec, not_subset, not_consumed;
3725
3726 /* Optimization, skip if this state doesn't accept the character. */
3727 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3728 continue;
3729
3730 /* Enumerate the intersection set of this state and 'accepts'. */
3731 has_intersec = 0;
3732 for (k = 0; k < BITSET_WORDS; ++k)
3733 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3734 /* And skip if the intersection set is empty. */
3735 if (!has_intersec)
3736 continue;
3737
3738 /* Then check if this state is a subset of 'accepts'. */
3739 not_subset = not_consumed = 0;
3740 for (k = 0; k < BITSET_WORDS; ++k)
3741 {
3742 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3743 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3744 }
3745
3746 /* If this state isn't a subset of 'accepts', create a
3747 new group state, which has the 'remains'. */
3748 if (not_subset)
3749 {
3750 bitset_copy (dests_ch[ndests], remains);
3751 bitset_copy (dests_ch[j], intersec);
3752 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3753 if (BE (err != REG_NOERROR, 0))
3754 goto error_return;
3755 ++ndests;
3756 }
3757
3758 /* Put the position in the current group. */
3759 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3760 if (BE (! ok, 0))
3761 goto error_return;
3762
3763 /* If all characters are consumed, go to next node. */
3764 if (!not_consumed)
3765 break;
3766 }
3767 /* Some characters remain, create a new group. */
3768 if (j == ndests)
3769 {
3770 bitset_copy (dests_ch[ndests], accepts);
3771 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3772 if (BE (err != REG_NOERROR, 0))
3773 goto error_return;
3774 ++ndests;
3775 bitset_empty (accepts);
3776 }
3777 }
3778 return ndests;
3779 error_return:
3780 for (j = 0; j < ndests; ++j)
3781 re_node_set_free (dests_node + j);
3782 return REG_MISSING;
3783}
3784
3785#ifdef RE_ENABLE_I18N
3786/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3787 Return the number of the bytes the node accepts.
3788 STR_IDX is the current index of the input string.
3789
3790 This function handles the nodes which can accept one character, or
3791 one collating element like '.', '[a-z]', opposite to the other nodes
3792 can only accept one byte. */
3793
3794static int
3795internal_function
3796check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3797 const re_string_t *input, Idx str_idx)
3798{
3799 const re_token_t *node = dfa->nodes + node_idx;
3800 int char_len, elem_len;
3801 Idx i;
3802
3803 if (BE (node->type == OP_UTF8_PERIOD, 0))
3804 {
3805 unsigned char c = re_string_byte_at (input, str_idx), d;
3806 if (BE (c < 0xc2, 1))
3807 return 0;
3808
3809 if (str_idx + 2 > input->len)
3810 return 0;
3811
3812 d = re_string_byte_at (input, str_idx + 1);
3813 if (c < 0xe0)
3814 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3815 else if (c < 0xf0)
3816 {
3817 char_len = 3;
3818 if (c == 0xe0 && d < 0xa0)
3819 return 0;
3820 }
3821 else if (c < 0xf8)
3822 {
3823 char_len = 4;
3824 if (c == 0xf0 && d < 0x90)
3825 return 0;
3826 }
3827 else if (c < 0xfc)
3828 {
3829 char_len = 5;
3830 if (c == 0xf8 && d < 0x88)
3831 return 0;
3832 }
3833 else if (c < 0xfe)
3834 {
3835 char_len = 6;
3836 if (c == 0xfc && d < 0x84)
3837 return 0;
3838 }
3839 else
3840 return 0;
3841
3842 if (str_idx + char_len > input->len)
3843 return 0;
3844
3845 for (i = 1; i < char_len; ++i)
3846 {
3847 d = re_string_byte_at (input, str_idx + i);
3848 if (d < 0x80 || d > 0xbf)
3849 return 0;
3850 }
3851 return char_len;
3852 }
3853
3854 char_len = re_string_char_size_at (input, str_idx);
3855 if (node->type == OP_PERIOD)
3856 {
3857 if (char_len <= 1)
3858 return 0;
3859 /* FIXME: I don't think this if is needed, as both '\n'
3860 and '\0' are char_len == 1. */
3861 /* '.' accepts any one character except the following two cases. */
3862 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3863 re_string_byte_at (input, str_idx) == '\n') ||
3864 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3865 re_string_byte_at (input, str_idx) == '\0'))
3866 return 0;
3867 return char_len;
3868 }
3869
3870 elem_len = re_string_elem_size_at (input, str_idx);
3871 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3872 return 0;
3873
3874 if (node->type == COMPLEX_BRACKET)
3875 {
3876 const re_charset_t *cset = node->opr.mbcset;
3877# ifdef _LIBC
3878 const unsigned char *pin
3879 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3880 Idx j;
3881 uint32_t nrules;
3882# endif /* _LIBC */
3883 int match_len = 0;
3884 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3885 ? re_string_wchar_at (input, str_idx) : 0);
3886
3887 /* match with multibyte character? */
3888 for (i = 0; i < cset->nmbchars; ++i)
3889 if (wc == cset->mbchars[i])
3890 {
3891 match_len = char_len;
3892 goto check_node_accept_bytes_match;
3893 }
3894 /* match with character_class? */
3895 for (i = 0; i < cset->nchar_classes; ++i)
3896 {
3897 wctype_t wt = cset->char_classes[i];
3898 if (__iswctype (wc, wt))
3899 {
3900 match_len = char_len;
3901 goto check_node_accept_bytes_match;
3902 }
3903 }
3904
3905# ifdef _LIBC
3906 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3907 if (nrules != 0)
3908 {
3909 unsigned int in_collseq = 0;
3910 const int32_t *table, *indirect;
3911 const unsigned char *weights, *extra;
3912 const char *collseqwc;
3913 /* This #include defines a local function! */
3914# include <locale/weight.h>
3915
3916 /* match with collating_symbol? */
3917 if (cset->ncoll_syms)
3918 extra = (const unsigned char *)
3919 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3920 for (i = 0; i < cset->ncoll_syms; ++i)
3921 {
3922 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3923 /* Compare the length of input collating element and
3924 the length of current collating element. */
3925 if (*coll_sym != elem_len)
3926 continue;
3927 /* Compare each bytes. */
3928 for (j = 0; j < *coll_sym; j++)
3929 if (pin[j] != coll_sym[1 + j])
3930 break;
3931 if (j == *coll_sym)
3932 {
3933 /* Match if every bytes is equal. */
3934 match_len = j;
3935 goto check_node_accept_bytes_match;
3936 }
3937 }
3938
3939 if (cset->nranges)
3940 {
3941 if (elem_len <= char_len)
3942 {
3943 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3944 in_collseq = __collseq_table_lookup (collseqwc, wc);
3945 }
3946 else
3947 in_collseq = find_collation_sequence_value (pin, elem_len);
3948 }
3949 /* match with range expression? */
3950 for (i = 0; i < cset->nranges; ++i)
3951 if (cset->range_starts[i] <= in_collseq
3952 && in_collseq <= cset->range_ends[i])
3953 {
3954 match_len = elem_len;
3955 goto check_node_accept_bytes_match;
3956 }
3957
3958 /* match with equivalence_class? */
3959 if (cset->nequiv_classes)
3960 {
3961 const unsigned char *cp = pin;
3962 table = (const int32_t *)
3963 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3964 weights = (const unsigned char *)
3965 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3966 extra = (const unsigned char *)
3967 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3968 indirect = (const int32_t *)
3969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3970 int32_t idx = findidx (&cp, elem_len);
3971 if (idx > 0)
3972 for (i = 0; i < cset->nequiv_classes; ++i)
3973 {
3974 int32_t equiv_class_idx = cset->equiv_classes[i];
3975 size_t weight_len = weights[idx & 0xffffff];
3976 if (weight_len == weights[equiv_class_idx & 0xffffff]
3977 && (idx >> 24) == (equiv_class_idx >> 24))
3978 {
3979 Idx cnt = 0;
3980
3981 idx &= 0xffffff;
3982 equiv_class_idx &= 0xffffff;
3983
3984 while (cnt <= weight_len
3985 && (weights[equiv_class_idx + 1 + cnt]
3986 == weights[idx + 1 + cnt]))
3987 ++cnt;
3988 if (cnt > weight_len)
3989 {
3990 match_len = elem_len;
3991 goto check_node_accept_bytes_match;
3992 }
3993 }
3994 }
3995 }
3996 }
3997 else
3998# endif /* _LIBC */
3999 {
4000 /* match with range expression? */
4001#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && defined __STRICT_ANSI__)
4002 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
4003#else
4004 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
4005 cmp_buf[2] = wc;
4006#endif
4007 for (i = 0; i < cset->nranges; ++i)
4008 {
4009 cmp_buf[0] = cset->range_starts[i];
4010 cmp_buf[4] = cset->range_ends[i];
4011 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
4012 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4013 {
4014 match_len = char_len;
4015 goto check_node_accept_bytes_match;
4016 }
4017 }
4018 }
4019 check_node_accept_bytes_match:
4020 if (!cset->non_match)
4021 return match_len;
4022 else
4023 {
4024 if (match_len > 0)
4025 return 0;
4026 else
4027 return (elem_len > char_len) ? elem_len : char_len;
4028 }
4029 }
4030 return 0;
4031}
4032
4033# ifdef _LIBC
4034static unsigned int
4035internal_function
4036find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4037{
4038 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4039 if (nrules == 0)
4040 {
4041 if (mbs_len == 1)
4042 {
4043 /* No valid character. Match it as a single byte character. */
4044 const unsigned char *collseq = (const unsigned char *)
4045 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4046 return collseq[mbs[0]];
4047 }
4048 return UINT_MAX;
4049 }
4050 else
4051 {
4052 int32_t idx;
4053 const unsigned char *extra = (const unsigned char *)
4054 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4055 int32_t extrasize = (const unsigned char *)
4056 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4057
4058 for (idx = 0; idx < extrasize;)
4059 {
4060 int mbs_cnt;
4061 bool found = false;
4062 int32_t elem_mbs_len;
4063 /* Skip the name of collating element name. */
4064 idx = idx + extra[idx] + 1;
4065 elem_mbs_len = extra[idx++];
4066 if (mbs_len == elem_mbs_len)
4067 {
4068 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4069 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4070 break;
4071 if (mbs_cnt == elem_mbs_len)
4072 /* Found the entry. */
4073 found = true;
4074 }
4075 /* Skip the byte sequence of the collating element. */
4076 idx += elem_mbs_len;
4077 /* Adjust for the alignment. */
4078 idx = (idx + 3) & ~3;
4079 /* Skip the collation sequence value. */
4080 idx += sizeof (uint32_t);
4081 /* Skip the wide char sequence of the collating element. */
4082 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4083 /* If we found the entry, return the sequence value. */
4084 if (found)
4085 return *(uint32_t *) (extra + idx);
4086 /* Skip the collation sequence value. */
4087 idx += sizeof (uint32_t);
4088 }
4089 return UINT_MAX;
4090 }
4091}
4092# endif /* _LIBC */
4093#endif /* RE_ENABLE_I18N */
4094
4095/* Check whether the node accepts the byte which is IDX-th
4096 byte of the INPUT. */
4097
4098static bool
4099internal_function
4100check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4101 Idx idx)
4102{
4103 unsigned char ch;
4104 ch = re_string_byte_at (&mctx->input, idx);
4105 switch (node->type)
4106 {
4107 case CHARACTER:
4108 if (node->opr.c != ch)
4109 return false;
4110 break;
4111
4112 case SIMPLE_BRACKET:
4113 if (!bitset_contain (node->opr.sbcset, ch))
4114 return false;
4115 break;
4116
4117#ifdef RE_ENABLE_I18N
4118 case OP_UTF8_PERIOD:
4119 if (ch >= ASCII_CHARS)
4120 return false;
4121 /* FALLTHROUGH */
4122#endif
4123 case OP_PERIOD:
4124 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4125 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4126 return false;
4127 break;
4128
4129 default:
4130 return false;
4131 }
4132
4133 if (node->constraint)
4134 {
4135 /* The node has constraints. Check whether the current context
4136 satisfies the constraints. */
4137 unsigned int context = re_string_context_at (&mctx->input, idx,
4138 mctx->eflags);
4139 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4140 return false;
4141 }
4142
4143 return true;
4144}
4145
4146/* Extend the buffers, if the buffers have run out. */
4147
4148static reg_errcode_t
4149internal_function __attribute_warn_unused_result__
4150extend_buffers (re_match_context_t *mctx)
4151{
4152 reg_errcode_t ret;
4153 re_string_t *pstr = &mctx->input;
4154
4155 /* Avoid overflow. */
4156 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4157 <= pstr->bufs_len, 0))
4158 return REG_ESPACE;
4159
4160 /* Double the lengths of the buffers. */
4161 ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4162 if (BE (ret != REG_NOERROR, 0))
4163 return ret;
4164
4165 if (mctx->state_log != NULL)
4166 {
4167 /* And double the length of state_log. */
4168 /* XXX We have no indication of the size of this buffer. If this
4169 allocation fail we have no indication that the state_log array
4170 does not have the right size. */
4171 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4172 pstr->bufs_len + 1);
4173 if (BE (new_array == NULL, 0))
4174 return REG_ESPACE;
4175 mctx->state_log = new_array;
4176 }
4177
4178 /* Then reconstruct the buffers. */
4179 if (pstr->icase)
4180 {
4181#ifdef RE_ENABLE_I18N
4182 if (pstr->mb_cur_max > 1)
4183 {
4184 ret = build_wcs_upper_buffer (pstr);
4185 if (BE (ret != REG_NOERROR, 0))
4186 return ret;
4187 }
4188 else
4189#endif /* RE_ENABLE_I18N */
4190 build_upper_buffer (pstr);
4191 }
4192 else
4193 {
4194#ifdef RE_ENABLE_I18N
4195 if (pstr->mb_cur_max > 1)
4196 build_wcs_buffer (pstr);
4197 else
4198#endif /* RE_ENABLE_I18N */
4199 {
4200 if (pstr->trans != NULL)
4201 re_string_translate_buffer (pstr);
4202 }
4203 }
4204 return REG_NOERROR;
4205}
4206
4207
4208
4209/* Functions for matching context. */
4210
4211/* Initialize MCTX. */
4212
4213static reg_errcode_t
4214internal_function __attribute_warn_unused_result__
4215match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4216{
4217 mctx->eflags = eflags;
4218 mctx->match_last = REG_MISSING;
4219 if (n > 0)
4220 {
4221 /* Avoid overflow. */
4222 size_t max_object_size =
4223 MAX (sizeof (struct re_backref_cache_entry),
4224 sizeof (re_sub_match_top_t *));
4225 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4226 return REG_ESPACE;
4227
4228 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4229 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4230 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4231 return REG_ESPACE;
4232 }
4233 /* Already zero-ed by the caller.
4234 else
4235 mctx->bkref_ents = NULL;
4236 mctx->nbkref_ents = 0;
4237 mctx->nsub_tops = 0; */
4238 mctx->abkref_ents = n;
4239 mctx->max_mb_elem_len = 1;
4240 mctx->asub_tops = n;
4241 return REG_NOERROR;
4242}
4243
4244/* Clean the entries which depend on the current input in MCTX.
4245 This function must be invoked when the matcher changes the start index
4246 of the input, or changes the input string. */
4247
4248static void
4249internal_function
4250match_ctx_clean (re_match_context_t *mctx)
4251{
4252 Idx st_idx;
4253 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4254 {
4255 Idx sl_idx;
4256 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4257 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4258 {
4259 re_sub_match_last_t *last = top->lasts[sl_idx];
4260 re_free (last->path.array);
4261 re_free (last);
4262 }
4263 re_free (top->lasts);
4264 if (top->path)
4265 {
4266 re_free (top->path->array);
4267 re_free (top->path);
4268 }
4269 free (top);
4270 }
4271
4272 mctx->nsub_tops = 0;
4273 mctx->nbkref_ents = 0;
4274}
4275
4276/* Free all the memory associated with MCTX. */
4277
4278static void
4279internal_function
4280match_ctx_free (re_match_context_t *mctx)
4281{
4282 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4283 match_ctx_clean (mctx);
4284 re_free (mctx->sub_tops);
4285 re_free (mctx->bkref_ents);
4286}
4287
4288/* Add a new backreference entry to MCTX.
4289 Note that we assume that caller never call this function with duplicate
4290 entry, and call with STR_IDX which isn't smaller than any existing entry.
4291*/
4292
4293static reg_errcode_t
4294internal_function __attribute_warn_unused_result__
4295match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4296 Idx to)
4297{
4298 if (mctx->nbkref_ents >= mctx->abkref_ents)
4299 {
4300 struct re_backref_cache_entry* new_entry;
4301 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4302 mctx->abkref_ents * 2);
4303 if (BE (new_entry == NULL, 0))
4304 {
4305 re_free (mctx->bkref_ents);
4306 return REG_ESPACE;
4307 }
4308 mctx->bkref_ents = new_entry;
4309 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4310 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4311 mctx->abkref_ents *= 2;
4312 }
4313 if (mctx->nbkref_ents > 0
4314 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4315 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4316
4317 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4318 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4319 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4320 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4321
4322 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4323 If bit N is clear, means that this entry won't epsilon-transition to
4324 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4325 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4326 such node.
4327
4328 A backreference does not epsilon-transition unless it is empty, so set
4329 to all zeros if FROM != TO. */
4330 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4331 = (from == to ? -1 : 0);
4332
4333 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4334 if (mctx->max_mb_elem_len < to - from)
4335 mctx->max_mb_elem_len = to - from;
4336 return REG_NOERROR;
4337}
4338
4339/* Return the first entry with the same str_idx, or REG_MISSING if none is
4340 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4341
4342static Idx
4343internal_function
4344search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4345{
4346 Idx left, right, mid, last;
4347 last = right = mctx->nbkref_ents;
4348 for (left = 0; left < right;)
4349 {
4350 mid = (left + right) / 2;
4351 if (mctx->bkref_ents[mid].str_idx < str_idx)
4352 left = mid + 1;
4353 else
4354 right = mid;
4355 }
4356 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4357 return left;
4358 else
4359 return REG_MISSING;
4360}
4361
4362/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4363 at STR_IDX. */
4364
4365static reg_errcode_t
4366internal_function __attribute_warn_unused_result__
4367match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4368{
4369#ifdef DEBUG
4370 assert (mctx->sub_tops != NULL);
4371 assert (mctx->asub_tops > 0);
4372#endif
4373 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4374 {
4375 Idx new_asub_tops = mctx->asub_tops * 2;
4376 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4377 re_sub_match_top_t *,
4378 new_asub_tops);
4379 if (BE (new_array == NULL, 0))
4380 return REG_ESPACE;
4381 mctx->sub_tops = new_array;
4382 mctx->asub_tops = new_asub_tops;
4383 }
4384 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4385 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4386 return REG_ESPACE;
4387 mctx->sub_tops[mctx->nsub_tops]->node = node;
4388 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4389 return REG_NOERROR;
4390}
4391
4392/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4393 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4394
4395static re_sub_match_last_t *
4396internal_function
4397match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4398{
4399 re_sub_match_last_t *new_entry;
4400 if (BE (subtop->nlasts == subtop->alasts, 0))
4401 {
4402 Idx new_alasts = 2 * subtop->alasts + 1;
4403 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4404 re_sub_match_last_t *,
4405 new_alasts);
4406 if (BE (new_array == NULL, 0))
4407 return NULL;
4408 subtop->lasts = new_array;
4409 subtop->alasts = new_alasts;
4410 }
4411 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4412 if (BE (new_entry != NULL, 1))
4413 {
4414 subtop->lasts[subtop->nlasts] = new_entry;
4415 new_entry->node = node;
4416 new_entry->str_idx = str_idx;
4417 ++subtop->nlasts;
4418 }
4419 return new_entry;
4420}
4421
4422static void
4423internal_function
4424sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4425 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4426{
4427 sctx->sifted_states = sifted_sts;
4428 sctx->limited_states = limited_sts;
4429 sctx->last_node = last_node;
4430 sctx->last_str_idx = last_str_idx;
4431 re_node_set_init_empty (&sctx->limits);
4432}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette