VirtualBox

source: kBuild/vendor/grep/current/src/dfasearch.c@ 3530

Last change on this file since 3530 was 3529, checked in by bird, 3 years ago

Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.

  • Property svn:eol-style set to native
File size: 19.1 KB
Line 
1/* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2021 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
21#include <config.h>
22#include "intprops.h"
23#include "search.h"
24#include "die.h"
25#include <error.h>
26
27struct dfa_comp
28{
29 /* KWset compiled pattern. For GEAcompile, we compile
30 a list of strings, at least one of which is known to occur in
31 any string matching the regexp. */
32 kwset_t kwset;
33
34 /* DFA compiled regexp. */
35 struct dfa *dfa;
36
37 /* Regex compiled regexps. */
38 struct re_pattern_buffer *patterns;
39 size_t pcount;
40 struct re_registers regs;
41
42 /* Number of compiled fixed strings known to exactly match the regexp.
43 If kwsexec returns < kwset_exact_matches, then we don't need to
44 call the regexp matcher at all. */
45 ptrdiff_t kwset_exact_matches;
46
47 bool begline;
48};
49
50void
51dfaerror (char const *mesg)
52{
53 die (EXIT_TROUBLE, 0, "%s", mesg);
54}
55
56/* For now, the sole dfawarn-eliciting condition (use of a regexp
57 like '[:lower:]') is unequivocally an error, so treat it as such,
58 when possible. */
59void
60dfawarn (char const *mesg)
61{
62 if (!getenv ("POSIXLY_CORRECT"))
63 dfaerror (mesg);
64}
65
66/* If the DFA turns out to have some set of fixed strings one of
67 which must occur in the match, then we build a kwset matcher
68 to find those strings, and thus quickly filter out impossible
69 matches. */
70static void
71kwsmusts (struct dfa_comp *dc)
72{
73 struct dfamust *dm = dfamust (dc->dfa);
74 if (!dm)
75 return;
76 dc->kwset = kwsinit (false);
77 if (dm->exact)
78 {
79 /* Prepare a substring whose presence implies a match.
80 The kwset matcher will return the index of the matching
81 string that it chooses. */
82 ++dc->kwset_exact_matches;
83 ptrdiff_t old_len = strlen (dm->must);
84 ptrdiff_t new_len = old_len + dm->begline + dm->endline;
85 char *must = xmalloc (new_len);
86 char *mp = must;
87 *mp = eolbyte;
88 mp += dm->begline;
89 dc->begline |= dm->begline;
90 memcpy (mp, dm->must, old_len);
91 if (dm->endline)
92 mp[old_len] = eolbyte;
93 kwsincr (dc->kwset, must, new_len);
94 free (must);
95 }
96 else
97 {
98 /* Otherwise, filtering with this substring should help reduce the
99 search space, but we'll still have to use the regexp matcher. */
100 kwsincr (dc->kwset, dm->must, strlen (dm->must));
101 }
102 kwsprep (dc->kwset);
103 dfamustfree (dm);
104}
105
106/* Return true if KEYS, of length LEN, might contain a back-reference.
107 Return false if KEYS cannot contain a back-reference.
108 BS_SAFE is true of encodings where a backslash cannot appear as the
109 last byte of a multibyte character. */
110static bool _GL_ATTRIBUTE_PURE
111possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
112{
113 /* Normally a backslash, but in an unsafe encoding this is a non-char
114 value so that the comparison below always fails, because if there
115 are two adjacent '\' bytes, the first might be the last byte of a
116 multibyte character. */
117 int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
118
119 /* This code can return true even if KEYS lacks a back-reference, for
120 patterns like [\2], or for encodings where '\' appears as the last
121 byte of a multibyte character. However, false alarms should be
122 rare and do not affect correctness. */
123
124 /* Do not look for a backslash in the pattern's last byte, since it
125 can't be part of a back-reference and this streamlines the code. */
126 len--;
127
128 if (0 <= len)
129 {
130 char const *lim = keys + len;
131 for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
132 {
133 if ('1' <= p[1] && p[1] <= '9')
134 return true;
135 if (p[1] == second_backslash)
136 {
137 p++;
138 if (p == lim)
139 break;
140 }
141 }
142 }
143 return false;
144}
145
146static bool
147regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
148 ptrdiff_t pcount, ptrdiff_t lineno, reg_syntax_t syntax_bits,
149 bool syntax_only)
150{
151 struct re_pattern_buffer pat0;
152 struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
153 pat->buffer = NULL;
154 pat->allocated = 0;
155
156 /* Do not use a fastmap with -i, to work around glibc Bug#20381. */
157 pat->fastmap = (syntax_only | match_icase) ? NULL : xmalloc (UCHAR_MAX + 1);
158
159 pat->translate = NULL;
160
161 if (syntax_only)
162 re_set_syntax (syntax_bits | RE_NO_SUB);
163 else
164 re_set_syntax (syntax_bits);
165
166 char const *err = re_compile_pattern (p, len, pat);
167 if (!err)
168 return true;
169
170 /* Emit a filename:lineno: prefix for patterns taken from files. */
171 size_t pat_lineno;
172 char const *pat_filename
173 = lineno < 0 ? "" : pattern_file_name (lineno, &pat_lineno);
174
175 if (*pat_filename == '\0')
176 error (0, 0, "%s", err);
177 else
178 error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
179
180 return false;
181}
182
183/* Compile PATTERN, containing SIZE bytes that are followed by '\n'.
184 SYNTAX_BITS specifies whether PATTERN uses style -G, -E, or -A.
185 Return a description of the compiled pattern. */
186
187void *
188GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits,
189 bool exact)
190{
191 char *motif;
192 struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
193
194 dc->dfa = dfaalloc ();
195
196 if (match_icase)
197 syntax_bits |= RE_ICASE;
198 int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
199 dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
200 bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
201
202 /* For GNU regex, pass the patterns separately to detect errors like
203 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
204 this should be a syntax error. The same for backref, where the
205 backref should be local to each pattern. */
206 char const *p = pattern;
207 char const *patlim = pattern + size;
208 bool compilation_failed = false;
209
210 dc->patterns = xmalloc (sizeof *dc->patterns);
211 dc->patterns++;
212 dc->pcount = 0;
213 size_t palloc = 1;
214
215 char const *prev = pattern;
216
217 /* Buffer containing back-reference-free patterns. */
218 char *buf = NULL;
219 ptrdiff_t buflen = 0;
220 size_t bufalloc = 0;
221
222 ptrdiff_t lineno = 0;
223
224 do
225 {
226 char const *sep = rawmemchr (p, '\n');
227 ptrdiff_t len = sep - p;
228
229 bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
230
231 if (backref && prev < p)
232 {
233 ptrdiff_t prevlen = p - prev;
234 while (bufalloc < buflen + prevlen)
235 buf = x2realloc (buf, &bufalloc);
236 memcpy (buf + buflen, prev, prevlen);
237 buflen += prevlen;
238 }
239
240 /* Ensure room for at least two more patterns. The extra one is
241 for the regex_compile that may be executed after this loop
242 exits, and its (unused) slot is patterns[-1] until then. */
243 while (palloc <= dc->pcount + 1)
244 {
245 dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
246 sizeof *dc->patterns);
247 dc->patterns++;
248 }
249
250 re_set_syntax (syntax_bits);
251
252 if (!regex_compile (dc, p, len, dc->pcount, lineno, syntax_bits,
253 !backref))
254 compilation_failed = true;
255
256 p = sep + 1;
257 lineno++;
258
259 if (backref)
260 {
261 dc->pcount++;
262 prev = p;
263 }
264 }
265 while (p <= patlim);
266
267 if (compilation_failed)
268 exit (EXIT_TROUBLE);
269
270 if (prev <= patlim)
271 {
272 if (pattern < prev)
273 {
274 ptrdiff_t prevlen = patlim - prev;
275 buf = xrealloc (buf, buflen + prevlen);
276 memcpy (buf + buflen, prev, prevlen);
277 buflen += prevlen;
278 }
279 else
280 {
281 buf = pattern;
282 buflen = size;
283 }
284 }
285
286 /* In the match_words and match_lines cases, we use a different pattern
287 for the DFA matcher that will quickly throw out cases that won't work.
288 Then if DFA succeeds we do some hairy stuff using the regex matcher
289 to decide whether the match should really count. */
290 if (match_words || match_lines)
291 {
292 static char const line_beg_no_bk[] = "^(";
293 static char const line_end_no_bk[] = ")$";
294 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
295 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
296 static char const line_beg_bk[] = "^\\(";
297 static char const line_end_bk[] = "\\)$";
298 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
299 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
300 int bk = !(syntax_bits & RE_NO_BK_PARENS);
301 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
302
303 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
304 : (bk ? word_beg_bk : word_beg_no_bk));
305 size_t total = strlen (n);
306 memcpy (n + total, pattern, size);
307 total += size;
308 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
309 : (bk ? word_end_bk : word_end_no_bk));
310 total += strlen (n + total);
311 pattern = motif = n;
312 size = total;
313 }
314 else
315 motif = NULL;
316
317 dfaparse (pattern, size, dc->dfa);
318 kwsmusts (dc);
319 dfacomp (NULL, 0, dc->dfa, 1);
320
321 if (buf != NULL)
322 {
323 if (exact || !dfasupported (dc->dfa))
324 {
325 dc->patterns--;
326 dc->pcount++;
327
328 if (!regex_compile (dc, buf, buflen, 0, -1, syntax_bits, false))
329 abort ();
330 }
331
332 if (buf != pattern)
333 free (buf);
334 }
335
336 free (motif);
337
338 return dc;
339}
340
341size_t
342EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
343 char const *start_ptr)
344{
345 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
346 char eol = eolbyte;
347 regoff_t start;
348 size_t len, best_len;
349 struct kwsmatch kwsm;
350 size_t i;
351 struct dfa_comp *dc = vdc;
352 struct dfa *superset = dfasuperset (dc->dfa);
353 bool dfafast = dfaisfast (dc->dfa);
354
355 mb_start = buf;
356 buflim = buf + size;
357
358 for (beg = end = buf; end < buflim; beg = end)
359 {
360 end = buflim;
361
362 if (!start_ptr)
363 {
364 char const *next_beg, *dfa_beg = beg;
365 ptrdiff_t count = 0;
366 bool exact_kwset_match = false;
367 bool backref = false;
368
369 /* Try matching with KWset, if it's defined. */
370 if (dc->kwset)
371 {
372 char const *prev_beg;
373
374 /* Find a possible match using the KWset matcher. */
375 ptrdiff_t offset = kwsexec (dc->kwset, beg - dc->begline,
376 buflim - beg + dc->begline,
377 &kwsm, true);
378 if (offset < 0)
379 return offset;
380 match = beg + offset;
381 prev_beg = beg;
382
383 /* Narrow down to the line containing the possible match. */
384 beg = memrchr (buf, eol, match - buf);
385 beg = beg ? beg + 1 : buf;
386 dfa_beg = beg;
387
388 /* Determine the end pointer to give the DFA next. Typically
389 this is after the first newline after MATCH; but if the KWset
390 match is not exact, the DFA is fast, and the offset from
391 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
392 greater of the latter two values; this temporarily prefers
393 the DFA to KWset. */
394 exact_kwset_match = kwsm.index < dc->kwset_exact_matches;
395 if (exact_kwset_match || !dfafast
396 || MAX (16, match - beg) < (match - prev_beg) >> 2)
397 {
398 end = rawmemchr (match, eol);
399 end++;
400 }
401 else if (MAX (16, match - beg) < (buflim - prev_beg) >> 2)
402 {
403 end = rawmemchr (prev_beg + 4 * MAX (16, match - beg), eol);
404 end++;
405 }
406 else
407 end = buflim;
408
409 if (exact_kwset_match)
410 {
411 if (!localeinfo.multibyte | localeinfo.using_utf8)
412 goto success;
413 if (mb_start < beg)
414 mb_start = beg;
415 if (mb_goback (&mb_start, NULL, match, buflim) == 0)
416 goto success;
417 /* The matched line starts in the middle of a multibyte
418 character. Perform the DFA search starting from the
419 beginning of the next character. */
420 dfa_beg = mb_start;
421 }
422 }
423
424 /* Try matching with the superset of DFA, if it's defined. */
425 if (superset && !exact_kwset_match)
426 {
427 /* Keep using the superset while it reports multiline
428 potential matches; this is more likely to be fast
429 than falling back to KWset would be. */
430 next_beg = dfaexec (superset, dfa_beg, (char *) end, 0,
431 &count, NULL);
432 if (next_beg == NULL || next_beg == end)
433 continue;
434
435 /* Narrow down to the line we've found. */
436 if (count != 0)
437 {
438 beg = memrchr (buf, eol, next_beg - buf);
439 beg++;
440 dfa_beg = beg;
441 }
442 end = rawmemchr (next_beg, eol);
443 end++;
444
445 count = 0;
446 }
447
448 /* Try matching with DFA. */
449 next_beg = dfaexec (dc->dfa, dfa_beg, (char *) end, 0, &count,
450 &backref);
451
452 /* If there's no match, or if we've matched the sentinel,
453 we're done. */
454 if (next_beg == NULL || next_beg == end)
455 continue;
456
457 /* Narrow down to the line we've found. */
458 if (count != 0)
459 {
460 beg = memrchr (buf, eol, next_beg - buf);
461 beg++;
462 }
463 end = rawmemchr (next_beg, eol);
464 end++;
465
466 /* Successful, no back-references encountered! */
467 if (!backref)
468 goto success;
469 ptr = beg;
470 }
471 else
472 {
473 /* We are looking for the leftmost (then longest) exact match.
474 We will go through the outer loop only once. */
475 ptr = start_ptr;
476 }
477
478 /* If the "line" is longer than the maximum regexp offset,
479 die as if we've run out of memory. */
480 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
481 xalloc_die ();
482
483 /* Run the possible match through Regex. */
484 best_match = end;
485 best_len = 0;
486 for (i = 0; i < dc->pcount; i++)
487 {
488 dc->patterns[i].not_eol = 0;
489 dc->patterns[i].newline_anchor = eolbyte == '\n';
490 start = re_search (&dc->patterns[i], beg, end - beg - 1,
491 ptr - beg, end - ptr - 1, &dc->regs);
492 if (start < -1)
493 xalloc_die ();
494 else if (0 <= start)
495 {
496 len = dc->regs.end[0] - start;
497 match = beg + start;
498 if (match > best_match)
499 continue;
500 if (start_ptr && !match_words)
501 goto assess_pattern_match;
502 if ((!match_lines && !match_words)
503 || (match_lines && len == end - ptr - 1))
504 {
505 match = ptr;
506 len = end - ptr;
507 goto assess_pattern_match;
508 }
509 /* If -w and not -x, check whether the match aligns with
510 word boundaries. Do this iteratively because:
511 (a) the line may contain more than one occurrence of the
512 pattern, and
513 (b) Several alternatives in the pattern might be valid at a
514 given point, and we may need to consider a shorter one to
515 find a word boundary. */
516 if (!match_lines && match_words)
517 while (match <= best_match)
518 {
519 regoff_t shorter_len = 0;
520 if (! wordchar_next (match + len, end - 1)
521 && ! wordchar_prev (beg, match, end - 1))
522 goto assess_pattern_match;
523 if (len > 0)
524 {
525 /* Try a shorter length anchored at the same place. */
526 --len;
527 dc->patterns[i].not_eol = 1;
528 shorter_len = re_match (&dc->patterns[i], beg,
529 match + len - ptr, match - beg,
530 &dc->regs);
531 if (shorter_len < -1)
532 xalloc_die ();
533 }
534 if (0 < shorter_len)
535 len = shorter_len;
536 else
537 {
538 /* Try looking further on. */
539 if (match == end - 1)
540 break;
541 match++;
542 dc->patterns[i].not_eol = 0;
543 start = re_search (&dc->patterns[i], beg, end - beg - 1,
544 match - beg, end - match - 1,
545 &dc->regs);
546 if (start < 0)
547 {
548 if (start < -1)
549 xalloc_die ();
550 break;
551 }
552 len = dc->regs.end[0] - start;
553 match = beg + start;
554 }
555 } /* while (match <= best_match) */
556 continue;
557 assess_pattern_match:
558 if (!start_ptr)
559 {
560 /* Good enough for a non-exact match.
561 No need to look at further patterns, if any. */
562 goto success;
563 }
564 if (match < best_match || (match == best_match && len > best_len))
565 {
566 /* Best exact match: leftmost, then longest. */
567 best_match = match;
568 best_len = len;
569 }
570 } /* if re_search >= 0 */
571 } /* for Regex patterns. */
572 if (best_match < end)
573 {
574 /* We have found an exact match. We were just
575 waiting for the best one (leftmost then longest). */
576 beg = best_match;
577 len = best_len;
578 goto success_in_len;
579 }
580 } /* for (beg = end ..) */
581
582 return -1;
583
584 success:
585 len = end - beg;
586 success_in_len:;
587 size_t off = beg - buf;
588 *match_size = len;
589 return off;
590}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette