/[pkgs]/devel/grep/grep-2.5.3-egf-speedup.patch
ViewVC logotype

Contents of /devel/grep/grep-2.5.3-egf-speedup.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download) (as text)
Sun Nov 23 18:34:10 2008 UTC (12 months ago) by lkundrak
Branch: MAIN
CVS Tags: F-12-split, F-11-split, grep-2_5_3-5_fc12, grep-2_5_3-3_fc11, grep-2_5_3-2_fc11, grep-2_5_3-6_fc12, grep-2_5_3-4_fc11, grep-2_5_3-1_fc11, HEAD
File MIME type: text/x-patch
* Thu Nov 20 2008 Lubomir Rintel <lkundrak@v3.sk> 2.5.3-1
- Update to latest upstream version
- Drop upstreamed patches
- Add a couple of regression tests
- Temporarily disable tests
- Minor cleanup
1 From aac37e1939632dbc7d2ade6f991af7ce103b0cba Mon Sep 17 00:00:00 2001
2 From: Tim Waugh <twaugh@redhat.com>
3 Date: Sun, 23 Nov 2008 17:30:59 +0100
4 Subject: [PATCH] EGF Speedup
5
6 The full story behind this patch is that grep-2.5.1a does not handle UTF-8 gracefully at all. The basic plan with handling UTF-8 in 2.5.1a is:
7
8 * whenever a buffer is parsed, go through the entire buffer deciding how many bytes make up each character
9 * use this information when necessary
10
11 This patch changes that to:
12
13 * when information about how many bytes make up a character is needed, work it out on demand
14
15 On the face of it, this is a small obvious improvement. In fact it is much better than that, because the original scheme would calculate character lengths several times for each buffer: in fact, one full pass for every single potential match!
16
17 For a full discussion of this patch, as well as dfa-optional, including benchmarking results, see the mailing list.
18
19 Upstream ticket: https://savannah.gnu.org/patch/?3803
20 Debian: 64-egf-speedup.patch
21 Debian: 66-match_icase.patch
22 ---
23 lib/posix/regex.h | 4 +
24 src/search.c | 652 +++++++++++++++++++++++++++++++++++++++++++----------
25 2 files changed, 535 insertions(+), 121 deletions(-)
26
27 diff --git a/lib/posix/regex.h b/lib/posix/regex.h
28 index f4c4150..98df2cb 100644
29 --- a/lib/posix/regex.h
30 +++ b/lib/posix/regex.h
31 @@ -165,6 +165,10 @@ typedef unsigned long int reg_syntax_t;
32 treated as 'a\{1'. */
33 #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
34
35 +/* If this bit is set, then ignore case when matching.
36 + If not set, then case is significant. */
37 +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
38 +
39 /* This global variable defines the particular regexp syntax to use (for
40 some interfaces). When a regexp is compiled, the syntax used is
41 stored in the pattern buffer, so changing this does not affect
42 diff --git a/src/search.c b/src/search.c
43 index 7f5f187..9691fb8 100644
44 --- a/src/search.c
45 +++ b/src/search.c
46 @@ -18,10 +18,15 @@
47
48 /* Written August 1992 by Mike Haertel. */
49
50 +#ifndef _GNU_SOURCE
51 +# define _GNU_SOURCE 1
52 +#endif
53 #ifdef HAVE_CONFIG_H
54 # include <config.h>
55 #endif
56
57 +#include <assert.h>
58 +
59 #include <sys/types.h>
60
61 #include "mbsupport.h"
62 @@ -43,6 +48,9 @@
63 #ifdef HAVE_LIBPCRE
64 # include <pcre.h>
65 #endif
66 +#ifdef HAVE_LANGINFO_CODESET
67 +# include <langinfo.h>
68 +#endif
69
70 #define NCHAR (UCHAR_MAX + 1)
71
72 @@ -68,6 +76,19 @@ kwsinit (void)
73 error (2, 0, _("memory exhausted"));
74 }
75
76 +/* UTF-8 encoding allows some optimizations that we can't otherwise
77 + assume in a multibyte encoding. */
78 +static int using_utf8;
79 +
80 +void
81 +check_utf8 (void)
82 +{
83 +#ifdef HAVE_LANGINFO_CODESET
84 + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
85 + using_utf8 = 1;
86 +#endif
87 +}
88 +
89 #ifndef FGREP_PROGRAM
90 /* DFA compiled regexp. */
91 static struct dfa dfa;
92 @@ -134,49 +155,6 @@ kwsmusts (void)
93 }
94 #endif /* !FGREP_PROGRAM */
95
96 -#ifdef MBS_SUPPORT
97 -/* This function allocate the array which correspond to "buf".
98 - Then this check multibyte string and mark on the positions which
99 - are not single byte character nor the first byte of a multibyte
100 - character. Caller must free the array. */
101 -static char*
102 -check_multibyte_string(char const *buf, size_t size)
103 -{
104 - char *mb_properties = xmalloc(size);
105 - mbstate_t cur_state;
106 - wchar_t wc;
107 - int i;
108 -
109 - memset(&cur_state, 0, sizeof(mbstate_t));
110 - memset(mb_properties, 0, sizeof(char)*size);
111 -
112 - for (i = 0; i < size ;)
113 - {
114 - size_t mbclen;
115 - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
116 -
117 - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
118 - {
119 - /* An invalid sequence, or a truncated multibyte character.
120 - We treat it as a single byte character. */
121 - mbclen = 1;
122 - }
123 - else if (match_icase)
124 - {
125 - if (iswupper((wint_t)wc))
126 - {
127 - wc = towlower((wint_t)wc);
128 - wcrtomb(buf + i, wc, &cur_state);
129 - }
130 - }
131 - mb_properties[i] = mbclen;
132 - i += mbclen;
133 - }
134 -
135 - return mb_properties;
136 -}
137 -#endif /* MBS_SUPPORT */
138 -
139 #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
140 #ifdef EGREP_PROGRAM
141 COMPILE_FCT(Ecompile)
142 @@ -193,10 +171,9 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
143 size_t total = size;
144 char const *motif = pattern;
145
146 -#if 0
147 + check_utf8 ();
148 if (match_icase)
149 syntax_bits |= RE_ICASE;
150 -#endif
151 re_set_syntax (syntax_bits);
152 dfasyntax (syntax_bits, match_icase, eolbyte);
153
154 @@ -303,20 +280,9 @@ EXECUTE_FCT(EGexecute)
155 struct kwsmatch kwsm;
156 size_t i, ret_val;
157 #ifdef MBS_SUPPORT
158 - char *mb_properties = NULL;
159 - if (MB_CUR_MAX > 1)
160 - {
161 - if (match_icase)
162 - {
163 - char *case_buf = xmalloc(size);
164 - memcpy(case_buf, buf, size);
165 - if (start_ptr)
166 - start_ptr = case_buf + (start_ptr - buf);
167 - buf = case_buf;
168 - }
169 - if (kwset)
170 - mb_properties = check_multibyte_string(buf, size);
171 - }
172 + int mb_cur_max = MB_CUR_MAX;
173 + mbstate_t mbs;
174 + memset (&mbs, '\0', sizeof (mbstate_t));
175 #endif /* MBS_SUPPORT */
176
177 buflim = buf + size;
178 @@ -329,21 +295,63 @@ EXECUTE_FCT(EGexecute)
179 if (kwset)
180 {
181 /* Find a possible match using the KWset matcher. */
182 - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
183 +#ifdef MBS_SUPPORT
184 + size_t bytes_left = 0;
185 +#endif /* MBS_SUPPORT */
186 + size_t offset;
187 +#ifdef MBS_SUPPORT
188 + /* kwsexec doesn't work with match_icase and multibyte input. */
189 + if (match_icase && mb_cur_max > 1)
190 + /* Avoid kwset */
191 + offset = 0;
192 + else
193 +#endif /* MBS_SUPPORT */
194 + offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
195 if (offset == (size_t) -1)
196 - goto failure;
197 + return (size_t)-1;
198 +#ifdef MBS_SUPPORT
199 + if (mb_cur_max > 1 && !using_utf8)
200 + {
201 + bytes_left = offset;
202 + while (bytes_left)
203 + {
204 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
205 + if (mlen == (size_t) -1 || mlen == 0)
206 + {
207 + /* Incomplete character: treat as single-byte. */
208 + memset (&mbs, '\0', sizeof (mbstate_t));
209 + beg++;
210 + bytes_left--;
211 + continue;
212 + }
213 +
214 + if (mlen == (size_t) -2)
215 + /* Offset points inside multibyte character:
216 + * no good. */
217 + break;
218 +
219 + beg += mlen;
220 + bytes_left -= mlen;
221 + }
222 + }
223 + else
224 +#endif /* MBS_SUPPORT */
225 beg += offset;
226 /* Narrow down to the line containing the candidate, and
227 run it through DFA. */
228 end = memchr(beg, eol, buflim - beg);
229 end++;
230 #ifdef MBS_SUPPORT
231 - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
232 + if (mb_cur_max > 1 && bytes_left)
233 continue;
234 #endif
235 while (beg > buf && beg[-1] != eol)
236 --beg;
237 - if (kwsm.index < kwset_exact_matches)
238 + if (
239 +#ifdef MBS_SUPPORT
240 + !(match_icase && mb_cur_max > 1) &&
241 +#endif /* MBS_SUPPORT */
242 + (kwsm.index < kwset_exact_matches))
243 goto success;
244 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
245 continue;
246 @@ -351,13 +359,47 @@ EXECUTE_FCT(EGexecute)
247 else
248 {
249 /* No good fixed strings; start with DFA. */
250 +#ifdef MBS_SUPPORT
251 + size_t bytes_left = 0;
252 +#endif /* MBS_SUPPORT */
253 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
254 if (offset == (size_t) -1)
255 break;
256 /* Narrow down to the line we've found. */
257 +#ifdef MBS_SUPPORT
258 + if (mb_cur_max > 1 && !using_utf8)
259 + {
260 + bytes_left = offset;
261 + while (bytes_left)
262 + {
263 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
264 + if (mlen == (size_t) -1 || mlen == 0)
265 + {
266 + /* Incomplete character: treat as single-byte. */
267 + memset (&mbs, '\0', sizeof (mbstate_t));
268 + beg++;
269 + bytes_left--;
270 + continue;
271 + }
272 +
273 + if (mlen == (size_t) -2)
274 + /* Offset points inside multibyte character:
275 + * no good. */
276 + break;
277 +
278 + beg += mlen;
279 + bytes_left -= mlen;
280 + }
281 + }
282 + else
283 +#endif /* MBS_SUPPORT */
284 beg += offset;
285 end = memchr (beg, eol, buflim - beg);
286 end++;
287 +#ifdef MBS_SUPPORT
288 + if (mb_cur_max > 1 && bytes_left)
289 + continue;
290 +#endif /* MBS_SUPPORT */
291 while (beg > buf && beg[-1] != eol)
292 --beg;
293 }
294 @@ -475,24 +517,144 @@ EXECUTE_FCT(EGexecute)
295 *match_size = len;
296 ret_val = beg - buf;
297 out:
298 -#ifdef MBS_SUPPORT
299 - if (MB_CUR_MAX > 1)
300 - {
301 - if (match_icase)
302 - free((char*)buf);
303 - if (mb_properties)
304 - free(mb_properties);
305 - }
306 -#endif /* MBS_SUPPORT */
307 return ret_val;
308 }
309 #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
310
311 +#ifdef MBS_SUPPORT
312 +static int f_i_multibyte; /* whether we're using the new -Fi MB method */
313 +static struct
314 +{
315 + wchar_t **patterns;
316 + size_t count, maxlen;
317 + unsigned char *match;
318 +} Fimb;
319 +#endif
320 +
321 #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
322 COMPILE_FCT(Fcompile)
323 {
324 + int mb_cur_max = MB_CUR_MAX;
325 char const *beg, *lim, *err;
326
327 + check_utf8 ();
328 +#ifdef MBS_SUPPORT
329 + /* Support -F -i for UTF-8 input. */
330 + if (match_icase && mb_cur_max > 1)
331 + {
332 + mbstate_t mbs;
333 + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
334 + const char *patternend = pattern;
335 + size_t wcsize;
336 + kwset_t fimb_kwset = NULL;
337 + char *starts = NULL;
338 + wchar_t *wcbeg, *wclim;
339 + size_t allocated = 0;
340 +
341 + memset (&mbs, '\0', sizeof (mbs));
342 +# ifdef __GNU_LIBRARY__
343 + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
344 + if (patternend != pattern + size)
345 + wcsize = (size_t) -1;
346 +# else
347 + {
348 + char *patterncopy = xmalloc (size + 1);
349 +
350 + memcpy (patterncopy, pattern, size);
351 + patterncopy[size] = '\0';
352 + patternend = patterncopy;
353 + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
354 + if (patternend != patterncopy + size)
355 + wcsize = (size_t) -1;
356 + free (patterncopy);
357 + }
358 +# endif
359 + if (wcsize + 2 <= 2)
360 + {
361 +fimb_fail:
362 + free (wcpattern);
363 + free (starts);
364 + if (fimb_kwset)
365 + kwsfree (fimb_kwset);
366 + free (Fimb.patterns);
367 + Fimb.patterns = NULL;
368 + }
369 + else
370 + {
371 + if (!(fimb_kwset = kwsalloc (NULL)))
372 + error (2, 0, _("memory exhausted"));
373 +
374 + starts = xmalloc (mb_cur_max * 3);
375 + wcbeg = wcpattern;
376 + do
377 + {
378 + int i;
379 + size_t wclen;
380 +
381 + if (Fimb.count >= allocated)
382 + {
383 + if (allocated == 0)
384 + allocated = 128;
385 + else
386 + allocated *= 2;
387 + Fimb.patterns = xrealloc (Fimb.patterns,
388 + sizeof (wchar_t *) * allocated);
389 + }
390 + Fimb.patterns[Fimb.count++] = wcbeg;
391 + for (wclim = wcbeg;
392 + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
393 + *wclim = towlower (*wclim);
394 + *wclim = L'\0';
395 + wclen = wclim - wcbeg;
396 + if (wclen > Fimb.maxlen)
397 + Fimb.maxlen = wclen;
398 + if (wclen > 3)
399 + wclen = 3;
400 + if (wclen == 0)
401 + {
402 + if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
403 + error (2, 0, err);
404 + }
405 + else
406 + for (i = 0; i < (1 << wclen); i++)
407 + {
408 + char *p = starts;
409 + int j, k;
410 +
411 + for (j = 0; j < wclen; ++j)
412 + {
413 + wchar_t wc = wcbeg[j];
414 + if (i & (1 << j))
415 + {
416 + wc = towupper (wc);
417 + if (wc == wcbeg[j])
418 + continue;
419 + }
420 + k = wctomb (p, wc);
421 + if (k <= 0)
422 + goto fimb_fail;
423 + p += k;
424 + }
425 + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
426 + error (2, 0, err);
427 + }
428 + if (wclim < wcpattern + wcsize)
429 + ++wclim;
430 + wcbeg = wclim;
431 + }
432 + while (wcbeg < wcpattern + wcsize);
433 + f_i_multibyte = 1;
434 + kwset = fimb_kwset;
435 + free (starts);
436 + Fimb.match = xmalloc (Fimb.count);
437 + if ((err = kwsprep (kwset)) != 0)
438 + error (2, 0, err);
439 + return;
440 + }
441 + }
442 +#endif /* MBS_SUPPORT */
443 +
444 +
445 kwsinit ();
446 beg = pattern;
447 do
448 @@ -511,6 +673,76 @@ COMPILE_FCT(Fcompile)
449 error (2, 0, err);
450 }
451
452 +#ifdef MBS_SUPPORT
453 +static int
454 +Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
455 +{
456 + size_t len, letter, i;
457 + int ret = -1;
458 + mbstate_t mbs;
459 + wchar_t wc;
460 + int patterns_left;
461 +
462 + assert (match_icase && f_i_multibyte == 1);
463 + assert (MB_CUR_MAX > 1);
464 +
465 + memset (&mbs, '\0', sizeof (mbs));
466 + memset (Fimb.match, '\1', Fimb.count);
467 + letter = len = 0;
468 + patterns_left = 1;
469 + while (patterns_left && len <= size)
470 + {
471 + size_t c;
472 +
473 + patterns_left = 0;
474 + if (len < size)
475 + {
476 + c = mbrtowc (&wc, buf + len, size - len, &mbs);
477 + if (c + 2 <= 2)
478 + return ret;
479 +
480 + wc = towlower (wc);
481 + }
482 + else
483 + {
484 + c = 1;
485 + wc = L'\0';
486 + }
487 +
488 + for (i = 0; i < Fimb.count; i++)
489 + {
490 + if (Fimb.match[i])
491 + {
492 + if (Fimb.patterns[i][letter] == L'\0')
493 + {
494 + /* Found a match. */
495 + *plen = len;
496 + if (!exact && !match_words)
497 + return 0;
498 + else
499 + {
500 + /* For -w or exact look for longest match. */
501 + ret = 0;
502 + Fimb.match[i] = '\0';
503 + continue;
504 + }
505 + }
506 +
507 + if (Fimb.patterns[i][letter] == wc)
508 + patterns_left = 1;
509 + else
510 + Fimb.match[i] = '\0';
511 + }
512 + }
513 +
514 + len += c;
515 + letter++;
516 + }
517 +
518 + return ret;
519 +}
520 +#endif /* MBS_SUPPORT */
521 +
522 EXECUTE_FCT(Fexecute)
523 {
524 register char const *beg, *try, *end;
525 @@ -519,69 +751,256 @@ EXECUTE_FCT(Fexecute)
526 struct kwsmatch kwsmatch;
527 size_t ret_val;
528 #ifdef MBS_SUPPORT
529 - char *mb_properties = NULL;
530 - if (MB_CUR_MAX > 1)
531 - {
532 - if (match_icase)
533 - {
534 - char *case_buf = xmalloc(size);
535 - memcpy(case_buf, buf, size);
536 - if (start_ptr)
537 - start_ptr = case_buf + (start_ptr - buf);
538 - buf = case_buf;
539 - }
540 - mb_properties = check_multibyte_string(buf, size);
541 - }
542 + int mb_cur_max = MB_CUR_MAX;
543 + mbstate_t mbs;
544 + memset (&mbs, '\0', sizeof (mbstate_t));
545 + const char *last_char = NULL;
546 #endif /* MBS_SUPPORT */
547
548 for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
549 {
550 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
551 if (offset == (size_t) -1)
552 - goto failure;
553 + return offset;
554 #ifdef MBS_SUPPORT
555 - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
556 - continue; /* It is a part of multibyte character. */
557 + if (mb_cur_max > 1 && !using_utf8)
558 + {
559 + size_t bytes_left = offset;
560 + while (bytes_left)
561 + {
562 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
563 +
564 + last_char = beg;
565 + if (mlen == (size_t) -1 || mlen == 0)
566 + {
567 + /* Incomplete character: treat as single-byte. */
568 + memset (&mbs, '\0', sizeof (mbstate_t));
569 + beg++;
570 + bytes_left--;
571 + continue;
572 + }
573 +
574 + if (mlen == (size_t) -2)
575 + /* Offset points inside multibyte character: no good. */
576 + break;
577 +
578 + beg += mlen;
579 + bytes_left -= mlen;
580 + }
581 +
582 + if (bytes_left)
583 + continue;
584 + }
585 + else
586 #endif /* MBS_SUPPORT */
587 beg += offset;
588 +#ifdef MBS_SUPPORT
589 + /* For f_i_multibyte, the string at beg now matches first 3 chars of
590 + one of the search strings (less if there are shorter search strings).
591 + See if this is a real match. */
592 + if (f_i_multibyte
593 + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL))
594 + goto next_char;
595 +#endif /* MBS_SUPPORT */
596 len = kwsmatch.size[0];
597 if (start_ptr && !match_words)
598 goto success_in_beg_and_len;
599 if (match_lines)
600 {
601 if (beg > buf && beg[-1] != eol)
602 - continue;
603 + goto next_char;
604 if (beg + len < buf + size && beg[len] != eol)
605 - continue;
606 + goto next_char;
607 goto success;
608 }
609 else if (match_words)
610 - for (try = beg; len; )
611 - {
612 - if (try > buf && WCHAR((unsigned char) try[-1]))
613 - break;
614 - if (try + len < buf + size && WCHAR((unsigned char) try[len]))
615 - {
616 - offset = kwsexec (kwset, beg, --len, &kwsmatch);
617 - if (offset == (size_t) -1)
618 - break;
619 - try = beg + offset;
620 - len = kwsmatch.size[0];
621 - }
622 - else if (!start_ptr)
623 - goto success;
624 - else
625 - goto success_in_beg_and_len;
626 - } /* for (try) */
627 - else
628 + {
629 + while (len)
630 + {
631 + int word_match = 0;
632 + if (beg > buf)
633 + {
634 +#ifdef MBS_SUPPORT
635 + if (mb_cur_max > 1)
636 + {
637 + const char *s;
638 + int mr;
639 + wchar_t pwc;
640 +
641 + if (using_utf8)
642 + {
643 + s = beg - 1;
644 + while (s > buf
645 + && (unsigned char) *s >= 0x80
646 + && (unsigned char) *s <= 0xbf)
647 + --s;
648 + }
649 + else
650 + s = last_char;
651 + mr = mbtowc (&pwc, s, beg - s);
652 + if (mr <= 0)
653 + memset (&mbs, '\0', sizeof (mbstate_t));
654 + else if ((iswalnum (pwc) || pwc == L'_')
655 + && mr == (int) (beg - s))
656 + goto next_char;
657 + }
658 + else
659 +#endif /* MBS_SUPPORT */
660 + if (WCHAR ((unsigned char) beg[-1]))
661 + goto next_char;
662 + }
663 +#ifdef MBS_SUPPORT
664 + if (mb_cur_max > 1)
665 + {
666 + wchar_t nwc;
667 + int mr;
668 +
669 + mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
670 + if (mr <= 0)
671 + {
672 + memset (&mbs, '\0', sizeof (mbstate_t));
673 + word_match = 1;
674 + }
675 + else if (!iswalnum (nwc) && nwc != L'_')
676 + word_match = 1;
677 + }
678 + else
679 +#endif /* MBS_SUPPORT */
680 + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
681 + word_match = 1;
682 + if (word_match)
683 + {
684 + if (start_ptr == NULL)
685 + /* Returns the whole line now we know there's a word match. */
686 + goto success;
687 + else {
688 + /* Returns just this word match. */
689 + *match_size = len;
690 + return beg - buf;
691 + }
692 + }
693 + if (len > 0)
694 + {
695 + /* Try a shorter length anchored at the same place. */
696 + --len;
697 + offset = kwsexec (kwset, beg, len, &kwsmatch);
698 +
699 + if (offset == -1)
700 + goto next_char; /* Try a different anchor. */
701 +#ifdef MBS_SUPPORT
702 +
703 + if (mb_cur_max > 1 && !using_utf8)
704 + {
705 + size_t bytes_left = offset;
706 + while (bytes_left)
707 + {
708 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
709 +
710 + last_char = beg;
711 + if (mlen == (size_t) -1 || mlen == 0)
712 + {
713 + /* Incomplete character: treat as single-byte. */
714 + memset (&mbs, '\0', sizeof (mbstate_t));
715 + beg++;
716 + bytes_left--;
717 + continue;
718 + }
719 +
720 + if (mlen == (size_t) -2)
721 + {
722 + /* Offset points inside multibyte character:
723 + * no good. */
724 + break;
725 + }
726 +
727 + beg += mlen;
728 + bytes_left -= mlen;
729 + }
730 +
731 + if (bytes_left)
732 + {
733 + memset (&mbs, '\0', sizeof (mbstate_t));
734 + goto next_char; /* Try a different anchor. */
735 + }
736 + }
737 + else
738 +#endif /* MBS_SUPPORT */
739 + beg += offset;
740 +#ifdef MBS_SUPPORT
741 + /* The string at beg now matches first 3 chars of one of
742 + the search strings (less if there are shorter search
743 + strings). See if this is a real match. */
744 + if (f_i_multibyte
745 + && Fimbexec (beg, len - offset, &kwsmatch.size[0],
746 + start_ptr == NULL))
747 + goto next_char;
748 +#endif /* MBS_SUPPORT */
749 + len = kwsmatch.size[0];
750 + }
751 + }
752 + }
753 + else
754 goto success;
755 - } /* for (beg in buf) */
756 +next_char:;
757 +#ifdef MBS_SUPPORT
758 + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
759 + by ++beg above. */
760 + if (mb_cur_max > 1)
761 + {
762 + if (using_utf8)
763 + {
764 + unsigned char c = *beg;
765 + if (c >= 0xc2)
766 + {
767 + if (c < 0xe0)
768 + ++beg;
769 + else if (c < 0xf0)
770 + beg += 2;
771 + else if (c < 0xf8)
772 + beg += 3;
773 + else if (c < 0xfc)
774 + beg += 4;
775 + else if (c < 0xfe)
776 + beg += 5;
777 + }
778 + }
779 + else
780 + {
781 + size_t l = mbrlen (beg, buf + size - beg, &mbs);
782
783 - failure:
784 - ret_val = -1;
785 - goto out;
786 + last_char = beg;
787 + if (l + 2 >= 2)
788 + beg += l - 1;
789 + else
790 + memset (&mbs, '\0', sizeof (mbstate_t));
791 + }
792 + }
793 +#endif /* MBS_SUPPORT */
794 + }
795 +
796 + return -1;
797
798 success:
799 +#ifdef MBS_SUPPORT
800 + if (mb_cur_max > 1 && !using_utf8)
801 + {
802 + end = beg + len;
803 + while (end < buf + size)
804 + {
805 + size_t mlen = mbrlen (end, buf + size - end, &mbs);
806 + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
807 + {
808 + memset (&mbs, '\0', sizeof (mbstate_t));
809 + mlen = 1;
810 + }
811 + if (mlen == 1 && *end == eol)
812 + break;
813 +
814 + end += mlen;
815 + }
816 + }
817 + else
818 + #endif /* MBS_SUPPORT */
819 end = memchr (beg + len, eol, (buf + size) - (beg + len));
820 end++;
821 while (buf < beg && beg[-1] != eol)
822 @@ -591,15 +1010,6 @@ EXECUTE_FCT(Fexecute)
823 *match_size = len;
824 ret_val = beg - buf;
825 out:
826 -#ifdef MBS_SUPPORT
827 - if (MB_CUR_MAX > 1)
828 - {
829 - if (match_icase)
830 - free((char*)buf);
831 - if (mb_properties)
832 - free(mb_properties);
833 - }
834 -#endif /* MBS_SUPPORT */
835 return ret_val;
836 }
837 #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
838 --
839 1.5.5.1
840

admin@fedoraproject.org
ViewVC Help
Powered by ViewVC 1.1.2