From c54159d44ceaba26ceda9fea1804f0de122a8f30 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 5 Sep 2016 17:06:29 -0400 Subject: Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us> --- src/include/regex/regcustom.h | 10 +++ src/include/regex/regex.h | 1 - src/include/regex/regguts.h | 141 ++++++++++++++++++++---------------------- 3 files changed, 76 insertions(+), 76 deletions(-) (limited to 'src/include/regex') diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h index 459851a7f6..04a1893c80 100644 --- a/src/include/regex/regcustom.h +++ b/src/include/regex/regcustom.h @@ -77,6 +77,16 @@ typedef unsigned uchr; /* unsigned type that will hold a chr */ */ #define CHR_IS_IN_RANGE(c) ((c) <= CHR_MAX) +/* + * MAX_SIMPLE_CHR is the cutoff between "simple" and "complicated" processing + * in the color map logic. It should usually be chosen high enough to ensure + * that all common characters are <= MAX_SIMPLE_CHR. However, very large + * values will be counterproductive since they cause more regex setup time. + * Also, small values can be helpful for testing the high-color-map logic + * with plain old ASCII input. + */ +#define MAX_SIMPLE_CHR 0x7FF /* suitable value for Unicode */ + /* functions operating on chr */ #define iscalnum(x) pg_wc_isalnum(x) #define iscalpha(x) pg_wc_isalpha(x) diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index 2f89dc9326..cc73db2547 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -172,6 +172,5 @@ extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t * extern int pg_regprefix(regex_t *, pg_wchar **, size_t *); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); -extern void pg_set_regex_collation(Oid collation); #endif /* _REGEX_H_ */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index b0aa641cc4..69816f1449 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -127,23 +127,6 @@ #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) - -/* - * We dissect a chr into byts for colormap table indexing. Here we define - * a byt, which will be the same as a byte on most machines... The exact - * size of a byt is not critical, but about 8 bits is good, and extraction - * of 8-bit chunks is sometimes especially fast. - */ -#ifndef BYTBITS -#define BYTBITS 8 /* bits in a byt */ -#endif -#define BYTTAB (1<flags & FREECOL) - union tree *block; /* block of solid color, if any */ }; /* * The color map itself * - * Much of the data in the colormap struct is only used at compile time. - * However, the bulk of the space usage is in the "tree" structure, so it's - * not clear that there's much point in converting the rest to a more compact - * form when compilation is finished. + * This struct holds both data used only at compile time, and the chr to + * color mapping information, used at both compile and run time. The latter + * is the bulk of the space, so it's not really worth separating out the + * compile-only portion. + * + * Ideally, the mapping data would just be an array of colors indexed by + * chr codes; but for large character sets that's impractical. Fortunately, + * common characters have smaller codes, so we can use a simple array for chr + * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above + * that, without much loss of performance. The "something more complex" is a + * 2-D array of color entries, where row indexes correspond to individual chrs + * or chr ranges that have been mentioned in the regex (with row zero + * representing all other chrs), and column indexes correspond to different + * sets of locale-dependent character classes such as "isalpha". The + * classbits[k] entry is zero if we do not care about the k'th character class + * in this regex, and otherwise it is the bit to be OR'd into the column index + * if the character in question is a member of that class. We find the color + * of a high-valued chr by identifying which colormaprange it is in to get + * the row index (use row zero if it's in none of them), identifying which of + * the interesting cclasses it's in to get the column index, and then indexing + * into the 2-D hicolormap array. + * + * The colormapranges are required to be nonempty, nonoverlapping, and to + * appear in increasing chr-value order. */ + +#define NUM_CCLASSES 13 /* must match data in regc_locale.c */ + +typedef struct colormaprange +{ + chr cmin; /* range represents cmin..cmax inclusive */ + chr cmax; + int rownum; /* row index in hicolormap array (>= 1) */ +} colormaprange; + struct colormap { int magic; @@ -233,27 +213,27 @@ struct colormap color free; /* beginning of free chain (if non-0) */ struct colordesc *cd; /* pointer to array of colordescs */ #define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) + + /* mapping data for chrs <= MAX_SIMPLE_CHR: */ + color *locolormap; /* simple array indexed by chr code */ + + /* mapping data for chrs > MAX_SIMPLE_CHR: */ + int classbits[NUM_CCLASSES]; /* see comment above */ + int numcmranges; /* number of colormapranges */ + colormaprange *cmranges; /* ranges of high chrs */ + color *hicolormap; /* 2-D array of color entries */ + int maxarrayrows; /* number of array rows allocated */ + int hiarrayrows; /* number of array rows in use */ + int hiarraycols; /* number of array columns (2^N) */ + /* If we need up to NINLINECDS, we store them here to save a malloc */ -#define NINLINECDS ((size_t)10) +#define NINLINECDS ((size_t) 10) struct colordesc cdspace[NINLINECDS]; - union tree tree[NBYTS]; /* tree top, plus lower-level fill blocks */ }; -/* optimization magic to do fast chr->color mapping */ -#define B0(c) ((c) & BYTMASK) -#define B1(c) (((c)>>BYTBITS) & BYTMASK) -#define B2(c) (((c)>>(2*BYTBITS)) & BYTMASK) -#define B3(c) (((c)>>(3*BYTBITS)) & BYTMASK) -#if NBYTS == 1 -#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)]) -#endif -/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */ -#if NBYTS == 2 -#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)]) -#endif -#if NBYTS == 4 -#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)]) -#endif +/* fetch color for chr; beware of multiple evaluation of c argument */ +#define GETCOLOR(cm, c) \ + ((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c)) /* @@ -264,6 +244,11 @@ struct colormap * Representation of a set of characters. chrs[] represents individual * code points, ranges[] represents ranges in the form min..max inclusive. * + * If the cvec represents a locale-specific character class, eg [[:alpha:]], + * then the chrs[] and ranges[] arrays contain only members of that class + * up to MAX_SIMPLE_CHR (inclusive). cclasscode is set to regc_locale.c's + * code for the class, rather than being -1 as it is in an ordinary cvec. + * * Note that in cvecs gotten from newcvec() and intended to be freed by * freecvec(), both arrays of chrs are after the end of the struct, not * separately malloc'd; so chrspace and rangespace are effectively immutable. @@ -276,6 +261,7 @@ struct cvec int nranges; /* number of ranges (chr pairs) */ int rangespace; /* number of ranges allocated in ranges[] */ chr *ranges; /* pointer to vector of chr pairs */ + int cclasscode; /* value of "enum classes", or -1 */ }; @@ -489,3 +475,8 @@ struct guts int nlacons; /* size of lacons[]; note that only slots * numbered 1 .. nlacons-1 are used */ }; + + +/* prototypes for functions that are exported from regcomp.c to regexec.c */ +extern void pg_set_regex_collation(Oid collation); +extern color pg_reg_getcolor(struct colormap * cm, chr c); -- cgit v1.2.3