diff options
| author | Tom Lane | 2019-01-10 00:47:38 +0000 |
|---|---|---|
| committer | Tom Lane | 2019-01-10 00:47:46 +0000 |
| commit | c64d0cd5ce24a344798534f1bc5827a9199b7a6e (patch) | |
| tree | 3968456d54c3f18d07976e5a139ca60589a8fbf0 /src/common/kwlookup.c | |
| parent | 5d59a6c5eaff4a58322683e450e76a11d943d322 (diff) | |
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function. Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea. Hence, do that.
The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d0712, so it's not a big problem. The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.
Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.
Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
Diffstat (limited to 'src/common/kwlookup.c')
| -rw-r--r-- | src/common/kwlookup.c | 73 |
1 files changed, 32 insertions, 41 deletions
diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c index d72842e759..6545480b5c 100644 --- a/src/common/kwlookup.c +++ b/src/common/kwlookup.c @@ -35,60 +35,51 @@ * receive a different case-normalization mapping. */ int -ScanKeywordLookup(const char *text, +ScanKeywordLookup(const char *str, const ScanKeywordList *keywords) { - int len, - i; - char word[NAMEDATALEN]; - const char *kw_string; - const uint16 *kw_offsets; - const uint16 *low; - const uint16 *high; - - len = strlen(text); + size_t len; + int h; + const char *kw; + /* + * Reject immediately if too long to be any keyword. This saves useless + * hashing and downcasing work on long strings. + */ + len = strlen(str); if (len > keywords->max_kw_len) - return -1; /* too long to be any keyword */ - - /* We assume all keywords are shorter than NAMEDATALEN. */ - Assert(len < NAMEDATALEN); + return -1; /* - * Apply an ASCII-only downcasing. We must not use tolower() since it may - * produce the wrong translation in some locales (eg, Turkish). + * Compute the hash function. We assume it was generated to produce + * case-insensitive results. Since it's a perfect hash, we need only + * match to the specific keyword it identifies. */ - for (i = 0; i < len; i++) - { - char ch = text[i]; + h = keywords->hash(str, len); - if (ch >= 'A' && ch <= 'Z') - ch += 'a' - 'A'; - word[i] = ch; - } - word[len] = '\0'; + /* An out-of-range result implies no match */ + if (h < 0 || h >= keywords->num_keywords) + return -1; /* - * Now do a binary search using plain strcmp() comparison. + * Compare character-by-character to see if we have a match, applying an + * ASCII-only downcasing to the input characters. We must not use + * tolower() since it may produce the wrong translation in some locales + * (eg, Turkish). */ - kw_string = keywords->kw_string; - kw_offsets = keywords->kw_offsets; - low = kw_offsets; - high = kw_offsets + (keywords->num_keywords - 1); - while (low <= high) + kw = GetScanKeyword(h, keywords); + while (*str != '\0') { - const uint16 *middle; - int difference; + char ch = *str++; - middle = low + (high - low) / 2; - difference = strcmp(kw_string + *middle, word); - if (difference == 0) - return middle - kw_offsets; - else if (difference < 0) - low = middle + 1; - else - high = middle - 1; + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + if (ch != *kw++) + return -1; } + if (*kw != '\0') + return -1; - return -1; + /* Success! */ + return h; } |
