Avoid repeated creation/freeing of per-subre DFAs during regex search.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 24 Feb 2012 19:56:35 +0000 (14:56 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 24 Feb 2012 23:40:30 +0000 (18:40 -0500)
In nested sub-regex trees, lower-level nodes created DFAs and then
destroyed them again before exiting, which is a bit dumb considering that
the recursive search is likely to call those nodes again later.  Instead
cache each created DFA until the end of pg_regexec().  This is basically a
space for time tradeoff, in that it might increase the maximum memory
usage.  However, in most regex patterns there are not all that many subre
nodes, so not that many DFAs --- and in any case, the peak usage occurs
when reaching the bottom recursion level, and except for alternation cases
that's going to be the same anyway.

src/backend/regex/regexec.c
src/include/regex/regguts.h

index 55f0c18d14f6af48432638fd95f72dcc27fb0081..d3e850a86994cadd2984fbd24140dfaea4985478 100644 (file)
@@ -112,6 +112,7 @@ struct vars
        chr                *search_start;       /* search start of string */
        chr                *stop;                       /* just past end of string */
        int                     err;                    /* error code if any (0 none) */
+       struct dfa **subdfas;           /* per-subre DFAs */
        struct smalldfa dfa1;
        struct smalldfa dfa2;
 };
@@ -130,6 +131,7 @@ struct vars
  * forward declarations
  */
 /* === regexec.c === */
+static struct dfa *getsubdfa(struct vars *, struct subre *);
 static int     find(struct vars *, struct cnfa *, struct colormap *);
 static int     cfind(struct vars *, struct cnfa *, struct colormap *);
 static int     cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
@@ -180,11 +182,15 @@ pg_regexec(regex_t *re,
        register struct vars *v = &var;
        int                     st;
        size_t          n;
+       size_t          i;
        int                     backref;
 
 #define  LOCALMAT       20
        regmatch_t      mat[LOCALMAT];
 
+#define  LOCALDFAS      40
+       struct dfa *subdfas[LOCALDFAS];
+
        /* sanity checks */
        if (re == NULL || string == NULL || re->re_magic != REMAGIC)
                return REG_INVARG;
@@ -225,6 +231,20 @@ pg_regexec(regex_t *re,
        v->search_start = (chr *) string + search_start;
        v->stop = (chr *) string + len;
        v->err = 0;
+       assert(v->g->ntree >= 0);
+       n = (size_t) v->g->ntree;
+       if (n <= LOCALDFAS)
+               v->subdfas = subdfas;
+       else
+               v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+       if (v->subdfas == NULL)
+       {
+               if (v->pmatch != pmatch && v->pmatch != mat)
+                       FREE(v->pmatch);
+               return REG_ESPACE;
+       }
+       for (i = 0; i < n; i++)
+               v->subdfas[i] = NULL;
 
        /* do it */
        assert(v->g->tree != NULL);
@@ -244,9 +264,36 @@ pg_regexec(regex_t *re,
        /* clean up */
        if (v->pmatch != pmatch && v->pmatch != mat)
                FREE(v->pmatch);
+       for (i = 0; i < n; i++)
+       {
+               if (v->subdfas[i] != NULL)
+                       freedfa(v->subdfas[i]);
+       }
+       if (v->subdfas != subdfas)
+               FREE(v->subdfas);
+
        return st;
 }
 
+/*
+ * getsubdfa - create or re-fetch the DFA for a subre node
+ *
+ * We only need to create the DFA once per overall regex execution.
+ * The DFA will be freed by the cleanup step in pg_regexec().
+ */
+static struct dfa *
+getsubdfa(struct vars * v,
+                 struct subre * t)
+{
+       if (v->subdfas[t->id] == NULL)
+       {
+               v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+               if (ISERR())
+                       return NULL;
+       }
+       return v->subdfas[t->id];
+}
+
 /*
  * find - find a match for the main NFA (no-complications case)
  */
@@ -578,15 +625,10 @@ condissect(struct vars * v,
        assert(t->left != NULL && t->left->cnfa.nstates > 0);
        assert(t->right != NULL && t->right->cnfa.nstates > 0);
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
+       d = getsubdfa(v, t->left);
+       NOERR();
+       d2 = getsubdfa(v, t->right);
        NOERR();
-       d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
-       if (ISERR())
-       {
-               assert(d2 == NULL);
-               freedfa(d);
-               return v->err;
-       }
 
        /* pick a tentative midpoint */
        if (shorter)
@@ -595,11 +637,7 @@ condissect(struct vars * v,
        else
                mid = longest(v, d, begin, end, (int *) NULL);
        if (mid == NULL)
-       {
-               freedfa(d);
-               freedfa(d2);
                return REG_ASSERT;
-       }
        MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
        /* iterate until satisfaction or failure */
@@ -610,8 +648,6 @@ condissect(struct vars * v,
                {
                        /* all possibilities exhausted! */
                        MDEBUG(("no midpoint!\n"));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_ASSERT;
                }
                if (shorter)
@@ -623,8 +659,6 @@ condissect(struct vars * v,
                {
                        /* failed to find a new one! */
                        MDEBUG(("failed midpoint!\n"));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_ASSERT;
                }
                MDEBUG(("new midpoint %ld\n", LOFF(mid)));
@@ -632,8 +666,6 @@ condissect(struct vars * v,
 
        /* satisfaction */
        MDEBUG(("successful\n"));
-       freedfa(d);
-       freedfa(d2);
        i = dissect(v, t->left, begin, mid);
        if (i != REG_OKAY)
                return i;
@@ -659,16 +691,13 @@ altdissect(struct vars * v,
        {
                MDEBUG(("trying %dth\n", i));
                assert(t->left != NULL && t->left->cnfa.nstates > 0);
-               d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
-               if (ISERR())
-                       return v->err;
+               d = getsubdfa(v, t->left);
+               NOERR();
                if (longest(v, d, begin, end, (int *) NULL) == end)
                {
                        MDEBUG(("success\n"));
-                       freedfa(d);
                        return dissect(v, t->left, begin, end);
                }
-               freedfa(d);
        }
        return REG_ASSERT;                      /* none of them matched?!? */
 }
@@ -731,7 +760,7 @@ iterdissect(struct vars * v,
                return REG_ESPACE;
        endpts[0] = begin;
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       d = getsubdfa(v, t->left);
        if (ISERR())
        {
                FREE(endpts);
@@ -814,7 +843,6 @@ iterdissect(struct vars * v,
                        if (er == REG_NOMATCH)
                                break;
                        /* oops, something failed */
-                       freedfa(d);
                        FREE(endpts);
                        return er;
                }
@@ -823,7 +851,6 @@ iterdissect(struct vars * v,
                {
                        /* satisfaction */
                        MDEBUG(("%d successful\n", t->id));
-                       freedfa(d);
                        FREE(endpts);
                        return REG_OKAY;
                }
@@ -856,7 +883,6 @@ backtrack:
 
        /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
        MDEBUG(("%d failed\n", t->id));
-       freedfa(d);
        FREE(endpts);
        return REG_ASSERT;
 }
@@ -917,7 +943,7 @@ reviterdissect(struct vars * v,
                return REG_ESPACE;
        endpts[0] = begin;
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       d = getsubdfa(v, t->left);
        if (ISERR())
        {
                FREE(endpts);
@@ -1002,7 +1028,6 @@ reviterdissect(struct vars * v,
                        if (er == REG_NOMATCH)
                                break;
                        /* oops, something failed */
-                       freedfa(d);
                        FREE(endpts);
                        return er;
                }
@@ -1011,7 +1036,6 @@ reviterdissect(struct vars * v,
                {
                        /* satisfaction */
                        MDEBUG(("%d successful\n", t->id));
-                       freedfa(d);
                        FREE(endpts);
                        return REG_OKAY;
                }
@@ -1037,7 +1061,6 @@ backtrack:
 
        /* all possibilities exhausted - shouldn't happen in uncomplicated mode */
        MDEBUG(("%d failed\n", t->id));
-       freedfa(d);
        FREE(endpts);
        return REG_ASSERT;
 }
@@ -1106,25 +1129,16 @@ ccondissect(struct vars * v,
        if (t->left->flags & SHORTER)           /* reverse scan */
                return crevdissect(v, t, begin, end);
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-       if (ISERR())
-               return v->err;
-       d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
-       if (ISERR())
-       {
-               freedfa(d);
-               return v->err;
-       }
+       d = getsubdfa(v, t->left);
+       NOERR();
+       d2 = getsubdfa(v, t->right);
+       NOERR();
        MDEBUG(("cconcat %d\n", t->id));
 
        /* pick a tentative midpoint */
        mid = longest(v, d, begin, end, (int *) NULL);
        if (mid == NULL)
-       {
-               freedfa(d);
-               freedfa(d2);
                return REG_NOMATCH;
-       }
        MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
        /* iterate until satisfaction or failure */
@@ -1141,17 +1155,11 @@ ccondissect(struct vars * v,
                                {
                                        /* satisfaction */
                                        MDEBUG(("successful\n"));
-                                       freedfa(d);
-                                       freedfa(d2);
                                        return REG_OKAY;
                                }
                        }
                        if (er != REG_OKAY && er != REG_NOMATCH)
-                       {
-                               freedfa(d);
-                               freedfa(d2);
                                return er;
-                       }
                }
 
                /* that midpoint didn't work, find a new one */
@@ -1159,8 +1167,6 @@ ccondissect(struct vars * v,
                {
                        /* all possibilities exhausted */
                        MDEBUG(("%d no midpoint\n", t->id));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_NOMATCH;
                }
                mid = longest(v, d, begin, mid - 1, (int *) NULL);
@@ -1168,8 +1174,6 @@ ccondissect(struct vars * v,
                {
                        /* failed to find a new one */
                        MDEBUG(("%d failed midpoint\n", t->id));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_NOMATCH;
                }
                MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -1201,25 +1205,16 @@ crevdissect(struct vars * v,
        assert(t->left->flags & SHORTER);
 
        /* concatenation -- need to split the substring between parts */
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-       if (ISERR())
-               return v->err;
-       d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
-       if (ISERR())
-       {
-               freedfa(d);
-               return v->err;
-       }
+       d = getsubdfa(v, t->left);
+       NOERR();
+       d2 = getsubdfa(v, t->right);
+       NOERR();
        MDEBUG(("crev %d\n", t->id));
 
        /* pick a tentative midpoint */
        mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
        if (mid == NULL)
-       {
-               freedfa(d);
-               freedfa(d2);
                return REG_NOMATCH;
-       }
        MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
 
        /* iterate until satisfaction or failure */
@@ -1236,17 +1231,11 @@ crevdissect(struct vars * v,
                                {
                                        /* satisfaction */
                                        MDEBUG(("successful\n"));
-                                       freedfa(d);
-                                       freedfa(d2);
                                        return REG_OKAY;
                                }
                        }
                        if (er != REG_OKAY && er != REG_NOMATCH)
-                       {
-                               freedfa(d);
-                               freedfa(d2);
                                return er;
-                       }
                }
 
                /* that midpoint didn't work, find a new one */
@@ -1254,8 +1243,6 @@ crevdissect(struct vars * v,
                {
                        /* all possibilities exhausted */
                        MDEBUG(("%d no midpoint\n", t->id));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_NOMATCH;
                }
                mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
@@ -1263,8 +1250,6 @@ crevdissect(struct vars * v,
                {
                        /* failed to find a new one */
                        MDEBUG(("%d failed midpoint\n", t->id));
-                       freedfa(d);
-                       freedfa(d2);
                        return REG_NOMATCH;
                }
                MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -1377,15 +1362,10 @@ caltdissect(struct vars * v,
 
        MDEBUG(("calt n%d\n", t->id));
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
-       if (ISERR())
-               return v->err;
+       d = getsubdfa(v, t->left);
+       NOERR();
        if (longest(v, d, begin, end, (int *) NULL) != end)
-       {
-               freedfa(d);
                return caltdissect(v, t->right, begin, end);
-       }
-       freedfa(d);
        MDEBUG(("calt matched\n"));
 
        er = cdissect(v, t->left, begin, end);
@@ -1453,7 +1433,7 @@ citerdissect(struct vars * v,
                return REG_ESPACE;
        endpts[0] = begin;
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       d = getsubdfa(v, t->left);
        if (ISERR())
        {
                FREE(endpts);
@@ -1537,7 +1517,6 @@ citerdissect(struct vars * v,
                        if (er == REG_NOMATCH)
                                break;
                        /* oops, something failed */
-                       freedfa(d);
                        FREE(endpts);
                        return er;
                }
@@ -1546,7 +1525,6 @@ citerdissect(struct vars * v,
                {
                        /* satisfaction */
                        MDEBUG(("%d successful\n", t->id));
-                       freedfa(d);
                        FREE(endpts);
                        return REG_OKAY;
                }
@@ -1579,7 +1557,6 @@ backtrack:
 
        /* all possibilities exhausted */
        MDEBUG(("%d failed\n", t->id));
-       freedfa(d);
        FREE(endpts);
        return REG_NOMATCH;
 }
@@ -1640,7 +1617,7 @@ creviterdissect(struct vars * v,
                return REG_ESPACE;
        endpts[0] = begin;
 
-       d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+       d = getsubdfa(v, t->left);
        if (ISERR())
        {
                FREE(endpts);
@@ -1726,7 +1703,6 @@ creviterdissect(struct vars * v,
                        if (er == REG_NOMATCH)
                                break;
                        /* oops, something failed */
-                       freedfa(d);
                        FREE(endpts);
                        return er;
                }
@@ -1735,7 +1711,6 @@ creviterdissect(struct vars * v,
                {
                        /* satisfaction */
                        MDEBUG(("%d successful\n", t->id));
-                       freedfa(d);
                        FREE(endpts);
                        return REG_OKAY;
                }
@@ -1761,7 +1736,6 @@ backtrack:
 
        /* all possibilities exhausted */
        MDEBUG(("%d failed\n", t->id));
-       freedfa(d);
        FREE(endpts);
        return REG_NOMATCH;
 }
index bc5419d98e78bf103a0fa54c87292e83c17eb2aa..65b8d178da86d244080560e9afd5499a4b500b24 100644 (file)
@@ -409,7 +409,7 @@ struct subre
 #define  PREF(f) ((f)&LOCAL)
 #define  PREF2(f1, f2)  ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
-       short           id;                             /* ID of subre (1..ntree) */
+       short           id;                             /* ID of subre (1..ntree-1) */
        int                     subno;                  /* subexpression number (for 'b' and '(') */
        short           min;                    /* min repetitions for iteration or backref */
        short           max;                    /* max repetitions for iteration or backref */
@@ -446,7 +446,7 @@ struct guts
        size_t          nsub;                   /* copy of re_nsub */
        struct subre *tree;
        struct cnfa search;                     /* for fast preliminary search */
-       int                     ntree;
+       int                     ntree;                  /* number of subre's, less one */
        struct colormap cmap;
        int                     FUNCPTR(compare, (const chr *, const chr *, size_t));
        struct subre *lacons;           /* lookahead-constraint vector */