Extended statistics on expressions
authorTomas Vondra <tomas.vondra@postgresql.org>
Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)
committerTomas Vondra <tomas.vondra@postgresql.org>
Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
Allow defining extended statistics on expressions, not just just on
simple column references.  With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:

  CREATE TABLE t (a int);
  CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
  ANALYZE t;

The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:

  SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;

  SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);

This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.

CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.

A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.

ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.

Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com

43 files changed:
doc/src/sgml/catalogs.sgml
doc/src/sgml/ref/create_statistics.sgml
src/backend/catalog/Makefile
src/backend/catalog/system_views.sql
src/backend/commands/statscmds.c
src/backend/commands/tablecmds.c
src/backend/nodes/copyfuncs.c
src/backend/nodes/equalfuncs.c
src/backend/nodes/outfuncs.c
src/backend/optimizer/util/plancat.c
src/backend/parser/gram.y
src/backend/parser/parse_agg.c
src/backend/parser/parse_expr.c
src/backend/parser/parse_func.c
src/backend/parser/parse_utilcmd.c
src/backend/statistics/dependencies.c
src/backend/statistics/extended_stats.c
src/backend/statistics/mcv.c
src/backend/statistics/mvdistinct.c
src/backend/tcop/utility.c
src/backend/utils/adt/ruleutils.c
src/backend/utils/adt/selfuncs.c
src/bin/pg_dump/t/002_pg_dump.pl
src/bin/psql/describe.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/include/catalog/pg_statistic_ext.h
src/include/catalog/pg_statistic_ext_data.h
src/include/commands/defrem.h
src/include/nodes/nodes.h
src/include/nodes/parsenodes.h
src/include/nodes/pathnodes.h
src/include/parser/parse_node.h
src/include/parser/parse_utilcmd.h
src/include/statistics/extended_stats_internal.h
src/include/statistics/statistics.h
src/include/utils/ruleutils.h
src/test/regress/expected/create_table_like.out
src/test/regress/expected/oidjoins.out
src/test/regress/expected/rules.out
src/test/regress/expected/stats_ext.out
src/test/regress/sql/create_table_like.sql
src/test/regress/sql/stats_ext.sql

index 0f8703af5a585a8865d26202e4fc42e341ecc00b..f103d914a62b90097248bce456d141693c294f32 100644 (file)
@@ -7385,8 +7385,22 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <literal>d</literal> for n-distinct statistics,
        <literal>f</literal> for functional dependency statistics, and
        <literal>m</literal> for most common values (MCV) list statistics
+       <literal>e</literal> for expression statistics
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       Expression trees (in <function>nodeToString()</function>
+       representation) for statistics object attributes that are not simple
+       column references.  This is a list with one element per expression.
+       Null if all statistics object attributes are simple references.
+      </para></entry>
+     </row>
+
     </tbody>
    </tgroup>
   </table>
@@ -7452,7 +7466,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       Extended statistic object containing the definition for this data
+       Extended statistics object containing the definition for this data
       </para></entry>
      </row>
 
@@ -7484,6 +7498,15 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structname>pg_mcv_list</structname> type
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       A list of any expressions covered by this statistics object.
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -7637,6 +7660,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        see <xref linkend="logical-replication-publication"/>.
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxdexpr</structfield> <type>pg_statistic[]</type>
+      </para>
+      <para>
+       Per-expression statistics, serialized as an array of
+       <structname>pg_statistic</structname> type
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -9444,6 +9477,11 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
       <entry>extended planner statistics</entry>
      </row>
 
+     <row>
+      <entry><link linkend="view-pg-stats-ext-exprs"><structname>pg_stats_ext_exprs</structname></link></entry>
+      <entry>extended planner statistics for expressions</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-tables"><structname>pg_tables</structname></link></entry>
       <entry>tables</entry>
@@ -12696,10 +12734,19 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
       </para>
       <para>
-       Name of the column described by this row
+       Names of the columns included in the extended statistics object
       </para></entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>exprs</structfield> <type>text[]</type>
+      </para>
+      <para>
+       Expressions included in the extended statistics object
+      </para></entry>
+      </row>
+
      <row>
       <entry role="catalog_table_entry"><para role="column_definition">
        <structfield>inherited</structfield> <type>bool</type>
@@ -12851,7 +12898,8 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
 
   <para>
    The view <structname>pg_stats_ext</structname> provides access to
-   the information stored in the <link
+   information about each extended statistics object in the database,
+   combining information stored in the <link
    linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
    and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
    catalogs.  This view allows access only to rows of
@@ -12908,7 +12956,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
       </para>
       <para>
-       Name of schema containing extended statistic
+       Name of schema containing extended statistics object
       </para></entry>
      </row>
 
@@ -12918,7 +12966,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
       </para>
       <para>
-       Name of extended statistics
+       Name of extended statistics object
       </para></entry>
      </row>
 
@@ -12928,7 +12976,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
       </para>
       <para>
-       Owner of the extended statistics
+       Owner of the extended statistics object
       </para></entry>
      </row>
 
@@ -12938,7 +12986,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
       </para>
       <para>
-       Names of the columns the extended statistics is defined on
+       Names of the columns the extended statistics object is defined on
       </para></entry>
      </row>
 
@@ -12947,7 +12995,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        <structfield>kinds</structfield> <type>char[]</type>
       </para>
       <para>
-       Types of extended statistics enabled for this record
+       Types of extended statistics object enabled for this record
       </para></entry>
      </row>
 
@@ -13032,6 +13080,237 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
 
  </sect1>
 
+ <sect1 id="view-pg-stats-ext-exprs">
+  <title><structname>pg_stats_ext_exprs</structname></title>
+
+  <indexterm zone="view-pg-stats-ext-exprs">
+   <primary>pg_stats_ext_exprs</primary>
+  </indexterm>
+
+  <para>
+   The view <structname>pg_stats_ext_exprs</structname> provides access to
+   information about all expressions included in extended statistics objects,
+   combining information stored in the <link
+   linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
+   and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   catalogs.  This view allows access only to rows of
+   <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   that correspond to tables the user has permission to read, and therefore
+   it is safe to allow public read access to this view.
+  </para>
+
+  <para>
+   <structname>pg_stats_ext_exprs</structname> is also designed to present
+   the information in a more readable format than the underlying catalogs
+   &mdash; at the cost that its schema must be extended whenever the structure
+   of statistics in <link linkend="catalog-pg-statistic"><structname>pg_statistic</structname></link> changes.
+  </para>
+
+  <table>
+   <title><structname>pg_stats_ext_exprs</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing table
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>tablename</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relname</structfield>)
+      </para>
+      <para>
+       Name of table the statistics object is defined on
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_name</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
+      </para>
+      <para>
+       Name of extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_owner</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
+      </para>
+      <para>
+       Owner of the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>expr</structfield> <type>text</type>
+      </para>
+      <para>
+       Expression included in the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>null_frac</structfield> <type>float4</type>
+      </para>
+      <para>
+       Fraction of expression entries that are null
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>avg_width</structfield> <type>int4</type>
+      </para>
+      <para>
+       Average width in bytes of expression's entries
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>n_distinct</structfield> <type>float4</type>
+      </para>
+      <para>
+       If greater than zero, the estimated number of distinct values in the
+       expression.  If less than zero, the negative of the number of distinct
+       values divided by the number of rows.  (The negated form is used when
+       <command>ANALYZE</command> believes that the number of distinct values is
+       likely to increase as the table grows; the positive form is used when
+       the expression seems to have a fixed number of possible values.)  For
+       example, -1 indicates a unique expression in which the number of distinct
+       values is the same as the number of rows.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_vals</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of the most common values in the expression. (Null if
+       no values seem to be more common than any others.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common values,
+       i.e., number of occurrences of each divided by total number of rows.
+       (Null when <structfield>most_common_vals</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>histogram_bounds</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of values that divide the expression's values into groups of
+       approximately equal population.  The values in
+       <structfield>most_common_vals</structfield>, if present, are omitted from this
+       histogram calculation.  (This expression is null if the expression data type
+       does not have a <literal>&lt;</literal> operator or if the
+       <structfield>most_common_vals</structfield> list accounts for the entire
+       population.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>correlation</structfield> <type>float4</type>
+      </para>
+      <para>
+       Statistical correlation between physical row ordering and
+       logical ordering of the expression values.  This ranges from -1 to +1.
+       When the value is near -1 or +1, an index scan on the expression will
+       be estimated to be cheaper than when it is near zero, due to reduction
+       of random access to the disk.  (This expression is null if the expression's
+       data type does not have a <literal>&lt;</literal> operator.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elems</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of non-null element values most often appearing within values of
+       the expression. (Null for scalar types.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elem_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common element values, i.e., the
+       fraction of rows containing at least one instance of the given value.
+       Two or three additional values follow the per-element frequencies;
+       these are the minimum and maximum of the preceding per-element
+       frequencies, and optionally the frequency of null elements.
+       (Null when <structfield>most_common_elems</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>elem_count_histogram</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A histogram of the counts of distinct non-null element values within the
+       values of the expression, followed by the average number of distinct
+       non-null elements.  (Null for scalar types.)
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The maximum number of entries in the array fields can be controlled on a
+   column-by-column basis using the <link linkend="sql-altertable"><command>ALTER
+   TABLE SET STATISTICS</command></link> command, or globally by setting the
+   <xref linkend="guc-default-statistics-target"/> run-time parameter.
+  </para>
+
+ </sect1>
+
  <sect1 id="view-pg-tables">
   <title><structname>pg_tables</structname></title>
 
index 4363be50c3c4d5e42bb3a0d6902efbee19690372..988f4c573ff5e86c6248c0e6bb8458cfd0d65e6f 100644 (file)
@@ -21,9 +21,13 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+    ON ( <replaceable class="parameter">expression</replaceable> )
+    FROM <replaceable class="parameter">table_name</replaceable>
+
 CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
     [ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
-    ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+    ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
     FROM <replaceable class="parameter">table_name</replaceable>
 </synopsis>
 
@@ -39,6 +43,19 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
    database and will be owned by the user issuing the command.
   </para>
 
+  <para>
+   The <command>CREATE STATISTICS</command> command has two basic forms. The
+   first form allows univariate statistics for a single expression to be
+   collected, providing benefits similar to an expression index without the
+   overhead of index maintenance.  This form does not allow the statistics
+   kind to be specified, since the various statistics kinds refer only to
+   multivariate statistics.  The second form of the command allows
+   multivariate statistics on multiple columns and/or expressions to be
+   collected, optionally specifying which statistics kinds to include.  This
+   form will also automatically cause univariate statistics to be collected on
+   any expressions included in the list.
+  </para>
+
   <para>
    If a schema name is given (for example, <literal>CREATE STATISTICS
    myschema.mystat ...</literal>) then the statistics object is created in the
@@ -79,14 +96,16 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     <term><replaceable class="parameter">statistics_kind</replaceable></term>
     <listitem>
      <para>
-      A statistics kind to be computed in this statistics object.
+      A multivariate statistics kind to be computed in this statistics object.
       Currently supported kinds are
       <literal>ndistinct</literal>, which enables n-distinct statistics,
       <literal>dependencies</literal>, which enables functional
       dependency statistics, and <literal>mcv</literal> which enables
       most-common values lists.
       If this clause is omitted, all supported statistics kinds are
-      included in the statistics object.
+      included in the statistics object. Univariate expression statistics are
+      built automatically if the statistics definition includes any complex
+      expressions rather than just simple column references.
       For more information, see <xref linkend="planner-stats-extended"/>
       and <xref linkend="multivariate-statistics-examples"/>.
      </para>
@@ -98,8 +117,22 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     <listitem>
      <para>
       The name of a table column to be covered by the computed statistics.
-      At least two column names must be given;  the order of the column names
-      is insignificant.
+      This is only allowed when building multivariate statistics.  At least
+      two column names or expressions must be specified, and their order is
+      not significant.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="parameter">expression</replaceable></term>
+    <listitem>
+     <para>
+      An expression to be covered by the computed statistics.  This may be
+      used to build univariate statistics on a single expression, or as part
+      of a list of multiple column names and/or expressions to build
+      multivariate statistics.  In the latter case, separate univariate
+      statistics are built automatically for each expression in the list.
      </para>
     </listitem>
    </varlistentry>
@@ -125,6 +158,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
    reading it.  Once created, however, the ownership of the statistics
    object is independent of the underlying table(s).
   </para>
+
+  <para>
+   Expression statistics are per-expression and are similar to creating an
+   index on the expression, except that they avoid the overhead of index
+   maintenance. Expression statistics are built automatically for each
+   expression in the statistics object definition.
+  </para>
  </refsect1>
 
  <refsect1 id="sql-createstatistics-examples">
@@ -196,6 +236,72 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
    in the table, allowing it to generate better estimates in both cases.
   </para>
 
+  <para>
+   Create table <structname>t3</structname> with a single timestamp column,
+   and run queries using expressions on that column.  Without extended
+   statistics, the planner has no information about the data distribution for
+   the expressions, and uses default estimates.  The planner also does not
+   realize that the value of the date truncated to the month is fully
+   determined by the value of the date truncated to the day. Then expression
+   and ndistinct statistics are built on those two expressions:
+
+<programlisting>
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+   Without expression and ndistinct statistics, the planner has no information
+   about the number of distinct values for the expressions, and has to rely
+   on default estimates. The equality and range conditions are assumed to have
+   0.5% selectivity, and the number of distinct values in the expression is
+   assumed to be the same as for the column (i.e. unique). This results in a
+   significant underestimate of the row count in the first two queries. Moreover,
+   the planner has no information about the relationship between the expressions,
+   so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
+   conditions are independent, and multiplies their selectivities together to
+   arrive at a severe overestimate of the group count in the aggregate query.
+   This is further exacerbated by the lack of accurate statistics for the
+   expressions, forcing the planner to use a default ndistinct estimate for the
+   expression derived from ndistinct for the column. With such statistics, the
+   planner recognizes that the conditions are correlated, and arrives at much
+   more accurate estimates.
+  </para>
+
  </refsect1>
 
  <refsect1>
index 70bc2123df7bf23111cf0ca289acdb8812910a9f..e36a9602c129abbe1dd1b0b106a1cb684165ace9 100644 (file)
@@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk
 
 # Note: the order of this list determines the order in which the catalog
 # header files are assembled into postgres.bki.  BKI_BOOTSTRAP catalogs
-# must appear first, and there are reputedly other, undocumented ordering
-# dependencies.
+# must appear first, and pg_statistic before pg_statistic_ext_data, and
+# there are reputedly other, undocumented ordering dependencies.
 CATALOG_HEADERS := \
        pg_proc.h pg_type.h pg_attribute.h pg_class.h \
        pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
        pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
        pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
-       pg_statistic_ext.h pg_statistic_ext_data.h \
-       pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
+       pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
+       pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
        pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
        pg_database.h pg_db_role_setting.h pg_tablespace.h \
        pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
index 0dca65dc7bb9fe920d60f84be73741bc6680662c..6483563204cfbd4242a492c2fdefd117205dbde9 100644 (file)
@@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                   JOIN pg_attribute a
                        ON (a.attrelid = s.stxrelid AND a.attnum = k)
            ) AS attnames,
+           pg_get_statisticsobjdef_expressions(s.oid) as exprs,
            s.stxkind AS kinds,
            sd.stxdndistinct AS n_distinct,
            sd.stxddependencies AS dependencies,
@@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                 WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
     AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
 
+CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
+    SELECT cn.nspname AS schemaname,
+           c.relname AS tablename,
+           sn.nspname AS statistics_schemaname,
+           s.stxname AS statistics_name,
+           pg_get_userbyid(s.stxowner) AS statistics_owner,
+           stat.expr,
+           (stat.a).stanullfrac AS null_frac,
+           (stat.a).stawidth AS avg_width,
+           (stat.a).stadistinct AS n_distinct,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
+           END) AS most_common_vals,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
+           END) AS most_common_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
+           END) AS histogram_bounds,
+           (CASE
+               WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
+               WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
+               WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
+               WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
+               WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
+           END) correlation,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
+           END) AS most_common_elems,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
+           END) AS most_common_elem_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
+           END) AS elem_count_histogram
+    FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
+         LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
+         LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
+         LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
+         JOIN LATERAL (
+             SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+                    unnest(sd.stxdexpr)::pg_statistic AS a
+         ) stat ON (stat.expr IS NOT NULL);
+
 -- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
 REVOKE ALL on pg_statistic_ext_data FROM public;
 
index 2bae205845992abbc87c4a7cddc4994c35ccb189..df4768952d5b8bd12c02fc25b7f1c018b110daa9 100644 (file)
@@ -29,6 +29,8 @@
 #include "commands/comment.h"
 #include "commands/defrem.h"
 #include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
 #include "statistics/statistics.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
@@ -62,7 +64,8 @@ ObjectAddress
 CreateStatistics(CreateStatsStmt *stmt)
 {
        int16           attnums[STATS_MAX_DIMENSIONS];
-       int                     numcols = 0;
+       int                     nattnums = 0;
+       int                     numcols;
        char       *namestr;
        NameData        stxname;
        Oid                     statoid;
@@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt)
        Datum           datavalues[Natts_pg_statistic_ext_data];
        bool            datanulls[Natts_pg_statistic_ext_data];
        int2vector *stxkeys;
+       List       *stxexprs = NIL;
+       Datum           exprsDatum;
        Relation        statrel;
        Relation        datarel;
        Relation        rel = NULL;
        Oid                     relid;
        ObjectAddress parentobject,
                                myself;
-       Datum           types[3];               /* one for each possible type of statistic */
+       Datum           types[4];               /* one for each possible type of statistic */
        int                     ntypes;
        ArrayType  *stxkind;
        bool            build_ndistinct;
        bool            build_dependencies;
        bool            build_mcv;
+       bool            build_expressions;
        bool            requested_type = false;
        int                     i;
        ListCell   *cell;
+       ListCell   *cell2;
 
        Assert(IsA(stmt, CreateStatsStmt));
 
@@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt)
        }
 
        /*
-        * Currently, we only allow simple column references in the expression
-        * list.  That will change someday, and again the grammar already supports
-        * it so we have to enforce restrictions here.  For now, we can convert
-        * the expression list to a simple array of attnums.  While at it, enforce
-        * some constraints.
+        * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
+        * might be duplicates and so on, but we'll deal with those later.
+        */
+       numcols = list_length(stmt->exprs);
+       if (numcols > STATS_MAX_DIMENSIONS)
+               ereport(ERROR,
+                               (errcode(ERRCODE_TOO_MANY_COLUMNS),
+                                errmsg("cannot have more than %d columns in statistics",
+                                               STATS_MAX_DIMENSIONS)));
+
+       /*
+        * Convert the expression list to a simple array of attnums, but also keep
+        * a list of more complex expressions.  While at it, enforce some
+        * constraints.
+        *
+        * XXX We do only the bare minimum to separate simple attribute and
+        * complex expressions - for example "(a)" will be treated as a complex
+        * expression. No matter how elaborate the check is, there'll always be a
+        * way around it, if the user is determined (consider e.g. "(a+0)"), so
+        * it's not worth protecting against it.
         */
        foreach(cell, stmt->exprs)
        {
                Node       *expr = (Node *) lfirst(cell);
-               ColumnRef  *cref;
-               char       *attname;
+               StatsElem  *selem;
                HeapTuple       atttuple;
                Form_pg_attribute attForm;
                TypeCacheEntry *type;
 
-               if (!IsA(expr, ColumnRef))
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("only simple column references are allowed in CREATE STATISTICS")));
-               cref = (ColumnRef *) expr;
-
-               if (list_length(cref->fields) != 1)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("only simple column references are allowed in CREATE STATISTICS")));
-               attname = strVal((Value *) linitial(cref->fields));
-
-               atttuple = SearchSysCacheAttName(relid, attname);
-               if (!HeapTupleIsValid(atttuple))
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_UNDEFINED_COLUMN),
-                                        errmsg("column \"%s\" does not exist",
-                                                       attname)));
-               attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
-               /* Disallow use of system attributes in extended stats */
-               if (attForm->attnum <= 0)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("statistics creation on system columns is not supported")));
-
-               /* Disallow data types without a less-than operator */
-               type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
-               if (type->lt_opr == InvalidOid)
+               /*
+                * We should not get anything else than StatsElem, given the grammar.
+                * But let's keep it as a safety.
+                */
+               if (!IsA(expr, StatsElem))
                        ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
-                                                       attname, format_type_be(attForm->atttypid))));
+                                        errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
 
-               /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
-               if (numcols >= STATS_MAX_DIMENSIONS)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_TOO_MANY_COLUMNS),
-                                        errmsg("cannot have more than %d columns in statistics",
-                                                       STATS_MAX_DIMENSIONS)));
+               selem = (StatsElem *) expr;
 
-               attnums[numcols] = attForm->attnum;
-               numcols++;
-               ReleaseSysCache(atttuple);
+               if (selem->name)                /* column reference */
+               {
+                       char       *attname;
+
+                       attname = selem->name;
+
+                       atttuple = SearchSysCacheAttName(relid, attname);
+                       if (!HeapTupleIsValid(atttuple))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_COLUMN),
+                                                errmsg("column \"%s\" does not exist",
+                                                               attname)));
+                       attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+                       /* Disallow use of system attributes in extended stats */
+                       if (attForm->attnum <= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                errmsg("statistics creation on system columns is not supported")));
+
+                       /* Disallow data types without a less-than operator */
+                       type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+                       if (type->lt_opr == InvalidOid)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+                                                               attname, format_type_be(attForm->atttypid))));
+
+                       attnums[nattnums] = attForm->attnum;
+                       nattnums++;
+                       ReleaseSysCache(atttuple);
+               }
+               else                                    /* expression */
+               {
+                       Node       *expr = selem->expr;
+                       Oid                     atttype;
+
+                       Assert(expr != NULL);
+
+                       /*
+                        * Disallow data types without a less-than operator.
+                        *
+                        * We ignore this for statistics on a single expression, in which
+                        * case we'll build the regular statistics only (and that code can
+                        * deal with such data types).
+                        */
+                       if (list_length(stmt->exprs) > 1)
+                       {
+                               atttype = exprType(expr);
+                               type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
+                               if (type->lt_opr == InvalidOid)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                        errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
+                                                                       format_type_be(atttype))));
+                       }
+
+                       stxexprs = lappend(stxexprs, expr);
+               }
        }
 
        /*
-        * Check that at least two columns were specified in the statement. The
-        * upper bound was already checked in the loop above.
-        */
-       if (numcols < 2)
-               ereport(ERROR,
-                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                                errmsg("extended statistics require at least 2 columns")));
-
-       /*
-        * Sort the attnums, which makes detecting duplicates somewhat easier, and
-        * it does not hurt (it does not affect the efficiency, unlike for
-        * indexes, for example).
-        */
-       qsort(attnums, numcols, sizeof(int16), compare_int16);
-
-       /*
-        * Check for duplicates in the list of columns. The attnums are sorted so
-        * just check consecutive elements.
+        * Parse the statistics kinds.
+        *
+        * First check that if this is the case with a single expression, there
+        * are no statistics kinds specified (we don't allow that for the simple
+        * CREATE STATISTICS form).
         */
-       for (i = 1; i < numcols; i++)
+       if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
        {
-               if (attnums[i] == attnums[i - 1])
+               /* statistics kinds not specified */
+               if (list_length(stmt->stat_types) > 0)
                        ereport(ERROR,
-                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
-                                        errmsg("duplicate column name in statistics definition")));
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                        errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
        }
 
-       /* Form an int2vector representation of the sorted column list */
-       stxkeys = buildint2vector(attnums, numcols);
-
-       /*
-        * Parse the statistics kinds.
-        */
+       /* OK, let's check that we recognize the statistics kinds. */
        build_ndistinct = false;
        build_dependencies = false;
        build_mcv = false;
@@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt)
                                         errmsg("unrecognized statistics kind \"%s\"",
                                                        type)));
        }
-       /* If no statistic type was specified, build them all. */
-       if (!requested_type)
+
+       /*
+        * If no statistic type was specified, build them all (but only when the
+        * statistics is defined on more than one column/expression).
+        */
+       if ((!requested_type) && (numcols >= 2))
        {
                build_ndistinct = true;
                build_dependencies = true;
                build_mcv = true;
        }
 
+       /*
+        * When there are non-trivial expressions, build the expression stats
+        * automatically. This allows calculating good estimates for stats that
+        * consider per-clause estimates (e.g. functional dependencies).
+        */
+       build_expressions = (list_length(stxexprs) > 0);
+
+       /*
+        * Check that at least two columns were specified in the statement, or
+        * that we're building statistics on a single expression.
+        */
+       if ((numcols < 2) && (list_length(stxexprs) != 1))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("extended statistics require at least 2 columns")));
+
+       /*
+        * Sort the attnums, which makes detecting duplicates somewhat easier, and
+        * it does not hurt (it does not matter for the contents, unlike for
+        * indexes, for example).
+        */
+       qsort(attnums, nattnums, sizeof(int16), compare_int16);
+
+       /*
+        * Check for duplicates in the list of columns. The attnums are sorted so
+        * just check consecutive elements.
+        */
+       for (i = 1; i < nattnums; i++)
+       {
+               if (attnums[i] == attnums[i - 1])
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                        errmsg("duplicate column name in statistics definition")));
+       }
+
+       /*
+        * Check for duplicate expressions. We do two loops, counting the
+        * occurrences of each expression. This is O(N^2) but we only allow small
+        * number of expressions and it's not executed often.
+        *
+        * XXX We don't cross-check attributes and expressions, because it does
+        * not seem worth it. In principle we could check that expressions don't
+        * contain trivial attribute references like "(a)", but the reasoning is
+        * similar to why we don't bother with extracting columns from
+        * expressions. It's either expensive or very easy to defeat for
+        * determined user, and there's no risk if we allow such statistics (the
+        * statistics is useless, but harmless).
+        */
+       foreach(cell, stxexprs)
+       {
+               Node       *expr1 = (Node *) lfirst(cell);
+               int                     cnt = 0;
+
+               foreach(cell2, stxexprs)
+               {
+                       Node       *expr2 = (Node *) lfirst(cell2);
+
+                       if (equal(expr1, expr2))
+                               cnt += 1;
+               }
+
+               /* every expression should find at least itself */
+               Assert(cnt >= 1);
+
+               if (cnt > 1)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                        errmsg("duplicate expression in statistics definition")));
+       }
+
+       /* Form an int2vector representation of the sorted column list */
+       stxkeys = buildint2vector(attnums, nattnums);
+
        /* construct the char array of enabled statistic types */
        ntypes = 0;
        if (build_ndistinct)
@@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt)
                types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
        if (build_mcv)
                types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
+       if (build_expressions)
+               types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
        Assert(ntypes > 0 && ntypes <= lengthof(types));
        stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
 
+       /* convert the expressions (if any) to a text datum */
+       if (stxexprs != NIL)
+       {
+               char       *exprsString;
+
+               exprsString = nodeToString(stxexprs);
+               exprsDatum = CStringGetTextDatum(exprsString);
+               pfree(exprsString);
+       }
+       else
+               exprsDatum = (Datum) 0;
+
        statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
 
        /*
@@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt)
        values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
        values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
 
+       values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
+       if (exprsDatum == (Datum) 0)
+               nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
+
        /* insert it into pg_statistic_ext */
        htup = heap_form_tuple(statrel->rd_att, values, nulls);
        CatalogTupleInsert(statrel, htup);
@@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt)
        datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
        datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
        datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+       datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
 
        /* insert it into pg_statistic_ext_data */
        htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
@@ -396,12 +522,41 @@ CreateStatistics(CreateStatsStmt *stmt)
         */
        ObjectAddressSet(myself, StatisticExtRelationId, statoid);
 
-       for (i = 0; i < numcols; i++)
+       /* add dependencies for plain column references */
+       for (i = 0; i < nattnums; i++)
        {
                ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
                recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
        }
 
+       /*
+        * If there are no dependencies on a column, give the statistics an auto
+        * dependency on the whole table.  In most cases, this will be redundant,
+        * but it might not be if the statistics expressions contain no Vars
+        * (which might seem strange but possible). This is consistent with what
+        * we do for indexes in index_create.
+        *
+        * XXX We intentionally don't consider the expressions before adding this
+        * dependency, because recordDependencyOnSingleRelExpr may not create any
+        * dependencies for whole-row Vars.
+        */
+       if (!nattnums)
+       {
+               ObjectAddressSet(parentobject, RelationRelationId, relid);
+               recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+       }
+
+       /*
+        * Store dependencies on anything mentioned in statistics expressions,
+        * just like we do for index expressions.
+        */
+       if (stxexprs)
+               recordDependencyOnSingleRelExpr(&myself,
+                                                                               (Node *) stxexprs,
+                                                                               relid,
+                                                                               DEPENDENCY_NORMAL,
+                                                                               DEPENDENCY_AUTO, false, true);
+
        /*
         * Also add dependencies on namespace and owner.  These are required
         * because the stats object might have a different namespace and/or owner
@@ -582,87 +737,6 @@ RemoveStatisticsById(Oid statsOid)
        table_close(relation, RowExclusiveLock);
 }
 
-/*
- * Update a statistics object for ALTER COLUMN TYPE on a source column.
- *
- * This could throw an error if the type change can't be supported.
- * If it can be supported, but the stats must be recomputed, a likely choice
- * would be to set the relevant column(s) of the pg_statistic_ext_data tuple
- * to null until the next ANALYZE.  (Note that the type change hasn't actually
- * happened yet, so one option that's *not* on the table is to recompute
- * immediately.)
- *
- * For both ndistinct and functional-dependencies stats, the on-disk
- * representation is independent of the source column data types, and it is
- * plausible to assume that the old statistic values will still be good for
- * the new column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
- * expression that substantially alters the semantic meaning of the column
- * values, this assumption could fail.  But that seems like a corner case
- * that doesn't justify zapping the stats in common cases.)
- *
- * For MCV lists that's not the case, as those statistics store the datums
- * internally. In this case we simply reset the statistics value to NULL.
- *
- * Note that "type change" includes collation change, which means we can rely
- * on the MCV list being consistent with the collation info in pg_attribute
- * during estimation.
- */
-void
-UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
-                                                         Oid oldColumnType, Oid newColumnType)
-{
-       HeapTuple       stup,
-                               oldtup;
-
-       Relation        rel;
-
-       Datum           values[Natts_pg_statistic_ext_data];
-       bool            nulls[Natts_pg_statistic_ext_data];
-       bool            replaces[Natts_pg_statistic_ext_data];
-
-       oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
-       if (!HeapTupleIsValid(oldtup))
-               elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
-
-       /*
-        * When none of the defined statistics types contain datum values from the
-        * table's columns then there's no need to reset the stats. Functional
-        * dependencies and ndistinct stats should still hold true.
-        */
-       if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
-       {
-               ReleaseSysCache(oldtup);
-               return;
-       }
-
-       /*
-        * OK, we need to reset some statistics. So let's build the new tuple,
-        * replacing the affected statistics types with NULL.
-        */
-       memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-       memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-       memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
-
-       replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-       nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-
-       rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
-
-       /* replace the old tuple */
-       stup = heap_modify_tuple(oldtup,
-                                                        RelationGetDescr(rel),
-                                                        values,
-                                                        nulls,
-                                                        replaces);
-
-       ReleaseSysCache(oldtup);
-       CatalogTupleUpdate(rel, &stup->t_self, stup);
-
-       heap_freetuple(stup);
-
-       table_close(rel, RowExclusiveLock);
-}
-
 /*
  * Select a nonconflicting name for a new statistics.
  *
@@ -731,18 +805,27 @@ ChooseExtendedStatisticNameAddition(List *exprs)
        buf[0] = '\0';
        foreach(lc, exprs)
        {
-               ColumnRef  *cref = (ColumnRef *) lfirst(lc);
+               StatsElem  *selem = (StatsElem *) lfirst(lc);
                const char *name;
 
                /* It should be one of these, but just skip if it happens not to be */
-               if (!IsA(cref, ColumnRef))
+               if (!IsA(selem, StatsElem))
                        continue;
 
-               name = strVal((Value *) linitial(cref->fields));
+               name = selem->name;
 
                if (buflen > 0)
                        buf[buflen++] = '_';    /* insert _ between names */
 
+               /*
+                * We use fixed 'expr' for expressions, which have empty column names.
+                * For indexes this is handled in ChooseIndexColumnNames, but we have
+                * no such function for stats and it does not seem worth adding. If a
+                * better name is needed, the user can specify it explicitly.
+                */
+               if (!name)
+                       name = "expr";
+
                /*
                 * At this point we have buflen <= NAMEDATALEN.  name should be less
                 * than NAMEDATALEN already, but use strlcpy for paranoia.
@@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs)
        }
        return pstrdup(buf);
 }
+
+/*
+ * StatisticsGetRelation: given a statistics's relation OID, get the OID of
+ * the relation it is an statistics on.  Uses the system cache.
+ */
+Oid
+StatisticsGetRelation(Oid statId, bool missing_ok)
+{
+       HeapTuple       tuple;
+       Form_pg_statistic_ext stx;
+       Oid                     result;
+
+       tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
+       if (!HeapTupleIsValid(tuple))
+       {
+               if (missing_ok)
+                       return InvalidOid;
+               elog(ERROR, "cache lookup failed for statistics object %u", statId);
+       }
+       stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
+       Assert(stx->oid == statId);
+
+       result = stx->stxrelid;
+       ReleaseSysCache(tuple);
+       return result;
+}
index efac06f72c7465603a684813cd66c618cf5c4336..88a68a4697ad1d26a5430c9b0753b549c2c10abc 100644 (file)
@@ -41,6 +41,7 @@
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_tablespace.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_trigger.h"
 #include "catalog/pg_type.h"
 #include "catalog/storage.h"
@@ -188,6 +189,8 @@ typedef struct AlteredTableInfo
        List       *changedIndexDefs;   /* string definitions of same */
        char       *replicaIdentityIndex;       /* index to reset as REPLICA IDENTITY */
        char       *clusterOnIndex; /* index to use for CLUSTER */
+       List       *changedStatisticsOids;      /* OIDs of statistics to rebuild */
+       List       *changedStatisticsDefs;      /* string definitions of same */
 } AlteredTableInfo;
 
 /* Struct describing one new constraint to check in Phase 3 scan */
@@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c
                                                                          ObjectAddresses *addrs);
 static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                                                                        IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
+static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                                                                                CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
 static ObjectAddress ATExecAddConstraint(List **wqueue,
                                                                                 AlteredTableInfo *tab, Relation rel,
                                                                                 Constraint *newConstraint, bool recurse, bool is_readd,
@@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                                                                                   AlterTableCmd *cmd, LOCKMODE lockmode);
 static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
 static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
+static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
 static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
                                                                   LOCKMODE lockmode);
 static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
@@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab,
                        address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
                                                                         lockmode);
                        break;
+               case AT_ReAddStatistics:        /* ADD STATISTICS */
+                       address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
+                                                                                 true, lockmode);
+                       break;
                case AT_AddConstraint:  /* ADD CONSTRAINT */
                        /* Transform the command only during initial examination */
                        if (cur_pass == AT_PASS_ADD_CONSTR)
@@ -8283,6 +8293,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
        return address;
 }
 
+/*
+ * ALTER TABLE ADD STATISTICS
+ *
+ * This is no such command in the grammar, but we use this internally to add
+ * AT_ReAddStatistics subcommands to rebuild extended statistics after a table
+ * column type change.
+ */
+static ObjectAddress
+ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                                       CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
+{
+       ObjectAddress address;
+
+       Assert(IsA(stmt, CreateStatsStmt));
+
+       /* The CreateStatsStmt has already been through transformStatsStmt */
+       Assert(stmt->transformed);
+
+       address = CreateStatistics(stmt);
+
+       return address;
+}
+
 /*
  * ALTER TABLE ADD CONSTRAINT USING INDEX
  *
@@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                                 * Give the extended-stats machinery a chance to fix anything
                                 * that this column type change would break.
                                 */
-                               UpdateStatisticsForTypeChange(foundObject.objectId,
-                                                                                         RelationGetRelid(rel), attnum,
-                                                                                         attTup->atttypid, targettype);
+                               RememberStatisticsForRebuilding(foundObject.objectId, tab);
                                break;
 
                        case OCLASS_PROC:
@@ -12202,6 +12233,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab)
        }
 }
 
+/*
+ * Subroutine for ATExecAlterColumnType: remember that a statistics object
+ * needs to be rebuilt (which we might already know).
+ */
+static void
+RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
+{
+       /*
+        * This de-duplication check is critical for two independent reasons: we
+        * mustn't try to recreate the same statistics object twice, and if the
+        * statistics depends on more than one column whose type is to be altered,
+        * we must capture its definition string before applying any of the type
+        * changes. ruleutils.c will get confused if we ask again later.
+        */
+       if (!list_member_oid(tab->changedStatisticsOids, stxoid))
+       {
+               /* OK, capture the index's existing definition string */
+               char       *defstring = pg_get_statisticsobjdef_string(stxoid);
+
+               tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
+                                                                                                stxoid);
+               tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
+                                                                                        defstring);
+       }
+}
+
 /*
  * Cleanup after we've finished all the ALTER TYPE operations for a
  * particular relation.  We have to drop and recreate all the indexes
@@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
                add_exact_object_address(&obj, objects);
        }
 
+       /* add dependencies for new statistics */
+       forboth(oid_item, tab->changedStatisticsOids,
+                       def_item, tab->changedStatisticsDefs)
+       {
+               Oid                     oldId = lfirst_oid(oid_item);
+               Oid                     relid;
+
+               relid = StatisticsGetRelation(oldId, false);
+               ATPostAlterTypeParse(oldId, relid, InvalidOid,
+                                                        (char *) lfirst(def_item),
+                                                        wqueue, lockmode, tab->rewrite);
+
+               ObjectAddressSet(obj, StatisticExtRelationId, oldId);
+               add_exact_object_address(&obj, objects);
+       }
+
        /*
         * Queue up command to restore replica identity index marking
         */
@@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
 }
 
 /*
- * Parse the previously-saved definition string for a constraint or index
- * against the newly-established column data type(s), and queue up the
- * resulting command parsetrees for execution.
+ * Parse the previously-saved definition string for a constraint, index or
+ * statistics object against the newly-established column data type(s), and
+ * queue up the resulting command parsetrees for execution.
  *
  * This might fail if, for example, you have a WHERE clause that uses an
  * operator that's not available for the new column type.
@@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                        querytree_list = lappend(querytree_list, stmt);
                        querytree_list = list_concat(querytree_list, afterStmts);
                }
+               else if (IsA(stmt, CreateStatsStmt))
+                       querytree_list = lappend(querytree_list,
+                                                                        transformStatsStmt(oldRelId,
+                                                                                                               (CreateStatsStmt *) stmt,
+                                                                                                               cmd));
                else
                        querytree_list = lappend(querytree_list, stmt);
        }
@@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                                elog(ERROR, "unexpected statement subtype: %d",
                                         (int) stmt->subtype);
                }
+               else if (IsA(stm, CreateStatsStmt))
+               {
+                       CreateStatsStmt  *stmt = (CreateStatsStmt *) stm;
+                       AlterTableCmd *newcmd;
+
+                       /* keep the statistics object's comment */
+                       stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
+
+                       newcmd = makeNode(AlterTableCmd);
+                       newcmd->subtype = AT_ReAddStatistics;
+                       newcmd->def = (Node *) stmt;
+                       tab->subcmds[AT_PASS_MISC] =
+                               lappend(tab->subcmds[AT_PASS_MISC], newcmd);
+               }
                else
                        elog(ERROR, "unexpected statement type: %d",
                                 (int) nodeTag(stm));
index 38b56231b7df2fb0890b17e5026d295578b5fbe5..d5b1ad4567044ecc762a414c0c607dddf69318d9 100644 (file)
@@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from)
        return newnode;
 }
 
+static StatsElem *
+_copyStatsElem(const StatsElem *from)
+{
+       StatsElem  *newnode = makeNode(StatsElem);
+
+       COPY_STRING_FIELD(name);
+       COPY_NODE_FIELD(expr);
+
+       return newnode;
+}
+
 static ColumnDef *
 _copyColumnDef(const ColumnDef *from)
 {
@@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from)
                case T_IndexElem:
                        retval = _copyIndexElem(from);
                        break;
+               case T_StatsElem:
+                       retval = _copyStatsElem(from);
+                       break;
                case T_ColumnDef:
                        retval = _copyColumnDef(from);
                        break;
index 3292dda34245084dc2b5e7a4fe7f9d8d713f6410..d46909bbc4f3a9606785d13a41810ccf57a050bb 100644 (file)
@@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b)
        return true;
 }
 
+
+static bool
+_equalStatsElem(const StatsElem *a, const StatsElem *b)
+{
+       COMPARE_STRING_FIELD(name);
+       COMPARE_NODE_FIELD(expr);
+
+       return true;
+}
+
 static bool
 _equalColumnDef(const ColumnDef *a, const ColumnDef *b)
 {
@@ -3724,6 +3734,9 @@ equal(const void *a, const void *b)
                case T_IndexElem:
                        retval = _equalIndexElem(a, b);
                        break;
+               case T_StatsElem:
+                       retval = _equalStatsElem(a, b);
+                       break;
                case T_ColumnDef:
                        retval = _equalColumnDef(a, b);
                        break;
index 9f7918c7e901d270272f941c4aa9bdc07ddeefd8..12561c475768035cc683b796fa8dbafcec0c99a2 100644 (file)
@@ -2943,6 +2943,15 @@ _outIndexElem(StringInfo str, const IndexElem *node)
        WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
 }
 
+static void
+_outStatsElem(StringInfo str, const StatsElem *node)
+{
+       WRITE_NODE_TYPE("STATSELEM");
+
+       WRITE_STRING_FIELD(name);
+       WRITE_NODE_FIELD(expr);
+}
+
 static void
 _outQuery(StringInfo str, const Query *node)
 {
@@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj)
                        case T_IndexElem:
                                _outIndexElem(str, obj);
                                break;
+                       case T_StatsElem:
+                               _outStatsElem(str, obj);
+                               break;
                        case T_Query:
                                _outQuery(str, obj);
                                break;
index 6c39bf893f886a92eaa1a58a79f16e91fdaf2506..0fa8875f0910d31b3b878d9a35bf0142e96b5043 100644 (file)
@@ -34,6 +34,7 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
@@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root,
 static List *
 get_relation_statistics(RelOptInfo *rel, Relation relation)
 {
+       Index           varno = rel->relid;
        List       *statoidlist;
        List       *stainfos = NIL;
        ListCell   *l;
@@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                HeapTuple       htup;
                HeapTuple       dtup;
                Bitmapset  *keys = NULL;
+               List       *exprs = NIL;
                int                     i;
 
                htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                for (i = 0; i < staForm->stxkeys.dim1; i++)
                        keys = bms_add_member(keys, staForm->stxkeys.values[i]);
 
+               /*
+                * Preprocess expressions (if any). We read the expressions, run them
+                * through eval_const_expressions, and fix the varnos.
+                */
+               {
+                       bool            isnull;
+                       Datum           datum;
+
+                       /* decode expression (if any) */
+                       datum = SysCacheGetAttr(STATEXTOID, htup,
+                                                                       Anum_pg_statistic_ext_stxexprs, &isnull);
+
+                       if (!isnull)
+                       {
+                               char       *exprsString;
+
+                               exprsString = TextDatumGetCString(datum);
+                               exprs = (List *) stringToNode(exprsString);
+                               pfree(exprsString);
+
+                               /*
+                                * Run the expressions through eval_const_expressions. This is
+                                * not just an optimization, but is necessary, because the
+                                * planner will be comparing them to similarly-processed qual
+                                * clauses, and may fail to detect valid matches without this.
+                                * We must not use canonicalize_qual, however, since these
+                                * aren't qual expressions.
+                                */
+                               exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+                               /* May as well fix opfuncids too */
+                               fix_opfuncids((Node *) exprs);
+
+                               /*
+                                * Modify the copies we obtain from the relcache to have the
+                                * correct varno for the parent relation, so that they match
+                                * up correctly against qual clauses.
+                                */
+                               if (varno != 1)
+                                       ChangeVarNodes((Node *) exprs, 1, varno, 0);
+                       }
+               }
+
                /* add one StatisticExtInfo for each kind built */
                if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
                {
@@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                        info->rel = rel;
                        info->kind = STATS_EXT_NDISTINCT;
                        info->keys = bms_copy(keys);
+                       info->exprs = exprs;
 
                        stainfos = lappend(stainfos, info);
                }
@@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                        info->rel = rel;
                        info->kind = STATS_EXT_DEPENDENCIES;
                        info->keys = bms_copy(keys);
+                       info->exprs = exprs;
 
                        stainfos = lappend(stainfos, info);
                }
@@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                        info->rel = rel;
                        info->kind = STATS_EXT_MCV;
                        info->keys = bms_copy(keys);
+                       info->exprs = exprs;
+
+                       stainfos = lappend(stainfos, info);
+               }
+
+               if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
+               {
+                       StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+                       info->statOid = statOid;
+                       info->rel = rel;
+                       info->kind = STATS_EXT_EXPRESSIONS;
+                       info->keys = bms_copy(keys);
+                       info->exprs = exprs;
 
                        stainfos = lappend(stainfos, info);
                }
index 2132cf4d828c6252351dab6abac6d2e94dee6565..7ff36bc84225aa5182ce7bb1e870348b7341cd55 100644 (file)
@@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
        WindowDef                       *windef;
        JoinExpr                        *jexpr;
        IndexElem                       *ielem;
+       StatsElem                       *selem;
        Alias                           *alias;
        RangeVar                        *range;
        IntoClause                      *into;
@@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
                                old_aggr_definition old_aggr_list
                                oper_argtypes RuleActionList RuleActionMulti
                                opt_column_list columnList opt_name_list
-                               sort_clause opt_sort_clause sortby_list index_params
+                               sort_clause opt_sort_clause sortby_list index_params stats_params
                                opt_include opt_c_include index_including_params
                                name_list role_list from_clause from_list opt_array_bounds
                                qualified_name_list any_name any_name_list type_name_list
@@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>   func_alias_clause
 %type <sortby> sortby
 %type <ielem>  index_elem index_elem_options
+%type <selem>  stats_param
 %type <node>   table_ref
 %type <jexpr>  joined_table
 %type <range>  relation_expr
@@ -4097,7 +4099,7 @@ ExistingIndex:   USING INDEX name                                 { $$ = $3; }
 
 CreateStatsStmt:
                        CREATE STATISTICS any_name
-                       opt_name_list ON expr_list FROM from_list
+                       opt_name_list ON stats_params FROM from_list
                                {
                                        CreateStatsStmt *n = makeNode(CreateStatsStmt);
                                        n->defnames = $3;
@@ -4109,7 +4111,7 @@ CreateStatsStmt:
                                        $$ = (Node *)n;
                                }
                        | CREATE STATISTICS IF_P NOT EXISTS any_name
-                       opt_name_list ON expr_list FROM from_list
+                       opt_name_list ON stats_params FROM from_list
                                {
                                        CreateStatsStmt *n = makeNode(CreateStatsStmt);
                                        n->defnames = $6;
@@ -4122,6 +4124,36 @@ CreateStatsStmt:
                                }
                        ;
 
+/*
+ * Statistics attributes can be either simple column references, or arbitrary
+ * expressions in parens.  For compatibility with index attributes permitted
+ * in CREATE INDEX, we allow an expression that's just a function call to be
+ * written without parens.
+ */
+
+stats_params:  stats_param                                                     { $$ = list_make1($1); }
+                       | stats_params ',' stats_param                  { $$ = lappend($1, $3); }
+               ;
+
+stats_param:   ColId
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = $1;
+                                       $$->expr = NULL;
+                               }
+                       | func_expr_windowless
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = NULL;
+                                       $$->expr = $1;
+                               }
+                       | '(' a_expr ')'
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = NULL;
+                                       $$->expr = $2;
+                               }
+               ;
 
 /*****************************************************************************
  *
index 7c3e01aa22b5a9e96b7d476138a50ae35d30a0c3..ceb0bf597d679aeefd93b942d86feee1935c2b81 100644 (file)
@@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr)
                        else
                                err = _("grouping operations are not allowed in index predicates");
 
+                       break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       if (isAgg)
+                               err = _("aggregate functions are not allowed in statistics expressions");
+                       else
+                               err = _("grouping operations are not allowed in statistics expressions");
+
                        break;
                case EXPR_KIND_ALTER_COL_TRANSFORM:
                        if (isAgg)
@@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
                case EXPR_KIND_INDEX_EXPRESSION:
                        err = _("window functions are not allowed in index expressions");
                        break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("window functions are not allowed in statistics expressions");
+                       break;
                case EXPR_KIND_INDEX_PREDICATE:
                        err = _("window functions are not allowed in index predicates");
                        break;
index f869e159d63f7d1a4a3c29c36692da43d0491a96..03373d551fcb9cfed7cd4562bee5a4391dd92374 100644 (file)
@@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
                case EXPR_KIND_FUNCTION_DEFAULT:
                case EXPR_KIND_INDEX_EXPRESSION:
                case EXPR_KIND_INDEX_PREDICATE:
+               case EXPR_KIND_STATS_EXPRESSION:
                case EXPR_KIND_ALTER_COL_TRANSFORM:
                case EXPR_KIND_EXECUTE_PARAMETER:
                case EXPR_KIND_TRIGGER_WHEN:
@@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink)
                case EXPR_KIND_INDEX_PREDICATE:
                        err = _("cannot use subquery in index predicate");
                        break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("cannot use subquery in statistics expression");
+                       break;
                case EXPR_KIND_ALTER_COL_TRANSFORM:
                        err = _("cannot use subquery in transform expression");
                        break;
@@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind)
                        return "index expression";
                case EXPR_KIND_INDEX_PREDICATE:
                        return "index predicate";
+               case EXPR_KIND_STATS_EXPRESSION:
+                       return "statistics expression";
                case EXPR_KIND_ALTER_COL_TRANSFORM:
                        return "USING";
                case EXPR_KIND_EXECUTE_PARAMETER:
index 37cebc7d829cc658fad4553893cf75dcbc6f6082..debef1d14fba1adb0d50b6cd256fc76e1c0ce0e7 100644 (file)
@@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
                case EXPR_KIND_INDEX_PREDICATE:
                        err = _("set-returning functions are not allowed in index predicates");
                        break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("set-returning functions are not allowed in statistics expressions");
+                       break;
                case EXPR_KIND_ALTER_COL_TRANSFORM:
                        err = _("set-returning functions are not allowed in transform expressions");
                        break;
index aa6c19adada7af8e1aeb592b890ff3f5a90c2add..b968c25dd69181411c47c0b7bf5d60288106a300 100644 (file)
@@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
                        stat_types = lappend(stat_types, makeString("dependencies"));
                else if (enabled[i] == STATS_EXT_MCV)
                        stat_types = lappend(stat_types, makeString("mcv"));
+               else if (enabled[i] == STATS_EXT_EXPRESSIONS)
+                       /* expression stats are not exposed to users */
+                       continue;
                else
                        elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
        }
@@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
        /* Determine which columns the statistics are on */
        for (i = 0; i < statsrec->stxkeys.dim1; i++)
        {
-               ColumnRef  *cref = makeNode(ColumnRef);
+               StatsElem  *selem = makeNode(StatsElem);
                AttrNumber      attnum = statsrec->stxkeys.values[i];
 
-               cref->fields = list_make1(makeString(get_attname(heapRelid,
-                                                                                                                attnum, false)));
-               cref->location = -1;
+               selem->name = get_attname(heapRelid, attnum, false);
+               selem->expr = NULL;
 
-               def_names = lappend(def_names, cref);
+               def_names = lappend(def_names, selem);
+       }
+
+       /*
+        * Now handle expressions, if there are any. The order (with respect to
+        * regular attributes) does not really matter for extended stats, so we
+        * simply append them after simple column references.
+        *
+        * XXX Some places during build/estimation treat expressions as if they
+        * are before atttibutes, but for the CREATE command that's entirely
+        * irrelevant.
+        */
+       datum = SysCacheGetAttr(STATEXTOID, ht_stats,
+                                                       Anum_pg_statistic_ext_stxexprs, &isnull);
+
+       if (!isnull)
+       {
+               ListCell   *lc;
+               List       *exprs = NIL;
+               char       *exprsString;
+
+               exprsString = TextDatumGetCString(datum);
+               exprs = (List *) stringToNode(exprsString);
+
+               foreach(lc, exprs)
+               {
+                       StatsElem  *selem = makeNode(StatsElem);
+
+                       selem->name = NULL;
+                       selem->expr = (Node *) lfirst(lc);
+
+                       def_names = lappend(def_names, selem);
+               }
+
+               pfree(exprsString);
        }
 
        /* finally, build the output node */
@@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
        stats->relations = list_make1(heapRel);
        stats->stxcomment = NULL;
        stats->if_not_exists = false;
+       stats->transformed = true;      /* don't need transformStatsStmt again */
 
        /* Clean up */
        ReleaseSysCache(ht_stats);
@@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString)
        return stmt;
 }
 
+/*
+ * transformStatsStmt - parse analysis for CREATE STATISTICS
+ *
+ * To avoid race conditions, it's important that this function rely only on
+ * the passed-in relid (and not on stmt->relation) to determine the target
+ * relation.
+ */
+CreateStatsStmt *
+transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
+{
+       ParseState *pstate;
+       ParseNamespaceItem *nsitem;
+       ListCell   *l;
+       Relation        rel;
+
+       /* Nothing to do if statement already transformed. */
+       if (stmt->transformed)
+               return stmt;
+
+       /*
+        * We must not scribble on the passed-in CreateStatsStmt, so copy it.
+        * (This is overkill, but easy.)
+        */
+       stmt = copyObject(stmt);
+
+       /* Set up pstate */
+       pstate = make_parsestate(NULL);
+       pstate->p_sourcetext = queryString;
+
+       /*
+        * Put the parent table into the rtable so that the expressions can refer
+        * to its fields without qualification.  Caller is responsible for locking
+        * relation, but we still need to open it.
+        */
+       rel = relation_open(relid, NoLock);
+       nsitem = addRangeTableEntryForRelation(pstate, rel,
+                                                                                  AccessShareLock,
+                                                                                  NULL, false, true);
+
+       /* no to join list, yes to namespaces */
+       addNSItemToQuery(pstate, nsitem, false, true, true);
+
+       /* take care of any expressions */
+       foreach(l, stmt->exprs)
+       {
+               StatsElem  *selem = (StatsElem *) lfirst(l);
+
+               if (selem->expr)
+               {
+                       /* Now do parse transformation of the expression */
+                       selem->expr = transformExpr(pstate, selem->expr,
+                                                                               EXPR_KIND_STATS_EXPRESSION);
+
+                       /* We have to fix its collations too */
+                       assign_expr_collations(pstate, selem->expr);
+               }
+       }
+
+       /*
+        * Check that only the base rel is mentioned.  (This should be dead code
+        * now that add_missing_from is history.)
+        */
+       if (list_length(pstate->p_rtable) != 1)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+                                errmsg("statistics expressions can refer only to the table being indexed")));
+
+       free_parsestate(pstate);
+
+       /* Close relation */
+       table_close(rel, NoLock);
+
+       /* Mark statement as successfully transformed */
+       stmt->transformed = true;
+
+       return stmt;
+}
+
 
 /*
  * transformRuleStmt -
index eac92851651e81227b32f72b389a29336826f9c5..cf8a6d5f68bd5167aa1f9655702782318a874f91 100644 (file)
@@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state);
 static DependencyGenerator DependencyGenerator_init(int n, int k);
 static void DependencyGenerator_free(DependencyGenerator state);
 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
-static double dependency_degree(int numrows, HeapTuple *rows, int k,
-                                                               AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
 static bool dependency_is_fully_matched(MVDependency *dependency,
                                                                                Bitmapset *attnums);
 static bool dependency_is_compatible_clause(Node *clause, Index relid,
                                                                                        AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+                                                                                               List *statlist, Node **expr);
 static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
-                                                                                          int ndependencies,
-                                                                                          Bitmapset *attnums);
+                                                                                          int ndependencies, Bitmapset *attnums);
 static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
                                                                                                 int varRelid, JoinType jointype,
                                                                                                 SpecialJoinInfo *sjinfo,
@@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state)
  * the last one.
  */
 static double
-dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
-                                 VacAttrStats **stats, Bitmapset *attrs)
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
 {
        int                     i,
                                nitems;
        MultiSortSupport mss;
        SortItem   *items;
-       AttrNumber *attnums;
        AttrNumber *attnums_dep;
-       int                     numattrs;
 
        /* counters valid within a group */
        int                     group_size = 0;
@@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
        mss = multi_sort_init(k);
 
        /*
-        * Transform the attrs from bitmap to an array to make accessing the i-th
-        * member easier, and then construct a filtered version with only attnums
-        * referenced by the dependency we validate.
+        * Translate the array of indexes to regular attnums for the dependency (we
+        * will need this to identify the columns in StatsBuildData).
         */
-       attnums = build_attnums_array(attrs, &numattrs);
-
        attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
        for (i = 0; i < k; i++)
-               attnums_dep[i] = attnums[dependency[i]];
+               attnums_dep[i] = data->attnums[dependency[i]];
 
        /*
         * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
        /* prepare the sort function for the dimensions */
        for (i = 0; i < k; i++)
        {
-               VacAttrStats *colstat = stats[dependency[i]];
+               VacAttrStats *colstat = data->stats[dependency[i]];
                TypeCacheEntry *type;
 
                type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
         * descriptor.  For now that assumption holds, but it might change in the
         * future for example if we support statistics on multiple tables.
         */
-       items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
-                                                          mss, k, attnums_dep);
+       items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
 
        /*
         * Walk through the sorted array, split it into rows according to the
@@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
                pfree(items);
 
        pfree(mss);
-       pfree(attnums);
        pfree(attnums_dep);
 
        /* Compute the 'degree of validity' as (supporting/total). */
-       return (n_supporting_rows * 1.0 / numrows);
+       return (n_supporting_rows * 1.0 / data->numrows);
 }
 
 /*
@@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
  *        (c) -> b
  */
 MVDependencies *
-statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
-                                                  VacAttrStats **stats)
+statext_dependencies_build(StatsBuildData *data)
 {
        int                     i,
                                k;
-       int                     numattrs;
-       AttrNumber *attnums;
 
        /* result */
        MVDependencies *dependencies = NULL;
 
-       /*
-        * Transform the bms into an array, to make accessing i-th member easier.
-        */
-       attnums = build_attnums_array(attrs, &numattrs);
-
-       Assert(numattrs >= 2);
+       Assert(data->nattnums >= 2);
 
        /*
         * We'll try build functional dependencies starting from the smallest ones
@@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
         * included in the statistics object.  We start from the smallest ones
         * because we want to be able to skip already implied ones.
         */
-       for (k = 2; k <= numattrs; k++)
+       for (k = 2; k <= data->nattnums; k++)
        {
                AttrNumber *dependency; /* array with k elements */
 
                /* prepare a DependencyGenerator of variation */
-               DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
+               DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
 
                /* generate all possible variations of k values (out of n) */
                while ((dependency = DependencyGenerator_next(DependencyGenerator)))
@@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
                        MVDependency *d;
 
                        /* compute how valid the dependency seems */
-                       degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+                       degree = dependency_degree(data, k, dependency);
 
                        /*
                         * if the dependency seems entirely invalid, don't store it
@@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
                        d->degree = degree;
                        d->nattributes = k;
                        for (i = 0; i < k; i++)
-                               d->attributes[i] = attnums[dependency[i]];
+                               d->attributes[i] = data->attnums[dependency[i]];
 
                        /* initialize the list of dependencies */
                        if (dependencies == NULL)
@@ -747,6 +731,7 @@ static bool
 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
 {
        Var                *var;
+       Node       *clause_expr;
 
        if (IsA(clause, RestrictInfo))
        {
@@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
 
                /* Make sure non-selected argument is a pseudoconstant. */
                if (is_pseudo_constant_clause(lsecond(expr->args)))
-                       var = linitial(expr->args);
+                       clause_expr = linitial(expr->args);
                else if (is_pseudo_constant_clause(linitial(expr->args)))
-                       var = lsecond(expr->args);
+                       clause_expr = lsecond(expr->args);
                else
                        return false;
 
@@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                /*
                 * Reject ALL() variant, we only care about ANY/IN.
                 *
-                * FIXME Maybe we should check if all the values are the same, and
-                * allow ALL in that case? Doesn't seem very practical, though.
+                * XXX Maybe we should check if all the values are the same, and allow
+                * ALL in that case? Doesn't seem very practical, though.
                 */
                if (!expr->useOr)
                        return false;
@@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                if (!is_pseudo_constant_clause(lsecond(expr->args)))
                        return false;
 
-               var = linitial(expr->args);
+               clause_expr = linitial(expr->args);
 
                /*
                 * If it's not an "=" operator, just ignore the clause, as it's not
@@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
        }
        else if (is_orclause(clause))
        {
-               BoolExpr   *expr = (BoolExpr *) clause;
+               BoolExpr   *bool_expr = (BoolExpr *) clause;
                ListCell   *lc;
 
                /* start with no attribute number */
                *attnum = InvalidAttrNumber;
 
-               foreach(lc, expr->args)
+               foreach(lc, bool_expr->args)
                {
                        AttrNumber      clause_attnum;
 
@@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                        if (*attnum == InvalidAttrNumber)
                                *attnum = clause_attnum;
 
+                       /* ensure all the variables are the same (same attnum) */
                        if (*attnum != clause_attnum)
                                return false;
                }
@@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                 * "NOT x" can be interpreted as "x = false", so get the argument and
                 * proceed with seeing if it's a suitable Var.
                 */
-               var = (Var *) get_notclausearg(clause);
+               clause_expr = (Node *) get_notclausearg(clause);
        }
        else
        {
@@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                 * A boolean expression "x" can be interpreted as "x = true", so
                 * proceed with seeing if it's a suitable Var.
                 */
-               var = (Var *) clause;
+               clause_expr = (Node *) clause;
        }
 
        /*
         * We may ignore any RelabelType node above the operand.  (There won't be
         * more than one, since eval_const_expressions has been applied already.)
         */
-       if (IsA(var, RelabelType))
-               var = (Var *) ((RelabelType *) var)->arg;
+       if (IsA(clause_expr, RelabelType))
+               clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
 
        /* We only support plain Vars for now */
-       if (!IsA(var, Var))
+       if (!IsA(clause_expr, Var))
                return false;
 
+       /* OK, we know we have a Var */
+       var = (Var *) clause_expr;
+
        /* Ensure Var is from the correct relation */
        if (var->varno != relid)
                return false;
@@ -1157,6 +1146,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
        return s1;
 }
 
+/*
+ * dependency_is_compatible_expression
+ *             Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+       List       *vars;
+       ListCell   *lc,
+                          *lc2;
+       Node       *clause_expr;
+
+       if (IsA(clause, RestrictInfo))
+       {
+               RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+               /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+               if (rinfo->pseudoconstant)
+                       return false;
+
+               /* Clauses referencing multiple, or no, varnos are incompatible */
+               if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+                       return false;
+
+               clause = (Node *) rinfo->clause;
+       }
+
+       if (is_opclause(clause))
+       {
+               /* If it's an opclause, check for Var = Const or Const = Var. */
+               OpExpr     *expr = (OpExpr *) clause;
+
+               /* Only expressions with two arguments are candidates. */
+               if (list_length(expr->args) != 2)
+                       return false;
+
+               /* Make sure non-selected argument is a pseudoconstant. */
+               if (is_pseudo_constant_clause(lsecond(expr->args)))
+                       clause_expr = linitial(expr->args);
+               else if (is_pseudo_constant_clause(linitial(expr->args)))
+                       clause_expr = lsecond(expr->args);
+               else
+                       return false;
+
+               /*
+                * If it's not an "=" operator, just ignore the clause, as it's not
+                * compatible with functional dependencies.
+                *
+                * This uses the function for estimating selectivity, not the operator
+                * directly (a bit awkward, but well ...).
+                *
+                * XXX this is pretty dubious; probably it'd be better to check btree
+                * or hash opclass membership, so as not to be fooled by custom
+                * selectivity functions, and to be more consistent with decisions
+                * elsewhere in the planner.
+                */
+               if (get_oprrest(expr->opno) != F_EQSEL)
+                       return false;
+
+               /* OK to proceed with checking "var" */
+       }
+       else if (IsA(clause, ScalarArrayOpExpr))
+       {
+               /* If it's an scalar array operator, check for Var IN Const. */
+               ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+               /*
+                * Reject ALL() variant, we only care about ANY/IN.
+                *
+                * FIXME Maybe we should check if all the values are the same, and
+                * allow ALL in that case? Doesn't seem very practical, though.
+                */
+               if (!expr->useOr)
+                       return false;
+
+               /* Only expressions with two arguments are candidates. */
+               if (list_length(expr->args) != 2)
+                       return false;
+
+               /*
+                * We know it's always (Var IN Const), so we assume the var is the
+                * first argument, and pseudoconstant is the second one.
+                */
+               if (!is_pseudo_constant_clause(lsecond(expr->args)))
+                       return false;
+
+               clause_expr = linitial(expr->args);
+
+               /*
+                * If it's not an "=" operator, just ignore the clause, as it's not
+                * compatible with functional dependencies. The operator is identified
+                * simply by looking at which function it uses to estimate
+                * selectivity. That's a bit strange, but it's what other similar
+                * places do.
+                */
+               if (get_oprrest(expr->opno) != F_EQSEL)
+                       return false;
+
+               /* OK to proceed with checking "var" */
+       }
+       else if (is_orclause(clause))
+       {
+               BoolExpr   *bool_expr = (BoolExpr *) clause;
+               ListCell   *lc;
+
+               /* start with no expression (we'll use the first match) */
+               *expr = NULL;
+
+               foreach(lc, bool_expr->args)
+               {
+                       Node       *or_expr = NULL;
+
+                       /*
+                        * Had we found incompatible expression in the arguments, treat
+                        * the whole expression as incompatible.
+                        */
+                       if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+                                                                                                        statlist, &or_expr))
+                               return false;
+
+                       if (*expr == NULL)
+                               *expr = or_expr;
+
+                       /* ensure all the expressions are the same */
+                       if (!equal(or_expr, *expr))
+                               return false;
+               }
+
+               /* the expression is already checked by the recursive call */
+               return true;
+       }
+       else if (is_notclause(clause))
+       {
+               /*
+                * "NOT x" can be interpreted as "x = false", so get the argument and
+                * proceed with seeing if it's a suitable Var.
+                */
+               clause_expr = (Node *) get_notclausearg(clause);
+       }
+       else
+       {
+               /*
+                * A boolean expression "x" can be interpreted as "x = true", so
+                * proceed with seeing if it's a suitable Var.
+                */
+               clause_expr = (Node *) clause;
+       }
+
+       /*
+        * We may ignore any RelabelType node above the operand.  (There won't be
+        * more than one, since eval_const_expressions has been applied already.)
+        */
+       if (IsA(clause_expr, RelabelType))
+               clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+       vars = pull_var_clause(clause_expr, 0);
+
+       foreach(lc, vars)
+       {
+               Var                *var = (Var *) lfirst(lc);
+
+               /* Ensure Var is from the correct relation */
+               if (var->varno != relid)
+                       return false;
+
+               /* We also better ensure the Var is from the current level */
+               if (var->varlevelsup != 0)
+                       return false;
+
+               /* Also ignore system attributes (we don't allow stats on those) */
+               if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+                       return false;
+       }
+
+       /*
+        * Check if we actually have a matching statistics for the expression.
+        *
+        * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+        */
+       foreach(lc, statlist)
+       {
+               StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+               /* ignore stats without dependencies */
+               if (info->kind != STATS_EXT_DEPENDENCIES)
+                       continue;
+
+               foreach(lc2, info->exprs)
+               {
+                       Node       *stat_expr = (Node *) lfirst(lc2);
+
+                       if (equal(clause_expr, stat_expr))
+                       {
+                               *expr = stat_expr;
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
 /*
  * dependencies_clauselist_selectivity
  *             Return the estimated selectivity of (a subset of) the given clauses
@@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
        MVDependency **dependencies;
        int                     ndependencies;
        int                     i;
+       AttrNumber      attnum_offset;
+
+       /* unique expressions */
+       Node      **unique_exprs;
+       int                     unique_exprs_cnt;
 
        /* check if there's any stats that might be useful for us. */
        if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
@@ -1212,6 +1412,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
        list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
                                                                                 list_length(clauses));
 
+       /*
+        * We allocate space as if every clause was a unique expression, although
+        * that's probably overkill. Some will be simple column references that
+        * we'll translate to attnums, and there might be duplicates. But it's
+        * easier and cheaper to just do one allocation than repalloc later.
+        */
+       unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+       unique_exprs_cnt = 0;
+
        /*
         * Pre-process the clauses list to extract the attnums seen in each item.
         * We need to determine if there's any clauses which will be useful for
@@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         *
         * We also skip clauses that we already estimated using different types of
         * statistics (we treat them as incompatible).
+        *
+        * To handle expressions, we assign them negative attnums, as if it was a
+        * system attribute (this is fine, as we only allow extended stats on user
+        * attributes). And then we offset everything by the number of
+        * expressions, so that we can store the values in a bitmapset.
         */
        listidx = 0;
        foreach(l, clauses)
        {
                Node       *clause = (Node *) lfirst(l);
                AttrNumber      attnum;
+               Node       *expr = NULL;
+
+               /* ignore clause by default */
+               list_attnums[listidx] = InvalidAttrNumber;
 
-               if (!bms_is_member(listidx, *estimatedclauses) &&
-                       dependency_is_compatible_clause(clause, rel->relid, &attnum))
+               if (!bms_is_member(listidx, *estimatedclauses))
                {
-                       list_attnums[listidx] = attnum;
-                       clauses_attnums = bms_add_member(clauses_attnums, attnum);
+                       /*
+                        * If it's a simple column refrence, just extract the attnum. If
+                        * it's an expression, assign a negative attnum as if it was a
+                        * system attribute.
+                        */
+                       if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+                       {
+                               list_attnums[listidx] = attnum;
+                       }
+                       else if (dependency_is_compatible_expression(clause, rel->relid,
+                                                                                                                rel->statlist,
+                                                                                                                &expr))
+                       {
+                               /* special attnum assigned to this expression */
+                               attnum = InvalidAttrNumber;
+
+                               Assert(expr != NULL);
+
+                               /* If the expression is duplicate, use the same attnum. */
+                               for (i = 0; i < unique_exprs_cnt; i++)
+                               {
+                                       if (equal(unique_exprs[i], expr))
+                                       {
+                                               /* negative attribute number to expression */
+                                               attnum = -(i + 1);
+                                               break;
+                                       }
+                               }
+
+                               /* not found in the list, so add it */
+                               if (attnum == InvalidAttrNumber)
+                               {
+                                       unique_exprs[unique_exprs_cnt++] = expr;
+
+                                       /* after incrementing the value, to get -1, -2, ... */
+                                       attnum = (-unique_exprs_cnt);
+                               }
+
+                               /* remember which attnum was assigned to this clause */
+                               list_attnums[listidx] = attnum;
+                       }
                }
-               else
-                       list_attnums[listidx] = InvalidAttrNumber;
 
                listidx++;
        }
 
+       Assert(listidx == list_length(clauses));
+
        /*
-        * If there's not at least two distinct attnums then reject the whole list
-        * of clauses. We must return 1.0 so the calling function's selectivity is
-        * unaffected.
+        * How much we need to offset the attnums? If there are no expressions,
+        * then no offset is needed. Otherwise we need to offset enough for the
+        * lowest value (-unique_exprs_cnt) to become 1.
+        */
+       if (unique_exprs_cnt > 0)
+               attnum_offset = (unique_exprs_cnt + 1);
+       else
+               attnum_offset = 0;
+
+       /*
+        * Now that we know how many expressions there are, we can offset the
+        * values just enough to build the bitmapset.
+        */
+       for (i = 0; i < list_length(clauses); i++)
+       {
+               AttrNumber      attnum;
+
+               /* ignore incompatible or already estimated clauses */
+               if (list_attnums[i] == InvalidAttrNumber)
+                       continue;
+
+               /* make sure the attnum is in the expected range */
+               Assert(list_attnums[i] >= (-unique_exprs_cnt));
+               Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+               /* make sure the attnum is positive (valid AttrNumber) */
+               attnum = list_attnums[i] + attnum_offset;
+
+               /*
+                * Either it's a regular attribute, or it's an expression, in which
+                * case we must not have seen it before (expressions are unique).
+                *
+                * XXX Check whether it's a regular attribute has to be done using the
+                * original attnum, while the second check has to use the value with
+                * an offset.
+                */
+               Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+                          !bms_is_member(attnum, clauses_attnums));
+
+               /*
+                * Remember the offset attnum, both for attributes and expressions.
+                * We'll pass list_attnums to clauselist_apply_dependencies, which
+                * uses it to identify clauses in a bitmap. We could also pass the
+                * offset, but this is more convenient.
+                */
+               list_attnums[i] = attnum;
+
+               clauses_attnums = bms_add_member(clauses_attnums, attnum);
+       }
+
+       /*
+        * If there's not at least two distinct attnums and expressions, then
+        * reject the whole list of clauses. We must return 1.0 so the calling
+        * function's selectivity is unaffected.
         */
        if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
        {
@@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
        foreach(l, rel->statlist)
        {
                StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
-               Bitmapset  *matched;
-               BMS_Membership membership;
+               int                     nmatched;
+               int                     nexprs;
+               int                     k;
+               MVDependencies *deps;
 
                /* skip statistics that are not of the correct type */
                if (stat->kind != STATS_EXT_DEPENDENCIES)
                        continue;
 
-               matched = bms_intersect(clauses_attnums, stat->keys);
-               membership = bms_membership(matched);
-               bms_free(matched);
+               /*
+                * Count matching attributes - we have to undo the attnum offsets. The
+                * input attribute numbers are not offset (expressions are not
+                * included in stat->keys, so it's not necessary). But we need to
+                * offset it before checking against clauses_attnums.
+                */
+               nmatched = 0;
+               k = -1;
+               while ((k = bms_next_member(stat->keys, k)) >= 0)
+               {
+                       AttrNumber      attnum = (AttrNumber) k;
 
-               /* skip objects matching fewer than two attributes from clauses */
-               if (membership != BMS_MULTIPLE)
+                       /* skip expressions */
+                       if (!AttrNumberIsForUserDefinedAttr(attnum))
+                               continue;
+
+                       /* apply the same offset as above */
+                       attnum += attnum_offset;
+
+                       if (bms_is_member(attnum, clauses_attnums))
+                               nmatched++;
+               }
+
+               /* count matching expressions */
+               nexprs = 0;
+               for (i = 0; i < unique_exprs_cnt; i++)
+               {
+                       ListCell   *lc;
+
+                       foreach(lc, stat->exprs)
+                       {
+                               Node       *stat_expr = (Node *) lfirst(lc);
+
+                               /* try to match it */
+                               if (equal(stat_expr, unique_exprs[i]))
+                                       nexprs++;
+                       }
+               }
+
+               /*
+                * Skip objects matching fewer than two attributes/expressions from
+                * clauses.
+                */
+               if (nmatched + nexprs < 2)
                        continue;
 
-               func_dependencies[nfunc_dependencies]
-                       = statext_dependencies_load(stat->statOid);
+               deps = statext_dependencies_load(stat->statOid);
+
+               /*
+                * The expressions may be represented by different attnums in the
+                * stats, we need to remap them to be consistent with the clauses.
+                * That will make the later steps (e.g. picking the strongest item and
+                * so on) much simpler and cheaper, because it won't need to care
+                * about the offset at all.
+                *
+                * When we're at it, we can ignore dependencies that are not fully
+                * matched by clauses (i.e. referencing attributes or expressions that
+                * are not in the clauses).
+                *
+                * We have to do this for all statistics, as long as there are any
+                * expressions - we need to shift the attnums in all dependencies.
+                *
+                * XXX Maybe we should do this always, because it also eliminates some
+                * of the dependencies early. It might be cheaper than having to walk
+                * the longer list in find_strongest_dependency later, especially as
+                * we need to do that repeatedly?
+                *
+                * XXX We have to do this even when there are no expressions in
+                * clauses, otherwise find_strongest_dependency may fail for stats
+                * with expressions (due to lookup of negative value in bitmap). So we
+                * need to at least filter out those dependencies. Maybe we could do
+                * it in a cheaper way (if there are no expr clauses, we can just
+                * discard all negative attnums without any lookups).
+                */
+               if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+               {
+                       int                     ndeps = 0;
+
+                       for (i = 0; i < deps->ndeps; i++)
+                       {
+                               bool            skip = false;
+                               MVDependency *dep = deps->deps[i];
+                               int                     j;
+
+                               for (j = 0; j < dep->nattributes; j++)
+                               {
+                                       int                     idx;
+                                       Node       *expr;
+                                       int                     k;
+                                       AttrNumber      unique_attnum = InvalidAttrNumber;
+                                       AttrNumber      attnum;
+
+                                       /* undo the per-statistics offset */
+                                       attnum = dep->attributes[j];
+
+                                       /*
+                                        * For regular attributes we can simply check if it
+                                        * matches any clause. If there's no matching clause, we
+                                        * can just ignore it. We need to offset the attnum
+                                        * though.
+                                        */
+                                       if (AttrNumberIsForUserDefinedAttr(attnum))
+                                       {
+                                               dep->attributes[j] = attnum + attnum_offset;
+
+                                               if (!bms_is_member(dep->attributes[j], clauses_attnums))
+                                               {
+                                                       skip = true;
+                                                       break;
+                                               }
+
+                                               continue;
+                                       }
+
+                                       /*
+                                        * the attnum should be a valid system attnum (-1, -2,
+                                        * ...)
+                                        */
+                                       Assert(AttributeNumberIsValid(attnum));
+
+                                       /*
+                                        * For expressions, we need to do two translations. First
+                                        * we have to translate the negative attnum to index in
+                                        * the list of expressions (in the statistics object).
+                                        * Then we need to see if there's a matching clause. The
+                                        * index of the unique expression determines the attnum
+                                        * (and we offset it).
+                                        */
+                                       idx = -(1 + attnum);
+
+                                       /* Is the expression index is valid? */
+                                       Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+                                       expr = (Node *) list_nth(stat->exprs, idx);
+
+                                       /* try to find the expression in the unique list */
+                                       for (k = 0; k < unique_exprs_cnt; k++)
+                                       {
+                                               /*
+                                                * found a matching unique expression, use the attnum
+                                                * (derived from index of the unique expression)
+                                                */
+                                               if (equal(unique_exprs[k], expr))
+                                               {
+                                                       unique_attnum = -(k + 1) + attnum_offset;
+                                                       break;
+                                               }
+                                       }
+
+                                       /*
+                                        * Found no matching expression, so we can simply skip
+                                        * this dependency, because there's no chance it will be
+                                        * fully covered.
+                                        */
+                                       if (unique_attnum == InvalidAttrNumber)
+                                       {
+                                               skip = true;
+                                               break;
+                                       }
+
+                                       /* otherwise remap it to the new attnum */
+                                       dep->attributes[j] = unique_attnum;
+                               }
 
-               total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
-               nfunc_dependencies++;
+                               /* if found a matching dependency, keep it */
+                               if (!skip)
+                               {
+                                       /* maybe we've skipped something earlier, so move it */
+                                       if (ndeps != i)
+                                               deps->deps[ndeps] = deps->deps[i];
+
+                                       ndeps++;
+                               }
+                       }
+
+                       deps->ndeps = ndeps;
+               }
+
+               /*
+                * It's possible we've removed all dependencies, in which case we
+                * don't bother adding it to the list.
+                */
+               if (deps->ndeps > 0)
+               {
+                       func_dependencies[nfunc_dependencies] = deps;
+                       total_ndeps += deps->ndeps;
+                       nfunc_dependencies++;
+               }
        }
 
        /* if no matching stats could be found then we've nothing to do */
@@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
                pfree(func_dependencies);
                bms_free(clauses_attnums);
                pfree(list_attnums);
+               pfree(unique_exprs);
                return 1.0;
        }
 
@@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
        pfree(func_dependencies);
        bms_free(clauses_attnums);
        pfree(list_attnums);
+       pfree(unique_exprs);
 
        return s1;
 }
index 7808c6a09cac71d57ac437ef37c95d1a368894de..8c75690fce816234850f31848fe4ef155646374b 100644 (file)
@@ -24,6 +24,7 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_statistic_ext_data.h"
+#include "executor/executor.h"
 #include "commands/progress.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "statistics/statistics.h"
 #include "utils/acl.h"
 #include "utils/array.h"
+#include "utils/attoptcache.h"
 #include "utils/builtins.h"
+#include "utils/datum.h"
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/selfuncs.h"
 #include "utils/syscache.h"
+#include "utils/typcache.h"
 
 /*
  * To avoid consuming too much memory during analysis and/or too much space
@@ -66,18 +70,38 @@ typedef struct StatExtEntry
        Bitmapset  *columns;            /* attribute numbers covered by the object */
        List       *types;                      /* 'char' list of enabled statistics kinds */
        int                     stattarget;             /* statistics target (-1 for default) */
+       List       *exprs;                      /* expressions */
 } StatExtEntry;
 
 
 static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
-static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                                                                                        int nvacatts, VacAttrStats **vacatts);
-static void statext_store(Oid relid,
+static void statext_store(Oid statOid,
                                                  MVNDistinct *ndistinct, MVDependencies *dependencies,
-                                                 MCVList *mcv, VacAttrStats **stats);
+                                                 MCVList *mcv, Datum exprs, VacAttrStats **stats);
 static int     statext_compute_stattarget(int stattarget,
                                                                           int natts, VacAttrStats **stats);
 
+/* Information needed to analyze a single simple expression. */
+typedef struct AnlExprData
+{
+       Node       *expr;                       /* expression to analyze */
+       VacAttrStats *vacattrstat;      /* statistics attrs to analyze */
+} AnlExprData;
+
+static void compute_expr_stats(Relation onerel, double totalrows,
+                                                          AnlExprData * exprdata, int nexprs,
+                                                          HeapTuple *rows, int numrows);
+static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs);
+static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static AnlExprData *build_expr_data(List *exprs, int stattarget);
+
+static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
+                                                                          int numrows, HeapTuple *rows,
+                                                                          VacAttrStats **stats, int stattarget);
+
+
 /*
  * Compute requested extended stats, using the rows sampled for the plain
  * (single-column) stats.
@@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
 {
        Relation        pg_stext;
        ListCell   *lc;
-       List       *stats;
+       List       *statslist;
        MemoryContext cxt;
        MemoryContext oldcxt;
        int64           ext_cnt;
 
+       /* Do nothing if there are no columns to analyze. */
+       if (!natts)
+               return;
+
        cxt = AllocSetContextCreate(CurrentMemoryContext,
                                                                "BuildRelationExtStatistics",
                                                                ALLOCSET_DEFAULT_SIZES);
        oldcxt = MemoryContextSwitchTo(cxt);
 
        pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
-       stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+       statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
 
        /* report this phase */
-       if (stats != NIL)
+       if (statslist != NIL)
        {
                const int       index[] = {
                        PROGRESS_ANALYZE_PHASE,
@@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                };
                const int64 val[] = {
                        PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS,
-                       list_length(stats)
+                       list_length(statslist)
                };
 
                pgstat_progress_update_multi_param(2, index, val);
        }
 
        ext_cnt = 0;
-       foreach(lc, stats)
+       foreach(lc, statslist)
        {
                StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
                MVNDistinct *ndistinct = NULL;
                MVDependencies *dependencies = NULL;
                MCVList    *mcv = NULL;
+               Datum           exprstats = (Datum) 0;
                VacAttrStats **stats;
                ListCell   *lc2;
                int                     stattarget;
+               StatsBuildData *data;
 
                /*
                 * Check if we can build these stats based on the column analyzed. If
                 * not, report this fact (except in autovacuum) and move on.
                 */
-               stats = lookup_var_attr_stats(onerel, stat->columns,
+               stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                                                          natts, vacattrstats);
                if (!stats)
                {
@@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                        continue;
                }
 
-               /* check allowed number of dimensions */
-               Assert(bms_num_members(stat->columns) >= 2 &&
-                          bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
                /* compute statistics target for this statistics */
                stattarget = statext_compute_stattarget(stat->stattarget,
                                                                                                bms_num_members(stat->columns),
@@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                if (stattarget == 0)
                        continue;
 
+               /* evaluate expressions (if the statistics has any) */
+               data = make_build_data(onerel, stat, numrows, rows, stats, stattarget);
+
                /* compute statistic of each requested type */
                foreach(lc2, stat->types)
                {
                        char            t = (char) lfirst_int(lc2);
 
                        if (t == STATS_EXT_NDISTINCT)
-                               ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
-                                                                                                       stat->columns, stats);
+                               ndistinct = statext_ndistinct_build(totalrows, data);
                        else if (t == STATS_EXT_DEPENDENCIES)
-                               dependencies = statext_dependencies_build(numrows, rows,
-                                                                                                                 stat->columns, stats);
+                               dependencies = statext_dependencies_build(data);
                        else if (t == STATS_EXT_MCV)
-                               mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
-                                                                               totalrows, stattarget);
+                               mcv = statext_mcv_build(data, totalrows, stattarget);
+                       else if (t == STATS_EXT_EXPRESSIONS)
+                       {
+                               AnlExprData *exprdata;
+                               int                     nexprs;
+
+                               /* should not happen, thanks to checks when defining stats */
+                               if (!stat->exprs)
+                                       elog(ERROR, "requested expression stats, but there are no expressions");
+
+                               exprdata = build_expr_data(stat->exprs, stattarget);
+                               nexprs = list_length(stat->exprs);
+
+                               compute_expr_stats(onerel, totalrows,
+                                                                  exprdata, nexprs,
+                                                                  rows, numrows);
+
+                               exprstats = serialize_expr_stats(exprdata, nexprs);
+                       }
                }
 
                /* store the statistics in the catalog */
-               statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
+               statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats);
 
                /* for reporting progress */
                pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED,
                                                                         ++ext_cnt);
+
+               /* free the build data (allocated as a single chunk) */
+               pfree(data);
        }
 
        table_close(pg_stext, RowExclusiveLock);
@@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel,
        MemoryContext oldcxt;
        int                     result = 0;
 
+       /* If there are no columns to analyze, just return 0. */
+       if (!natts)
+               return 0;
+
        cxt = AllocSetContextCreate(CurrentMemoryContext,
                                                                "ComputeExtStatisticsRows",
                                                                ALLOCSET_DEFAULT_SIZES);
@@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel,
                 * analyzed. If not, ignore it (don't report anything, we'll do that
                 * during the actual build BuildRelationExtStatistics).
                 */
-               stats = lookup_var_attr_stats(onerel, stat->columns,
+               stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                                                          natts, vacattrstats);
 
                if (!stats)
@@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type)
                        attnum = Anum_pg_statistic_ext_data_stxdmcv;
                        break;
 
+               case STATS_EXT_EXPRESSIONS:
+                       attnum = Anum_pg_statistic_ext_data_stxdexpr;
+                       break;
+
                default:
                        elog(ERROR, "unexpected statistics type requested: %d", type);
        }
@@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
                ArrayType  *arr;
                char       *enabled;
                Form_pg_statistic_ext staForm;
+               List       *exprs = NIL;
 
                entry = palloc0(sizeof(StatExtEntry));
                staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
@@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
                {
                        Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
                                   (enabled[i] == STATS_EXT_DEPENDENCIES) ||
-                                  (enabled[i] == STATS_EXT_MCV));
+                                  (enabled[i] == STATS_EXT_MCV) ||
+                                  (enabled[i] == STATS_EXT_EXPRESSIONS));
                        entry->types = lappend_int(entry->types, (int) enabled[i]);
                }
 
+               /* decode expression (if any) */
+               datum = SysCacheGetAttr(STATEXTOID, htup,
+                                                               Anum_pg_statistic_ext_stxexprs, &isnull);
+
+               if (!isnull)
+               {
+                       char       *exprsString;
+
+                       exprsString = TextDatumGetCString(datum);
+                       exprs = (List *) stringToNode(exprsString);
+
+                       pfree(exprsString);
+
+                       /*
+                        * Run the expressions through eval_const_expressions. This is not
+                        * just an optimization, but is necessary, because the planner
+                        * will be comparing them to similarly-processed qual clauses, and
+                        * may fail to detect valid matches without this.  We must not use
+                        * canonicalize_qual, however, since these aren't qual
+                        * expressions.
+                        */
+                       exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+                       /* May as well fix opfuncids too */
+                       fix_opfuncids((Node *) exprs);
+               }
+
+               entry->exprs = exprs;
+
                result = lappend(result, entry);
        }
 
@@ -427,6 +513,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
        return result;
 }
 
+/*
+ * examine_attribute -- pre-analysis of a single column
+ *
+ * Determine whether the column is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_attribute(Node *expr)
+{
+       HeapTuple       typtuple;
+       VacAttrStats *stats;
+       int                     i;
+       bool            ok;
+
+       /*
+        * Create the VacAttrStats struct.  Note that we only have a copy of the
+        * fixed fields of the pg_attribute tuple.
+        */
+       stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+       /* fake the attribute */
+       stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE);
+       stats->attr->attstattarget = -1;
+
+       /*
+        * When analyzing an expression, believe the expression tree's type not
+        * the column datatype --- the latter might be the opckeytype storage
+        * type of the opclass, which is not interesting for our purposes.  (Note:
+        * if we did anything with non-expression statistics columns, we'd need to
+        * figure out where to get the correct type info from, but for now that's
+        * not a problem.)      It's not clear whether anyone will care about the
+        * typmod, but we store that too just in case.
+        */
+       stats->attrtypid = exprType(expr);
+       stats->attrtypmod = exprTypmod(expr);
+       stats->attrcollid = exprCollation(expr);
+
+       typtuple = SearchSysCacheCopy1(TYPEOID,
+                                                                  ObjectIdGetDatum(stats->attrtypid));
+       if (!HeapTupleIsValid(typtuple))
+               elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+       stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+
+       /*
+        * We don't actually analyze individual attributes, so no need to set the
+        * memory context.
+        */
+       stats->anl_context = NULL;
+       stats->tupattnum = InvalidAttrNumber;
+
+       /*
+        * The fields describing the stats->stavalues[n] element types default to
+        * the type of the data being analyzed, but the type-specific typanalyze
+        * function can change them if it wants to store something else.
+        */
+       for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+       {
+               stats->statypid[i] = stats->attrtypid;
+               stats->statyplen[i] = stats->attrtype->typlen;
+               stats->statypbyval[i] = stats->attrtype->typbyval;
+               stats->statypalign[i] = stats->attrtype->typalign;
+       }
+
+       /*
+        * Call the type-specific typanalyze function.  If none is specified, use
+        * std_typanalyze().
+        */
+       if (OidIsValid(stats->attrtype->typanalyze))
+               ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                                                                  PointerGetDatum(stats)));
+       else
+               ok = std_typanalyze(stats);
+
+       if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+       {
+               heap_freetuple(typtuple);
+               pfree(stats->attr);
+               pfree(stats);
+               return NULL;
+       }
+
+       return stats;
+}
+
+/*
+ * examine_expression -- pre-analysis of a single expression
+ *
+ * Determine whether the expression is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_expression(Node *expr, int stattarget)
+{
+       HeapTuple       typtuple;
+       VacAttrStats *stats;
+       int                     i;
+       bool            ok;
+
+       Assert(expr != NULL);
+
+       /*
+        * Create the VacAttrStats struct.
+        */
+       stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+       /*
+        * When analyzing an expression, believe the expression tree's type.
+        */
+       stats->attrtypid = exprType(expr);
+       stats->attrtypmod = exprTypmod(expr);
+
+       /*
+        * We don't allow collation to be specified in CREATE STATISTICS, so we
+        * have to use the collation specified for the expression. It's possible
+        * to specify the collation in the expression "(col COLLATE "en_US")" in
+        * which case exprCollation() does the right thing.
+        */
+       stats->attrcollid = exprCollation(expr);
+
+       /*
+        * We don't have any pg_attribute for expressions, so let's fake something
+        * reasonable into attstattarget, which is the only thing std_typanalyze
+        * needs.
+        */
+       stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
+
+       /*
+        * We can't have statistics target specified for the expression, so we
+        * could use either the default_statistics_target, or the target computed
+        * for the extended statistics. The second option seems more reasonable.
+        */
+       stats->attr->attstattarget = stattarget;
+
+       /* initialize some basic fields */
+       stats->attr->attrelid = InvalidOid;
+       stats->attr->attnum = InvalidAttrNumber;
+       stats->attr->atttypid = stats->attrtypid;
+
+       typtuple = SearchSysCacheCopy1(TYPEOID,
+                                                                  ObjectIdGetDatum(stats->attrtypid));
+       if (!HeapTupleIsValid(typtuple))
+               elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+
+       stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+       stats->anl_context = CurrentMemoryContext;      /* XXX should be using
+                                                                                                * something else? */
+       stats->tupattnum = InvalidAttrNumber;
+
+       /*
+        * The fields describing the stats->stavalues[n] element types default to
+        * the type of the data being analyzed, but the type-specific typanalyze
+        * function can change them if it wants to store something else.
+        */
+       for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+       {
+               stats->statypid[i] = stats->attrtypid;
+               stats->statyplen[i] = stats->attrtype->typlen;
+               stats->statypbyval[i] = stats->attrtype->typbyval;
+               stats->statypalign[i] = stats->attrtype->typalign;
+       }
+
+       /*
+        * Call the type-specific typanalyze function.  If none is specified, use
+        * std_typanalyze().
+        */
+       if (OidIsValid(stats->attrtype->typanalyze))
+               ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                                                                  PointerGetDatum(stats)));
+       else
+               ok = std_typanalyze(stats);
+
+       if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+       {
+               heap_freetuple(typtuple);
+               pfree(stats);
+               return NULL;
+       }
+
+       return stats;
+}
+
 /*
  * Using 'vacatts' of size 'nvacatts' as input data, return a newly built
  * VacAttrStats array which includes only the items corresponding to
@@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
  * to the caller that the stats should not be built.
  */
 static VacAttrStats **
-lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                                          int nvacatts, VacAttrStats **vacatts)
 {
        int                     i = 0;
        int                     x = -1;
+       int                     natts;
        VacAttrStats **stats;
+       ListCell   *lc;
+
+       natts = bms_num_members(attrs) + list_length(exprs);
 
-       stats = (VacAttrStats **)
-               palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
+       stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *));
 
        /* lookup VacAttrStats info for the requested columns (same attnum) */
        while ((x = bms_next_member(attrs, x)) >= 0)
@@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
                i++;
        }
 
+       /* also add info for expressions */
+       foreach(lc, exprs)
+       {
+               Node       *expr = (Node *) lfirst(lc);
+
+               stats[i] = examine_attribute(expr);
+
+               /*
+                * XXX We need tuple descriptor later, and we just grab it from
+                * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded
+                * examine_attribute does not set that, so just grab it from the first
+                * vacatts element.
+                */
+               stats[i]->tupDesc = vacatts[0]->tupDesc;
+
+               i++;
+       }
+
        return stats;
 }
 
@@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
 static void
 statext_store(Oid statOid,
                          MVNDistinct *ndistinct, MVDependencies *dependencies,
-                         MCVList *mcv, VacAttrStats **stats)
+                         MCVList *mcv, Datum exprs, VacAttrStats **stats)
 {
        Relation        pg_stextdata;
        HeapTuple       stup,
@@ -532,11 +820,17 @@ statext_store(Oid statOid,
                nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
                values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
        }
+       if (exprs != (Datum) 0)
+       {
+               nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false;
+               values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs;
+       }
 
        /* always replace the value (either by bytea or NULL) */
        replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
        replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
        replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+       replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
 
        /* there should already be a pg_statistic_ext_data tuple */
        oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
@@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup)
  * is not necessary here (and when querying the bitmap).
  */
 AttrNumber *
-build_attnums_array(Bitmapset *attrs, int *numattrs)
+build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs)
 {
        int                     i,
                                j;
@@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
        j = -1;
        while ((j = bms_next_member(attrs, j)) >= 0)
        {
+               AttrNumber      attnum = (j - nexprs);
+
                /*
                 * Make sure the bitmap contains only user-defined attributes. As
                 * bitmaps can't contain negative values, this can be violated in two
                 * ways. Firstly, the bitmap might contain 0 as a member, and secondly
                 * the integer value might be larger than MaxAttrNumber.
                 */
-               Assert(AttrNumberIsForUserDefinedAttr(j));
-               Assert(j <= MaxAttrNumber);
+               Assert(AttributeNumberIsValid(attnum));
+               Assert(attnum <= MaxAttrNumber);
+               Assert(attnum >= (-nexprs));
 
-               attnums[i++] = (AttrNumber) j;
+               attnums[i++] = (AttrNumber) attnum;
 
                /* protect against overflows */
                Assert(i <= num);
@@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
  * can simply pfree the return value to release all of it.
  */
 SortItem *
-build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
-                                  MultiSortSupport mss, int numattrs, AttrNumber *attnums)
+build_sorted_items(StatsBuildData *data, int *nitems,
+                                  MultiSortSupport mss,
+                                  int numattrs, AttrNumber *attnums)
 {
        int                     i,
                                j,
                                len,
-                               idx;
-       int                     nvalues = numrows * numattrs;
+                               nrows;
+       int                     nvalues = data->numrows * numattrs;
 
        SortItem   *items;
        Datum      *values;
        bool       *isnull;
        char       *ptr;
+       int                *typlen;
 
        /* Compute the total amount of memory we need (both items and values). */
-       len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+       len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
 
        /* Allocate the memory and split it into the pieces. */
        ptr = palloc0(len);
 
        /* items to sort */
        items = (SortItem *) ptr;
-       ptr += numrows * sizeof(SortItem);
+       ptr += data->numrows * sizeof(SortItem);
 
        /* values and null flags */
        values = (Datum *) ptr;
@@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
        Assert((ptr - (char *) items) == len);
 
        /* fix the pointers to Datum and bool arrays */
-       idx = 0;
-       for (i = 0; i < numrows; i++)
+       nrows = 0;
+       for (i = 0; i < data->numrows; i++)
        {
-               bool            toowide = false;
+               items[nrows].values = &values[nrows * numattrs];
+               items[nrows].isnull = &isnull[nrows * numattrs];
 
-               items[idx].values = &values[idx * numattrs];
-               items[idx].isnull = &isnull[idx * numattrs];
+               nrows++;
+       }
+
+       /* build a local cache of typlen for all attributes */
+       typlen = (int *) palloc(sizeof(int) * data->nattnums);
+       for (i = 0; i < data->nattnums; i++)
+               typlen[i] = get_typlen(data->stats[i]->attrtypid);
+
+       nrows = 0;
+       for (i = 0; i < data->numrows; i++)