Extended statistics on expressions

author Tomas Vondra <tomas.vondra@postgresql.org>

Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)

committer Tomas Vondra <tomas.vondra@postgresql.org>

Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
author Tomas Vondra <tomas.vondra@postgresql.org>
Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)
committer Tomas Vondra <tomas.vondra@postgresql.org>
Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 0f8703af5a585a8865d26202e4fc42e341ecc00b..f103d914a62b90097248bce456d141693c294f32 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7385,8 +7385,22 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         <literal>d</literal> for n-distinct statistics,
         <literal>f</literal> for functional dependency statistics, and
         <literal>m</literal> for most common values (MCV) list statistics
+       <literal>e</literal> for expression statistics
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       Expression trees (in <function>nodeToString()</function>
+       representation) for statistics object attributes that are not simple
+       column references.  This is a list with one element per expression.
+       Null if all statistics object attributes are simple references.
+      </para></entry>
+     </row>
+
      </tbody>
     </tgroup>
    </table>
@@ -7452,7 +7466,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>oid</structfield>)
        </para>
        <para>
-       Extended statistic object containing the definition for this data
+       Extended statistics object containing the definition for this data
        </para></entry>
       </row>
  
@@ -7484,6 +7498,15 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         <structname>pg_mcv_list</structname> type
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       A list of any expressions covered by this statistics object.
+      </para></entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
@@ -7637,6 +7660,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         see <xref linkend="logical-replication-publication"/>.
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxdexpr</structfield> <type>pg_statistic[]</type>
+      </para>
+      <para>
+       Per-expression statistics, serialized as an array of
+       <structname>pg_statistic</structname> type
+      </para></entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
@@ -9444,6 +9477,11 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <entry>extended planner statistics</entry>
       </row>
  
+     <row>
+      <entry><link linkend="view-pg-stats-ext-exprs"><structname>pg_stats_ext_exprs</structname></link></entry>
+      <entry>extended planner statistics for expressions</entry>
+     </row>
+
       <row>
        <entry><link linkend="view-pg-tables"><structname>pg_tables</structname></link></entry>
        <entry>tables</entry>
@@ -12696,10 +12734,19 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
        </para>
        <para>
-       Name of the column described by this row
+       Names of the columns included in the extended statistics object
        </para></entry>
       </row>
  
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>exprs</structfield> <type>text[]</type>
+      </para>
+      <para>
+       Expressions included in the extended statistics object
+      </para></entry>
+      </row>
+
       <row>
        <entry role="catalog_table_entry"><para role="column_definition">
         <structfield>inherited</structfield> <type>bool</type>
@@ -12851,7 +12898,8 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
  
    <para>
     The view <structname>pg_stats_ext</structname> provides access to
-   the information stored in the <link
+   information about each extended statistics object in the database,
+   combining information stored in the <link
     linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
     and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
     catalogs.  This view allows access only to rows of
@@ -12908,7 +12956,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
        </para>
        <para>
-       Name of schema containing extended statistic
+       Name of schema containing extended statistics object
        </para></entry>
       </row>
  
@@ -12918,7 +12966,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
        </para>
        <para>
-       Name of extended statistics
+       Name of extended statistics object
        </para></entry>
       </row>
  
@@ -12928,7 +12976,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
        </para>
        <para>
-       Owner of the extended statistics
+       Owner of the extended statistics object
        </para></entry>
       </row>
  
@@ -12938,7 +12986,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
        </para>
        <para>
-       Names of the columns the extended statistics is defined on
+       Names of the columns the extended statistics object is defined on
        </para></entry>
       </row>
  
@@ -12947,7 +12995,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         <structfield>kinds</structfield> <type>char[]</type>
        </para>
        <para>
-       Types of extended statistics enabled for this record
+       Types of extended statistics object enabled for this record
        </para></entry>
       </row>
  
@@ -13032,6 +13080,237 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
  
   </sect1>
  
+ <sect1 id="view-pg-stats-ext-exprs">
+  <title><structname>pg_stats_ext_exprs</structname></title>
+
+  <indexterm zone="view-pg-stats-ext-exprs">
+   <primary>pg_stats_ext_exprs</primary>
+  </indexterm>
+
+  <para>
+   The view <structname>pg_stats_ext_exprs</structname> provides access to
+   information about all expressions included in extended statistics objects,
+   combining information stored in the <link
+   linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
+   and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   catalogs.  This view allows access only to rows of
+   <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   that correspond to tables the user has permission to read, and therefore
+   it is safe to allow public read access to this view.
+  </para>
+
+  <para>
+   <structname>pg_stats_ext_exprs</structname> is also designed to present
+   the information in a more readable format than the underlying catalogs
+   &mdash; at the cost that its schema must be extended whenever the structure
+   of statistics in <link linkend="catalog-pg-statistic"><structname>pg_statistic</structname></link> changes.
+  </para>
+
+  <table>
+   <title><structname>pg_stats_ext_exprs</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing table
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>tablename</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relname</structfield>)
+      </para>
+      <para>
+       Name of table the statistics object is defined on
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_name</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
+      </para>
+      <para>
+       Name of extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_owner</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
+      </para>
+      <para>
+       Owner of the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>expr</structfield> <type>text</type>
+      </para>
+      <para>
+       Expression included in the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>null_frac</structfield> <type>float4</type>
+      </para>
+      <para>
+       Fraction of expression entries that are null
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>avg_width</structfield> <type>int4</type>
+      </para>
+      <para>
+       Average width in bytes of expression's entries
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>n_distinct</structfield> <type>float4</type>
+      </para>
+      <para>
+       If greater than zero, the estimated number of distinct values in the
+       expression.  If less than zero, the negative of the number of distinct
+       values divided by the number of rows.  (The negated form is used when
+       <command>ANALYZE</command> believes that the number of distinct values is
+       likely to increase as the table grows; the positive form is used when
+       the expression seems to have a fixed number of possible values.)  For
+       example, -1 indicates a unique expression in which the number of distinct
+       values is the same as the number of rows.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_vals</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of the most common values in the expression. (Null if
+       no values seem to be more common than any others.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common values,
+       i.e., number of occurrences of each divided by total number of rows.
+       (Null when <structfield>most_common_vals</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>histogram_bounds</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of values that divide the expression's values into groups of
+       approximately equal population.  The values in
+       <structfield>most_common_vals</structfield>, if present, are omitted from this
+       histogram calculation.  (This expression is null if the expression data type
+       does not have a <literal>&lt;</literal> operator or if the
+       <structfield>most_common_vals</structfield> list accounts for the entire
+       population.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>correlation</structfield> <type>float4</type>
+      </para>
+      <para>
+       Statistical correlation between physical row ordering and
+       logical ordering of the expression values.  This ranges from -1 to +1.
+       When the value is near -1 or +1, an index scan on the expression will
+       be estimated to be cheaper than when it is near zero, due to reduction
+       of random access to the disk.  (This expression is null if the expression's
+       data type does not have a <literal>&lt;</literal> operator.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elems</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of non-null element values most often appearing within values of
+       the expression. (Null for scalar types.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elem_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common element values, i.e., the
+       fraction of rows containing at least one instance of the given value.
+       Two or three additional values follow the per-element frequencies;
+       these are the minimum and maximum of the preceding per-element
+       frequencies, and optionally the frequency of null elements.
+       (Null when <structfield>most_common_elems</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>elem_count_histogram</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A histogram of the counts of distinct non-null element values within the
+       values of the expression, followed by the average number of distinct
+       non-null elements.  (Null for scalar types.)
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The maximum number of entries in the array fields can be controlled on a
+   column-by-column basis using the <link linkend="sql-altertable"><command>ALTER
+   TABLE SET STATISTICS</command></link> command, or globally by setting the
+   <xref linkend="guc-default-statistics-target"/> run-time parameter.
+  </para>
+
+ </sect1>
+
   <sect1 id="view-pg-tables">
    <title><structname>pg_tables</structname></title>
  
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml

index 4363be50c3c4d5e42bb3a0d6902efbee19690372..988f4c573ff5e86c6248c0e6bb8458cfd0d65e6f 100644 (file)
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -21,9 +21,13 @@ PostgreSQL documentation
  
   <refsynopsisdiv>
  <synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+    ON ( <replaceable class="parameter">expression</replaceable> )
+    FROM <replaceable class="parameter">table_name</replaceable>
+
  CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
      [ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
-    ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+    ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
      FROM <replaceable class="parameter">table_name</replaceable>
  </synopsis>
  
@@ -39,6 +43,19 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     database and will be owned by the user issuing the command.
    </para>
  
+  <para>
+   The <command>CREATE STATISTICS</command> command has two basic forms. The
+   first form allows univariate statistics for a single expression to be
+   collected, providing benefits similar to an expression index without the
+   overhead of index maintenance.  This form does not allow the statistics
+   kind to be specified, since the various statistics kinds refer only to
+   multivariate statistics.  The second form of the command allows
+   multivariate statistics on multiple columns and/or expressions to be
+   collected, optionally specifying which statistics kinds to include.  This
+   form will also automatically cause univariate statistics to be collected on
+   any expressions included in the list.
+  </para>
+
    <para>
     If a schema name is given (for example, <literal>CREATE STATISTICS
     myschema.mystat ...</literal>) then the statistics object is created in the
@@ -79,14 +96,16 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
      <term><replaceable class="parameter">statistics_kind</replaceable></term>
      <listitem>
       <para>
-      A statistics kind to be computed in this statistics object.
+      A multivariate statistics kind to be computed in this statistics object.
        Currently supported kinds are
        <literal>ndistinct</literal>, which enables n-distinct statistics,
        <literal>dependencies</literal>, which enables functional
        dependency statistics, and <literal>mcv</literal> which enables
        most-common values lists.
        If this clause is omitted, all supported statistics kinds are
-      included in the statistics object.
+      included in the statistics object. Univariate expression statistics are
+      built automatically if the statistics definition includes any complex
+      expressions rather than just simple column references.
        For more information, see <xref linkend="planner-stats-extended"/>
        and <xref linkend="multivariate-statistics-examples"/>.
       </para>
@@ -98,8 +117,22 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
      <listitem>
       <para>
        The name of a table column to be covered by the computed statistics.
-      At least two column names must be given;  the order of the column names
-      is insignificant.
+      This is only allowed when building multivariate statistics.  At least
+      two column names or expressions must be specified, and their order is
+      not significant.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="parameter">expression</replaceable></term>
+    <listitem>
+     <para>
+      An expression to be covered by the computed statistics.  This may be
+      used to build univariate statistics on a single expression, or as part
+      of a list of multiple column names and/or expressions to build
+      multivariate statistics.  In the latter case, separate univariate
+      statistics are built automatically for each expression in the list.
       </para>
      </listitem>
     </varlistentry>
@@ -125,6 +158,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     reading it.  Once created, however, the ownership of the statistics
     object is independent of the underlying table(s).
    </para>
+
+  <para>
+   Expression statistics are per-expression and are similar to creating an
+   index on the expression, except that they avoid the overhead of index
+   maintenance. Expression statistics are built automatically for each
+   expression in the statistics object definition.
+  </para>
   </refsect1>
  
   <refsect1 id="sql-createstatistics-examples">
@@ -196,6 +236,72 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
     in the table, allowing it to generate better estimates in both cases.
    </para>
  
+  <para>
+   Create table <structname>t3</structname> with a single timestamp column,
+   and run queries using expressions on that column.  Without extended
+   statistics, the planner has no information about the data distribution for
+   the expressions, and uses default estimates.  The planner also does not
+   realize that the value of the date truncated to the month is fully
+   determined by the value of the date truncated to the day. Then expression
+   and ndistinct statistics are built on those two expressions:
+
+<programlisting>
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+   Without expression and ndistinct statistics, the planner has no information
+   about the number of distinct values for the expressions, and has to rely
+   on default estimates. The equality and range conditions are assumed to have
+   0.5% selectivity, and the number of distinct values in the expression is
+   assumed to be the same as for the column (i.e. unique). This results in a
+   significant underestimate of the row count in the first two queries. Moreover,
+   the planner has no information about the relationship between the expressions,
+   so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
+   conditions are independent, and multiplies their selectivities together to
+   arrive at a severe overestimate of the group count in the aggregate query.
+   This is further exacerbated by the lack of accurate statistics for the
+   expressions, forcing the planner to use a default ndistinct estimate for the
+   expression derived from ndistinct for the column. With such statistics, the
+   planner recognizes that the conditions are correlated, and arrives at much
+   more accurate estimates.
+  </para>
+
   </refsect1>
  
   <refsect1>
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile

index 70bc2123df7bf23111cf0ca289acdb8812910a9f..e36a9602c129abbe1dd1b0b106a1cb684165ace9 100644 (file)
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk
  
  # Note: the order of this list determines the order in which the catalog
  # header files are assembled into postgres.bki.  BKI_BOOTSTRAP catalogs
-# must appear first, and there are reputedly other, undocumented ordering
-# dependencies.
+# must appear first, and pg_statistic before pg_statistic_ext_data, and
+# there are reputedly other, undocumented ordering dependencies.
  CATALOG_HEADERS := \
         pg_proc.h pg_type.h pg_attribute.h pg_class.h \
         pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
         pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
         pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
-       pg_statistic_ext.h pg_statistic_ext_data.h \
-       pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
+       pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
+       pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
         pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
         pg_database.h pg_db_role_setting.h pg_tablespace.h \
         pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql

index 0dca65dc7bb9fe920d60f84be73741bc6680662c..6483563204cfbd4242a492c2fdefd117205dbde9 100644 (file)
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                    JOIN pg_attribute a
                         ON (a.attrelid = s.stxrelid AND a.attnum = k)
             ) AS attnames,
+           pg_get_statisticsobjdef_expressions(s.oid) as exprs,
             s.stxkind AS kinds,
             sd.stxdndistinct AS n_distinct,
             sd.stxddependencies AS dependencies,
@@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                  WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
      AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
  
+CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
+    SELECT cn.nspname AS schemaname,
+           c.relname AS tablename,
+           sn.nspname AS statistics_schemaname,
+           s.stxname AS statistics_name,
+           pg_get_userbyid(s.stxowner) AS statistics_owner,
+           stat.expr,
+           (stat.a).stanullfrac AS null_frac,
+           (stat.a).stawidth AS avg_width,
+           (stat.a).stadistinct AS n_distinct,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
+           END) AS most_common_vals,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
+           END) AS most_common_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
+           END) AS histogram_bounds,
+           (CASE
+               WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
+               WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
+               WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
+               WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
+               WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
+           END) correlation,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
+           END) AS most_common_elems,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
+           END) AS most_common_elem_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
+           END) AS elem_count_histogram
+    FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
+         LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
+         LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
+         LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
+         JOIN LATERAL (
+             SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+                    unnest(sd.stxdexpr)::pg_statistic AS a
+         ) stat ON (stat.expr IS NOT NULL);
+
  -- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
  REVOKE ALL on pg_statistic_ext_data FROM public;
  
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c

index 2bae205845992abbc87c4a7cddc4994c35ccb189..df4768952d5b8bd12c02fc25b7f1c018b110daa9 100644 (file)
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -29,6 +29,8 @@
  #include "commands/comment.h"
  #include "commands/defrem.h"
  #include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
  #include "statistics/statistics.h"
  #include "utils/builtins.h"
  #include "utils/fmgroids.h"
@@ -62,7 +64,8 @@ ObjectAddress
  CreateStatistics(CreateStatsStmt *stmt)
  {
         int16           attnums[STATS_MAX_DIMENSIONS];
-       int                     numcols = 0;
+       int                     nattnums = 0;
+       int                     numcols;
         char       *namestr;
         NameData        stxname;
         Oid                     statoid;
@@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt)
         Datum           datavalues[Natts_pg_statistic_ext_data];
         bool            datanulls[Natts_pg_statistic_ext_data];
         int2vector *stxkeys;
+       List       *stxexprs = NIL;
+       Datum           exprsDatum;
         Relation        statrel;
         Relation        datarel;
         Relation        rel = NULL;
         Oid                     relid;
         ObjectAddress parentobject,
                                 myself;
-       Datum           types[3];               /* one for each possible type of statistic */
+       Datum           types[4];               /* one for each possible type of statistic */
         int                     ntypes;
         ArrayType  *stxkind;
         bool            build_ndistinct;
         bool            build_dependencies;
         bool            build_mcv;
+       bool            build_expressions;
         bool            requested_type = false;
         int                     i;
         ListCell   *cell;
+       ListCell   *cell2;
  
         Assert(IsA(stmt, CreateStatsStmt));
  
@@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt)
         }
  
         /*
-        * Currently, we only allow simple column references in the expression
-        * list.  That will change someday, and again the grammar already supports
-        * it so we have to enforce restrictions here.  For now, we can convert
-        * the expression list to a simple array of attnums.  While at it, enforce
-        * some constraints.
+        * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
+        * might be duplicates and so on, but we'll deal with those later.
+        */
+       numcols = list_length(stmt->exprs);
+       if (numcols > STATS_MAX_DIMENSIONS)
+               ereport(ERROR,
+                               (errcode(ERRCODE_TOO_MANY_COLUMNS),
+                                errmsg("cannot have more than %d columns in statistics",
+                                               STATS_MAX_DIMENSIONS)));
+
+       /*
+        * Convert the expression list to a simple array of attnums, but also keep
+        * a list of more complex expressions.  While at it, enforce some
+        * constraints.
+        *
+        * XXX We do only the bare minimum to separate simple attribute and
+        * complex expressions - for example "(a)" will be treated as a complex
+        * expression. No matter how elaborate the check is, there'll always be a
+        * way around it, if the user is determined (consider e.g. "(a+0)"), so
+        * it's not worth protecting against it.
          */
         foreach(cell, stmt->exprs)
         {
                 Node       *expr = (Node *) lfirst(cell);
-               ColumnRef  *cref;
-               char       *attname;
+               StatsElem  *selem;
                 HeapTuple       atttuple;
                 Form_pg_attribute attForm;
                 TypeCacheEntry *type;
  
-               if (!IsA(expr, ColumnRef))
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("only simple column references are allowed in CREATE STATISTICS")));
-               cref = (ColumnRef *) expr;
-
-               if (list_length(cref->fields) != 1)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("only simple column references are allowed in CREATE STATISTICS")));
-               attname = strVal((Value *) linitial(cref->fields));
-
-               atttuple = SearchSysCacheAttName(relid, attname);
-               if (!HeapTupleIsValid(atttuple))
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_UNDEFINED_COLUMN),
-                                        errmsg("column \"%s\" does not exist",
-                                                       attname)));
-               attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
-               /* Disallow use of system attributes in extended stats */
-               if (attForm->attnum <= 0)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("statistics creation on system columns is not supported")));
-
-               /* Disallow data types without a less-than operator */
-               type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
-               if (type->lt_opr == InvalidOid)
+               /*
+                * We should not get anything else than StatsElem, given the grammar.
+                * But let's keep it as a safety.
+                */
+               if (!IsA(expr, StatsElem))
                         ereport(ERROR,
                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
-                                                       attname, format_type_be(attForm->atttypid))));
+                                        errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
  
-               /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
-               if (numcols >= STATS_MAX_DIMENSIONS)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_TOO_MANY_COLUMNS),
-                                        errmsg("cannot have more than %d columns in statistics",
-                                                       STATS_MAX_DIMENSIONS)));
+               selem = (StatsElem *) expr;
  
-               attnums[numcols] = attForm->attnum;
-               numcols++;
-               ReleaseSysCache(atttuple);
+               if (selem->name)                /* column reference */
+               {
+                       char       *attname;
+
+                       attname = selem->name;
+
+                       atttuple = SearchSysCacheAttName(relid, attname);
+                       if (!HeapTupleIsValid(atttuple))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_COLUMN),
+                                                errmsg("column \"%s\" does not exist",
+                                                               attname)));
+                       attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+                       /* Disallow use of system attributes in extended stats */
+                       if (attForm->attnum <= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                errmsg("statistics creation on system columns is not supported")));
+
+                       /* Disallow data types without a less-than operator */
+                       type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+                       if (type->lt_opr == InvalidOid)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+                                                               attname, format_type_be(attForm->atttypid))));
+
+                       attnums[nattnums] = attForm->attnum;
+                       nattnums++;
+                       ReleaseSysCache(atttuple);
+               }
+               else                                    /* expression */
+               {
+                       Node       *expr = selem->expr;
+                       Oid                     atttype;
+
+                       Assert(expr != NULL);
+
+                       /*
+                        * Disallow data types without a less-than operator.
+                        *
+                        * We ignore this for statistics on a single expression, in which
+                        * case we'll build the regular statistics only (and that code can
+                        * deal with such data types).
+                        */
+                       if (list_length(stmt->exprs) > 1)
+                       {
+                               atttype = exprType(expr);
+                               type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
+                               if (type->lt_opr == InvalidOid)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                        errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
+                                                                       format_type_be(atttype))));
+                       }
+
+                       stxexprs = lappend(stxexprs, expr);
+               }
         }
  
         /*
-        * Check that at least two columns were specified in the statement. The
-        * upper bound was already checked in the loop above.
-        */
-       if (numcols < 2)
-               ereport(ERROR,
-                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                                errmsg("extended statistics require at least 2 columns")));
-
-       /*
-        * Sort the attnums, which makes detecting duplicates somewhat easier, and
-        * it does not hurt (it does not affect the efficiency, unlike for
-        * indexes, for example).
-        */
-       qsort(attnums, numcols, sizeof(int16), compare_int16);
-
-       /*
-        * Check for duplicates in the list of columns. The attnums are sorted so
-        * just check consecutive elements.
+        * Parse the statistics kinds.
+        *
+        * First check that if this is the case with a single expression, there
+        * are no statistics kinds specified (we don't allow that for the simple
+        * CREATE STATISTICS form).
          */
-       for (i = 1; i < numcols; i++)
+       if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
         {
-               if (attnums[i] == attnums[i - 1])
+               /* statistics kinds not specified */
+               if (list_length(stmt->stat_types) > 0)
                         ereport(ERROR,
-                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
-                                        errmsg("duplicate column name in statistics definition")));
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                        errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
         }
  
-       /* Form an int2vector representation of the sorted column list */
-       stxkeys = buildint2vector(attnums, numcols);
-
-       /*
-        * Parse the statistics kinds.
-        */
+       /* OK, let's check that we recognize the statistics kinds. */
         build_ndistinct = false;
         build_dependencies = false;
         build_mcv = false;
@@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt)
                                          errmsg("unrecognized statistics kind \"%s\"",
                                                         type)));
         }
-       /* If no statistic type was specified, build them all. */
-       if (!requested_type)
+
+       /*
+        * If no statistic type was specified, build them all (but only when the
+        * statistics is defined on more than one column/expression).
+        */
+       if ((!requested_type) && (numcols >= 2))
         {
                 build_ndistinct = true;
                 build_dependencies = true;
                 build_mcv = true;
         }
  
+       /*
+        * When there are non-trivial expressions, build the expression stats
+        * automatically. This allows calculating good estimates for stats that
+        * consider per-clause estimates (e.g. functional dependencies).
+        */
+       build_expressions = (list_length(stxexprs) > 0);
+
+       /*
+        * Check that at least two columns were specified in the statement, or
+        * that we're building statistics on a single expression.
+        */
+       if ((numcols < 2) && (list_length(stxexprs) != 1))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("extended statistics require at least 2 columns")));
+
+       /*
+        * Sort the attnums, which makes detecting duplicates somewhat easier, and
+        * it does not hurt (it does not matter for the contents, unlike for
+        * indexes, for example).
+        */
+       qsort(attnums, nattnums, sizeof(int16), compare_int16);
+
+       /*
+        * Check for duplicates in the list of columns. The attnums are sorted so
+        * just check consecutive elements.
+        */
+       for (i = 1; i < nattnums; i++)
+       {
+               if (attnums[i] == attnums[i - 1])
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                        errmsg("duplicate column name in statistics definition")));
+       }
+
+       /*
+        * Check for duplicate expressions. We do two loops, counting the
+        * occurrences of each expression. This is O(N^2) but we only allow small
+        * number of expressions and it's not executed often.
+        *
+        * XXX We don't cross-check attributes and expressions, because it does
+        * not seem worth it. In principle we could check that expressions don't
+        * contain trivial attribute references like "(a)", but the reasoning is
+        * similar to why we don't bother with extracting columns from
+        * expressions. It's either expensive or very easy to defeat for
+        * determined user, and there's no risk if we allow such statistics (the
+        * statistics is useless, but harmless).
+        */
+       foreach(cell, stxexprs)
+       {
+               Node       *expr1 = (Node *) lfirst(cell);
+               int                     cnt = 0;
+
+               foreach(cell2, stxexprs)
+               {
+                       Node       *expr2 = (Node *) lfirst(cell2);
+
+                       if (equal(expr1, expr2))
+                               cnt += 1;
+               }
+
+               /* every expression should find at least itself */
+               Assert(cnt >= 1);
+
+               if (cnt > 1)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                        errmsg("duplicate expression in statistics definition")));
+       }
+
+       /* Form an int2vector representation of the sorted column list */
+       stxkeys = buildint2vector(attnums, nattnums);
+
         /* construct the char array of enabled statistic types */
         ntypes = 0;
         if (build_ndistinct)
@@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt)
                 types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
         if (build_mcv)
                 types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
+       if (build_expressions)
+               types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
         Assert(ntypes > 0 && ntypes <= lengthof(types));
         stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
  
+       /* convert the expressions (if any) to a text datum */
+       if (stxexprs != NIL)
+       {
+               char       *exprsString;
+
+               exprsString = nodeToString(stxexprs);
+               exprsDatum = CStringGetTextDatum(exprsString);
+               pfree(exprsString);
+       }
+       else
+               exprsDatum = (Datum) 0;
+
         statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
  
         /*
@@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt)
         values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
         values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
  
+       values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
+       if (exprsDatum == (Datum) 0)
+               nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
+
         /* insert it into pg_statistic_ext */
         htup = heap_form_tuple(statrel->rd_att, values, nulls);
         CatalogTupleInsert(statrel, htup);
@@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt)
         datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
         datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
         datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+       datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
  
         /* insert it into pg_statistic_ext_data */
         htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
@@ -396,12 +522,41 @@ CreateStatistics(CreateStatsStmt *stmt)
          */
         ObjectAddressSet(myself, StatisticExtRelationId, statoid);
  
-       for (i = 0; i < numcols; i++)
+       /* add dependencies for plain column references */
+       for (i = 0; i < nattnums; i++)
         {
                 ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
                 recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
         }
  
+       /*
+        * If there are no dependencies on a column, give the statistics an auto
+        * dependency on the whole table.  In most cases, this will be redundant,
+        * but it might not be if the statistics expressions contain no Vars
+        * (which might seem strange but possible). This is consistent with what
+        * we do for indexes in index_create.
+        *
+        * XXX We intentionally don't consider the expressions before adding this
+        * dependency, because recordDependencyOnSingleRelExpr may not create any
+        * dependencies for whole-row Vars.
+        */
+       if (!nattnums)
+       {
+               ObjectAddressSet(parentobject, RelationRelationId, relid);
+               recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+       }
+
+       /*
+        * Store dependencies on anything mentioned in statistics expressions,
+        * just like we do for index expressions.
+        */
+       if (stxexprs)
+               recordDependencyOnSingleRelExpr(&myself,
+                                                                               (Node *) stxexprs,
+                                                                               relid,
+                                                                               DEPENDENCY_NORMAL,
+                                                                               DEPENDENCY_AUTO, false, true);
+
         /*
          * Also add dependencies on namespace and owner.  These are required
          * because the stats object might have a different namespace and/or owner
@@ -582,87 +737,6 @@ RemoveStatisticsById(Oid statsOid)
         table_close(relation, RowExclusiveLock);
  }
  
-/*
- * Update a statistics object for ALTER COLUMN TYPE on a source column.
- *
- * This could throw an error if the type change can't be supported.
- * If it can be supported, but the stats must be recomputed, a likely choice
- * would be to set the relevant column(s) of the pg_statistic_ext_data tuple
- * to null until the next ANALYZE.  (Note that the type change hasn't actually
- * happened yet, so one option that's *not* on the table is to recompute
- * immediately.)
- *
- * For both ndistinct and functional-dependencies stats, the on-disk
- * representation is independent of the source column data types, and it is
- * plausible to assume that the old statistic values will still be good for
- * the new column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
- * expression that substantially alters the semantic meaning of the column
- * values, this assumption could fail.  But that seems like a corner case
- * that doesn't justify zapping the stats in common cases.)
- *
- * For MCV lists that's not the case, as those statistics store the datums
- * internally. In this case we simply reset the statistics value to NULL.
- *
- * Note that "type change" includes collation change, which means we can rely
- * on the MCV list being consistent with the collation info in pg_attribute
- * during estimation.
- */
-void
-UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
-                                                         Oid oldColumnType, Oid newColumnType)
-{
-       HeapTuple       stup,
-                               oldtup;
-
-       Relation        rel;
-
-       Datum           values[Natts_pg_statistic_ext_data];
-       bool            nulls[Natts_pg_statistic_ext_data];
-       bool            replaces[Natts_pg_statistic_ext_data];
-
-       oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
-       if (!HeapTupleIsValid(oldtup))
-               elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
-
-       /*
-        * When none of the defined statistics types contain datum values from the
-        * table's columns then there's no need to reset the stats. Functional
-        * dependencies and ndistinct stats should still hold true.
-        */
-       if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
-       {
-               ReleaseSysCache(oldtup);
-               return;
-       }
-
-       /*
-        * OK, we need to reset some statistics. So let's build the new tuple,
-        * replacing the affected statistics types with NULL.
-        */
-       memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-       memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-       memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
-
-       replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-       nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-
-       rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
-
-       /* replace the old tuple */
-       stup = heap_modify_tuple(oldtup,
-                                                        RelationGetDescr(rel),
-                                                        values,
-                                                        nulls,
-                                                        replaces);
-
-       ReleaseSysCache(oldtup);
-       CatalogTupleUpdate(rel, &stup->t_self, stup);
-
-       heap_freetuple(stup);
-
-       table_close(rel, RowExclusiveLock);
-}
-
  /*
   * Select a nonconflicting name for a new statistics.
   *
@@ -731,18 +805,27 @@ ChooseExtendedStatisticNameAddition(List *exprs)
         buf[0] = '\0';
         foreach(lc, exprs)
         {
-               ColumnRef  *cref = (ColumnRef *) lfirst(lc);
+               StatsElem  *selem = (StatsElem *) lfirst(lc);
                 const char *name;
  
                 /* It should be one of these, but just skip if it happens not to be */
-               if (!IsA(cref, ColumnRef))
+               if (!IsA(selem, StatsElem))
                         continue;
  
-               name = strVal((Value *) linitial(cref->fields));
+               name = selem->name;
  
                 if (buflen > 0)
                         buf[buflen++] = '_';    /* insert _ between names */
  
+               /*
+                * We use fixed 'expr' for expressions, which have empty column names.
+                * For indexes this is handled in ChooseIndexColumnNames, but we have
+                * no such function for stats and it does not seem worth adding. If a
+                * better name is needed, the user can specify it explicitly.
+                */
+               if (!name)
+                       name = "expr";
+
                 /*
                  * At this point we have buflen <= NAMEDATALEN.  name should be less
                  * than NAMEDATALEN already, but use strlcpy for paranoia.
@@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs)
         }
         return pstrdup(buf);
  }
+
+/*
+ * StatisticsGetRelation: given a statistics's relation OID, get the OID of
+ * the relation it is an statistics on.  Uses the system cache.
+ */
+Oid
+StatisticsGetRelation(Oid statId, bool missing_ok)
+{
+       HeapTuple       tuple;
+       Form_pg_statistic_ext stx;
+       Oid                     result;
+
+       tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
+       if (!HeapTupleIsValid(tuple))
+       {
+               if (missing_ok)
+                       return InvalidOid;
+               elog(ERROR, "cache lookup failed for statistics object %u", statId);
+       }
+       stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
+       Assert(stx->oid == statId);
+
+       result = stx->stxrelid;
+       ReleaseSysCache(tuple);
+       return result;
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c

index efac06f72c7465603a684813cd66c618cf5c4336..88a68a4697ad1d26a5430c9b0753b549c2c10abc 100644 (file)
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -41,6 +41,7 @@
  #include "catalog/pg_namespace.h"
  #include "catalog/pg_opclass.h"
  #include "catalog/pg_tablespace.h"
+#include "catalog/pg_statistic_ext.h"
  #include "catalog/pg_trigger.h"
  #include "catalog/pg_type.h"
  #include "catalog/storage.h"
@@ -188,6 +189,8 @@ typedef struct AlteredTableInfo
         List       *changedIndexDefs;   /* string definitions of same */
         char       *replicaIdentityIndex;       /* index to reset as REPLICA IDENTITY */
         char       *clusterOnIndex; /* index to use for CLUSTER */
+       List       *changedStatisticsOids;      /* OIDs of statistics to rebuild */
+       List       *changedStatisticsDefs;      /* string definitions of same */
  } AlteredTableInfo;
  
  /* Struct describing one new constraint to check in Phase 3 scan */
@@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c
                                                                           ObjectAddresses *addrs);
  static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                                                                         IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
+static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                                                                                CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
  static ObjectAddress ATExecAddConstraint(List **wqueue,
                                                                                  AlteredTableInfo *tab, Relation rel,
                                                                                  Constraint *newConstraint, bool recurse, bool is_readd,
@@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                                                                                    AlterTableCmd *cmd, LOCKMODE lockmode);
  static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
  static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
+static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
  static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
                                                                    LOCKMODE lockmode);
  static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
@@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab,
                         address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
                                                                          lockmode);
                         break;
+               case AT_ReAddStatistics:        /* ADD STATISTICS */
+                       address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
+                                                                                 true, lockmode);
+                       break;
                 case AT_AddConstraint:  /* ADD CONSTRAINT */
                         /* Transform the command only during initial examination */
                         if (cur_pass == AT_PASS_ADD_CONSTR)
@@ -8283,6 +8293,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
         return address;
  }
  
+/*
+ * ALTER TABLE ADD STATISTICS
+ *
+ * This is no such command in the grammar, but we use this internally to add
+ * AT_ReAddStatistics subcommands to rebuild extended statistics after a table
+ * column type change.
+ */
+static ObjectAddress
+ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                                       CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
+{
+       ObjectAddress address;
+
+       Assert(IsA(stmt, CreateStatsStmt));
+
+       /* The CreateStatsStmt has already been through transformStatsStmt */
+       Assert(stmt->transformed);
+
+       address = CreateStatistics(stmt);
+
+       return address;
+}
+
  /*
   * ALTER TABLE ADD CONSTRAINT USING INDEX
   *
@@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                                  * Give the extended-stats machinery a chance to fix anything
                                  * that this column type change would break.
                                  */
-                               UpdateStatisticsForTypeChange(foundObject.objectId,
-                                                                                         RelationGetRelid(rel), attnum,
-                                                                                         attTup->atttypid, targettype);
+                               RememberStatisticsForRebuilding(foundObject.objectId, tab);
                                 break;
  
                         case OCLASS_PROC:
@@ -12202,6 +12233,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab)
         }
  }
  
+/*
+ * Subroutine for ATExecAlterColumnType: remember that a statistics object
+ * needs to be rebuilt (which we might already know).
+ */
+static void
+RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
+{
+       /*
+        * This de-duplication check is critical for two independent reasons: we
+        * mustn't try to recreate the same statistics object twice, and if the
+        * statistics depends on more than one column whose type is to be altered,
+        * we must capture its definition string before applying any of the type
+        * changes. ruleutils.c will get confused if we ask again later.
+        */
+       if (!list_member_oid(tab->changedStatisticsOids, stxoid))
+       {
+               /* OK, capture the index's existing definition string */
+               char       *defstring = pg_get_statisticsobjdef_string(stxoid);
+
+               tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
+                                                                                                stxoid);
+               tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
+                                                                                        defstring);
+       }
+}
+
  /*
   * Cleanup after we've finished all the ALTER TYPE operations for a
   * particular relation.  We have to drop and recreate all the indexes
@@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
                 add_exact_object_address(&obj, objects);
         }
  
+       /* add dependencies for new statistics */
+       forboth(oid_item, tab->changedStatisticsOids,
+                       def_item, tab->changedStatisticsDefs)
+       {
+               Oid                     oldId = lfirst_oid(oid_item);
+               Oid                     relid;
+
+               relid = StatisticsGetRelation(oldId, false);
+               ATPostAlterTypeParse(oldId, relid, InvalidOid,
+                                                        (char *) lfirst(def_item),
+                                                        wqueue, lockmode, tab->rewrite);
+
+               ObjectAddressSet(obj, StatisticExtRelationId, oldId);
+               add_exact_object_address(&obj, objects);
+       }
+
         /*
          * Queue up command to restore replica identity index marking
          */
@@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
  }
  
  /*
- * Parse the previously-saved definition string for a constraint or index
- * against the newly-established column data type(s), and queue up the
- * resulting command parsetrees for execution.
+ * Parse the previously-saved definition string for a constraint, index or
+ * statistics object against the newly-established column data type(s), and
+ * queue up the resulting command parsetrees for execution.
   *
   * This might fail if, for example, you have a WHERE clause that uses an
   * operator that's not available for the new column type.
@@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                         querytree_list = lappend(querytree_list, stmt);
                         querytree_list = list_concat(querytree_list, afterStmts);
                 }
+               else if (IsA(stmt, CreateStatsStmt))
+                       querytree_list = lappend(querytree_list,
+                                                                        transformStatsStmt(oldRelId,
+                                                                                                               (CreateStatsStmt *) stmt,
+                                                                                                               cmd));
                 else
                         querytree_list = lappend(querytree_list, stmt);
         }
@@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                                 elog(ERROR, "unexpected statement subtype: %d",
                                          (int) stmt->subtype);
                 }
+               else if (IsA(stm, CreateStatsStmt))
+               {
+                       CreateStatsStmt  *stmt = (CreateStatsStmt *) stm;
+                       AlterTableCmd *newcmd;
+
+                       /* keep the statistics object's comment */
+                       stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
+
+                       newcmd = makeNode(AlterTableCmd);
+                       newcmd->subtype = AT_ReAddStatistics;
+                       newcmd->def = (Node *) stmt;
+                       tab->subcmds[AT_PASS_MISC] =
+                               lappend(tab->subcmds[AT_PASS_MISC], newcmd);
+               }
                 else
                         elog(ERROR, "unexpected statement type: %d",
                                  (int) nodeTag(stm));
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c

index 38b56231b7df2fb0890b17e5026d295578b5fbe5..d5b1ad4567044ecc762a414c0c607dddf69318d9 100644 (file)
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from)
         return newnode;
  }
  
+static StatsElem *
+_copyStatsElem(const StatsElem *from)
+{
+       StatsElem  *newnode = makeNode(StatsElem);
+
+       COPY_STRING_FIELD(name);
+       COPY_NODE_FIELD(expr);
+
+       return newnode;
+}
+
  static ColumnDef *
  _copyColumnDef(const ColumnDef *from)
  {
@@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from)
                 case T_IndexElem:
                         retval = _copyIndexElem(from);
                         break;
+               case T_StatsElem:
+                       retval = _copyStatsElem(from);
+                       break;
                 case T_ColumnDef:
                         retval = _copyColumnDef(from);
                         break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c

index 3292dda34245084dc2b5e7a4fe7f9d8d713f6410..d46909bbc4f3a9606785d13a41810ccf57a050bb 100644 (file)
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b)
         return true;
  }
  
+
+static bool
+_equalStatsElem(const StatsElem *a, const StatsElem *b)
+{
+       COMPARE_STRING_FIELD(name);
+       COMPARE_NODE_FIELD(expr);
+
+       return true;
+}
+
  static bool
  _equalColumnDef(const ColumnDef *a, const ColumnDef *b)
  {
@@ -3724,6 +3734,9 @@ equal(const void *a, const void *b)
                 case T_IndexElem:
                         retval = _equalIndexElem(a, b);
                         break;
+               case T_StatsElem:
+                       retval = _equalStatsElem(a, b);
+                       break;
                 case T_ColumnDef:
                         retval = _equalColumnDef(a, b);
                         break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c

index 9f7918c7e901d270272f941c4aa9bdc07ddeefd8..12561c475768035cc683b796fa8dbafcec0c99a2 100644 (file)
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2943,6 +2943,15 @@ _outIndexElem(StringInfo str, const IndexElem *node)
         WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
  }
  
+static void
+_outStatsElem(StringInfo str, const StatsElem *node)
+{
+       WRITE_NODE_TYPE("STATSELEM");
+
+       WRITE_STRING_FIELD(name);
+       WRITE_NODE_FIELD(expr);
+}
+
  static void
  _outQuery(StringInfo str, const Query *node)
  {
@@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj)
                         case T_IndexElem:
                                 _outIndexElem(str, obj);
                                 break;
+                       case T_StatsElem:
+                               _outStatsElem(str, obj);
+                               break;
                         case T_Query:
                                 _outQuery(str, obj);
                                 break;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c

index 6c39bf893f886a92eaa1a58a79f16e91fdaf2506..0fa8875f0910d31b3b878d9a35bf0142e96b5043 100644 (file)
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -34,6 +34,7 @@
  #include "foreign/fdwapi.h"
  #include "miscadmin.h"
  #include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
  #include "nodes/supportnodes.h"
  #include "optimizer/clauses.h"
  #include "optimizer/cost.h"
@@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root,
  static List *
  get_relation_statistics(RelOptInfo *rel, Relation relation)
  {
+       Index           varno = rel->relid;
         List       *statoidlist;
         List       *stainfos = NIL;
         ListCell   *l;
@@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                 HeapTuple       htup;
                 HeapTuple       dtup;
                 Bitmapset  *keys = NULL;
+               List       *exprs = NIL;
                 int                     i;
  
                 htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                 for (i = 0; i < staForm->stxkeys.dim1; i++)
                         keys = bms_add_member(keys, staForm->stxkeys.values[i]);
  
+               /*
+                * Preprocess expressions (if any). We read the expressions, run them
+                * through eval_const_expressions, and fix the varnos.
+                */
+               {
+                       bool            isnull;
+                       Datum           datum;
+
+                       /* decode expression (if any) */
+                       datum = SysCacheGetAttr(STATEXTOID, htup,
+                                                                       Anum_pg_statistic_ext_stxexprs, &isnull);
+
+                       if (!isnull)
+                       {
+                               char       *exprsString;
+
+                               exprsString = TextDatumGetCString(datum);
+                               exprs = (List *) stringToNode(exprsString);
+                               pfree(exprsString);
+
+                               /*
+                                * Run the expressions through eval_const_expressions. This is
+                                * not just an optimization, but is necessary, because the
+                                * planner will be comparing them to similarly-processed qual
+                                * clauses, and may fail to detect valid matches without this.
+                                * We must not use canonicalize_qual, however, since these
+                                * aren't qual expressions.
+                                */
+                               exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+                               /* May as well fix opfuncids too */
+                               fix_opfuncids((Node *) exprs);
+
+                               /*
+                                * Modify the copies we obtain from the relcache to have the
+                                * correct varno for the parent relation, so that they match
+                                * up correctly against qual clauses.
+                                */
+                               if (varno != 1)
+                                       ChangeVarNodes((Node *) exprs, 1, varno, 0);
+                       }
+               }
+
                 /* add one StatisticExtInfo for each kind built */
                 if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
                 {
@@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                         info->rel = rel;
                         info->kind = STATS_EXT_NDISTINCT;
                         info->keys = bms_copy(keys);
+                       info->exprs = exprs;
  
                         stainfos = lappend(stainfos, info);
                 }
@@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                         info->rel = rel;
                         info->kind = STATS_EXT_DEPENDENCIES;
                         info->keys = bms_copy(keys);
+                       info->exprs = exprs;
  
                         stainfos = lappend(stainfos, info);
                 }
@@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
                         info->rel = rel;
                         info->kind = STATS_EXT_MCV;
                         info->keys = bms_copy(keys);
+                       info->exprs = exprs;
+
+                       stainfos = lappend(stainfos, info);
+               }
+
+               if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
+               {
+                       StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+                       info->statOid = statOid;
+                       info->rel = rel;
+                       info->kind = STATS_EXT_EXPRESSIONS;
+                       info->keys = bms_copy(keys);
+                       info->exprs = exprs;
  
                         stainfos = lappend(stainfos, info);
                 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y

index 2132cf4d828c6252351dab6abac6d2e94dee6565..7ff36bc84225aa5182ce7bb1e870348b7341cd55 100644 (file)
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
         WindowDef                       *windef;
         JoinExpr                        *jexpr;
         IndexElem                       *ielem;
+       StatsElem                       *selem;
         Alias                           *alias;
         RangeVar                        *range;
         IntoClause                      *into;
@@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
                                 old_aggr_definition old_aggr_list
                                 oper_argtypes RuleActionList RuleActionMulti
                                 opt_column_list columnList opt_name_list
-                               sort_clause opt_sort_clause sortby_list index_params
+                               sort_clause opt_sort_clause sortby_list index_params stats_params
                                 opt_include opt_c_include index_including_params
                                 name_list role_list from_clause from_list opt_array_bounds
                                 qualified_name_list any_name any_name_list type_name_list
@@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
  %type <list>   func_alias_clause
  %type <sortby> sortby
  %type <ielem>  index_elem index_elem_options
+%type <selem>  stats_param
  %type <node>   table_ref
  %type <jexpr>  joined_table
  %type <range>  relation_expr
@@ -4097,7 +4099,7 @@ ExistingIndex:   USING INDEX name                                 { $$ = $3; }
  
  CreateStatsStmt:
                         CREATE STATISTICS any_name
-                       opt_name_list ON expr_list FROM from_list
+                       opt_name_list ON stats_params FROM from_list
                                 {
                                         CreateStatsStmt *n = makeNode(CreateStatsStmt);
                                         n->defnames = $3;
@@ -4109,7 +4111,7 @@ CreateStatsStmt:
                                         $$ = (Node *)n;
                                 }
                         | CREATE STATISTICS IF_P NOT EXISTS any_name
-                       opt_name_list ON expr_list FROM from_list
+                       opt_name_list ON stats_params FROM from_list
                                 {
                                         CreateStatsStmt *n = makeNode(CreateStatsStmt);
                                         n->defnames = $6;
@@ -4122,6 +4124,36 @@ CreateStatsStmt:
                                 }
                         ;
  
+/*
+ * Statistics attributes can be either simple column references, or arbitrary
+ * expressions in parens.  For compatibility with index attributes permitted
+ * in CREATE INDEX, we allow an expression that's just a function call to be
+ * written without parens.
+ */
+
+stats_params:  stats_param                                                     { $$ = list_make1($1); }
+                       | stats_params ',' stats_param                  { $$ = lappend($1, $3); }
+               ;
+
+stats_param:   ColId
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = $1;
+                                       $$->expr = NULL;
+                               }
+                       | func_expr_windowless
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = NULL;
+                                       $$->expr = $1;
+                               }
+                       | '(' a_expr ')'
+                               {
+                                       $$ = makeNode(StatsElem);
+                                       $$->name = NULL;
+                                       $$->expr = $2;
+                               }
+               ;
  
  /*****************************************************************************
   *
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c

index 7c3e01aa22b5a9e96b7d476138a50ae35d30a0c3..ceb0bf597d679aeefd93b942d86feee1935c2b81 100644 (file)
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr)
                         else
                                 err = _("grouping operations are not allowed in index predicates");
  
+                       break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       if (isAgg)
+                               err = _("aggregate functions are not allowed in statistics expressions");
+                       else
+                               err = _("grouping operations are not allowed in statistics expressions");
+
                         break;
                 case EXPR_KIND_ALTER_COL_TRANSFORM:
                         if (isAgg)
@@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
                 case EXPR_KIND_INDEX_EXPRESSION:
                         err = _("window functions are not allowed in index expressions");
                         break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("window functions are not allowed in statistics expressions");
+                       break;
                 case EXPR_KIND_INDEX_PREDICATE:
                         err = _("window functions are not allowed in index predicates");
                         break;
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c

index f869e159d63f7d1a4a3c29c36692da43d0491a96..03373d551fcb9cfed7cd4562bee5a4391dd92374 100644 (file)
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
                 case EXPR_KIND_FUNCTION_DEFAULT:
                 case EXPR_KIND_INDEX_EXPRESSION:
                 case EXPR_KIND_INDEX_PREDICATE:
+               case EXPR_KIND_STATS_EXPRESSION:
                 case EXPR_KIND_ALTER_COL_TRANSFORM:
                 case EXPR_KIND_EXECUTE_PARAMETER:
                 case EXPR_KIND_TRIGGER_WHEN:
@@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink)
                 case EXPR_KIND_INDEX_PREDICATE:
                         err = _("cannot use subquery in index predicate");
                         break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("cannot use subquery in statistics expression");
+                       break;
                 case EXPR_KIND_ALTER_COL_TRANSFORM:
                         err = _("cannot use subquery in transform expression");
                         break;
@@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind)
                         return "index expression";
                 case EXPR_KIND_INDEX_PREDICATE:
                         return "index predicate";
+               case EXPR_KIND_STATS_EXPRESSION:
+                       return "statistics expression";
                 case EXPR_KIND_ALTER_COL_TRANSFORM:
                         return "USING";
                 case EXPR_KIND_EXECUTE_PARAMETER:
diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c

index 37cebc7d829cc658fad4553893cf75dcbc6f6082..debef1d14fba1adb0d50b6cd256fc76e1c0ce0e7 100644 (file)
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
                 case EXPR_KIND_INDEX_PREDICATE:
                         err = _("set-returning functions are not allowed in index predicates");
                         break;
+               case EXPR_KIND_STATS_EXPRESSION:
+                       err = _("set-returning functions are not allowed in statistics expressions");
+                       break;
                 case EXPR_KIND_ALTER_COL_TRANSFORM:
                         err = _("set-returning functions are not allowed in transform expressions");
                         break;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c

index aa6c19adada7af8e1aeb592b890ff3f5a90c2add..b968c25dd69181411c47c0b7bf5d60288106a300 100644 (file)
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
                         stat_types = lappend(stat_types, makeString("dependencies"));
                 else if (enabled[i] == STATS_EXT_MCV)
                         stat_types = lappend(stat_types, makeString("mcv"));
+               else if (enabled[i] == STATS_EXT_EXPRESSIONS)
+                       /* expression stats are not exposed to users */
+                       continue;
                 else
                         elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
         }
@@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
         /* Determine which columns the statistics are on */
         for (i = 0; i < statsrec->stxkeys.dim1; i++)
         {
-               ColumnRef  *cref = makeNode(ColumnRef);
+               StatsElem  *selem = makeNode(StatsElem);
                 AttrNumber      attnum = statsrec->stxkeys.values[i];
  
-               cref->fields = list_make1(makeString(get_attname(heapRelid,
-                                                                                                                attnum, false)));
-               cref->location = -1;
+               selem->name = get_attname(heapRelid, attnum, false);
+               selem->expr = NULL;
  
-               def_names = lappend(def_names, cref);
+               def_names = lappend(def_names, selem);
+       }
+
+       /*
+        * Now handle expressions, if there are any. The order (with respect to
+        * regular attributes) does not really matter for extended stats, so we
+        * simply append them after simple column references.
+        *
+        * XXX Some places during build/estimation treat expressions as if they
+        * are before atttibutes, but for the CREATE command that's entirely
+        * irrelevant.
+        */
+       datum = SysCacheGetAttr(STATEXTOID, ht_stats,
+                                                       Anum_pg_statistic_ext_stxexprs, &isnull);
+
+       if (!isnull)
+       {
+               ListCell   *lc;
+               List       *exprs = NIL;
+               char       *exprsString;
+
+               exprsString = TextDatumGetCString(datum);
+               exprs = (List *) stringToNode(exprsString);
+
+               foreach(lc, exprs)
+               {
+                       StatsElem  *selem = makeNode(StatsElem);
+
+                       selem->name = NULL;
+                       selem->expr = (Node *) lfirst(lc);
+
+                       def_names = lappend(def_names, selem);
+               }
+
+               pfree(exprsString);
         }
  
         /* finally, build the output node */
@@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
         stats->relations = list_make1(heapRel);
         stats->stxcomment = NULL;
         stats->if_not_exists = false;
+       stats->transformed = true;      /* don't need transformStatsStmt again */
  
         /* Clean up */
         ReleaseSysCache(ht_stats);
@@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString)
         return stmt;
  }
  
+/*
+ * transformStatsStmt - parse analysis for CREATE STATISTICS
+ *
+ * To avoid race conditions, it's important that this function rely only on
+ * the passed-in relid (and not on stmt->relation) to determine the target
+ * relation.
+ */
+CreateStatsStmt *
+transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
+{
+       ParseState *pstate;
+       ParseNamespaceItem *nsitem;
+       ListCell   *l;
+       Relation        rel;
+
+       /* Nothing to do if statement already transformed. */
+       if (stmt->transformed)
+               return stmt;
+
+       /*
+        * We must not scribble on the passed-in CreateStatsStmt, so copy it.
+        * (This is overkill, but easy.)
+        */
+       stmt = copyObject(stmt);
+
+       /* Set up pstate */
+       pstate = make_parsestate(NULL);
+       pstate->p_sourcetext = queryString;
+
+       /*
+        * Put the parent table into the rtable so that the expressions can refer
+        * to its fields without qualification.  Caller is responsible for locking
+        * relation, but we still need to open it.
+        */
+       rel = relation_open(relid, NoLock);
+       nsitem = addRangeTableEntryForRelation(pstate, rel,
+                                                                                  AccessShareLock,
+                                                                                  NULL, false, true);
+
+       /* no to join list, yes to namespaces */
+       addNSItemToQuery(pstate, nsitem, false, true, true);
+
+       /* take care of any expressions */
+       foreach(l, stmt->exprs)
+       {
+               StatsElem  *selem = (StatsElem *) lfirst(l);
+
+               if (selem->expr)
+               {
+                       /* Now do parse transformation of the expression */
+                       selem->expr = transformExpr(pstate, selem->expr,
+                                                                               EXPR_KIND_STATS_EXPRESSION);
+
+                       /* We have to fix its collations too */
+                       assign_expr_collations(pstate, selem->expr);
+               }
+       }
+
+       /*
+        * Check that only the base rel is mentioned.  (This should be dead code
+        * now that add_missing_from is history.)
+        */
+       if (list_length(pstate->p_rtable) != 1)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+                                errmsg("statistics expressions can refer only to the table being indexed")));
+
+       free_parsestate(pstate);
+
+       /* Close relation */
+       table_close(rel, NoLock);
+
+       /* Mark statement as successfully transformed */
+       stmt->transformed = true;
+
+       return stmt;
+}
+
  
  /*
   * transformRuleStmt -
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c

index eac92851651e81227b32f72b389a29336826f9c5..cf8a6d5f68bd5167aa1f9655702782318a874f91 100644 (file)
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state);
  static DependencyGenerator DependencyGenerator_init(int n, int k);
  static void DependencyGenerator_free(DependencyGenerator state);
  static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
-static double dependency_degree(int numrows, HeapTuple *rows, int k,
-                                                               AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
  static bool dependency_is_fully_matched(MVDependency *dependency,
                                                                                 Bitmapset *attnums);
  static bool dependency_is_compatible_clause(Node *clause, Index relid,
                                                                                         AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+                                                                                               List *statlist, Node **expr);
  static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
-                                                                                          int ndependencies,
-                                                                                          Bitmapset *attnums);
+                                                                                          int ndependencies, Bitmapset *attnums);
  static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
                                                                                                  int varRelid, JoinType jointype,
                                                                                                  SpecialJoinInfo *sjinfo,
@@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state)
   * the last one.
   */
  static double
-dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
-                                 VacAttrStats **stats, Bitmapset *attrs)
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
  {
         int                     i,
                                 nitems;
         MultiSortSupport mss;
         SortItem   *items;
-       AttrNumber *attnums;
         AttrNumber *attnums_dep;
-       int                     numattrs;
  
         /* counters valid within a group */
         int                     group_size = 0;
@@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
         mss = multi_sort_init(k);
  
         /*
-        * Transform the attrs from bitmap to an array to make accessing the i-th
-        * member easier, and then construct a filtered version with only attnums
-        * referenced by the dependency we validate.
+        * Translate the array of indexes to regular attnums for the dependency (we
+        * will need this to identify the columns in StatsBuildData).
          */
-       attnums = build_attnums_array(attrs, &numattrs);
-
         attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
         for (i = 0; i < k; i++)
-               attnums_dep[i] = attnums[dependency[i]];
+               attnums_dep[i] = data->attnums[dependency[i]];
  
         /*
          * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
         /* prepare the sort function for the dimensions */
         for (i = 0; i < k; i++)
         {
-               VacAttrStats *colstat = stats[dependency[i]];
+               VacAttrStats *colstat = data->stats[dependency[i]];
                 TypeCacheEntry *type;
  
                 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
          * descriptor.  For now that assumption holds, but it might change in the
          * future for example if we support statistics on multiple tables.
          */
-       items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
-                                                          mss, k, attnums_dep);
+       items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
  
         /*
          * Walk through the sorted array, split it into rows according to the
@@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
                 pfree(items);
  
         pfree(mss);
-       pfree(attnums);
         pfree(attnums_dep);
  
         /* Compute the 'degree of validity' as (supporting/total). */
-       return (n_supporting_rows * 1.0 / numrows);
+       return (n_supporting_rows * 1.0 / data->numrows);
  }
  
  /*
@@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
   *        (c) -> b
   */
  MVDependencies *
-statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
-                                                  VacAttrStats **stats)
+statext_dependencies_build(StatsBuildData *data)
  {
         int                     i,
                                 k;
-       int                     numattrs;
-       AttrNumber *attnums;
  
         /* result */
         MVDependencies *dependencies = NULL;
  
-       /*
-        * Transform the bms into an array, to make accessing i-th member easier.
-        */
-       attnums = build_attnums_array(attrs, &numattrs);
-
-       Assert(numattrs >= 2);
+       Assert(data->nattnums >= 2);
  
         /*
          * We'll try build functional dependencies starting from the smallest ones
@@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
          * included in the statistics object.  We start from the smallest ones
          * because we want to be able to skip already implied ones.
          */
-       for (k = 2; k <= numattrs; k++)
+       for (k = 2; k <= data->nattnums; k++)
         {
                 AttrNumber *dependency; /* array with k elements */
  
                 /* prepare a DependencyGenerator of variation */
-               DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
+               DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
  
                 /* generate all possible variations of k values (out of n) */
                 while ((dependency = DependencyGenerator_next(DependencyGenerator)))
@@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
                         MVDependency *d;
  
                         /* compute how valid the dependency seems */
-                       degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+                       degree = dependency_degree(data, k, dependency);
  
                         /*
                          * if the dependency seems entirely invalid, don't store it
@@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
                         d->degree = degree;
                         d->nattributes = k;
                         for (i = 0; i < k; i++)
-                               d->attributes[i] = attnums[dependency[i]];
+                               d->attributes[i] = data->attnums[dependency[i]];
  
                         /* initialize the list of dependencies */
                         if (dependencies == NULL)
@@ -747,6 +731,7 @@ static bool
  dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
  {
         Var                *var;
+       Node       *clause_expr;
  
         if (IsA(clause, RestrictInfo))
         {
@@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
  
                 /* Make sure non-selected argument is a pseudoconstant. */
                 if (is_pseudo_constant_clause(lsecond(expr->args)))
-                       var = linitial(expr->args);
+                       clause_expr = linitial(expr->args);
                 else if (is_pseudo_constant_clause(linitial(expr->args)))
-                       var = lsecond(expr->args);
+                       clause_expr = lsecond(expr->args);
                 else
                         return false;
  
@@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                 /*
                  * Reject ALL() variant, we only care about ANY/IN.
                  *
-                * FIXME Maybe we should check if all the values are the same, and
-                * allow ALL in that case? Doesn't seem very practical, though.
+                * XXX Maybe we should check if all the values are the same, and allow
+                * ALL in that case? Doesn't seem very practical, though.
                  */
                 if (!expr->useOr)
                         return false;
@@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                 if (!is_pseudo_constant_clause(lsecond(expr->args)))
                         return false;
  
-               var = linitial(expr->args);
+               clause_expr = linitial(expr->args);
  
                 /*
                  * If it's not an "=" operator, just ignore the clause, as it's not
@@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
         }
         else if (is_orclause(clause))
         {
-               BoolExpr   *expr = (BoolExpr *) clause;
+               BoolExpr   *bool_expr = (BoolExpr *) clause;
                 ListCell   *lc;
  
                 /* start with no attribute number */
                 *attnum = InvalidAttrNumber;
  
-               foreach(lc, expr->args)
+               foreach(lc, bool_expr->args)
                 {
                         AttrNumber      clause_attnum;
  
@@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                         if (*attnum == InvalidAttrNumber)
                                 *attnum = clause_attnum;
  
+                       /* ensure all the variables are the same (same attnum) */
                         if (*attnum != clause_attnum)
                                 return false;
                 }
@@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                  * "NOT x" can be interpreted as "x = false", so get the argument and
                  * proceed with seeing if it's a suitable Var.
                  */
-               var = (Var *) get_notclausearg(clause);
+               clause_expr = (Node *) get_notclausearg(clause);
         }
         else
         {
@@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
                  * A boolean expression "x" can be interpreted as "x = true", so
                  * proceed with seeing if it's a suitable Var.
                  */
-               var = (Var *) clause;
+               clause_expr = (Node *) clause;
         }
  
         /*
          * We may ignore any RelabelType node above the operand.  (There won't be
          * more than one, since eval_const_expressions has been applied already.)
          */
-       if (IsA(var, RelabelType))
-               var = (Var *) ((RelabelType *) var)->arg;
+       if (IsA(clause_expr, RelabelType))
+               clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
  
         /* We only support plain Vars for now */
-       if (!IsA(var, Var))
+       if (!IsA(clause_expr, Var))
                 return false;
  
+       /* OK, we know we have a Var */
+       var = (Var *) clause_expr;
+
         /* Ensure Var is from the correct relation */
         if (var->varno != relid)
                 return false;
@@ -1157,6 +1146,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
         return s1;
  }
  
+/*
+ * dependency_is_compatible_expression
+ *             Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+       List       *vars;
+       ListCell   *lc,
+                          *lc2;
+       Node       *clause_expr;
+
+       if (IsA(clause, RestrictInfo))
+       {
+               RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+               /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+               if (rinfo->pseudoconstant)
+                       return false;
+
+               /* Clauses referencing multiple, or no, varnos are incompatible */
+               if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+                       return false;
+
+               clause = (Node *) rinfo->clause;
+       }
+
+       if (is_opclause(clause))
+       {
+               /* If it's an opclause, check for Var = Const or Const = Var. */
+               OpExpr     *expr = (OpExpr *) clause;
+
+               /* Only expressions with two arguments are candidates. */
+               if (list_length(expr->args) != 2)
+                       return false;
+
+               /* Make sure non-selected argument is a pseudoconstant. */
+               if (is_pseudo_constant_clause(lsecond(expr->args)))
+                       clause_expr = linitial(expr->args);
+               else if (is_pseudo_constant_clause(linitial(expr->args)))
+                       clause_expr = lsecond(expr->args);
+               else
+                       return false;
+
+               /*
+                * If it's not an "=" operator, just ignore the clause, as it's not
+                * compatible with functional dependencies.
+                *
+                * This uses the function for estimating selectivity, not the operator
+                * directly (a bit awkward, but well ...).
+                *
+                * XXX this is pretty dubious; probably it'd be better to check btree
+                * or hash opclass membership, so as not to be fooled by custom
+                * selectivity functions, and to be more consistent with decisions
+                * elsewhere in the planner.
+                */
+               if (get_oprrest(expr->opno) != F_EQSEL)
+                       return false;
+
+               /* OK to proceed with checking "var" */
+       }
+       else if (IsA(clause, ScalarArrayOpExpr))
+       {
+               /* If it's an scalar array operator, check for Var IN Const. */
+               ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+               /*
+                * Reject ALL() variant, we only care about ANY/IN.
+                *
+                * FIXME Maybe we should check if all the values are the same, and
+                * allow ALL in that case? Doesn't seem very practical, though.
+                */
+               if (!expr->useOr)
+                       return false;
+
+               /* Only expressions with two arguments are candidates. */
+               if (list_length(expr->args) != 2)
+                       return false;
+
+               /*
+                * We know it's always (Var IN Const), so we assume the var is the
+                * first argument, and pseudoconstant is the second one.
+                */
+               if (!is_pseudo_constant_clause(lsecond(expr->args)))
+                       return false;
+
+               clause_expr = linitial(expr->args);
+
+               /*
+                * If it's not an "=" operator, just ignore the clause, as it's not
+                * compatible with functional dependencies. The operator is identified
+                * simply by looking at which function it uses to estimate
+                * selectivity. That's a bit strange, but it's what other similar
+                * places do.
+                */
+               if (get_oprrest(expr->opno) != F_EQSEL)
+                       return false;
+
+               /* OK to proceed with checking "var" */
+       }
+       else if (is_orclause(clause))
+       {
+               BoolExpr   *bool_expr = (BoolExpr *) clause;
+               ListCell   *lc;
+
+               /* start with no expression (we'll use the first match) */
+               *expr = NULL;
+
+               foreach(lc, bool_expr->args)
+               {
+                       Node       *or_expr = NULL;
+
+                       /*
+                        * Had we found incompatible expression in the arguments, treat
+                        * the whole expression as incompatible.
+                        */
+                       if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+                                                                                                        statlist, &or_expr))
+                               return false;
+
+                       if (*expr == NULL)
+                               *expr = or_expr;
+
+                       /* ensure all the expressions are the same */
+                       if (!equal(or_expr, *expr))
+                               return false;
+               }
+
+               /* the expression is already checked by the recursive call */
+               return true;
+       }
+       else if (is_notclause(clause))
+       {
+               /*
+                * "NOT x" can be interpreted as "x = false", so get the argument and
+                * proceed with seeing if it's a suitable Var.
+                */
+               clause_expr = (Node *) get_notclausearg(clause);
+       }
+       else
+       {
+               /*
+                * A boolean expression "x" can be interpreted as "x = true", so
+                * proceed with seeing if it's a suitable Var.
+                */
+               clause_expr = (Node *) clause;
+       }
+
+       /*
+        * We may ignore any RelabelType node above the operand.  (There won't be
+        * more than one, since eval_const_expressions has been applied already.)
+        */
+       if (IsA(clause_expr, RelabelType))
+               clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+       vars = pull_var_clause(clause_expr, 0);
+
+       foreach(lc, vars)
+       {
+               Var                *var = (Var *) lfirst(lc);
+
+               /* Ensure Var is from the correct relation */
+               if (var->varno != relid)
+                       return false;
+
+               /* We also better ensure the Var is from the current level */
+               if (var->varlevelsup != 0)
+                       return false;
+
+               /* Also ignore system attributes (we don't allow stats on those) */
+               if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+                       return false;
+       }
+
+       /*
+        * Check if we actually have a matching statistics for the expression.
+        *
+        * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+        */
+       foreach(lc, statlist)
+       {
+               StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+               /* ignore stats without dependencies */
+               if (info->kind != STATS_EXT_DEPENDENCIES)
+                       continue;
+
+               foreach(lc2, info->exprs)
+               {
+                       Node       *stat_expr = (Node *) lfirst(lc2);
+
+                       if (equal(clause_expr, stat_expr))
+                       {
+                               *expr = stat_expr;
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
  /*
   * dependencies_clauselist_selectivity
   *             Return the estimated selectivity of (a subset of) the given clauses
@@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         MVDependency **dependencies;
         int                     ndependencies;
         int                     i;
+       AttrNumber      attnum_offset;
+
+       /* unique expressions */
+       Node      **unique_exprs;
+       int                     unique_exprs_cnt;
  
         /* check if there's any stats that might be useful for us. */
         if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
@@ -1212,6 +1412,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
                                                                                  list_length(clauses));
  
+       /*
+        * We allocate space as if every clause was a unique expression, although
+        * that's probably overkill. Some will be simple column references that
+        * we'll translate to attnums, and there might be duplicates. But it's
+        * easier and cheaper to just do one allocation than repalloc later.
+        */
+       unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+       unique_exprs_cnt = 0;
+
         /*
          * Pre-process the clauses list to extract the attnums seen in each item.
          * We need to determine if there's any clauses which will be useful for
@@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
          *
          * We also skip clauses that we already estimated using different types of
          * statistics (we treat them as incompatible).
+        *
+        * To handle expressions, we assign them negative attnums, as if it was a
+        * system attribute (this is fine, as we only allow extended stats on user
+        * attributes). And then we offset everything by the number of
+        * expressions, so that we can store the values in a bitmapset.
          */
         listidx = 0;
         foreach(l, clauses)
         {
                 Node       *clause = (Node *) lfirst(l);
                 AttrNumber      attnum;
+               Node       *expr = NULL;
+
+               /* ignore clause by default */
+               list_attnums[listidx] = InvalidAttrNumber;
  
-               if (!bms_is_member(listidx, *estimatedclauses) &&
-                       dependency_is_compatible_clause(clause, rel->relid, &attnum))
+               if (!bms_is_member(listidx, *estimatedclauses))
                 {
-                       list_attnums[listidx] = attnum;
-                       clauses_attnums = bms_add_member(clauses_attnums, attnum);
+                       /*
+                        * If it's a simple column refrence, just extract the attnum. If
+                        * it's an expression, assign a negative attnum as if it was a
+                        * system attribute.
+                        */
+                       if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+                       {
+                               list_attnums[listidx] = attnum;
+                       }
+                       else if (dependency_is_compatible_expression(clause, rel->relid,
+                                                                                                                rel->statlist,
+                                                                                                                &expr))
+                       {
+                               /* special attnum assigned to this expression */
+                               attnum = InvalidAttrNumber;
+
+                               Assert(expr != NULL);
+
+                               /* If the expression is duplicate, use the same attnum. */
+                               for (i = 0; i < unique_exprs_cnt; i++)
+                               {
+                                       if (equal(unique_exprs[i], expr))
+                                       {
+                                               /* negative attribute number to expression */
+                                               attnum = -(i + 1);
+                                               break;
+                                       }
+                               }
+
+                               /* not found in the list, so add it */
+                               if (attnum == InvalidAttrNumber)
+                               {
+                                       unique_exprs[unique_exprs_cnt++] = expr;
+
+                                       /* after incrementing the value, to get -1, -2, ... */
+                                       attnum = (-unique_exprs_cnt);
+                               }
+
+                               /* remember which attnum was assigned to this clause */
+                               list_attnums[listidx] = attnum;
+                       }
                 }
-               else
-                       list_attnums[listidx] = InvalidAttrNumber;
  
                 listidx++;
         }
  
+       Assert(listidx == list_length(clauses));
+
         /*
-        * If there's not at least two distinct attnums then reject the whole list
-        * of clauses. We must return 1.0 so the calling function's selectivity is
-        * unaffected.
+        * How much we need to offset the attnums? If there are no expressions,
+        * then no offset is needed. Otherwise we need to offset enough for the
+        * lowest value (-unique_exprs_cnt) to become 1.
+        */
+       if (unique_exprs_cnt > 0)
+               attnum_offset = (unique_exprs_cnt + 1);
+       else
+               attnum_offset = 0;
+
+       /*
+        * Now that we know how many expressions there are, we can offset the
+        * values just enough to build the bitmapset.
+        */
+       for (i = 0; i < list_length(clauses); i++)
+       {
+               AttrNumber      attnum;
+
+               /* ignore incompatible or already estimated clauses */
+               if (list_attnums[i] == InvalidAttrNumber)
+                       continue;
+
+               /* make sure the attnum is in the expected range */
+               Assert(list_attnums[i] >= (-unique_exprs_cnt));
+               Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+               /* make sure the attnum is positive (valid AttrNumber) */
+               attnum = list_attnums[i] + attnum_offset;
+
+               /*
+                * Either it's a regular attribute, or it's an expression, in which
+                * case we must not have seen it before (expressions are unique).
+                *
+                * XXX Check whether it's a regular attribute has to be done using the
+                * original attnum, while the second check has to use the value with
+                * an offset.
+                */
+               Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+                          !bms_is_member(attnum, clauses_attnums));
+
+               /*
+                * Remember the offset attnum, both for attributes and expressions.
+                * We'll pass list_attnums to clauselist_apply_dependencies, which
+                * uses it to identify clauses in a bitmap. We could also pass the
+                * offset, but this is more convenient.
+                */
+               list_attnums[i] = attnum;
+
+               clauses_attnums = bms_add_member(clauses_attnums, attnum);
+       }
+
+       /*
+        * If there's not at least two distinct attnums and expressions, then
+        * reject the whole list of clauses. We must return 1.0 so the calling
+        * function's selectivity is unaffected.
          */
         if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
         {
@@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         foreach(l, rel->statlist)
         {
                 StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
-               Bitmapset  *matched;
-               BMS_Membership membership;
+               int                     nmatched;
+               int                     nexprs;
+               int                     k;
+               MVDependencies *deps;
  
                 /* skip statistics that are not of the correct type */
                 if (stat->kind != STATS_EXT_DEPENDENCIES)
                         continue;
  
-               matched = bms_intersect(clauses_attnums, stat->keys);
-               membership = bms_membership(matched);
-               bms_free(matched);
+               /*
+                * Count matching attributes - we have to undo the attnum offsets. The
+                * input attribute numbers are not offset (expressions are not
+                * included in stat->keys, so it's not necessary). But we need to
+                * offset it before checking against clauses_attnums.
+                */
+               nmatched = 0;
+               k = -1;
+               while ((k = bms_next_member(stat->keys, k)) >= 0)
+               {
+                       AttrNumber      attnum = (AttrNumber) k;
  
-               /* skip objects matching fewer than two attributes from clauses */
-               if (membership != BMS_MULTIPLE)
+                       /* skip expressions */
+                       if (!AttrNumberIsForUserDefinedAttr(attnum))
+                               continue;
+
+                       /* apply the same offset as above */
+                       attnum += attnum_offset;
+
+                       if (bms_is_member(attnum, clauses_attnums))
+                               nmatched++;
+               }
+
+               /* count matching expressions */
+               nexprs = 0;
+               for (i = 0; i < unique_exprs_cnt; i++)
+               {
+                       ListCell   *lc;
+
+                       foreach(lc, stat->exprs)
+                       {
+                               Node       *stat_expr = (Node *) lfirst(lc);
+
+                               /* try to match it */
+                               if (equal(stat_expr, unique_exprs[i]))
+                                       nexprs++;
+                       }
+               }
+
+               /*
+                * Skip objects matching fewer than two attributes/expressions from
+                * clauses.
+                */
+               if (nmatched + nexprs < 2)
                         continue;
  
-               func_dependencies[nfunc_dependencies]
-                       = statext_dependencies_load(stat->statOid);
+               deps = statext_dependencies_load(stat->statOid);
+
+               /*
+                * The expressions may be represented by different attnums in the
+                * stats, we need to remap them to be consistent with the clauses.
+                * That will make the later steps (e.g. picking the strongest item and
+                * so on) much simpler and cheaper, because it won't need to care
+                * about the offset at all.
+                *
+                * When we're at it, we can ignore dependencies that are not fully
+                * matched by clauses (i.e. referencing attributes or expressions that
+                * are not in the clauses).
+                *
+                * We have to do this for all statistics, as long as there are any
+                * expressions - we need to shift the attnums in all dependencies.
+                *
+                * XXX Maybe we should do this always, because it also eliminates some
+                * of the dependencies early. It might be cheaper than having to walk
+                * the longer list in find_strongest_dependency later, especially as
+                * we need to do that repeatedly?
+                *
+                * XXX We have to do this even when there are no expressions in
+                * clauses, otherwise find_strongest_dependency may fail for stats
+                * with expressions (due to lookup of negative value in bitmap). So we
+                * need to at least filter out those dependencies. Maybe we could do
+                * it in a cheaper way (if there are no expr clauses, we can just
+                * discard all negative attnums without any lookups).
+                */
+               if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+               {
+                       int                     ndeps = 0;
+
+                       for (i = 0; i < deps->ndeps; i++)
+                       {
+                               bool            skip = false;
+                               MVDependency *dep = deps->deps[i];
+                               int                     j;
+
+                               for (j = 0; j < dep->nattributes; j++)
+                               {
+                                       int                     idx;
+                                       Node       *expr;
+                                       int                     k;
+                                       AttrNumber      unique_attnum = InvalidAttrNumber;
+                                       AttrNumber      attnum;
+
+                                       /* undo the per-statistics offset */
+                                       attnum = dep->attributes[j];
+
+                                       /*
+                                        * For regular attributes we can simply check if it
+                                        * matches any clause. If there's no matching clause, we
+                                        * can just ignore it. We need to offset the attnum
+                                        * though.
+                                        */
+                                       if (AttrNumberIsForUserDefinedAttr(attnum))
+                                       {
+                                               dep->attributes[j] = attnum + attnum_offset;
+
+                                               if (!bms_is_member(dep->attributes[j], clauses_attnums))
+                                               {
+                                                       skip = true;
+                                                       break;
+                                               }
+
+                                               continue;
+                                       }
+
+                                       /*
+                                        * the attnum should be a valid system attnum (-1, -2,
+                                        * ...)
+                                        */
+                                       Assert(AttributeNumberIsValid(attnum));
+
+                                       /*
+                                        * For expressions, we need to do two translations. First
+                                        * we have to translate the negative attnum to index in
+                                        * the list of expressions (in the statistics object).
+                                        * Then we need to see if there's a matching clause. The
+                                        * index of the unique expression determines the attnum
+                                        * (and we offset it).
+                                        */
+                                       idx = -(1 + attnum);
+
+                                       /* Is the expression index is valid? */
+                                       Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+                                       expr = (Node *) list_nth(stat->exprs, idx);
+
+                                       /* try to find the expression in the unique list */
+                                       for (k = 0; k < unique_exprs_cnt; k++)
+                                       {
+                                               /*
+                                                * found a matching unique expression, use the attnum
+                                                * (derived from index of the unique expression)
+                                                */
+                                               if (equal(unique_exprs[k], expr))
+                                               {
+                                                       unique_attnum = -(k + 1) + attnum_offset;
+                                                       break;
+                                               }
+                                       }
+
+                                       /*
+                                        * Found no matching expression, so we can simply skip
+                                        * this dependency, because there's no chance it will be
+                                        * fully covered.
+                                        */
+                                       if (unique_attnum == InvalidAttrNumber)
+                                       {
+                                               skip = true;
+                                               break;
+                                       }
+
+                                       /* otherwise remap it to the new attnum */
+                                       dep->attributes[j] = unique_attnum;
+                               }
  
-               total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
-               nfunc_dependencies++;
+                               /* if found a matching dependency, keep it */
+                               if (!skip)
+                               {
+                                       /* maybe we've skipped something earlier, so move it */
+                                       if (ndeps != i)
+                                               deps->deps[ndeps] = deps->deps[i];
+
+                                       ndeps++;
+                               }
+                       }
+
+                       deps->ndeps = ndeps;
+               }
+
+               /*
+                * It's possible we've removed all dependencies, in which case we
+                * don't bother adding it to the list.
+                */
+               if (deps->ndeps > 0)
+               {
+                       func_dependencies[nfunc_dependencies] = deps;
+                       total_ndeps += deps->ndeps;
+                       nfunc_dependencies++;
+               }
         }
  
         /* if no matching stats could be found then we've nothing to do */
@@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
                 pfree(func_dependencies);
                 bms_free(clauses_attnums);
                 pfree(list_attnums);
+               pfree(unique_exprs);
                 return 1.0;
         }
  
@@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         pfree(func_dependencies);
         bms_free(clauses_attnums);
         pfree(list_attnums);
+       pfree(unique_exprs);
  
         return s1;
  }
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c

index 7808c6a09cac71d57ac437ef37c95d1a368894de..8c75690fce816234850f31848fe4ef155646374b 100644 (file)
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -24,6 +24,7 @@
  #include "catalog/pg_collation.h"
  #include "catalog/pg_statistic_ext.h"
  #include "catalog/pg_statistic_ext_data.h"
+#include "executor/executor.h"
  #include "commands/progress.h"
  #include "miscadmin.h"
  #include "nodes/nodeFuncs.h"
@@ -35,13 +36,16 @@
  #include "statistics/statistics.h"
  #include "utils/acl.h"
  #include "utils/array.h"
+#include "utils/attoptcache.h"
  #include "utils/builtins.h"
+#include "utils/datum.h"
  #include "utils/fmgroids.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/rel.h"
  #include "utils/selfuncs.h"
  #include "utils/syscache.h"
+#include "utils/typcache.h"
  
  /*
   * To avoid consuming too much memory during analysis and/or too much space
@@ -66,18 +70,38 @@ typedef struct StatExtEntry
         Bitmapset  *columns;            /* attribute numbers covered by the object */
         List       *types;                      /* 'char' list of enabled statistics kinds */
         int                     stattarget;             /* statistics target (-1 for default) */
+       List       *exprs;                      /* expressions */
  } StatExtEntry;
  
  
  static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
-static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                                                                                         int nvacatts, VacAttrStats **vacatts);
-static void statext_store(Oid relid,
+static void statext_store(Oid statOid,
                                                   MVNDistinct *ndistinct, MVDependencies *dependencies,
-                                                 MCVList *mcv, VacAttrStats **stats);
+                                                 MCVList *mcv, Datum exprs, VacAttrStats **stats);
  static int     statext_compute_stattarget(int stattarget,
                                                                            int natts, VacAttrStats **stats);
  
+/* Information needed to analyze a single simple expression. */
+typedef struct AnlExprData
+{
+       Node       *expr;                       /* expression to analyze */
+       VacAttrStats *vacattrstat;      /* statistics attrs to analyze */
+} AnlExprData;
+
+static void compute_expr_stats(Relation onerel, double totalrows,
+                                                          AnlExprData * exprdata, int nexprs,
+                                                          HeapTuple *rows, int numrows);
+static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs);
+static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static AnlExprData *build_expr_data(List *exprs, int stattarget);
+
+static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
+                                                                          int numrows, HeapTuple *rows,
+                                                                          VacAttrStats **stats, int stattarget);
+
+
  /*
   * Compute requested extended stats, using the rows sampled for the plain
   * (single-column) stats.
@@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
  {
         Relation        pg_stext;
         ListCell   *lc;
-       List       *stats;
+       List       *statslist;
         MemoryContext cxt;
         MemoryContext oldcxt;
         int64           ext_cnt;
  
+       /* Do nothing if there are no columns to analyze. */
+       if (!natts)
+               return;
+
         cxt = AllocSetContextCreate(CurrentMemoryContext,
                                                                 "BuildRelationExtStatistics",
                                                                 ALLOCSET_DEFAULT_SIZES);
         oldcxt = MemoryContextSwitchTo(cxt);
  
         pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
-       stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+       statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
  
         /* report this phase */
-       if (stats != NIL)
+       if (statslist != NIL)
         {
                 const int       index[] = {
                         PROGRESS_ANALYZE_PHASE,
@@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                 };
                 const int64 val[] = {
                         PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS,
-                       list_length(stats)
+                       list_length(statslist)
                 };
  
                 pgstat_progress_update_multi_param(2, index, val);
         }
  
         ext_cnt = 0;
-       foreach(lc, stats)
+       foreach(lc, statslist)
         {
                 StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
                 MVNDistinct *ndistinct = NULL;
                 MVDependencies *dependencies = NULL;
                 MCVList    *mcv = NULL;
+               Datum           exprstats = (Datum) 0;
                 VacAttrStats **stats;
                 ListCell   *lc2;
                 int                     stattarget;
+               StatsBuildData *data;
  
                 /*
                  * Check if we can build these stats based on the column analyzed. If
                  * not, report this fact (except in autovacuum) and move on.
                  */
-               stats = lookup_var_attr_stats(onerel, stat->columns,
+               stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                                                           natts, vacattrstats);
                 if (!stats)
                 {
@@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                         continue;
                 }
  
-               /* check allowed number of dimensions */
-               Assert(bms_num_members(stat->columns) >= 2 &&
-                          bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
                 /* compute statistics target for this statistics */
                 stattarget = statext_compute_stattarget(stat->stattarget,
                                                                                                 bms_num_members(stat->columns),
@@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                 if (stattarget == 0)
                         continue;
  
+               /* evaluate expressions (if the statistics has any) */
+               data = make_build_data(onerel, stat, numrows, rows, stats, stattarget);
+
                 /* compute statistic of each requested type */
                 foreach(lc2, stat->types)
                 {
                         char            t = (char) lfirst_int(lc2);
  
                         if (t == STATS_EXT_NDISTINCT)
-                               ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
-                                                                                                       stat->columns, stats);
+                               ndistinct = statext_ndistinct_build(totalrows, data);
                         else if (t == STATS_EXT_DEPENDENCIES)
-                               dependencies = statext_dependencies_build(numrows, rows,
-                                                                                                                 stat->columns, stats);
+                               dependencies = statext_dependencies_build(data);
                         else if (t == STATS_EXT_MCV)
-                               mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
-                                                                               totalrows, stattarget);
+                               mcv = statext_mcv_build(data, totalrows, stattarget);
+                       else if (t == STATS_EXT_EXPRESSIONS)
+                       {
+                               AnlExprData *exprdata;
+                               int                     nexprs;
+
+                               /* should not happen, thanks to checks when defining stats */
+                               if (!stat->exprs)
+                                       elog(ERROR, "requested expression stats, but there are no expressions");
+
+                               exprdata = build_expr_data(stat->exprs, stattarget);
+                               nexprs = list_length(stat->exprs);
+
+                               compute_expr_stats(onerel, totalrows,
+                                                                  exprdata, nexprs,
+                                                                  rows, numrows);
+
+                               exprstats = serialize_expr_stats(exprdata, nexprs);
+                       }
                 }
  
                 /* store the statistics in the catalog */
-               statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
+               statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats);
  
                 /* for reporting progress */
                 pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED,
                                                                          ++ext_cnt);
+
+               /* free the build data (allocated as a single chunk) */
+               pfree(data);
         }
  
         table_close(pg_stext, RowExclusiveLock);
@@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel,
         MemoryContext oldcxt;
         int                     result = 0;
  
+       /* If there are no columns to analyze, just return 0. */
+       if (!natts)
+               return 0;
+
         cxt = AllocSetContextCreate(CurrentMemoryContext,
                                                                 "ComputeExtStatisticsRows",
                                                                 ALLOCSET_DEFAULT_SIZES);
@@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel,
                  * analyzed. If not, ignore it (don't report anything, we'll do that
                  * during the actual build BuildRelationExtStatistics).
                  */
-               stats = lookup_var_attr_stats(onerel, stat->columns,
+               stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                                                           natts, vacattrstats);
  
                 if (!stats)
@@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type)
                         attnum = Anum_pg_statistic_ext_data_stxdmcv;
                         break;
  
+               case STATS_EXT_EXPRESSIONS:
+                       attnum = Anum_pg_statistic_ext_data_stxdexpr;
+                       break;
+
                 default:
                         elog(ERROR, "unexpected statistics type requested: %d", type);
         }
@@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
                 ArrayType  *arr;
                 char       *enabled;
                 Form_pg_statistic_ext staForm;
+               List       *exprs = NIL;
  
                 entry = palloc0(sizeof(StatExtEntry));
                 staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
@@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
                 {
                         Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
                                    (enabled[i] == STATS_EXT_DEPENDENCIES) ||
-                                  (enabled[i] == STATS_EXT_MCV));
+                                  (enabled[i] == STATS_EXT_MCV) ||
+                                  (enabled[i] == STATS_EXT_EXPRESSIONS));
                         entry->types = lappend_int(entry->types, (int) enabled[i]);
                 }
  
+               /* decode expression (if any) */
+               datum = SysCacheGetAttr(STATEXTOID, htup,
+                                                               Anum_pg_statistic_ext_stxexprs, &isnull);
+
+               if (!isnull)
+               {
+                       char       *exprsString;
+
+                       exprsString = TextDatumGetCString(datum);
+                       exprs = (List *) stringToNode(exprsString);
+
+                       pfree(exprsString);
+
+                       /*
+                        * Run the expressions through eval_const_expressions. This is not
+                        * just an optimization, but is necessary, because the planner
+                        * will be comparing them to similarly-processed qual clauses, and
+                        * may fail to detect valid matches without this.  We must not use
+                        * canonicalize_qual, however, since these aren't qual
+                        * expressions.
+                        */
+                       exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+                       /* May as well fix opfuncids too */
+                       fix_opfuncids((Node *) exprs);
+               }
+
+               entry->exprs = exprs;
+
                 result = lappend(result, entry);
         }
  
@@ -427,6 +513,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
         return result;
  }
  
+/*
+ * examine_attribute -- pre-analysis of a single column
+ *
+ * Determine whether the column is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_attribute(Node *expr)
+{
+       HeapTuple       typtuple;
+       VacAttrStats *stats;
+       int                     i;
+       bool            ok;
+
+       /*
+        * Create the VacAttrStats struct.  Note that we only have a copy of the
+        * fixed fields of the pg_attribute tuple.
+        */
+       stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+       /* fake the attribute */
+       stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE);
+       stats->attr->attstattarget = -1;
+
+       /*
+        * When analyzing an expression, believe the expression tree's type not
+        * the column datatype --- the latter might be the opckeytype storage
+        * type of the opclass, which is not interesting for our purposes.  (Note:
+        * if we did anything with non-expression statistics columns, we'd need to
+        * figure out where to get the correct type info from, but for now that's
+        * not a problem.)      It's not clear whether anyone will care about the
+        * typmod, but we store that too just in case.
+        */
+       stats->attrtypid = exprType(expr);
+       stats->attrtypmod = exprTypmod(expr);
+       stats->attrcollid = exprCollation(expr);
+
+       typtuple = SearchSysCacheCopy1(TYPEOID,
+                                                                  ObjectIdGetDatum(stats->attrtypid));
+       if (!HeapTupleIsValid(typtuple))
+               elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+       stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+
+       /*
+        * We don't actually analyze individual attributes, so no need to set the
+        * memory context.
+        */
+       stats->anl_context = NULL;
+       stats->tupattnum = InvalidAttrNumber;
+
+       /*
+        * The fields describing the stats->stavalues[n] element types default to
+        * the type of the data being analyzed, but the type-specific typanalyze
+        * function can change them if it wants to store something else.
+        */
+       for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+       {
+               stats->statypid[i] = stats->attrtypid;
+               stats->statyplen[i] = stats->attrtype->typlen;
+               stats->statypbyval[i] = stats->attrtype->typbyval;
+               stats->statypalign[i] = stats->attrtype->typalign;
+       }
+
+       /*
+        * Call the type-specific typanalyze function.  If none is specified, use
+        * std_typanalyze().
+        */
+       if (OidIsValid(stats->attrtype->typanalyze))
+               ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                                                                  PointerGetDatum(stats)));
+       else
+               ok = std_typanalyze(stats);
+
+       if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+       {
+               heap_freetuple(typtuple);
+               pfree(stats->attr);
+               pfree(stats);
+               return NULL;
+       }
+
+       return stats;
+}
+
+/*
+ * examine_expression -- pre-analysis of a single expression
+ *
+ * Determine whether the expression is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_expression(Node *expr, int stattarget)
+{
+       HeapTuple       typtuple;
+       VacAttrStats *stats;
+       int                     i;
+       bool            ok;
+
+       Assert(expr != NULL);
+
+       /*
+        * Create the VacAttrStats struct.
+        */
+       stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+       /*
+        * When analyzing an expression, believe the expression tree's type.
+        */
+       stats->attrtypid = exprType(expr);
+       stats->attrtypmod = exprTypmod(expr);
+
+       /*
+        * We don't allow collation to be specified in CREATE STATISTICS, so we
+        * have to use the collation specified for the expression. It's possible
+        * to specify the collation in the expression "(col COLLATE "en_US")" in
+        * which case exprCollation() does the right thing.
+        */
+       stats->attrcollid = exprCollation(expr);
+
+       /*
+        * We don't have any pg_attribute for expressions, so let's fake something
+        * reasonable into attstattarget, which is the only thing std_typanalyze
+        * needs.
+        */
+       stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
+
+       /*
+        * We can't have statistics target specified for the expression, so we
+        * could use either the default_statistics_target, or the target computed
+        * for the extended statistics. The second option seems more reasonable.
+        */
+       stats->attr->attstattarget = stattarget;
+
+       /* initialize some basic fields */
+       stats->attr->attrelid = InvalidOid;
+       stats->attr->attnum = InvalidAttrNumber;
+       stats->attr->atttypid = stats->attrtypid;
+
+       typtuple = SearchSysCacheCopy1(TYPEOID,
+                                                                  ObjectIdGetDatum(stats->attrtypid));
+       if (!HeapTupleIsValid(typtuple))
+               elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+
+       stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+       stats->anl_context = CurrentMemoryContext;      /* XXX should be using
+                                                                                                * something else? */
+       stats->tupattnum = InvalidAttrNumber;
+
+       /*
+        * The fields describing the stats->stavalues[n] element types default to
+        * the type of the data being analyzed, but the type-specific typanalyze
+        * function can change them if it wants to store something else.
+        */
+       for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+       {
+               stats->statypid[i] = stats->attrtypid;
+               stats->statyplen[i] = stats->attrtype->typlen;
+               stats->statypbyval[i] = stats->attrtype->typbyval;
+               stats->statypalign[i] = stats->attrtype->typalign;
+       }
+
+       /*
+        * Call the type-specific typanalyze function.  If none is specified, use
+        * std_typanalyze().
+        */
+       if (OidIsValid(stats->attrtype->typanalyze))
+               ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                                                                  PointerGetDatum(stats)));
+       else
+               ok = std_typanalyze(stats);
+
+       if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+       {
+               heap_freetuple(typtuple);
+               pfree(stats);
+               return NULL;
+       }
+
+       return stats;
+}
+
  /*
   * Using 'vacatts' of size 'nvacatts' as input data, return a newly built
   * VacAttrStats array which includes only the items corresponding to
@@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
   * to the caller that the stats should not be built.
   */
  static VacAttrStats **
-lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                                           int nvacatts, VacAttrStats **vacatts)
  {
         int                     i = 0;
         int                     x = -1;
+       int                     natts;
         VacAttrStats **stats;
+       ListCell   *lc;
+
+       natts = bms_num_members(attrs) + list_length(exprs);
  
-       stats = (VacAttrStats **)
-               palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
+       stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *));
  
         /* lookup VacAttrStats info for the requested columns (same attnum) */
         while ((x = bms_next_member(attrs, x)) >= 0)
@@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
                 i++;
         }
  
+       /* also add info for expressions */
+       foreach(lc, exprs)
+       {
+               Node       *expr = (Node *) lfirst(lc);
+
+               stats[i] = examine_attribute(expr);
+
+               /*
+                * XXX We need tuple descriptor later, and we just grab it from
+                * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded
+                * examine_attribute does not set that, so just grab it from the first
+                * vacatts element.
+                */
+               stats[i]->tupDesc = vacatts[0]->tupDesc;
+
+               i++;
+       }
+
         return stats;
  }
  
@@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
  static void
  statext_store(Oid statOid,
                           MVNDistinct *ndistinct, MVDependencies *dependencies,
-                         MCVList *mcv, VacAttrStats **stats)
+                         MCVList *mcv, Datum exprs, VacAttrStats **stats)
  {
         Relation        pg_stextdata;
         HeapTuple       stup,
@@ -532,11 +820,17 @@ statext_store(Oid statOid,
                 nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
                 values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
         }
+       if (exprs != (Datum) 0)
+       {
+               nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false;
+               values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs;
+       }
  
         /* always replace the value (either by bytea or NULL) */
         replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
         replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
         replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+       replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
  
         /* there should already be a pg_statistic_ext_data tuple */
         oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
@@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup)
   * is not necessary here (and when querying the bitmap).
   */
  AttrNumber *
-build_attnums_array(Bitmapset *attrs, int *numattrs)
+build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs)
  {
         int                     i,
                                 j;
@@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
         j = -1;
         while ((j = bms_next_member(attrs, j)) >= 0)
         {
+               AttrNumber      attnum = (j - nexprs);
+
                 /*
                  * Make sure the bitmap contains only user-defined attributes. As
                  * bitmaps can't contain negative values, this can be violated in two
                  * ways. Firstly, the bitmap might contain 0 as a member, and secondly
                  * the integer value might be larger than MaxAttrNumber.
                  */
-               Assert(AttrNumberIsForUserDefinedAttr(j));
-               Assert(j <= MaxAttrNumber);
+               Assert(AttributeNumberIsValid(attnum));
+               Assert(attnum <= MaxAttrNumber);
+               Assert(attnum >= (-nexprs));
  
-               attnums[i++] = (AttrNumber) j;
+               attnums[i++] = (AttrNumber) attnum;
  
                 /* protect against overflows */
                 Assert(i <= num);
@@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
   * can simply pfree the return value to release all of it.
   */
  SortItem *
-build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
-                                  MultiSortSupport mss, int numattrs, AttrNumber *attnums)
+build_sorted_items(StatsBuildData *data, int *nitems,
+                                  MultiSortSupport mss,
+                                  int numattrs, AttrNumber *attnums)
  {
         int                     i,
                                 j,
                                 len,
-                               idx;
-       int                     nvalues = numrows * numattrs;
+                               nrows;
+       int                     nvalues = data->numrows * numattrs;
  
         SortItem   *items;
         Datum      *values;
         bool       *isnull;
         char       *ptr;
+       int                *typlen;
  
         /* Compute the total amount of memory we need (both items and values). */
-       len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+       len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
  
         /* Allocate the memory and split it into the pieces. */
         ptr = palloc0(len);
  
         /* items to sort */
         items = (SortItem *) ptr;
-       ptr += numrows * sizeof(SortItem);
+       ptr += data->numrows * sizeof(SortItem);
  
         /* values and null flags */
         values = (Datum *) ptr;
@@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
         Assert((ptr - (char *) items) == len);
  
         /* fix the pointers to Datum and bool arrays */
-       idx = 0;
-       for (i = 0; i < numrows; i++)
+       nrows = 0;
+       for (i = 0; i < data->numrows; i++)
         {
-               bool            toowide = false;
+               items[nrows].values = &values[nrows * numattrs];
+               items[nrows].isnull = &isnull[nrows * numattrs];
  
-               items[idx].values = &values[idx * numattrs];
-               items[idx].isnull = &isnull[idx * numattrs];
+               nrows++;
+       }
+
+       /* build a local cache of typlen for all attributes */
+       typlen = (int *) palloc(sizeof(int) * data->nattnums);
+       for (i = 0; i < data->nattnums; i++)
+               typlen[i] = get_typlen(data->stats[i]->attrtypid);
+
+       nrows = 0;
+       for (i = 0; i < data->numrows; i++)
author	Tomas Vondra <tomas.vondra@postgresql.org>
	Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)
committer	Tomas Vondra <tomas.vondra@postgresql.org>
	Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_statistics.sgml		patch \| blob \| blame \| history
src/backend/catalog/Makefile		patch \| blob \| blame \| history
src/backend/catalog/system_views.sql		patch \| blob \| blame \| history
src/backend/commands/statscmds.c		patch \| blob \| blame \| history
src/backend/commands/tablecmds.c		patch \| blob \| blame \| history
src/backend/nodes/copyfuncs.c		patch \| blob \| blame \| history
src/backend/nodes/equalfuncs.c		patch \| blob \| blame \| history
src/backend/nodes/outfuncs.c		patch \| blob \| blame \| history
src/backend/optimizer/util/plancat.c		patch \| blob \| blame \| history
src/backend/parser/gram.y		patch \| blob \| blame \| history
src/backend/parser/parse_agg.c		patch \| blob \| blame \| history
src/backend/parser/parse_expr.c		patch \| blob \| blame \| history
src/backend/parser/parse_func.c		patch \| blob \| blame \| history
src/backend/parser/parse_utilcmd.c		patch \| blob \| blame \| history
src/backend/statistics/dependencies.c		patch \| blob \| blame \| history
src/backend/statistics/extended_stats.c		patch \| blob \| blame \| history
src/backend/statistics/mcv.c		patch \| blob \| blame \| history
src/backend/statistics/mvdistinct.c		patch \| blob \| blame \| history
src/backend/tcop/utility.c		patch \| blob \| blame \| history
src/backend/utils/adt/ruleutils.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/bin/pg_dump/t/002_pg_dump.pl		patch \| blob \| blame \| history
src/bin/psql/describe.c		patch \| blob \| blame \| history
src/include/catalog/catversion.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.dat		patch \| blob \| blame \| history
src/include/catalog/pg_statistic_ext.h		patch \| blob \| blame \| history
src/include/catalog/pg_statistic_ext_data.h		patch \| blob \| blame \| history
src/include/commands/defrem.h		patch \| blob \| blame \| history
src/include/nodes/nodes.h		patch \| blob \| blame \| history
src/include/nodes/parsenodes.h		patch \| blob \| blame \| history
src/include/nodes/pathnodes.h		patch \| blob \| blame \| history
src/include/parser/parse_node.h		patch \| blob \| blame \| history
src/include/parser/parse_utilcmd.h		patch \| blob \| blame \| history
src/include/statistics/extended_stats_internal.h		patch \| blob \| blame \| history
src/include/statistics/statistics.h		patch \| blob \| blame \| history
src/include/utils/ruleutils.h		patch \| blob \| blame \| history
src/test/regress/expected/create_table_like.out		patch \| blob \| blame \| history
src/test/regress/expected/oidjoins.out		patch \| blob \| blame \| history
src/test/regress/expected/rules.out		patch \| blob \| blame \| history
src/test/regress/expected/stats_ext.out		patch \| blob \| blame \| history
src/test/regress/sql/create_table_like.sql		patch \| blob \| blame \| history
src/test/regress/sql/stats_ext.sql		patch \| blob \| blame \| history