Add array_sample() and array_shuffle() functions.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 7 Apr 2023 15:47:07 +0000 (11:47 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 7 Apr 2023 15:47:07 +0000 (11:47 -0400)
These are useful in Monte Carlo applications.

Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself

Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net

doc/src/sgml/func.sgml
src/backend/utils/adt/array_userfuncs.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/test/regress/expected/arrays.out
src/test/regress/sql/arrays.sql

index dc44a74eb2519ddcb25e6bef1cad8b6ac717b393..4211d31f307c7485b615b4f8d6e25c42f130b20c 100644 (file)
@@ -16053,7 +16053,7 @@ SELECT js,
   js IS JSON ARRAY "array?"
 FROM (VALUES
       ('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js);
-     js     | json? | scalar? | object? | array? 
+     js     | json? | scalar? | object? | array?
 ------------+-------+---------+---------+--------
  123        | t     | t       | f       | f
  "abc"      | t     | t       | f       | f
@@ -18777,6 +18777,48 @@ SELECT NULLIF(value, '(none)') ...
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_sample</primary>
+        </indexterm>
+        <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Returns an array of <parameter>n</parameter> items randomly selected
+        from <parameter>array</parameter>.  <parameter>n</parameter> may not
+        exceed the length of <parameter>array</parameter>'s first dimension.
+        If <parameter>array</parameter> is multi-dimensional,
+        an <quote>item</quote> is a slice having a given first subscript.
+       </para>
+       <para>
+        <literal>array_sample(ARRAY[1,2,3,4,5,6], 3)</literal>
+        <returnvalue>{2,6,1}</returnvalue>
+       </para>
+       <para>
+        <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
+        <returnvalue>{{5,6},{1,2}}</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_shuffle</primary>
+        </indexterm>
+        <function>array_shuffle</function> ( <type>anyarray</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Randomly shuffles the first dimension of the array.
+       </para>
+       <para>
+        <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
+        <returnvalue>{{5,6},{1,2},{3,4}}</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm id="function-array-to-string">
index 80750191d8d2d79870c8d242c000bb61dd912c94..33e2b98307d1ba03a907c8fe9287fb79a80a723b 100644 (file)
@@ -15,6 +15,7 @@
 #include "catalog/pg_type.h"
 #include "libpq/pqformat.h"
 #include "common/int.h"
+#include "common/pg_prng.h"
 #include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/datum.h"
@@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS)
 
        PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
 }
+
+/*
+ * array_shuffle_n
+ *             Return a copy of array with n randomly chosen items.
+ *
+ * The number of items must not exceed the size of the first dimension of the
+ * array.  We preserve the first dimension's lower bound if keep_lb,
+ * else it's set to 1.  Lower-order dimensions are preserved in any case.
+ *
+ * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
+ * from the system catalogs, given only the elmtyp. However, the caller is
+ * in a better position to cache this info across multiple calls.
+ */
+static ArrayType *
+array_shuffle_n(ArrayType *array, int n, bool keep_lb,
+                               Oid elmtyp, TypeCacheEntry *typentry)
+{
+       ArrayType  *result;
+       int                     ndim,
+                          *dims,
+                          *lbs,
+                               nelm,
+                               nitem,
+                               rdims[MAXDIM],
+                               rlbs[MAXDIM];
+       int16           elmlen;
+       bool            elmbyval;
+       char            elmalign;
+       Datum      *elms,
+                          *ielms;
+       bool       *nuls,
+                          *inuls;
+
+       ndim = ARR_NDIM(array);
+       dims = ARR_DIMS(array);
+       lbs = ARR_LBOUND(array);
+
+       elmlen = typentry->typlen;
+       elmbyval = typentry->typbyval;
+       elmalign = typentry->typalign;
+
+       /* If the target array is empty, exit fast */
+       if (ndim < 1 || dims[0] < 1 || n < 1)
+               return construct_empty_array(elmtyp);
+
+       deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+                                         &elms, &nuls, &nelm);
+
+       nitem = dims[0];                        /* total number of items */
+       nelm /= nitem;                          /* number of elements per item */
+
+       Assert(n <= nitem);                     /* else it's caller error */
+
+       /*
+        * Shuffle array using Fisher-Yates algorithm.  Scan the array and swap
+        * current item (nelm datums starting at ielms) with a randomly chosen
+        * later item (nelm datums starting at jelms) in each iteration.  We can
+        * stop once we've done n iterations; then first n items are the result.
+        */
+       ielms = elms;
+       inuls = nuls;
+       for (int i = 0; i < n; i++)
+       {
+               int                     j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm;
+               Datum      *jelms = elms + j;
+               bool       *jnuls = nuls + j;
+
+               /* Swap i'th and j'th items; advance ielms/inuls to next item */
+               for (int k = 0; k < nelm; k++)
+               {
+                       Datum           elm = *ielms;
+                       bool            nul = *inuls;
+
+                       *ielms++ = *jelms;
+                       *inuls++ = *jnuls;
+                       *jelms++ = elm;
+                       *jnuls++ = nul;
+               }
+       }
+
+       /* Set up dimensions of the result */
+       memcpy(rdims, dims, ndim * sizeof(int));
+       memcpy(rlbs, lbs, ndim * sizeof(int));
+       rdims[0] = n;
+       if (!keep_lb)
+               rlbs[0] = 1;
+
+       result = construct_md_array(elms, nuls, ndim, rdims, rlbs,
+                                                               elmtyp, elmlen, elmbyval, elmalign);
+
+       pfree(elms);
+       pfree(nuls);
+
+       return result;
+}
+
+/*
+ * array_shuffle
+ *
+ * Returns an array with the same dimensions as the input array, with its
+ * first-dimension elements in random order.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+       ArrayType  *array = PG_GETARG_ARRAYTYPE_P(0);
+       ArrayType  *result;
+       Oid                     elmtyp;
+       TypeCacheEntry *typentry;
+
+       /*
+        * There is no point in shuffling empty arrays or arrays with less than
+        * two items.
+        */
+       if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
+               PG_RETURN_ARRAYTYPE_P(array);
+
+       elmtyp = ARR_ELEMTYPE(array);
+       typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+       if (typentry == NULL || typentry->type_id != elmtyp)
+       {
+               typentry = lookup_type_cache(elmtyp, 0);
+               fcinfo->flinfo->fn_extra = (void *) typentry;
+       }
+
+       result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry);
+
+       PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * array_sample
+ *
+ * Returns an array of n randomly chosen first-dimension elements
+ * from the input array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+       ArrayType  *array = PG_GETARG_ARRAYTYPE_P(0);
+       int                     n = PG_GETARG_INT32(1);
+       ArrayType  *result;
+       Oid                     elmtyp;
+       TypeCacheEntry *typentry;
+       int                     nitem;
+
+       nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
+
+       if (n < 0 || n > nitem)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("sample size must be between 0 and %d", nitem)));
+
+       elmtyp = ARR_ELEMTYPE(array);
+       typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+       if (typentry == NULL || typentry->type_id != elmtyp)
+       {
+               typentry = lookup_type_cache(elmtyp, 0);
+               fcinfo->flinfo->fn_extra = (void *) typentry;
+       }
+
+       result = array_shuffle_n(array, n, false, elmtyp, typentry);
+
+       PG_RETURN_ARRAYTYPE_P(result);
+}
index af134d2f672ba5ddec9cca9c0a022e2e1152722d..42e881fafbded1d85cff1e19d4db4a9abde3873b 100644 (file)
@@ -57,6 +57,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     202304051
+#define CATALOG_VERSION_NO     202304071
 
 #endif
index f9f264220153aaa78e68fb81599871f898da2191..f64bc68276a5223be84e55c1d4d9333ff3c9ac9d 100644 (file)
 { oid => '6172', descr => 'remove last N elements of array',
   proname => 'trim_array', prorettype => 'anyarray',
   proargtypes => 'anyarray int4', prosrc => 'trim_array' },
+{ oid => '8464', descr => 'shuffle array',
+  proname => 'array_shuffle', provolatile => 'v', prorettype => 'anyarray',
+  proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+  proname => 'array_sample', provolatile => 'v', prorettype => 'anyarray',
+  proargtypes => 'anyarray int4', prosrc => 'array_sample' },
 { oid => '3816', descr => 'array typanalyze',
   proname => 'array_typanalyze', provolatile => 's', prorettype => 'bool',
   proargtypes => 'internal', prosrc => 'array_typanalyze' },
index bfaf12518733c5b8efd250e07b7590a848bfbc70..70643914688408f3bc93d9981a2ab301a244f716 100644 (file)
@@ -2472,3 +2472,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
 ERROR:  number of elements to trim must be between 0 and 3
 SELECT trim_array(ARRAY[]::int[], 1); -- fail
 ERROR:  number of elements to trim must be between 0 and 0
+-- array_shuffle
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
+ array_dims  
+-------------
+ [-1:2][2:3]
+(1 row)
+
+SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
+   array_dims    
+-----------------
+ [1:3][1:2][1:2]
+(1 row)
+
+-- array_sample
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
+ array_length 
+--------------
+            3
+(1 row)
+
+SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
+ array_dims 
+------------
+ [1:3][2:3]
+(1 row)
+
+SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
+   array_dims    
+-----------------
+ [1:2][1:2][1:2]
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
+ERROR:  sample size must be between 0 and 6
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
+ERROR:  sample size must be between 0 and 6
index 094937ba639afaadc3eb20987b5ca1fc8c3be33c..f1375621e0ca1a41d83b3d97bdb6aeb6feabf903 100644 (file)
@@ -761,3 +761,17 @@ FROM
 SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
 SELECT trim_array(ARRAY[]::int[], 1); -- fail
+
+-- array_shuffle
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
+SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
+SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
+
+-- array_sample
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
+SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
+SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
+SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
+SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail