Fix two issues in TOAST decompression.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)
diff --git a/src/common/pg_lzcompress.c b/src/common/pg_lzcompress.c

index d24d4803a9839fff48c36d46aadc5854fd5fad66..79747767ce08a0144b85d3496d8def6b74071825 100644 (file)
--- a/src/common/pg_lzcompress.c
+++ b/src/common/pg_lzcompress.c
@@ -710,7 +710,6 @@ pglz_decompress(const char *source, int32 slen, char *dest,
  
                 for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++)
                 {
-
                         if (ctrl & 1)
                         {
                                 /*
@@ -732,41 +731,62 @@ pglz_decompress(const char *source, int32 slen, char *dest,
                                         len += *sp++;
  
                                 /*
-                                * Now we copy the bytes specified by the tag from OUTPUT to
-                                * OUTPUT (copy len bytes from dp - off to dp). The copied
-                                * areas could overlap, to preven possible uncertainty, we
-                                * copy only non-overlapping regions.
+                                * Check for corrupt data: if we fell off the end of the
+                                * source, or if we obtained off = 0, we have problems.  (We
+                                * must check this, else we risk an infinite loop below in the
+                                * face of corrupt data.)
+                                */
+                               if (sp > srcend || off == 0)
+                                       break;
+
+                               /*
+                                * Don't emit more data than requested.
                                  */
                                 len = Min(len, destend - dp);
+
+                               /*
+                                * Now we copy the bytes specified by the tag from OUTPUT to
+                                * OUTPUT (copy len bytes from dp - off to dp).  The copied
+                                * areas could overlap, so to avoid undefined behavior in
+                                * memcpy(), be careful to copy only non-overlapping regions.
+                                *
+                                * Note that we cannot use memmove() instead, since while its
+                                * behavior is well-defined, it's also not what we want.
+                                */
                                 while (off < len)
                                 {
-                                       /*---------
-                                        * When offset is smaller than length - source and
-                                        * destination regions overlap. memmove() is resolving
-                                        * this overlap in an incompatible way with pglz. Thus we
-                                        * resort to memcpy()-ing non-overlapping regions.
-                                        *
-                                        * Consider input: 112341234123412341234
-                                        * At byte 5       here ^ we have match with length 16 and
-                                        * offset 4.       11234M(len=16, off=4)
-                                        * We are decoding first period of match and rewrite match
-                                        *                 112341234M(len=12, off=8)
-                                        *
-                                        * The same match is now at position 9, it points to the
-                                        * same start byte of output, but from another position:
-                                        * the offset is doubled.
-                                        *
-                                        * We iterate through this offset growth until we can
-                                        * proceed to usual memcpy(). If we would try to decode
-                                        * the match at byte 5 (len=16, off=4) by memmove() we
-                                        * would issue memmove(5, 1, 16) which would produce
-                                        * 112341234XXXXXXXXXXXX, where series of X is 12
-                                        * undefined bytes, that were at bytes [5:17].
-                                        *---------
+                                       /*
+                                        * We can safely copy "off" bytes since that clearly
+                                        * results in non-overlapping source and destination.
                                          */
                                         memcpy(dp, dp - off, off);
                                         len -= off;
                                         dp += off;
+
+                                       /*----------
+                                        * This bit is less obvious: we can double "off" after
+                                        * each such step.  Consider this raw input:
+                                        *              112341234123412341234
+                                        * This will be encoded as 5 literal bytes "11234" and
+                                        * then a match tag with length 16 and offset 4.  After
+                                        * memcpy'ing the first 4 bytes, we will have emitted
+                                        *              112341234
+                                        * so we can double "off" to 8, then after the next step
+                                        * we have emitted
+                                        *              11234123412341234
+                                        * Then we can double "off" again, after which it is more
+                                        * than the remaining "len" so we fall out of this loop
+                                        * and finish with a non-overlapping copy of the
+                                        * remainder.  In general, a match tag with off < len
+                                        * implies that the decoded data has a repeat length of
+                                        * "off".  We can handle 1, 2, 4, etc repetitions of the
+                                        * repeated string per memcpy until we get to a situation
+                                        * where the final copy step is non-overlapping.
+                                        *
+                                        * (Another way to understand this is that we are keeping
+                                        * the copy source point dp - off the same throughout.)
+                                        *----------
+                                        */
                                         off += off;
                                 }
                                 memcpy(dp, dp - off, len);
@@ -820,21 +840,32 @@ pglz_decompress(const char *source, int32 slen, char *dest,
  int32
  pglz_maximum_compressed_size(int32 rawsize, int32 total_compressed_size)
  {
-       int32           compressed_size;
+       int64           compressed_size;
  
         /*
-        * pglz uses one control bit per byte, so we need (rawsize * 9) bits. We
-        * care about bytes though, so we add 7 to make sure we include the last
-        * incomplete byte (integer division rounds down).
+        * pglz uses one control bit per byte, so if the entire desired prefix is
+        * represented as literal bytes, we'll need (rawsize * 9) bits.  We care
+        * about bytes though, so be sure to round up not down.
          *
-        * XXX Use int64 to prevent overflow during calculation.
+        * Use int64 here to prevent overflow during calculation.
+        */
+       compressed_size = ((int64) rawsize * 9 + 7) / 8;
+
+       /*
+        * The above fails to account for a corner case: we could have compressed
+        * data that starts with N-1 or N-2 literal bytes and then has a match tag
+        * of 2 or 3 bytes.  It's therefore possible that we need to fetch 1 or 2
+        * more bytes in order to have the whole match tag.  (Match tags earlier
+        * in the compressed data don't cause a problem, since they should
+        * represent more decompressed bytes than they occupy themselves.)
          */
-       compressed_size = (int32) ((int64) rawsize * 9 + 7) / 8;
+       compressed_size += 2;
  
         /*
          * Maximum compressed size can't be larger than total compressed size.
+        * (This also ensures that our result fits in int32.)
          */
         compressed_size = Min(compressed_size, total_compressed_size);
  
-       return compressed_size;
+       return (int32) compressed_size;
  }
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 1 Nov 2020 23:38:42 +0000 (18:38 -0500)