Add base32hex support to encode() and decode() functions.

This adds support for base32hex encoding and decoding, as defined in RFC 4648 Section 7. Unlike standard base32, base32hex uses the extended hex alphabet (0-9, A-V) which preserves the lexicographical order of the encoded data. This is particularly useful for representing UUIDv7 values in a compact string format while maintaining their time-ordered sort property. The encode() function produces output padded with '=', while decode() accepts both padded and unpadded input. Following the behavior of other encoding types, decoding is case-insensitive. Suggested-by: Sergey Prokhorenko <sergeyprokhorenko@yahoo.com.au> Author: Andrey Borodin <x4mmm@yandex-team.ru> Co-authored-by: Aleksander Alekseev <aleksander@tigerdata.com> Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-by: Илья Чердаков <i.cherdakov.pg@gmail.com> Reviewed-by: Chengxi Sun <chengxisun92@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://postgr.es/m/CAJ7c6TOramr1UTLcyB128LWMqita1Y7%3Darq3KHaU%3Dqikf5yKOQ%40mail.gmail.com
2026-05-16 11:29:49 -04:00 · 2026-03-25 11:35:19 -07:00 · 2026-03-25 11:35:19 -07:00 · 497c1170cb
commit 497c1170cb
parent c8b4a3ec08
6 changed files with 420 additions and 13 deletions
--- a/doc/src/sgml/func/func-binarystring.sgml
+++ b/doc/src/sgml/func/func-binarystring.sgml
@ -727,6 +727,7 @@
      <para>
       Encodes binary data into a textual representation; supported
       <parameter>format</parameter> values are:
+       <link linkend="encode-format-base32hex"><literal>base32hex</literal></link>,
       <link linkend="encode-format-base64"><literal>base64</literal></link>,
       <link linkend="encode-format-base64url"><literal>base64url</literal></link>,
       <link linkend="encode-format-escape"><literal>escape</literal></link>,
@ -766,6 +767,32 @@
   functions support the following textual formats:

   <variablelist>
+    <varlistentry id="encode-format-base32hex">
+     <term>base32hex
+      <indexterm>
+       <primary>base32hex format</primary>
+      </indexterm></term>
+     <listitem>
+      <para>
+       The <literal>base32hex</literal> format is that of
+       <ulink url="https://datatracker.ietf.org/doc/html/rfc4648#section-7">
+       RFC 4648 Section 7</ulink>.  It uses the extended hex alphabet
+       (<literal>0</literal>-<literal>9</literal> and
+       <literal>A</literal>-<literal>V</literal>) which preserves the lexicographical
+       sort order of the encoded data. The <function>encode</function> function
+       produces output padded with <literal>'='</literal>, while <function>decode</function>
+       accepts both padded and unpadded input. Decoding is case-insensitive and ignores
+       whitespace characters.
+      </para>
+      <para>
+       This format is useful for encoding UUIDs in a compact, sortable format:
+       <literal>rtrim(encode(uuid_value::bytea, 'base32hex'), '=')</literal>
+       produces a 26-character string compared to the standard 36-character
+       UUID representation.
+      </para>
+     </listitem>
+    </varlistentry>
+
    <varlistentry id="encode-format-base64">
     <term>base64
     <indexterm>
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("unrecognized encoding: \"%s\"", namebuf),
-				 errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
-						 "base64", "base64url", "escape", "hex")));
+				 errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
+						 "base32hex", "base64", "base64url", "escape", "hex")));

 	dataptr = VARDATA_ANY(data);
 	datalen = VARSIZE_ANY_EXHDR(data);
@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("unrecognized encoding: \"%s\"", namebuf),
-				 errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
-						 "base64", "base64url", "escape", "hex")));
+				 errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
+						 "base32hex", "base64", "base64url", "escape", "hex")));

 	dataptr = VARDATA_ANY(data);
 	datalen = VARSIZE_ANY_EXHDR(data);
@ -825,6 +825,153 @@ esc_dec_len(const char *src, size_t srclen)
 	return len;
 }

+/*
+ * BASE32HEX
+ */
+
+static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
+
+static const int8 b32hexlookup[128] = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+	25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+	25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static uint64
+base32hex_enc_len(const char *src, size_t srclen)
+{
+	/* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */
+	return ((uint64) srclen + 4) / 5 * 8;
+}
+
+static uint64
+base32hex_dec_len(const char *src, size_t srclen)
+{
+	/* Each 8 characters of input produces at most 5 bytes of output */
+	return ((uint64) srclen * 5) / 8;
+}
+
+static uint64
+base32hex_encode(const char *src, size_t srclen, char *dst)
+{
+	const unsigned char *data = (const unsigned char *) src;
+	uint32		bits_buffer = 0;
+	int			bits_in_buffer = 0;
+	uint64		output_pos = 0;
+	size_t		i;
+
+	for (i = 0; i < srclen; i++)
+	{
+		/* Add 8 bits to the buffer */
+		bits_buffer = (bits_buffer << 8) | data[i];
+		bits_in_buffer += 8;
+
+		/* Extract 5-bit chunks while we have enough bits */
+		while (bits_in_buffer >= 5)
+		{
+			bits_in_buffer -= 5;
+			/* Extract top 5 bits */
+			dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F];
+			/* Clear the extracted bits by masking */
+			bits_buffer &= ((1U << bits_in_buffer) - 1);
+		}
+	}
+
+	/* Handle remaining bits (if any) */
+	if (bits_in_buffer > 0)
+		dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F];
+
+	/* Add padding to make length a multiple of 8 (per RFC 4648) */
+	while (output_pos % 8 != 0)
+		dst[output_pos++] = '=';
+
+	return output_pos;
+}
+
+static uint64
+base32hex_decode(const char *src, size_t srclen, char *dst)
+{
+	const char *srcend = src + srclen,
+			   *s = src;
+	uint32		bits_buffer = 0;
+	int			bits_in_buffer = 0;
+	uint64		output_pos = 0;
+	int			pos = 0;		/* position within 8-character group (0-7) */
+	bool		end = false;	/* have we seen padding? */
+
+	while (s < srcend)
+	{
+		char		c = *s++;
+		int			val;
+
+		/* Skip whitespace */
+		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+			continue;
+
+		if (c == '=')
+		{
+			/*
+			 * The first padding is only valid at positions 2, 4, 5, or 7
+			 * within an 8-character group (corresponding to 1, 2, 3, or 4
+			 * input bytes). We only check the position for the first '='
+			 * character.
+			 */
+			if (!end)
+			{
+				if (pos != 2 && pos != 4 && pos != 5 && pos != 7)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("unexpected \"=\" while decoding base32hex sequence")));
+				end = true;
+			}
+			pos++;
+			continue;
+		}
+
+		/* No data characters allowed after padding */
+		if (end)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
+							pg_mblen_range(s - 1, srcend), s - 1)));
+
+		/* Decode base32hex character (0-9, A-V, case-insensitive) */
+		val = -1;
+		if ((unsigned char) c < 128)
+			val = b32hexlookup[(unsigned char) c];
+		if (val < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
+							pg_mblen_range(s - 1, srcend), s - 1)));
+
+		/* Add 5 bits to buffer */
+		bits_buffer = (bits_buffer << 5) | val;
+		bits_in_buffer += 5;
+		pos++;
+
+		/* Extract 8-bit bytes when we have enough bits */
+		while (bits_in_buffer >= 8)
+		{
+			bits_in_buffer -= 8;
+			dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer);
+			/* Clear the extracted bits */
+			bits_buffer &= ((1U << bits_in_buffer) - 1);
+		}
+
+		/* Reset position after each complete 8-character group */
+		if (pos == 8)
+			pos = 0;
+	}
+
+	return output_pos;
+}
+
 /*
 * Common
 */
@ -854,6 +1001,12 @@ static const struct
 			pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode
 		}
 	},
+	{
+		"base32hex",
+		{
+			base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode
+		}
+	},
 	{
 		"escape",
 		{
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@ -2600,14 +2600,170 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape');
 -- report an error with a hint listing valid encodings when an invalid encoding is specified
 SELECT encode('\x01'::bytea, 'invalid');  -- error
 ERROR:  unrecognized encoding: "invalid"
-HINT:  Valid encodings are "base64", "base64url", "escape", and "hex".
+HINT:  Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
 SELECT decode('00', 'invalid');           -- error
 ERROR:  unrecognized encoding: "invalid"
-HINT:  Valid encodings are "base64", "base64url", "escape", and "hex".
+HINT:  Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
+--
+-- base32hex encoding/decoding
+--
+SET bytea_output TO hex;
+SELECT encode('', 'base32hex');  -- ''
+ encode 
+--------
+ 
+(1 row)
+
+SELECT encode('\x11', 'base32hex');  -- '24======'
+  encode  
+----------
+ 24======
+(1 row)
+
+SELECT encode('\x1122', 'base32hex');  -- '24H0===='
+  encode  
+----------
+ 24H0====
+(1 row)
+
+SELECT encode('\x112233', 'base32hex');  -- '24H36==='
+  encode  
+----------
+ 24H36===
+(1 row)
+
+SELECT encode('\x11223344', 'base32hex');  -- '24H36H0='
+  encode  
+----------
+ 24H36H0=
+(1 row)
+
+SELECT encode('\x1122334455', 'base32hex');  -- '24H36H2L'
+  encode  
+----------
+ 24H36H2L
+(1 row)
+
+SELECT encode('\x112233445566', 'base32hex');  -- '24H36H2LCO======'
+      encode      
+------------------
+ 24H36H2LCO======
+(1 row)
+
+SELECT decode('', 'base32hex');  -- ''
+ decode 
+--------
+ \x
+(1 row)
+
+SELECT decode('24======', 'base32hex');  -- \x11
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H0====', 'base32hex');  -- \x1122
+ decode 
+--------
+ \x1122
+(1 row)
+
+SELECT decode('24H36===', 'base32hex');  -- \x112233
+  decode  
+----------
+ \x112233
+(1 row)
+
+SELECT decode('24H36H0=', 'base32hex');  -- \x11223344
+   decode   
+------------
+ \x11223344
+(1 row)
+
+SELECT decode('24H36H2L', 'base32hex');  -- \x1122334455
+    decode    
+--------------
+ \x1122334455
+(1 row)
+
+SELECT decode('24H36H2LCO======', 'base32hex');  -- \x112233445566
+     decode     
+----------------
+ \x112233445566
+(1 row)
+
+SELECT decode('24h36h2lco', 'base32hex');  -- OK, the encoding is case-insensitive
+     decode     
+----------------
+ \x112233445566
+(1 row)
+
+-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
+SELECT decode('24', 'base32hex');
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H', 'base32hex');
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H36', 'base32hex');
+  decode  
+----------
+ \x112233
+(1 row)
+
+SELECT decode('24H36H0', 'base32hex');
+   decode   
+------------
+ \x11223344
+(1 row)
+
+SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
+ decode 
+--------
+ \x
+(1 row)
+
+SELECT decode('11=', 'base32hex');  -- OK, non-zero padding bits are accepted (consistent with base64)
+ decode 
+--------
+ \x08
+(1 row)
+
+SELECT decode('2=', 'base32hex'); -- error
+ERROR:  unexpected "=" while decoding base32hex sequence
+SELECT decode('=', 'base32hex');  -- error
+ERROR:  unexpected "=" while decoding base32hex sequence
+SELECT decode('W', 'base32hex');  -- error
+ERROR:  invalid symbol "W" found while decoding base32hex sequence
+SELECT decode('24H36H0=24', 'base32hex'); -- error
+ERROR:  invalid symbol "2" found while decoding base32hex sequence
+-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
+DO $$
+DECLARE
+  v1 uuid;
+  v2 uuid;
+BEGIN
+  FOR i IN 1..10 LOOP
+    v1 := gen_random_uuid();
+    v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
+
+    IF v1 != v2 THEN
+      RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
+    END IF;
+  END LOOP;
+  RAISE NOTICE 'OK';
+END;
+$$;
+NOTICE:  OK
 --
 -- base64url encoding/decoding
 --
-SET bytea_output TO hex;
 -- Simple encoding/decoding
 SELECT encode('\x69b73eff', 'base64url');  -- abc-_w
 encode 
--- a/src/test/regress/expected/uuid.out
+++ b/src/test/regress/expected/uuid.out
@ -13,7 +13,8 @@ CREATE TABLE guid2
 CREATE TABLE guid3
 (
 	id SERIAL,
-	guid_field UUID
+	guid_field UUID,
+	guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
 );
 -- inserting invalid data tests
 -- too long
@ -226,11 +227,20 @@ SELECT count(DISTINCT guid_field) FROM guid1;
 (1 row)

 -- test sortability of v7
+INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
 INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
+INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
 SELECT array_agg(id ORDER BY guid_field) FROM guid3;
-       array_agg        
------------------------
- {1,2,3,4,5,6,7,8,9,10}
+          array_agg           
+------------------------------
+ {1,2,3,4,5,6,7,8,9,10,11,12}
+(1 row)
+
+-- make sure base32hex encoding works with UUIDs and preserves ordering
+SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
+          array_agg           
+------------------------------
+ {1,2,3,4,5,6,7,8,9,10,11,12}
 (1 row)

 -- Check the timestamp offsets for v7.
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@ -835,10 +835,65 @@ SELECT encode('\x01'::bytea, 'invalid');  -- error
 SELECT decode('00', 'invalid');           -- error

 --
-- base64url encoding/decoding
+-- base32hex encoding/decoding
 --
 SET bytea_output TO hex;

+SELECT encode('', 'base32hex');  -- ''
+SELECT encode('\x11', 'base32hex');  -- '24======'
+SELECT encode('\x1122', 'base32hex');  -- '24H0===='
+SELECT encode('\x112233', 'base32hex');  -- '24H36==='
+SELECT encode('\x11223344', 'base32hex');  -- '24H36H0='
+SELECT encode('\x1122334455', 'base32hex');  -- '24H36H2L'
+SELECT encode('\x112233445566', 'base32hex');  -- '24H36H2LCO======'
+
+SELECT decode('', 'base32hex');  -- ''
+SELECT decode('24======', 'base32hex');  -- \x11
+SELECT decode('24H0====', 'base32hex');  -- \x1122
+SELECT decode('24H36===', 'base32hex');  -- \x112233
+SELECT decode('24H36H0=', 'base32hex');  -- \x11223344
+SELECT decode('24H36H2L', 'base32hex');  -- \x1122334455
+SELECT decode('24H36H2LCO======', 'base32hex');  -- \x112233445566
+
+SELECT decode('24h36h2lco', 'base32hex');  -- OK, the encoding is case-insensitive
+
+-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
+SELECT decode('24', 'base32hex');
+SELECT decode('24H', 'base32hex');
+SELECT decode('24H36', 'base32hex');
+SELECT decode('24H36H0', 'base32hex');
+
+SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
+SELECT decode('11=', 'base32hex');  -- OK, non-zero padding bits are accepted (consistent with base64)
+
+SELECT decode('2=', 'base32hex'); -- error
+SELECT decode('=', 'base32hex');  -- error
+SELECT decode('W', 'base32hex');  -- error
+SELECT decode('24H36H0=24', 'base32hex'); -- error
+
+-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
+DO $$
+DECLARE
+  v1 uuid;
+  v2 uuid;
+BEGIN
+  FOR i IN 1..10 LOOP
+    v1 := gen_random_uuid();
+    v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
+
+    IF v1 != v2 THEN
+      RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
+    END IF;
+  END LOOP;
+  RAISE NOTICE 'OK';
+END;
+$$;
+
+
+--
+-- base64url encoding/decoding
+--
+
 -- Simple encoding/decoding
 SELECT encode('\x69b73eff', 'base64url');  -- abc-_w
 SELECT decode('abc-_w', 'base64url');      -- \x69b73eff
--- a/src/test/regress/sql/uuid.sql
+++ b/src/test/regress/sql/uuid.sql
@ -13,7 +13,8 @@ CREATE TABLE guid2
 CREATE TABLE guid3
 (
 	id SERIAL,
-	guid_field UUID
+	guid_field UUID,
+	guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
 );

 -- inserting invalid data tests
@ -116,9 +117,14 @@ INSERT INTO guid1 (guid_field) VALUES (uuidv7(INTERVAL '1 day'));
 SELECT count(DISTINCT guid_field) FROM guid1;

 -- test sortability of v7
+INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
 INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
+INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
 SELECT array_agg(id ORDER BY guid_field) FROM guid3;

+-- make sure base32hex encoding works with UUIDs and preserves ordering
+SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
+
 -- Check the timestamp offsets for v7.
 --
 -- generate UUIDv7 values with timestamps ranging from 1970 (the Unix epoch year)