Add base32hex support to encode() and decode() functions.

This adds support for base32hex encoding and decoding, as defined in
RFC 4648 Section 7. Unlike standard base32, base32hex uses the
extended hex alphabet (0-9, A-V) which preserves the lexicographical
order of the encoded data.

This is particularly useful for representing UUIDv7 values in a
compact string format while maintaining their time-ordered sort
property.

The encode() function produces output padded with '=', while decode()
accepts both padded and unpadded input. Following the behavior of
other encoding types, decoding is case-insensitive.

Suggested-by: Sergey Prokhorenko <sergeyprokhorenko@yahoo.com.au>
Author: Andrey Borodin <x4mmm@yandex-team.ru>
Co-authored-by: Aleksander Alekseev <aleksander@tigerdata.com>
Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com>
Reviewed-by: Илья Чердаков <i.cherdakov.pg@gmail.com>
Reviewed-by: Chengxi Sun <chengxisun92@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/CAJ7c6TOramr1UTLcyB128LWMqita1Y7%3Darq3KHaU%3Dqikf5yKOQ%40mail.gmail.com
This commit is contained in:
Masahiko Sawada 2026-03-25 11:35:19 -07:00
parent c8b4a3ec08
commit 497c1170cb
6 changed files with 420 additions and 13 deletions

View file

@ -727,6 +727,7 @@
<para>
Encodes binary data into a textual representation; supported
<parameter>format</parameter> values are:
<link linkend="encode-format-base32hex"><literal>base32hex</literal></link>,
<link linkend="encode-format-base64"><literal>base64</literal></link>,
<link linkend="encode-format-base64url"><literal>base64url</literal></link>,
<link linkend="encode-format-escape"><literal>escape</literal></link>,
@ -766,6 +767,32 @@
functions support the following textual formats:
<variablelist>
<varlistentry id="encode-format-base32hex">
<term>base32hex
<indexterm>
<primary>base32hex format</primary>
</indexterm></term>
<listitem>
<para>
The <literal>base32hex</literal> format is that of
<ulink url="https://datatracker.ietf.org/doc/html/rfc4648#section-7">
RFC 4648 Section 7</ulink>. It uses the extended hex alphabet
(<literal>0</literal>-<literal>9</literal> and
<literal>A</literal>-<literal>V</literal>) which preserves the lexicographical
sort order of the encoded data. The <function>encode</function> function
produces output padded with <literal>'='</literal>, while <function>decode</function>
accepts both padded and unpadded input. Decoding is case-insensitive and ignores
whitespace characters.
</para>
<para>
This format is useful for encoding UUIDs in a compact, sortable format:
<literal>rtrim(encode(uuid_value::bytea, 'base32hex'), '=')</literal>
produces a 26-character string compared to the standard 36-character
UUID representation.
</para>
</listitem>
</varlistentry>
<varlistentry id="encode-format-base64">
<term>base64
<indexterm>

View file

@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized encoding: \"%s\"", namebuf),
errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
"base64", "base64url", "escape", "hex")));
errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
"base32hex", "base64", "base64url", "escape", "hex")));
dataptr = VARDATA_ANY(data);
datalen = VARSIZE_ANY_EXHDR(data);
@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized encoding: \"%s\"", namebuf),
errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
"base64", "base64url", "escape", "hex")));
errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
"base32hex", "base64", "base64url", "escape", "hex")));
dataptr = VARDATA_ANY(data);
datalen = VARSIZE_ANY_EXHDR(data);
@ -825,6 +825,153 @@ esc_dec_len(const char *src, size_t srclen)
return len;
}
/*
* BASE32HEX
*/
static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
static const int8 b32hexlookup[128] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};
static uint64
base32hex_enc_len(const char *src, size_t srclen)
{
/* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */
return ((uint64) srclen + 4) / 5 * 8;
}
static uint64
base32hex_dec_len(const char *src, size_t srclen)
{
/* Each 8 characters of input produces at most 5 bytes of output */
return ((uint64) srclen * 5) / 8;
}
static uint64
base32hex_encode(const char *src, size_t srclen, char *dst)
{
const unsigned char *data = (const unsigned char *) src;
uint32 bits_buffer = 0;
int bits_in_buffer = 0;
uint64 output_pos = 0;
size_t i;
for (i = 0; i < srclen; i++)
{
/* Add 8 bits to the buffer */
bits_buffer = (bits_buffer << 8) | data[i];
bits_in_buffer += 8;
/* Extract 5-bit chunks while we have enough bits */
while (bits_in_buffer >= 5)
{
bits_in_buffer -= 5;
/* Extract top 5 bits */
dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F];
/* Clear the extracted bits by masking */
bits_buffer &= ((1U << bits_in_buffer) - 1);
}
}
/* Handle remaining bits (if any) */
if (bits_in_buffer > 0)
dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F];
/* Add padding to make length a multiple of 8 (per RFC 4648) */
while (output_pos % 8 != 0)
dst[output_pos++] = '=';
return output_pos;
}
static uint64
base32hex_decode(const char *src, size_t srclen, char *dst)
{
const char *srcend = src + srclen,
*s = src;
uint32 bits_buffer = 0;
int bits_in_buffer = 0;
uint64 output_pos = 0;
int pos = 0; /* position within 8-character group (0-7) */
bool end = false; /* have we seen padding? */
while (s < srcend)
{
char c = *s++;
int val;
/* Skip whitespace */
if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
continue;
if (c == '=')
{
/*
* The first padding is only valid at positions 2, 4, 5, or 7
* within an 8-character group (corresponding to 1, 2, 3, or 4
* input bytes). We only check the position for the first '='
* character.
*/
if (!end)
{
if (pos != 2 && pos != 4 && pos != 5 && pos != 7)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unexpected \"=\" while decoding base32hex sequence")));
end = true;
}
pos++;
continue;
}
/* No data characters allowed after padding */
if (end)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
pg_mblen_range(s - 1, srcend), s - 1)));
/* Decode base32hex character (0-9, A-V, case-insensitive) */
val = -1;
if ((unsigned char) c < 128)
val = b32hexlookup[(unsigned char) c];
if (val < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
pg_mblen_range(s - 1, srcend), s - 1)));
/* Add 5 bits to buffer */
bits_buffer = (bits_buffer << 5) | val;
bits_in_buffer += 5;
pos++;
/* Extract 8-bit bytes when we have enough bits */
while (bits_in_buffer >= 8)
{
bits_in_buffer -= 8;
dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer);
/* Clear the extracted bits */
bits_buffer &= ((1U << bits_in_buffer) - 1);
}
/* Reset position after each complete 8-character group */
if (pos == 8)
pos = 0;
}
return output_pos;
}
/*
* Common
*/
@ -854,6 +1001,12 @@ static const struct
pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode
}
},
{
"base32hex",
{
base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode
}
},
{
"escape",
{

View file

@ -2600,14 +2600,170 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape');
-- report an error with a hint listing valid encodings when an invalid encoding is specified
SELECT encode('\x01'::bytea, 'invalid'); -- error
ERROR: unrecognized encoding: "invalid"
HINT: Valid encodings are "base64", "base64url", "escape", and "hex".
HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
SELECT decode('00', 'invalid'); -- error
ERROR: unrecognized encoding: "invalid"
HINT: Valid encodings are "base64", "base64url", "escape", and "hex".
HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
--
-- base32hex encoding/decoding
--
SET bytea_output TO hex;
SELECT encode('', 'base32hex'); -- ''
encode
--------
(1 row)
SELECT encode('\x11', 'base32hex'); -- '24======'
encode
----------
24======
(1 row)
SELECT encode('\x1122', 'base32hex'); -- '24H0===='
encode
----------
24H0====
(1 row)
SELECT encode('\x112233', 'base32hex'); -- '24H36==='
encode
----------
24H36===
(1 row)
SELECT encode('\x11223344', 'base32hex'); -- '24H36H0='
encode
----------
24H36H0=
(1 row)
SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L'
encode
----------
24H36H2L
(1 row)
SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======'
encode
------------------
24H36H2LCO======
(1 row)
SELECT decode('', 'base32hex'); -- ''
decode
--------
\x
(1 row)
SELECT decode('24======', 'base32hex'); -- \x11
decode
--------
\x11
(1 row)
SELECT decode('24H0====', 'base32hex'); -- \x1122
decode
--------
\x1122
(1 row)
SELECT decode('24H36===', 'base32hex'); -- \x112233
decode
----------
\x112233
(1 row)
SELECT decode('24H36H0=', 'base32hex'); -- \x11223344
decode
------------
\x11223344
(1 row)
SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455
decode
--------------
\x1122334455
(1 row)
SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566
decode
----------------
\x112233445566
(1 row)
SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive
decode
----------------
\x112233445566
(1 row)
-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
SELECT decode('24', 'base32hex');
decode
--------
\x11
(1 row)
SELECT decode('24H', 'base32hex');
decode
--------
\x11
(1 row)
SELECT decode('24H36', 'base32hex');
decode
----------
\x112233
(1 row)
SELECT decode('24H36H0', 'base32hex');
decode
------------
\x11223344
(1 row)
SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
decode
--------
\x
(1 row)
SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64)
decode
--------
\x08
(1 row)
SELECT decode('2=', 'base32hex'); -- error
ERROR: unexpected "=" while decoding base32hex sequence
SELECT decode('=', 'base32hex'); -- error
ERROR: unexpected "=" while decoding base32hex sequence
SELECT decode('W', 'base32hex'); -- error
ERROR: invalid symbol "W" found while decoding base32hex sequence
SELECT decode('24H36H0=24', 'base32hex'); -- error
ERROR: invalid symbol "2" found while decoding base32hex sequence
-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
DO $$
DECLARE
v1 uuid;
v2 uuid;
BEGIN
FOR i IN 1..10 LOOP
v1 := gen_random_uuid();
v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
IF v1 != v2 THEN
RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
END IF;
END LOOP;
RAISE NOTICE 'OK';
END;
$$;
NOTICE: OK
--
-- base64url encoding/decoding
--
SET bytea_output TO hex;
-- Simple encoding/decoding
SELECT encode('\x69b73eff', 'base64url'); -- abc-_w
encode

View file

@ -13,7 +13,8 @@ CREATE TABLE guid2
CREATE TABLE guid3
(
id SERIAL,
guid_field UUID
guid_field UUID,
guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
);
-- inserting invalid data tests
-- too long
@ -226,11 +227,20 @@ SELECT count(DISTINCT guid_field) FROM guid1;
(1 row)
-- test sortability of v7
INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
SELECT array_agg(id ORDER BY guid_field) FROM guid3;
array_agg
------------------------
{1,2,3,4,5,6,7,8,9,10}
array_agg
------------------------------
{1,2,3,4,5,6,7,8,9,10,11,12}
(1 row)
-- make sure base32hex encoding works with UUIDs and preserves ordering
SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
array_agg
------------------------------
{1,2,3,4,5,6,7,8,9,10,11,12}
(1 row)
-- Check the timestamp offsets for v7.

View file

@ -835,10 +835,65 @@ SELECT encode('\x01'::bytea, 'invalid'); -- error
SELECT decode('00', 'invalid'); -- error
--
-- base64url encoding/decoding
-- base32hex encoding/decoding
--
SET bytea_output TO hex;
SELECT encode('', 'base32hex'); -- ''
SELECT encode('\x11', 'base32hex'); -- '24======'
SELECT encode('\x1122', 'base32hex'); -- '24H0===='
SELECT encode('\x112233', 'base32hex'); -- '24H36==='
SELECT encode('\x11223344', 'base32hex'); -- '24H36H0='
SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L'
SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======'
SELECT decode('', 'base32hex'); -- ''
SELECT decode('24======', 'base32hex'); -- \x11
SELECT decode('24H0====', 'base32hex'); -- \x1122
SELECT decode('24H36===', 'base32hex'); -- \x112233
SELECT decode('24H36H0=', 'base32hex'); -- \x11223344
SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455
SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566
SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive
-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
SELECT decode('24', 'base32hex');
SELECT decode('24H', 'base32hex');
SELECT decode('24H36', 'base32hex');
SELECT decode('24H36H0', 'base32hex');
SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64)
SELECT decode('2=', 'base32hex'); -- error
SELECT decode('=', 'base32hex'); -- error
SELECT decode('W', 'base32hex'); -- error
SELECT decode('24H36H0=24', 'base32hex'); -- error
-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
DO $$
DECLARE
v1 uuid;
v2 uuid;
BEGIN
FOR i IN 1..10 LOOP
v1 := gen_random_uuid();
v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
IF v1 != v2 THEN
RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
END IF;
END LOOP;
RAISE NOTICE 'OK';
END;
$$;
--
-- base64url encoding/decoding
--
-- Simple encoding/decoding
SELECT encode('\x69b73eff', 'base64url'); -- abc-_w
SELECT decode('abc-_w', 'base64url'); -- \x69b73eff

View file

@ -13,7 +13,8 @@ CREATE TABLE guid2
CREATE TABLE guid3
(
id SERIAL,
guid_field UUID
guid_field UUID,
guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
);
-- inserting invalid data tests
@ -116,9 +117,14 @@ INSERT INTO guid1 (guid_field) VALUES (uuidv7(INTERVAL '1 day'));
SELECT count(DISTINCT guid_field) FROM guid1;
-- test sortability of v7
INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
SELECT array_agg(id ORDER BY guid_field) FROM guid3;
-- make sure base32hex encoding works with UUIDs and preserves ordering
SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
-- Check the timestamp offsets for v7.
--
-- generate UUIDv7 values with timestamps ranging from 1970 (the Unix epoch year)