diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml index b256381e01f..0aaf9bc68f1 100644 --- a/doc/src/sgml/func/func-binarystring.sgml +++ b/doc/src/sgml/func/func-binarystring.sgml @@ -727,6 +727,7 @@ Encodes binary data into a textual representation; supported format values are: + base32hex, base64, base64url, escape, @@ -766,6 +767,32 @@ functions support the following textual formats: + + base32hex + + base32hex format + + + + The base32hex format is that of + + RFC 4648 Section 7. It uses the extended hex alphabet + (0-9 and + A-V) which preserves the lexicographical + sort order of the encoded data. The encode function + produces output padded with '=', while decode + accepts both padded and unpadded input. Decoding is case-insensitive and ignores + whitespace characters. + + + This format is useful for encoding UUIDs in a compact, sortable format: + rtrim(encode(uuid_value::bytea, 'base32hex'), '=') + produces a 26-character string compared to the standard 36-character + UUID representation. + + + + base64 diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index f5f835e944a..9ea3ddb49ec 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base32hex", "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base32hex", "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -825,6 +825,153 @@ esc_dec_len(const char *src, size_t srclen) return len; } +/* + * BASE32HEX + */ + +static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV"; + +static const int8 b32hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +static uint64 +base32hex_enc_len(const char *src, size_t srclen) +{ + /* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */ + return ((uint64) srclen + 4) / 5 * 8; +} + +static uint64 +base32hex_dec_len(const char *src, size_t srclen) +{ + /* Each 8 characters of input produces at most 5 bytes of output */ + return ((uint64) srclen * 5) / 8; +} + +static uint64 +base32hex_encode(const char *src, size_t srclen, char *dst) +{ + const unsigned char *data = (const unsigned char *) src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + size_t i; + + for (i = 0; i < srclen; i++) + { + /* Add 8 bits to the buffer */ + bits_buffer = (bits_buffer << 8) | data[i]; + bits_in_buffer += 8; + + /* Extract 5-bit chunks while we have enough bits */ + while (bits_in_buffer >= 5) + { + bits_in_buffer -= 5; + /* Extract top 5 bits */ + dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F]; + /* Clear the extracted bits by masking */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + } + + /* Handle remaining bits (if any) */ + if (bits_in_buffer > 0) + dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F]; + + /* Add padding to make length a multiple of 8 (per RFC 4648) */ + while (output_pos % 8 != 0) + dst[output_pos++] = '='; + + return output_pos; +} + +static uint64 +base32hex_decode(const char *src, size_t srclen, char *dst) +{ + const char *srcend = src + srclen, + *s = src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + int pos = 0; /* position within 8-character group (0-7) */ + bool end = false; /* have we seen padding? */ + + while (s < srcend) + { + char c = *s++; + int val; + + /* Skip whitespace */ + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + continue; + + if (c == '=') + { + /* + * The first padding is only valid at positions 2, 4, 5, or 7 + * within an 8-character group (corresponding to 1, 2, 3, or 4 + * input bytes). We only check the position for the first '=' + * character. + */ + if (!end) + { + if (pos != 2 && pos != 4 && pos != 5 && pos != 7) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unexpected \"=\" while decoding base32hex sequence"))); + end = true; + } + pos++; + continue; + } + + /* No data characters allowed after padding */ + if (end) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Decode base32hex character (0-9, A-V, case-insensitive) */ + val = -1; + if ((unsigned char) c < 128) + val = b32hexlookup[(unsigned char) c]; + if (val < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Add 5 bits to buffer */ + bits_buffer = (bits_buffer << 5) | val; + bits_in_buffer += 5; + pos++; + + /* Extract 8-bit bytes when we have enough bits */ + while (bits_in_buffer >= 8) + { + bits_in_buffer -= 8; + dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer); + /* Clear the extracted bits */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + + /* Reset position after each complete 8-character group */ + if (pos == 8) + pos = 0; + } + + return output_pos; +} + /* * Common */ @@ -854,6 +1001,12 @@ static const struct pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode } }, + { + "base32hex", + { + base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode + } + }, { "escape", { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index f38688b5c37..a49b75fa1f9 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2600,14 +2600,170 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape'); -- report an error with a hint listing valid encodings when an invalid encoding is specified SELECT encode('\x01'::bytea, 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex". SELECT decode('00', 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex". +-- +-- base32hex encoding/decoding +-- +SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' + encode +-------- + +(1 row) + +SELECT encode('\x11', 'base32hex'); -- '24======' + encode +---------- + 24====== +(1 row) + +SELECT encode('\x1122', 'base32hex'); -- '24H0====' + encode +---------- + 24H0==== +(1 row) + +SELECT encode('\x112233', 'base32hex'); -- '24H36===' + encode +---------- + 24H36=== +(1 row) + +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' + encode +---------- + 24H36H0= +(1 row) + +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' + encode +---------- + 24H36H2L +(1 row) + +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + encode +------------------ + 24H36H2LCO====== +(1 row) + +SELECT decode('', 'base32hex'); -- '' + decode +-------- + \x +(1 row) + +SELECT decode('24======', 'base32hex'); -- \x11 + decode +-------- + \x11 +(1 row) + +SELECT decode('24H0====', 'base32hex'); -- \x1122 + decode +-------- + \x1122 +(1 row) + +SELECT decode('24H36===', 'base32hex'); -- \x112233 + decode +---------- + \x112233 +(1 row) + +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 + decode +------------ + \x11223344 +(1 row) + +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 + decode +-------------- + \x1122334455 +(1 row) + +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + decode +---------------- + \x112233445566 +(1 row) + +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive + decode +---------------- + \x112233445566 +(1 row) + +-- Tests for decoding unpadded base32hex strings. Padding '=' are optional. +SELECT decode('24', 'base32hex'); + decode +-------- + \x11 +(1 row) + +SELECT decode('24H', 'base32hex'); + decode +-------- + \x11 +(1 row) + +SELECT decode('24H36', 'base32hex'); + decode +---------- + \x112233 +(1 row) + +SELECT decode('24H36H0', 'base32hex'); + decode +------------ + \x11223344 +(1 row) + +SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted + decode +-------- + \x +(1 row) + +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) + decode +-------- + \x08 +(1 row) + +SELECT decode('2=', 'base32hex'); -- error +ERROR: unexpected "=" while decoding base32hex sequence +SELECT decode('=', 'base32hex'); -- error +ERROR: unexpected "=" while decoding base32hex sequence +SELECT decode('W', 'base32hex'); -- error +ERROR: invalid symbol "W" found while decoding base32hex sequence +SELECT decode('24H36H0=24', 'base32hex'); -- error +ERROR: invalid symbol "2" found while decoding base32hex sequence +-- Check round-trip capability of base32hex encoding for multiple random UUIDs. +DO $$ +DECLARE + v1 uuid; + v2 uuid; +BEGIN + FOR i IN 1..10 LOOP + v1 := gen_random_uuid(); + v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid; + + IF v1 != v2 THEN + RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2; + END IF; + END LOOP; + RAISE NOTICE 'OK'; +END; +$$; +NOTICE: OK -- -- base64url encoding/decoding -- -SET bytea_output TO hex; -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w encode diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index d157ef7d0b3..142c529e693 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -13,7 +13,8 @@ CREATE TABLE guid2 CREATE TABLE guid3 ( id SERIAL, - guid_field UUID + guid_field UUID, + guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED ); -- inserting invalid data tests -- too long @@ -226,11 +227,20 @@ SELECT count(DISTINCT guid_field) FROM guid1; (1 row) -- test sortability of v7 +INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid); INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10); +INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid); SELECT array_agg(id ORDER BY guid_field) FROM guid3; - array_agg ------------------------- - {1,2,3,4,5,6,7,8,9,10} + array_agg +------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12} +(1 row) + +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT array_agg(id ORDER BY guid_encoded) FROM guid3; + array_agg +------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12} (1 row) -- Check the timestamp offsets for v7. diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index d8a09737668..5ae0e7da31a 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -835,10 +835,65 @@ SELECT encode('\x01'::bytea, 'invalid'); -- error SELECT decode('00', 'invalid'); -- error -- --- base64url encoding/decoding +-- base32hex encoding/decoding -- SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' +SELECT encode('\x11', 'base32hex'); -- '24======' +SELECT encode('\x1122', 'base32hex'); -- '24H0====' +SELECT encode('\x112233', 'base32hex'); -- '24H36===' +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + +SELECT decode('', 'base32hex'); -- '' +SELECT decode('24======', 'base32hex'); -- \x11 +SELECT decode('24H0====', 'base32hex'); -- \x1122 +SELECT decode('24H36===', 'base32hex'); -- \x112233 +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive + +-- Tests for decoding unpadded base32hex strings. Padding '=' are optional. +SELECT decode('24', 'base32hex'); +SELECT decode('24H', 'base32hex'); +SELECT decode('24H36', 'base32hex'); +SELECT decode('24H36H0', 'base32hex'); + +SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) + +SELECT decode('2=', 'base32hex'); -- error +SELECT decode('=', 'base32hex'); -- error +SELECT decode('W', 'base32hex'); -- error +SELECT decode('24H36H0=24', 'base32hex'); -- error + +-- Check round-trip capability of base32hex encoding for multiple random UUIDs. +DO $$ +DECLARE + v1 uuid; + v2 uuid; +BEGIN + FOR i IN 1..10 LOOP + v1 := gen_random_uuid(); + v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid; + + IF v1 != v2 THEN + RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2; + END IF; + END LOOP; + RAISE NOTICE 'OK'; +END; +$$; + + +-- +-- base64url encoding/decoding +-- + -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w SELECT decode('abc-_w', 'base64url'); -- \x69b73eff diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index f512f4dea1d..f2ff00f5ddd 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -13,7 +13,8 @@ CREATE TABLE guid2 CREATE TABLE guid3 ( id SERIAL, - guid_field UUID + guid_field UUID, + guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED ); -- inserting invalid data tests @@ -116,9 +117,14 @@ INSERT INTO guid1 (guid_field) VALUES (uuidv7(INTERVAL '1 day')); SELECT count(DISTINCT guid_field) FROM guid1; -- test sortability of v7 +INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid); INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10); +INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid); SELECT array_agg(id ORDER BY guid_field) FROM guid3; +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT array_agg(id ORDER BY guid_encoded) FROM guid3; + -- Check the timestamp offsets for v7. -- -- generate UUIDv7 values with timestamps ranging from 1970 (the Unix epoch year)