Code coverage for most pg_mblen* calls.

A security patch changed them today, so close the coverage gap now.
Test that buffer overrun is avoided when pg_mblen*() requires more
than the number of bytes remaining.

This does not cover the calls in dict_thesaurus.c or in dict_synonym.c.
That code is straightforward.  To change that code's input, one must
have access to modify installed OS files, so low-privilege users are not
a threat.  Testing this would likewise require changing installed
share/postgresql/tsearch_data, which was enough of an obstacle to not
bother.

Security: CVE-2026-2006
Backpatch-through: 14
Co-authored-by: Thomas Munro <thomas.munro@gmail.com>
Co-authored-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
This commit is contained in:
Thomas Munro 2026-01-12 10:20:06 +13:00
parent d837fb0292
commit 4c08960d97
15 changed files with 885 additions and 2 deletions

View file

@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \
pg_trgm--1.0--1.1.sql
PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm
ifdef USE_PGXS
PG_CONFIG = pg_config

View file

@ -0,0 +1,50 @@
Mathematics
数学
गणित
Matemáticas
رياضيات
Mathématiques
গণিত
Matemática
Математика
ریاضی
Matematika
Mathematik
数学
Mathematics
गणित
గణితం
Matematik
கணிதம்
數學
Toán học
Matematika
数学
수학
ریاضی
Lissafi
Hisabati
Matematika
Matematica
ریاضی
ಗಣಿತ
ગણિત
คณิตศาสตร์
ሂሳብ
गणित
ਗਣਿਤ
數學
数学
Iṣiro
數學
သင်္ချာ
Herrega
رياضي
गणित
Математика
Matematyka
ഗണിതം
Matematika
رياضي
Matematika
Matematică

View file

@ -0,0 +1,8 @@
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
-- Index 50 translations of the word "Mathematics"
CREATE TEMP TABLE mb (s text);
\copy mb from 'data/trgm_utf8.data'
CREATE INDEX ON mb USING gist(s gist_trgm_ops);

View file

@ -0,0 +1,3 @@
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit

View file

@ -39,6 +39,7 @@ tests += {
'regress': {
'sql': [
'pg_trgm',
'pg_utf8_trgm',
'pg_word_trgm',
'pg_strict_word_trgm',
],

View file

@ -0,0 +1,9 @@
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
-- Index 50 translations of the word "Mathematics"
CREATE TEMP TABLE mb (s text);
\copy mb from 'data/trgm_utf8.data'
CREATE INDEX ON mb USING gist(s gist_trgm_ops);

View file

@ -3752,6 +3752,12 @@ deconstruct_array_builtin(ArrayType *array,
elmalign = TYPALIGN_SHORT;
break;
case INT4OID:
elmlen = sizeof(int32);
elmbyval = true;
elmalign = TYPALIGN_INT;
break;
case OIDOID:
elmlen = sizeof(Oid);
elmbyval = true;

View file

@ -0,0 +1,401 @@
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
\getenv libdir PG_LIBDIR
\getenv dlsuffix PG_DLSUFFIX
\set regresslib :libdir '/regress' :dlsuffix
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
AS :'regresslib' LANGUAGE C STRICT;
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
INSERT INTO regress_encoding
VALUES ('café',
'caf' || test_bytea_to_text('\xc3'),
'café' || test_bytea_to_text('\x00') || 'dcba',
'caf' || test_bytea_to_text('\xc300') || 'dcba');
SELECT good, truncated, with_nul FROM regress_encoding;
good | truncated | with_nul
------+-----------+----------
café | caf | café
(1 row)
SELECT length(good) FROM regress_encoding;
length
--------
4
(1 row)
SELECT substring(good, 3, 1) FROM regress_encoding;
substring
-----------
f
(1 row)
SELECT substring(good, 4, 1) FROM regress_encoding;
substring
-----------
é
(1 row)
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
regexp_replace
----------------
é
(1 row)
SELECT reverse(good) FROM regress_encoding;
reverse
---------
éfac
(1 row)
-- invalid short mb character = error
SELECT length(truncated) FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
SELECT substring(truncated, 1, 1) FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
SELECT reverse(truncated) FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
-- invalid short mb character = silently dropped
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
regexp_replace
----------------
caf
(1 row)
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
-- contains NUL at a character boundary position, some functions treat it as a
-- character while others treat it as a terminator, as implementation details.
-- NUL = terminator
SELECT length(with_nul) FROM regress_encoding;
length
--------
4
(1 row)
SELECT substring(with_nul, 3, 1) FROM regress_encoding;
substring
-----------
f
(1 row)
SELECT substring(with_nul, 4, 1) FROM regress_encoding;
substring
-----------
é
(1 row)
SELECT substring(with_nul, 5, 1) FROM regress_encoding;
substring
-----------
(1 row)
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
convert_to
------------
\x
(1 row)
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
regexp_replace
----------------
é
(1 row)
-- NUL = character
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
with_nul | reverse | reverse
----------+---------+---------
café | abcd | café
(1 row)
-- If a corrupted string contains NUL in the tail bytes of a multibyte
-- character (invalid in all encodings), it is considered part of the
-- character for length purposes. An error will only be raised in code paths
-- that convert or verify encodings.
SELECT length(truncated_with_nul) FROM regress_encoding;
length
--------
8
(1 row)
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
substring
-----------
f
(1 row)
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
substring
-----------
(1 row)
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
substring
-----------
d
(1 row)
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
?column?
----------
t
(1 row)
SELECT reverse(truncated_with_nul) FROM regress_encoding;
reverse
---------
abcd
(1 row)
-- unbounded: sequence would overrun the string!
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
FROM regress_encoding;
test_mblen_func
-----------------
2
(1 row)
-- condition detected when using the length/range variants
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
-- unbounded: sequence would overrun the string, if the terminator were really
-- the end of it
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
FROM regress_encoding;
test_mblen_func
-----------------
2
(1 row)
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
FROM regress_encoding;
test_mblen_func
-----------------
2
(1 row)
-- condition detected when using the cstr variants
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
FROM regress_encoding;
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
DROP TABLE regress_encoding;
-- mb<->wchar conversions
CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
RETURNS VOID LANGUAGE plpgsql AS
$$
DECLARE
prefix text;
len int;
wchars int[];
round_trip bytea;
result text;
BEGIN
prefix := rpad(encoding || ' ' || description || ':', 28);
-- XXX could also test validation, length functions and include client
-- only encodings with these test cases
IF test_valid_server_encoding(encoding) THEN
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
if input = round_trip then
result := 'OK';
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
result := 'truncated';
else
result := 'failed';
end if;
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
END IF;
END;
$$;
-- No validation is done on the encoding itself, just the length to avoid
-- overruns, so some of the byte sequences below are bogus. They cover
-- all code branches, server encodings only for now.
CREATE TABLE encoding_tests (encoding text, description text, input bytea);
INSERT INTO encoding_tests VALUES
-- LATIN1, other single-byte encodings
('LATIN1', 'ASCII', 'a'),
('LATIN1', 'extended', '\xe9'),
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
-- 2 80..ff (CS1)
('EUC_JP', 'ASCII', 'a'),
('EUC_JP', 'CS1, short', '\x80'),
('EUC_JP', 'CS1', '\x8002'),
('EUC_JP', 'CS2, short', '\x8e'),
('EUC_JP', 'CS2', '\x8e02'),
('EUC_JP', 'CS3, short', '\x8f'),
('EUC_JP', 'CS3, short', '\x8f02'),
('EUC_JP', 'CS3', '\x8f0203'),
-- EUC_CN
-- 3 8e (CS2, not used but arbitrarily considered to have length 3)
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
-- 2 80..ff (CS1)
('EUC_CN', 'ASCII', 'a'),
('EUC_CN', 'CS1, short', '\x80'),
('EUC_CN', 'CS1', '\x8002'),
('EUC_CN', 'CS2, short', '\x8e'),
('EUC_CN', 'CS2, short', '\x8e02'),
('EUC_CN', 'CS2', '\x8e0203'),
('EUC_CN', 'CS3, short', '\x8f'),
('EUC_CN', 'CS3, short', '\x8f02'),
('EUC_CN', 'CS3', '\x8f0203'),
-- EUC_TW:
-- 4 8e (CS2)
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
-- 2 80..ff (CS1)
('EUC_TW', 'ASCII', 'a'),
('EUC_TW', 'CS1, short', '\x80'),
('EUC_TW', 'CS1', '\x8002'),
('EUC_TW', 'CS2, short', '\x8e'),
('EUC_TW', 'CS2, short', '\x8e02'),
('EUC_TW', 'CS2, short', '\x8e0203'),
('EUC_TW', 'CS2', '\x8e020304'),
('EUC_TW', 'CS3, short', '\x8f'),
('EUC_TW', 'CS3, short', '\x8f02'),
('EUC_TW', 'CS3', '\x8f0203'),
-- UTF8
-- 2 c0..df
-- 3 e0..ef
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
-- 5 f8..fb (not supported)
-- 6 fc..fd (not supported)
('UTF8', 'ASCII', 'a'),
('UTF8', '2 byte, short', '\xdf'),
('UTF8', '2 byte', '\xdf82'),
('UTF8', '3 byte, short', '\xef'),
('UTF8', '3 byte, short', '\xef82'),
('UTF8', '3 byte', '\xef8283'),
('UTF8', '4 byte, short', '\xf7'),
('UTF8', '4 byte, short', '\xf782'),
('UTF8', '4 byte, short', '\xf78283'),
('UTF8', '4 byte', '\xf7828384'),
('UTF8', '5 byte, unsupported', '\xfb'),
('UTF8', '5 byte, unsupported', '\xfb82'),
('UTF8', '5 byte, unsupported', '\xfb8283'),
('UTF8', '5 byte, unsupported', '\xfb828384'),
('UTF8', '5 byte, unsupported', '\xfb82838485'),
('UTF8', '6 byte, unsupported', '\xfd'),
('UTF8', '6 byte, unsupported', '\xfd82'),
('UTF8', '6 byte, unsupported', '\xfd8283'),
('UTF8', '6 byte, unsupported', '\xfd828384'),
('UTF8', '6 byte, unsupported', '\xfd82838485'),
('UTF8', '6 byte, unsupported', '\xfd8283848586'),
-- MULE_INTERNAL
-- 2 81..8d LC1
-- 3 90..99 LC2
('MULE_INTERNAL', 'ASCII', 'a'),
('MULE_INTERNAL', 'LC1, short', '\x81'),
('MULE_INTERNAL', 'LC1', '\x8182'),
('MULE_INTERNAL', 'LC2, short', '\x90'),
('MULE_INTERNAL', 'LC2, short', '\x9082'),
('MULE_INTERNAL', 'LC2', '\x908283');
SELECT COUNT(test_encoding(encoding, description, input)) > 0
FROM encoding_tests;
NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK
NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated
NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK
NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated
NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK
NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated
NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated
NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated
NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK
NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated
NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated
NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK
NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated
NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated
NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated
NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK
NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated
NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated
NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated
NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK
NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated
NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated
NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated
NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK
NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated
NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated
NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK
NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated
NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated
NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated
NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK
NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed
NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed
NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed
NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed
NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed
NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed
NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed
NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed
NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed
NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed
NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed
NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK
NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated
NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK
NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated
NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated
NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK
?column?
----------
t
(1 row)
DROP TABLE encoding_tests;
DROP FUNCTION test_encoding;
DROP FUNCTION test_text_to_wchars;
DROP FUNCTION test_mblen_func;
DROP FUNCTION test_bytea_to_text;
DROP FUNCTION test_text_to_bytea;
-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
substring
-----------
(1 row)
-- Levenshtein distance metric: exercise character length cache.
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
ERROR: column "real§_name" does not exist
LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
^
HINT: Perhaps you meant to reference the column "x.real_name".
-- JSON errcontext: truncate long data.
SELECT repeat(U&'\00A7', 30)::json;
ERROR: invalid input syntax for type json
DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid.
CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§

View file

@ -0,0 +1,4 @@
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit

View file

@ -0,0 +1,16 @@
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
-- of EUC_KR, also run the test in UTF8.
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
\if :skip_test
\quit
\endif
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
SELECT POSITION(
convert_from('\xbcf6c7d0', 'EUC_KR') IN
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
position
----------
5
(1 row)

View file

@ -0,0 +1,6 @@
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
-- of EUC_KR, also run the test in UTF8.
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
\if :skip_test
\quit

View file

@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
# geometry depends on point, lseg, line, box, path, polygon, circle
# horology depends on date, time, timetz, timestamp, timestamptz, interval
# ----------
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database encoding euc_kr
# ----------
# Load huge amounts of data

View file

@ -1287,6 +1287,145 @@ test_enc_conversion(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
}
/* Convert bytea to text without validation for corruption tests from SQL. */
PG_FUNCTION_INFO_V1(test_bytea_to_text);
Datum
test_bytea_to_text(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0));
}
/* And the reverse. */
PG_FUNCTION_INFO_V1(test_text_to_bytea);
Datum
test_text_to_bytea(PG_FUNCTION_ARGS)
{
PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0));
}
/* Corruption tests in C. */
PG_FUNCTION_INFO_V1(test_mblen_func);
Datum
test_mblen_func(PG_FUNCTION_ARGS)
{
const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0));
const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1));
text *string = PG_GETARG_BYTEA_PP(2);
int offset = PG_GETARG_INT32(3);
const char *data = VARDATA_ANY(string);
size_t size = VARSIZE_ANY_EXHDR(string);
int result = 0;
if (strcmp(func, "pg_mblen_unbounded") == 0)
result = pg_mblen_unbounded(data + offset);
else if (strcmp(func, "pg_mblen_cstr") == 0)
result = pg_mblen_cstr(data + offset);
else if (strcmp(func, "pg_mblen_with_len") == 0)
result = pg_mblen_with_len(data + offset, size - offset);
else if (strcmp(func, "pg_mblen_range") == 0)
result = pg_mblen_range(data + offset, data + size);
else if (strcmp(func, "pg_encoding_mblen") == 0)
result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset);
else
elog(ERROR, "unknown function");
PG_RETURN_INT32(result);
}
PG_FUNCTION_INFO_V1(test_text_to_wchars);
Datum
test_text_to_wchars(PG_FUNCTION_ARGS)
{
const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
text *string = PG_GETARG_TEXT_PP(1);
const char *data = VARDATA_ANY(string);
size_t size = VARSIZE_ANY_EXHDR(string);
pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1));
Datum *datums;
int wlen;
int encoding;
encoding = pg_char_to_encoding(encoding_name);
if (encoding < 0)
elog(ERROR, "unknown encoding name: %s", encoding_name);
if (size > 0)
{
datums = palloc(sizeof(Datum) * size);
wlen = pg_encoding_mb2wchar_with_len(encoding,
data,
wchars,
size);
Assert(wlen >= 0);
Assert(wlen <= size);
Assert(wchars[wlen] == 0);
for (int i = 0; i < wlen; ++i)
datums[i] = UInt32GetDatum(wchars[i]);
}
else
{
datums = NULL;
wlen = 0;
}
PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID));
}
PG_FUNCTION_INFO_V1(test_wchars_to_text);
Datum
test_wchars_to_text(PG_FUNCTION_ARGS)
{
const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
ArrayType *array = PG_GETARG_ARRAYTYPE_P(1);
Datum *datums;
bool *nulls;
char *mb;
text *result;
int wlen;
int bytes;
int encoding;
encoding = pg_char_to_encoding(encoding_name);
if (encoding < 0)
elog(ERROR, "unknown encoding name: %s", encoding_name);
deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen);
if (wlen > 0)
{
pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen);
for (int i = 0; i < wlen; ++i)
{
if (nulls[i])
elog(ERROR, "unexpected NULL in array");
wchars[i] = DatumGetInt32(datums[i]);
}
mb = palloc(pg_encoding_max_length(encoding) * wlen + 1);
bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen);
}
else
{
mb = "";
bytes = 0;
}
result = palloc(bytes + VARHDRSZ);
SET_VARSIZE(result, bytes + VARHDRSZ);
memcpy(VARDATA(result), mb, bytes);
PG_RETURN_TEXT_P(result);
}
PG_FUNCTION_INFO_V1(test_valid_server_encoding);
Datum
test_valid_server_encoding(PG_FUNCTION_ARGS)
{
return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0)));
}
/* Provide SQL access to IsBinaryCoercible() */
PG_FUNCTION_INFO_V1(binary_coercible);
Datum

View file

@ -0,0 +1,228 @@
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
\getenv libdir PG_LIBDIR
\getenv dlsuffix PG_DLSUFFIX
\set regresslib :libdir '/regress' :dlsuffix
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
AS :'regresslib' LANGUAGE C STRICT;
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
AS :'regresslib' LANGUAGE C STRICT;
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
INSERT INTO regress_encoding
VALUES ('café',
'caf' || test_bytea_to_text('\xc3'),
'café' || test_bytea_to_text('\x00') || 'dcba',
'caf' || test_bytea_to_text('\xc300') || 'dcba');
SELECT good, truncated, with_nul FROM regress_encoding;
SELECT length(good) FROM regress_encoding;
SELECT substring(good, 3, 1) FROM regress_encoding;
SELECT substring(good, 4, 1) FROM regress_encoding;
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
SELECT reverse(good) FROM regress_encoding;
-- invalid short mb character = error
SELECT length(truncated) FROM regress_encoding;
SELECT substring(truncated, 1, 1) FROM regress_encoding;
SELECT reverse(truncated) FROM regress_encoding;
-- invalid short mb character = silently dropped
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
-- contains NUL at a character boundary position, some functions treat it as a
-- character while others treat it as a terminator, as implementation details.
-- NUL = terminator
SELECT length(with_nul) FROM regress_encoding;
SELECT substring(with_nul, 3, 1) FROM regress_encoding;
SELECT substring(with_nul, 4, 1) FROM regress_encoding;
SELECT substring(with_nul, 5, 1) FROM regress_encoding;
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
-- NUL = character
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
-- If a corrupted string contains NUL in the tail bytes of a multibyte
-- character (invalid in all encodings), it is considered part of the
-- character for length purposes. An error will only be raised in code paths
-- that convert or verify encodings.
SELECT length(truncated_with_nul) FROM regress_encoding;
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
SELECT reverse(truncated_with_nul) FROM regress_encoding;
-- unbounded: sequence would overrun the string!
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
FROM regress_encoding;
-- condition detected when using the length/range variants
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
FROM regress_encoding;
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
FROM regress_encoding;
-- unbounded: sequence would overrun the string, if the terminator were really
-- the end of it
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
FROM regress_encoding;
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
FROM regress_encoding;
-- condition detected when using the cstr variants
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
FROM regress_encoding;
DROP TABLE regress_encoding;
-- mb<->wchar conversions
CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
RETURNS VOID LANGUAGE plpgsql AS
$$
DECLARE
prefix text;
len int;
wchars int[];
round_trip bytea;
result text;
BEGIN
prefix := rpad(encoding || ' ' || description || ':', 28);
-- XXX could also test validation, length functions and include client
-- only encodings with these test cases
IF test_valid_server_encoding(encoding) THEN
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
if input = round_trip then
result := 'OK';
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
result := 'truncated';
else
result := 'failed';
end if;
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
END IF;
END;
$$;
-- No validation is done on the encoding itself, just the length to avoid
-- overruns, so some of the byte sequences below are bogus. They cover
-- all code branches, server encodings only for now.
CREATE TABLE encoding_tests (encoding text, description text, input bytea);
INSERT INTO encoding_tests VALUES
-- LATIN1, other single-byte encodings
('LATIN1', 'ASCII', 'a'),
('LATIN1', 'extended', '\xe9'),
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
-- 2 80..ff (CS1)
('EUC_JP', 'ASCII', 'a'),
('EUC_JP', 'CS1, short', '\x80'),
('EUC_JP', 'CS1', '\x8002'),
('EUC_JP', 'CS2, short', '\x8e'),
('EUC_JP', 'CS2', '\x8e02'),
('EUC_JP', 'CS3, short', '\x8f'),
('EUC_JP', 'CS3, short', '\x8f02'),
('EUC_JP', 'CS3', '\x8f0203'),
-- EUC_CN
-- 3 8e (CS2, not used but arbitrarily considered to have length 3)
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
-- 2 80..ff (CS1)
('EUC_CN', 'ASCII', 'a'),
('EUC_CN', 'CS1, short', '\x80'),
('EUC_CN', 'CS1', '\x8002'),
('EUC_CN', 'CS2, short', '\x8e'),
('EUC_CN', 'CS2, short', '\x8e02'),
('EUC_CN', 'CS2', '\x8e0203'),
('EUC_CN', 'CS3, short', '\x8f'),
('EUC_CN', 'CS3, short', '\x8f02'),
('EUC_CN', 'CS3', '\x8f0203'),
-- EUC_TW:
-- 4 8e (CS2)
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
-- 2 80..ff (CS1)
('EUC_TW', 'ASCII', 'a'),
('EUC_TW', 'CS1, short', '\x80'),
('EUC_TW', 'CS1', '\x8002'),
('EUC_TW', 'CS2, short', '\x8e'),
('EUC_TW', 'CS2, short', '\x8e02'),
('EUC_TW', 'CS2, short', '\x8e0203'),
('EUC_TW', 'CS2', '\x8e020304'),
('EUC_TW', 'CS3, short', '\x8f'),
('EUC_TW', 'CS3, short', '\x8f02'),
('EUC_TW', 'CS3', '\x8f0203'),
-- UTF8
-- 2 c0..df
-- 3 e0..ef
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
-- 5 f8..fb (not supported)
-- 6 fc..fd (not supported)
('UTF8', 'ASCII', 'a'),
('UTF8', '2 byte, short', '\xdf'),
('UTF8', '2 byte', '\xdf82'),
('UTF8', '3 byte, short', '\xef'),
('UTF8', '3 byte, short', '\xef82'),
('UTF8', '3 byte', '\xef8283'),
('UTF8', '4 byte, short', '\xf7'),
('UTF8', '4 byte, short', '\xf782'),
('UTF8', '4 byte, short', '\xf78283'),
('UTF8', '4 byte', '\xf7828384'),
('UTF8', '5 byte, unsupported', '\xfb'),
('UTF8', '5 byte, unsupported', '\xfb82'),
('UTF8', '5 byte, unsupported', '\xfb8283'),
('UTF8', '5 byte, unsupported', '\xfb828384'),
('UTF8', '5 byte, unsupported', '\xfb82838485'),
('UTF8', '6 byte, unsupported', '\xfd'),
('UTF8', '6 byte, unsupported', '\xfd82'),
('UTF8', '6 byte, unsupported', '\xfd8283'),
('UTF8', '6 byte, unsupported', '\xfd828384'),
('UTF8', '6 byte, unsupported', '\xfd82838485'),
('UTF8', '6 byte, unsupported', '\xfd8283848586'),
-- MULE_INTERNAL
-- 2 81..8d LC1
-- 3 90..99 LC2
('MULE_INTERNAL', 'ASCII', 'a'),
('MULE_INTERNAL', 'LC1, short', '\x81'),
('MULE_INTERNAL', 'LC1', '\x8182'),
('MULE_INTERNAL', 'LC2, short', '\x90'),
('MULE_INTERNAL', 'LC2, short', '\x9082'),
('MULE_INTERNAL', 'LC2', '\x908283');
SELECT COUNT(test_encoding(encoding, description, input)) > 0
FROM encoding_tests;
DROP TABLE encoding_tests;
DROP FUNCTION test_encoding;
DROP FUNCTION test_text_to_wchars;
DROP FUNCTION test_mblen_func;
DROP FUNCTION test_bytea_to_text;
DROP FUNCTION test_text_to_bytea;
-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
-- Levenshtein distance metric: exercise character length cache.
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
-- JSON errcontext: truncate long data.
SELECT repeat(U&'\00A7', 30)::json;

View file

@ -0,0 +1,12 @@
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
-- of EUC_KR, also run the test in UTF8.
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
\if :skip_test
\quit
\endif
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
SELECT POSITION(
convert_from('\xbcf6c7d0', 'EUC_KR') IN
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));