diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index 1fbdc9ec1ef..c1756993ec7 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \ pg_trgm--1.0--1.1.sql PGFILEDESC = "pg_trgm - trigram matching" -REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm +REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_trgm/data/trgm_utf8.data b/contrib/pg_trgm/data/trgm_utf8.data new file mode 100644 index 00000000000..713856e76a6 --- /dev/null +++ b/contrib/pg_trgm/data/trgm_utf8.data @@ -0,0 +1,50 @@ +Mathematics +数学 +गणित +Matemáticas +رياضيات +Mathématiques +গণিত +Matemática +Математика +ریاضی +Matematika +Mathematik +数学 +Mathematics +गणित +గణితం +Matematik +கணிதம் +數學 +Toán học +Matematika +数学 +수학 +ریاضی +Lissafi +Hisabati +Matematika +Matematica +ریاضی +ಗಣಿತ +ગણિત +คณิตศาสตร์ +ሂሳብ +गणित +ਗਣਿਤ +數學 +数学 +Iṣiro +數學 +သင်္ချာ +Herrega +رياضي +गणित +Математика +Matematyka +ഗണിതം +Matematika +رياضي +Matematika +Matematică diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm.out b/contrib/pg_trgm/expected/pg_utf8_trgm.out new file mode 100644 index 00000000000..0768e7d6a83 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm.out @@ -0,0 +1,8 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out new file mode 100644 index 00000000000..8505c4fa552 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out @@ -0,0 +1,3 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build index 093ac18400c..bd3a34f2557 100644 --- a/contrib/pg_trgm/meson.build +++ b/contrib/pg_trgm/meson.build @@ -39,6 +39,7 @@ tests += { 'regress': { 'sql': [ 'pg_trgm', + 'pg_utf8_trgm', 'pg_word_trgm', 'pg_strict_word_trgm', ], diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql new file mode 100644 index 00000000000..0dd962ced83 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql @@ -0,0 +1,9 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index 2d03f967354..db1a3ef80ae 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -3752,6 +3752,12 @@ deconstruct_array_builtin(ArrayType *array, elmalign = TYPALIGN_SHORT; break; + case INT4OID: + elmlen = sizeof(int32); + elmbyval = true; + elmalign = TYPALIGN_INT; + break; + case OIDOID: elmlen = sizeof(Oid); elmbyval = true; diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out new file mode 100644 index 00000000000..ea1f38cff41 --- /dev/null +++ b/src/test/regress/expected/encoding.out @@ -0,0 +1,401 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX +\set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); +SELECT good, truncated, with_nul FROM regress_encoding; + good | truncated | with_nul +------+-----------+---------- + café | caf | café +(1 row) + +SELECT length(good) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(good, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(good, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +SELECT reverse(good) FROM regress_encoding; + reverse +--------- + éfac +(1 row) + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT substring(truncated, 1, 1) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT reverse(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + caf +(1 row) + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(with_nul, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT substring(with_nul, 5, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; + convert_to +------------ + \x +(1 row) + +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + with_nul | reverse | reverse +----------+---------+--------- + café | abcd | café +(1 row) + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. +SELECT length(truncated_with_nul) FROM regress_encoding; + length +-------- + 8 +(1 row) + +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00 +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; + substring +----------- + d +(1 row) + +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; + ?column? +---------- + t +(1 row) + +SELECT reverse(truncated_with_nul) FROM regress_encoding; + reverse +--------- + abcd +(1 row) + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +DROP TABLE regress_encoding; +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; +NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK +NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK +NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK +NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated +NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK +NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated +NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK +NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated +NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated +NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK +NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated +NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK +NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed +NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK +NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK + ?column? +---------- + t +(1 row) + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); + substring +----------- + +(1 row) + +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +ERROR: column "real§_name" does not exist +LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); + ^ +HINT: Perhaps you meant to reference the column "x.real_name". +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; +ERROR: invalid input syntax for type json +DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid. +CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§ diff --git a/src/test/regress/expected/encoding_1.out b/src/test/regress/expected/encoding_1.out new file mode 100644 index 00000000000..a5b02090901 --- /dev/null +++ b/src/test/regress/expected/encoding_1.out @@ -0,0 +1,4 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out new file mode 100644 index 00000000000..7a61c89a43a --- /dev/null +++ b/src/test/regress/expected/euc_kr.out @@ -0,0 +1,16 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); + position +---------- + 5 +(1 row) + diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out new file mode 100644 index 00000000000..faaac5d6355 --- /dev/null +++ b/src/test/regress/expected/euc_kr_1.out @@ -0,0 +1,6 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 87daf361618..989134997e1 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database encoding euc_kr # ---------- # Load huge amounts of data diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index c168081eb4a..b1dc26ebff5 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1287,6 +1287,145 @@ test_enc_conversion(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); } +/* Convert bytea to text without validation for corruption tests from SQL. */ +PG_FUNCTION_INFO_V1(test_bytea_to_text); +Datum +test_bytea_to_text(PG_FUNCTION_ARGS) +{ + PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0)); +} + +/* And the reverse. */ +PG_FUNCTION_INFO_V1(test_text_to_bytea); +Datum +test_text_to_bytea(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0)); +} + +/* Corruption tests in C. */ +PG_FUNCTION_INFO_V1(test_mblen_func); +Datum +test_mblen_func(PG_FUNCTION_ARGS) +{ + const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1)); + text *string = PG_GETARG_BYTEA_PP(2); + int offset = PG_GETARG_INT32(3); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + int result = 0; + + if (strcmp(func, "pg_mblen_unbounded") == 0) + result = pg_mblen_unbounded(data + offset); + else if (strcmp(func, "pg_mblen_cstr") == 0) + result = pg_mblen_cstr(data + offset); + else if (strcmp(func, "pg_mblen_with_len") == 0) + result = pg_mblen_with_len(data + offset, size - offset); + else if (strcmp(func, "pg_mblen_range") == 0) + result = pg_mblen_range(data + offset, data + size); + else if (strcmp(func, "pg_encoding_mblen") == 0) + result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset); + else + elog(ERROR, "unknown function"); + + PG_RETURN_INT32(result); +} + +PG_FUNCTION_INFO_V1(test_text_to_wchars); +Datum +test_text_to_wchars(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + text *string = PG_GETARG_TEXT_PP(1); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1)); + Datum *datums; + int wlen; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + if (size > 0) + { + datums = palloc(sizeof(Datum) * size); + wlen = pg_encoding_mb2wchar_with_len(encoding, + data, + wchars, + size); + Assert(wlen >= 0); + Assert(wlen <= size); + Assert(wchars[wlen] == 0); + + for (int i = 0; i < wlen; ++i) + datums[i] = UInt32GetDatum(wchars[i]); + } + else + { + datums = NULL; + wlen = 0; + } + + PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID)); +} + +PG_FUNCTION_INFO_V1(test_wchars_to_text); +Datum +test_wchars_to_text(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + ArrayType *array = PG_GETARG_ARRAYTYPE_P(1); + Datum *datums; + bool *nulls; + char *mb; + text *result; + int wlen; + int bytes; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen); + + if (wlen > 0) + { + pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen); + + for (int i = 0; i < wlen; ++i) + { + if (nulls[i]) + elog(ERROR, "unexpected NULL in array"); + wchars[i] = DatumGetInt32(datums[i]); + } + + mb = palloc(pg_encoding_max_length(encoding) * wlen + 1); + bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen); + } + else + { + mb = ""; + bytes = 0; + } + + result = palloc(bytes + VARHDRSZ); + SET_VARSIZE(result, bytes + VARHDRSZ); + memcpy(VARDATA(result), mb, bytes); + + PG_RETURN_TEXT_P(result); +} + +PG_FUNCTION_INFO_V1(test_valid_server_encoding); +Datum +test_valid_server_encoding(PG_FUNCTION_ARGS) +{ + return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))); +} + /* Provide SQL access to IsBinaryCoercible() */ PG_FUNCTION_INFO_V1(binary_coercible); Datum diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql new file mode 100644 index 00000000000..b9543c0cb32 --- /dev/null +++ b/src/test/regress/sql/encoding.sql @@ -0,0 +1,228 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX + +\set regresslib :libdir '/regress' :dlsuffix + +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; + + +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); + +SELECT good, truncated, with_nul FROM regress_encoding; + +SELECT length(good) FROM regress_encoding; +SELECT substring(good, 3, 1) FROM regress_encoding; +SELECT substring(good, 4, 1) FROM regress_encoding; +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; +SELECT reverse(good) FROM regress_encoding; + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT reverse(truncated) FROM regress_encoding; +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. + +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; +SELECT substring(with_nul, 3, 1) FROM regress_encoding; +SELECT substring(with_nul, 4, 1) FROM regress_encoding; +SELECT substring(with_nul, 5, 1) FROM regress_encoding; +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. + +SELECT length(truncated_with_nul) FROM regress_encoding; +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; +SELECT reverse(truncated_with_nul) FROM regress_encoding; + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + +DROP TABLE regress_encoding; + +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); + +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; + + +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql new file mode 100644 index 00000000000..1851b2a8c14 --- /dev/null +++ b/src/test/regress/sql/euc_kr.sql @@ -0,0 +1,12 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));