mirror of
https://github.com/postgres/postgres.git
synced 2026-02-11 23:03:25 -05:00
Code coverage for most pg_mblen* calls.
A security patch changed them today, so close the coverage gap now. Test that buffer overrun is avoided when pg_mblen*() requires more than the number of bytes remaining. This does not cover the calls in dict_thesaurus.c or in dict_synonym.c. That code is straightforward. To change that code's input, one must have access to modify installed OS files, so low-privilege users are not a threat. Testing this would likewise require changing installed share/postgresql/tsearch_data, which was enough of an obstacle to not bother. Security: CVE-2026-2006 Backpatch-through: 14 Co-authored-by: Thomas Munro <thomas.munro@gmail.com> Co-authored-by: Noah Misch <noah@leadboat.com> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
This commit is contained in:
parent
d837fb0292
commit
4c08960d97
15 changed files with 885 additions and 2 deletions
|
|
@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \
|
|||
pg_trgm--1.0--1.1.sql
|
||||
PGFILEDESC = "pg_trgm - trigram matching"
|
||||
|
||||
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
|
||||
REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm
|
||||
|
||||
ifdef USE_PGXS
|
||||
PG_CONFIG = pg_config
|
||||
|
|
|
|||
50
contrib/pg_trgm/data/trgm_utf8.data
Normal file
50
contrib/pg_trgm/data/trgm_utf8.data
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
Mathematics
|
||||
数学
|
||||
गणित
|
||||
Matemáticas
|
||||
رياضيات
|
||||
Mathématiques
|
||||
গণিত
|
||||
Matemática
|
||||
Математика
|
||||
ریاضی
|
||||
Matematika
|
||||
Mathematik
|
||||
数学
|
||||
Mathematics
|
||||
गणित
|
||||
గణితం
|
||||
Matematik
|
||||
கணிதம்
|
||||
數學
|
||||
Toán học
|
||||
Matematika
|
||||
数学
|
||||
수학
|
||||
ریاضی
|
||||
Lissafi
|
||||
Hisabati
|
||||
Matematika
|
||||
Matematica
|
||||
ریاضی
|
||||
ಗಣಿತ
|
||||
ગણિત
|
||||
คณิตศาสตร์
|
||||
ሂሳብ
|
||||
गणित
|
||||
ਗਣਿਤ
|
||||
數學
|
||||
数学
|
||||
Iṣiro
|
||||
數學
|
||||
သင်္ချာ
|
||||
Herrega
|
||||
رياضي
|
||||
गणित
|
||||
Математика
|
||||
Matematyka
|
||||
ഗണിതം
|
||||
Matematika
|
||||
رياضي
|
||||
Matematika
|
||||
Matematică
|
||||
8
contrib/pg_trgm/expected/pg_utf8_trgm.out
Normal file
8
contrib/pg_trgm/expected/pg_utf8_trgm.out
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
-- Index 50 translations of the word "Mathematics"
|
||||
CREATE TEMP TABLE mb (s text);
|
||||
\copy mb from 'data/trgm_utf8.data'
|
||||
CREATE INDEX ON mb USING gist(s gist_trgm_ops);
|
||||
3
contrib/pg_trgm/expected/pg_utf8_trgm_1.out
Normal file
3
contrib/pg_trgm/expected/pg_utf8_trgm_1.out
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
|
|
@ -39,6 +39,7 @@ tests += {
|
|||
'regress': {
|
||||
'sql': [
|
||||
'pg_trgm',
|
||||
'pg_utf8_trgm',
|
||||
'pg_word_trgm',
|
||||
'pg_strict_word_trgm',
|
||||
],
|
||||
|
|
|
|||
9
contrib/pg_trgm/sql/pg_utf8_trgm.sql
Normal file
9
contrib/pg_trgm/sql/pg_utf8_trgm.sql
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
|
||||
-- Index 50 translations of the word "Mathematics"
|
||||
CREATE TEMP TABLE mb (s text);
|
||||
\copy mb from 'data/trgm_utf8.data'
|
||||
CREATE INDEX ON mb USING gist(s gist_trgm_ops);
|
||||
|
|
@ -3752,6 +3752,12 @@ deconstruct_array_builtin(ArrayType *array,
|
|||
elmalign = TYPALIGN_SHORT;
|
||||
break;
|
||||
|
||||
case INT4OID:
|
||||
elmlen = sizeof(int32);
|
||||
elmbyval = true;
|
||||
elmalign = TYPALIGN_INT;
|
||||
break;
|
||||
|
||||
case OIDOID:
|
||||
elmlen = sizeof(Oid);
|
||||
elmbyval = true;
|
||||
|
|
|
|||
401
src/test/regress/expected/encoding.out
Normal file
401
src/test/regress/expected/encoding.out
Normal file
|
|
@ -0,0 +1,401 @@
|
|||
/* skip test if not UTF8 server encoding */
|
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
\getenv libdir PG_LIBDIR
|
||||
\getenv dlsuffix PG_DLSUFFIX
|
||||
\set regresslib :libdir '/regress' :dlsuffix
|
||||
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
|
||||
INSERT INTO regress_encoding
|
||||
VALUES ('café',
|
||||
'caf' || test_bytea_to_text('\xc3'),
|
||||
'café' || test_bytea_to_text('\x00') || 'dcba',
|
||||
'caf' || test_bytea_to_text('\xc300') || 'dcba');
|
||||
SELECT good, truncated, with_nul FROM regress_encoding;
|
||||
good | truncated | with_nul
|
||||
------+-----------+----------
|
||||
café | caf | café
|
||||
(1 row)
|
||||
|
||||
SELECT length(good) FROM regress_encoding;
|
||||
length
|
||||
--------
|
||||
4
|
||||
(1 row)
|
||||
|
||||
SELECT substring(good, 3, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
f
|
||||
(1 row)
|
||||
|
||||
SELECT substring(good, 4, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
é
|
||||
(1 row)
|
||||
|
||||
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
regexp_replace
|
||||
----------------
|
||||
é
|
||||
(1 row)
|
||||
|
||||
SELECT reverse(good) FROM regress_encoding;
|
||||
reverse
|
||||
---------
|
||||
éfac
|
||||
(1 row)
|
||||
|
||||
-- invalid short mb character = error
|
||||
SELECT length(truncated) FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
SELECT substring(truncated, 1, 1) FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
SELECT reverse(truncated) FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
-- invalid short mb character = silently dropped
|
||||
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
regexp_replace
|
||||
----------------
|
||||
caf
|
||||
(1 row)
|
||||
|
||||
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
|
||||
-- contains NUL at a character boundary position, some functions treat it as a
|
||||
-- character while others treat it as a terminator, as implementation details.
|
||||
-- NUL = terminator
|
||||
SELECT length(with_nul) FROM regress_encoding;
|
||||
length
|
||||
--------
|
||||
4
|
||||
(1 row)
|
||||
|
||||
SELECT substring(with_nul, 3, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
f
|
||||
(1 row)
|
||||
|
||||
SELECT substring(with_nul, 4, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
é
|
||||
(1 row)
|
||||
|
||||
SELECT substring(with_nul, 5, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
|
||||
convert_to
|
||||
------------
|
||||
\x
|
||||
(1 row)
|
||||
|
||||
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
regexp_replace
|
||||
----------------
|
||||
é
|
||||
(1 row)
|
||||
|
||||
-- NUL = character
|
||||
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
|
||||
with_nul | reverse | reverse
|
||||
----------+---------+---------
|
||||
café | abcd | café
|
||||
(1 row)
|
||||
|
||||
-- If a corrupted string contains NUL in the tail bytes of a multibyte
|
||||
-- character (invalid in all encodings), it is considered part of the
|
||||
-- character for length purposes. An error will only be raised in code paths
|
||||
-- that convert or verify encodings.
|
||||
SELECT length(truncated_with_nul) FROM regress_encoding;
|
||||
length
|
||||
--------
|
||||
8
|
||||
(1 row)
|
||||
|
||||
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
f
|
||||
(1 row)
|
||||
|
||||
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00
|
||||
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
|
||||
substring
|
||||
-----------
|
||||
d
|
||||
(1 row)
|
||||
|
||||
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT reverse(truncated_with_nul) FROM regress_encoding;
|
||||
reverse
|
||||
---------
|
||||
abcd
|
||||
(1 row)
|
||||
|
||||
-- unbounded: sequence would overrun the string!
|
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
test_mblen_func
|
||||
-----------------
|
||||
2
|
||||
(1 row)
|
||||
|
||||
-- condition detected when using the length/range variants
|
||||
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
-- unbounded: sequence would overrun the string, if the terminator were really
|
||||
-- the end of it
|
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
test_mblen_func
|
||||
-----------------
|
||||
2
|
||||
(1 row)
|
||||
|
||||
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
test_mblen_func
|
||||
-----------------
|
||||
2
|
||||
(1 row)
|
||||
|
||||
-- condition detected when using the cstr variants
|
||||
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3
|
||||
DROP TABLE regress_encoding;
|
||||
-- mb<->wchar conversions
|
||||
CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
|
||||
RETURNS VOID LANGUAGE plpgsql AS
|
||||
$$
|
||||
DECLARE
|
||||
prefix text;
|
||||
len int;
|
||||
wchars int[];
|
||||
round_trip bytea;
|
||||
result text;
|
||||
BEGIN
|
||||
prefix := rpad(encoding || ' ' || description || ':', 28);
|
||||
|
||||
-- XXX could also test validation, length functions and include client
|
||||
-- only encodings with these test cases
|
||||
|
||||
IF test_valid_server_encoding(encoding) THEN
|
||||
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
|
||||
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
|
||||
if input = round_trip then
|
||||
result := 'OK';
|
||||
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
|
||||
result := 'truncated';
|
||||
else
|
||||
result := 'failed';
|
||||
end if;
|
||||
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
|
||||
END IF;
|
||||
END;
|
||||
$$;
|
||||
-- No validation is done on the encoding itself, just the length to avoid
|
||||
-- overruns, so some of the byte sequences below are bogus. They cover
|
||||
-- all code branches, server encodings only for now.
|
||||
CREATE TABLE encoding_tests (encoding text, description text, input bytea);
|
||||
INSERT INTO encoding_tests VALUES
|
||||
-- LATIN1, other single-byte encodings
|
||||
('LATIN1', 'ASCII', 'a'),
|
||||
('LATIN1', 'extended', '\xe9'),
|
||||
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
|
||||
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
|
||||
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_JP', 'ASCII', 'a'),
|
||||
('EUC_JP', 'CS1, short', '\x80'),
|
||||
('EUC_JP', 'CS1', '\x8002'),
|
||||
('EUC_JP', 'CS2, short', '\x8e'),
|
||||
('EUC_JP', 'CS2', '\x8e02'),
|
||||
('EUC_JP', 'CS3, short', '\x8f'),
|
||||
('EUC_JP', 'CS3, short', '\x8f02'),
|
||||
('EUC_JP', 'CS3', '\x8f0203'),
|
||||
-- EUC_CN
|
||||
-- 3 8e (CS2, not used but arbitrarily considered to have length 3)
|
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_CN', 'ASCII', 'a'),
|
||||
('EUC_CN', 'CS1, short', '\x80'),
|
||||
('EUC_CN', 'CS1', '\x8002'),
|
||||
('EUC_CN', 'CS2, short', '\x8e'),
|
||||
('EUC_CN', 'CS2, short', '\x8e02'),
|
||||
('EUC_CN', 'CS2', '\x8e0203'),
|
||||
('EUC_CN', 'CS3, short', '\x8f'),
|
||||
('EUC_CN', 'CS3, short', '\x8f02'),
|
||||
('EUC_CN', 'CS3', '\x8f0203'),
|
||||
-- EUC_TW:
|
||||
-- 4 8e (CS2)
|
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_TW', 'ASCII', 'a'),
|
||||
('EUC_TW', 'CS1, short', '\x80'),
|
||||
('EUC_TW', 'CS1', '\x8002'),
|
||||
('EUC_TW', 'CS2, short', '\x8e'),
|
||||
('EUC_TW', 'CS2, short', '\x8e02'),
|
||||
('EUC_TW', 'CS2, short', '\x8e0203'),
|
||||
('EUC_TW', 'CS2', '\x8e020304'),
|
||||
('EUC_TW', 'CS3, short', '\x8f'),
|
||||
('EUC_TW', 'CS3, short', '\x8f02'),
|
||||
('EUC_TW', 'CS3', '\x8f0203'),
|
||||
-- UTF8
|
||||
-- 2 c0..df
|
||||
-- 3 e0..ef
|
||||
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
|
||||
-- 5 f8..fb (not supported)
|
||||
-- 6 fc..fd (not supported)
|
||||
('UTF8', 'ASCII', 'a'),
|
||||
('UTF8', '2 byte, short', '\xdf'),
|
||||
('UTF8', '2 byte', '\xdf82'),
|
||||
('UTF8', '3 byte, short', '\xef'),
|
||||
('UTF8', '3 byte, short', '\xef82'),
|
||||
('UTF8', '3 byte', '\xef8283'),
|
||||
('UTF8', '4 byte, short', '\xf7'),
|
||||
('UTF8', '4 byte, short', '\xf782'),
|
||||
('UTF8', '4 byte, short', '\xf78283'),
|
||||
('UTF8', '4 byte', '\xf7828384'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb82'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb8283'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb828384'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb82838485'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd82'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd8283'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd828384'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd82838485'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd8283848586'),
|
||||
-- MULE_INTERNAL
|
||||
-- 2 81..8d LC1
|
||||
-- 3 90..99 LC2
|
||||
('MULE_INTERNAL', 'ASCII', 'a'),
|
||||
('MULE_INTERNAL', 'LC1, short', '\x81'),
|
||||
('MULE_INTERNAL', 'LC1', '\x8182'),
|
||||
('MULE_INTERNAL', 'LC2, short', '\x90'),
|
||||
('MULE_INTERNAL', 'LC2, short', '\x9082'),
|
||||
('MULE_INTERNAL', 'LC2', '\x908283');
|
||||
SELECT COUNT(test_encoding(encoding, description, input)) > 0
|
||||
FROM encoding_tests;
|
||||
NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK
|
||||
NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated
|
||||
NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK
|
||||
NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated
|
||||
NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK
|
||||
NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated
|
||||
NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated
|
||||
NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
|
||||
NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated
|
||||
NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK
|
||||
NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated
|
||||
NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated
|
||||
NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK
|
||||
NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated
|
||||
NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated
|
||||
NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
|
||||
NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK
|
||||
NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK
|
||||
NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated
|
||||
NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
|
||||
NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated
|
||||
NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK
|
||||
NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated
|
||||
NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated
|
||||
NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK
|
||||
NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated
|
||||
NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated
|
||||
NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated
|
||||
NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK
|
||||
NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed
|
||||
NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed
|
||||
NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed
|
||||
NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed
|
||||
NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed
|
||||
NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed
|
||||
NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK
|
||||
NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated
|
||||
NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK
|
||||
NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated
|
||||
NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated
|
||||
NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
DROP TABLE encoding_tests;
|
||||
DROP FUNCTION test_encoding;
|
||||
DROP FUNCTION test_text_to_wchars;
|
||||
DROP FUNCTION test_mblen_func;
|
||||
DROP FUNCTION test_bytea_to_text;
|
||||
DROP FUNCTION test_text_to_bytea;
|
||||
-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
|
||||
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
|
||||
substring
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Levenshtein distance metric: exercise character length cache.
|
||||
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
|
||||
ERROR: column "real§_name" does not exist
|
||||
LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
|
||||
^
|
||||
HINT: Perhaps you meant to reference the column "x.real_name".
|
||||
-- JSON errcontext: truncate long data.
|
||||
SELECT repeat(U&'\00A7', 30)::json;
|
||||
ERROR: invalid input syntax for type json
|
||||
DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid.
|
||||
CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§
|
||||
4
src/test/regress/expected/encoding_1.out
Normal file
4
src/test/regress/expected/encoding_1.out
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
/* skip test if not UTF8 server encoding */
|
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
16
src/test/regress/expected/euc_kr.out
Normal file
16
src/test/regress/expected/euc_kr.out
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
|
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
|
||||
-- of EUC_KR, also run the test in UTF8.
|
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
|
||||
SELECT POSITION(
|
||||
convert_from('\xbcf6c7d0', 'EUC_KR') IN
|
||||
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
|
||||
position
|
||||
----------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
6
src/test/regress/expected/euc_kr_1.out
Normal file
6
src/test/regress/expected/euc_kr_1.out
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
|
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
|
||||
-- of EUC_KR, also run the test in UTF8.
|
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
|
|
@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
|
|||
# geometry depends on point, lseg, line, box, path, polygon, circle
|
||||
# horology depends on date, time, timetz, timestamp, timestamptz, interval
|
||||
# ----------
|
||||
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database
|
||||
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database encoding euc_kr
|
||||
|
||||
# ----------
|
||||
# Load huge amounts of data
|
||||
|
|
|
|||
|
|
@ -1287,6 +1287,145 @@ test_enc_conversion(PG_FUNCTION_ARGS)
|
|||
PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
|
||||
}
|
||||
|
||||
/* Convert bytea to text without validation for corruption tests from SQL. */
|
||||
PG_FUNCTION_INFO_V1(test_bytea_to_text);
|
||||
Datum
|
||||
test_bytea_to_text(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0));
|
||||
}
|
||||
|
||||
/* And the reverse. */
|
||||
PG_FUNCTION_INFO_V1(test_text_to_bytea);
|
||||
Datum
|
||||
test_text_to_bytea(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0));
|
||||
}
|
||||
|
||||
/* Corruption tests in C. */
|
||||
PG_FUNCTION_INFO_V1(test_mblen_func);
|
||||
Datum
|
||||
test_mblen_func(PG_FUNCTION_ARGS)
|
||||
{
|
||||
const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0));
|
||||
const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1));
|
||||
text *string = PG_GETARG_BYTEA_PP(2);
|
||||
int offset = PG_GETARG_INT32(3);
|
||||
const char *data = VARDATA_ANY(string);
|
||||
size_t size = VARSIZE_ANY_EXHDR(string);
|
||||
int result = 0;
|
||||
|
||||
if (strcmp(func, "pg_mblen_unbounded") == 0)
|
||||
result = pg_mblen_unbounded(data + offset);
|
||||
else if (strcmp(func, "pg_mblen_cstr") == 0)
|
||||
result = pg_mblen_cstr(data + offset);
|
||||
else if (strcmp(func, "pg_mblen_with_len") == 0)
|
||||
result = pg_mblen_with_len(data + offset, size - offset);
|
||||
else if (strcmp(func, "pg_mblen_range") == 0)
|
||||
result = pg_mblen_range(data + offset, data + size);
|
||||
else if (strcmp(func, "pg_encoding_mblen") == 0)
|
||||
result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset);
|
||||
else
|
||||
elog(ERROR, "unknown function");
|
||||
|
||||
PG_RETURN_INT32(result);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_text_to_wchars);
|
||||
Datum
|
||||
test_text_to_wchars(PG_FUNCTION_ARGS)
|
||||
{
|
||||
const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
|
||||
text *string = PG_GETARG_TEXT_PP(1);
|
||||
const char *data = VARDATA_ANY(string);
|
||||
size_t size = VARSIZE_ANY_EXHDR(string);
|
||||
pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1));
|
||||
Datum *datums;
|
||||
int wlen;
|
||||
int encoding;
|
||||
|
||||
encoding = pg_char_to_encoding(encoding_name);
|
||||
if (encoding < 0)
|
||||
elog(ERROR, "unknown encoding name: %s", encoding_name);
|
||||
|
||||
if (size > 0)
|
||||
{
|
||||
datums = palloc(sizeof(Datum) * size);
|
||||
wlen = pg_encoding_mb2wchar_with_len(encoding,
|
||||
data,
|
||||
wchars,
|
||||
size);
|
||||
Assert(wlen >= 0);
|
||||
Assert(wlen <= size);
|
||||
Assert(wchars[wlen] == 0);
|
||||
|
||||
for (int i = 0; i < wlen; ++i)
|
||||
datums[i] = UInt32GetDatum(wchars[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
datums = NULL;
|
||||
wlen = 0;
|
||||
}
|
||||
|
||||
PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID));
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_wchars_to_text);
|
||||
Datum
|
||||
test_wchars_to_text(PG_FUNCTION_ARGS)
|
||||
{
|
||||
const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
|
||||
ArrayType *array = PG_GETARG_ARRAYTYPE_P(1);
|
||||
Datum *datums;
|
||||
bool *nulls;
|
||||
char *mb;
|
||||
text *result;
|
||||
int wlen;
|
||||
int bytes;
|
||||
int encoding;
|
||||
|
||||
encoding = pg_char_to_encoding(encoding_name);
|
||||
if (encoding < 0)
|
||||
elog(ERROR, "unknown encoding name: %s", encoding_name);
|
||||
|
||||
deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen);
|
||||
|
||||
if (wlen > 0)
|
||||
{
|
||||
pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen);
|
||||
|
||||
for (int i = 0; i < wlen; ++i)
|
||||
{
|
||||
if (nulls[i])
|
||||
elog(ERROR, "unexpected NULL in array");
|
||||
wchars[i] = DatumGetInt32(datums[i]);
|
||||
}
|
||||
|
||||
mb = palloc(pg_encoding_max_length(encoding) * wlen + 1);
|
||||
bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen);
|
||||
}
|
||||
else
|
||||
{
|
||||
mb = "";
|
||||
bytes = 0;
|
||||
}
|
||||
|
||||
result = palloc(bytes + VARHDRSZ);
|
||||
SET_VARSIZE(result, bytes + VARHDRSZ);
|
||||
memcpy(VARDATA(result), mb, bytes);
|
||||
|
||||
PG_RETURN_TEXT_P(result);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_valid_server_encoding);
|
||||
Datum
|
||||
test_valid_server_encoding(PG_FUNCTION_ARGS)
|
||||
{
|
||||
return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0)));
|
||||
}
|
||||
|
||||
/* Provide SQL access to IsBinaryCoercible() */
|
||||
PG_FUNCTION_INFO_V1(binary_coercible);
|
||||
Datum
|
||||
|
|
|
|||
228
src/test/regress/sql/encoding.sql
Normal file
228
src/test/regress/sql/encoding.sql
Normal file
|
|
@ -0,0 +1,228 @@
|
|||
/* skip test if not UTF8 server encoding */
|
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
|
||||
\getenv libdir PG_LIBDIR
|
||||
\getenv dlsuffix PG_DLSUFFIX
|
||||
|
||||
\set regresslib :libdir '/regress' :dlsuffix
|
||||
|
||||
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
|
||||
AS :'regresslib' LANGUAGE C STRICT;
|
||||
|
||||
|
||||
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
|
||||
INSERT INTO regress_encoding
|
||||
VALUES ('café',
|
||||
'caf' || test_bytea_to_text('\xc3'),
|
||||
'café' || test_bytea_to_text('\x00') || 'dcba',
|
||||
'caf' || test_bytea_to_text('\xc300') || 'dcba');
|
||||
|
||||
SELECT good, truncated, with_nul FROM regress_encoding;
|
||||
|
||||
SELECT length(good) FROM regress_encoding;
|
||||
SELECT substring(good, 3, 1) FROM regress_encoding;
|
||||
SELECT substring(good, 4, 1) FROM regress_encoding;
|
||||
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
SELECT reverse(good) FROM regress_encoding;
|
||||
|
||||
-- invalid short mb character = error
|
||||
SELECT length(truncated) FROM regress_encoding;
|
||||
SELECT substring(truncated, 1, 1) FROM regress_encoding;
|
||||
SELECT reverse(truncated) FROM regress_encoding;
|
||||
-- invalid short mb character = silently dropped
|
||||
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
|
||||
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
|
||||
-- contains NUL at a character boundary position, some functions treat it as a
|
||||
-- character while others treat it as a terminator, as implementation details.
|
||||
|
||||
-- NUL = terminator
|
||||
SELECT length(with_nul) FROM regress_encoding;
|
||||
SELECT substring(with_nul, 3, 1) FROM regress_encoding;
|
||||
SELECT substring(with_nul, 4, 1) FROM regress_encoding;
|
||||
SELECT substring(with_nul, 5, 1) FROM regress_encoding;
|
||||
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
|
||||
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
|
||||
-- NUL = character
|
||||
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
|
||||
|
||||
-- If a corrupted string contains NUL in the tail bytes of a multibyte
|
||||
-- character (invalid in all encodings), it is considered part of the
|
||||
-- character for length purposes. An error will only be raised in code paths
|
||||
-- that convert or verify encodings.
|
||||
|
||||
SELECT length(truncated_with_nul) FROM regress_encoding;
|
||||
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
|
||||
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
|
||||
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
|
||||
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
|
||||
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
|
||||
SELECT reverse(truncated_with_nul) FROM regress_encoding;
|
||||
|
||||
-- unbounded: sequence would overrun the string!
|
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
|
||||
-- condition detected when using the length/range variants
|
||||
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
|
||||
FROM regress_encoding;
|
||||
|
||||
-- unbounded: sequence would overrun the string, if the terminator were really
|
||||
-- the end of it
|
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
|
||||
-- condition detected when using the cstr variants
|
||||
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
|
||||
FROM regress_encoding;
|
||||
|
||||
DROP TABLE regress_encoding;
|
||||
|
||||
-- mb<->wchar conversions
|
||||
CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
|
||||
RETURNS VOID LANGUAGE plpgsql AS
|
||||
$$
|
||||
DECLARE
|
||||
prefix text;
|
||||
len int;
|
||||
wchars int[];
|
||||
round_trip bytea;
|
||||
result text;
|
||||
BEGIN
|
||||
prefix := rpad(encoding || ' ' || description || ':', 28);
|
||||
|
||||
-- XXX could also test validation, length functions and include client
|
||||
-- only encodings with these test cases
|
||||
|
||||
IF test_valid_server_encoding(encoding) THEN
|
||||
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
|
||||
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
|
||||
if input = round_trip then
|
||||
result := 'OK';
|
||||
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
|
||||
result := 'truncated';
|
||||
else
|
||||
result := 'failed';
|
||||
end if;
|
||||
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
|
||||
END IF;
|
||||
END;
|
||||
$$;
|
||||
-- No validation is done on the encoding itself, just the length to avoid
|
||||
-- overruns, so some of the byte sequences below are bogus. They cover
|
||||
-- all code branches, server encodings only for now.
|
||||
CREATE TABLE encoding_tests (encoding text, description text, input bytea);
|
||||
INSERT INTO encoding_tests VALUES
|
||||
-- LATIN1, other single-byte encodings
|
||||
('LATIN1', 'ASCII', 'a'),
|
||||
('LATIN1', 'extended', '\xe9'),
|
||||
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
|
||||
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
|
||||
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_JP', 'ASCII', 'a'),
|
||||
('EUC_JP', 'CS1, short', '\x80'),
|
||||
('EUC_JP', 'CS1', '\x8002'),
|
||||
('EUC_JP', 'CS2, short', '\x8e'),
|
||||
('EUC_JP', 'CS2', '\x8e02'),
|
||||
('EUC_JP', 'CS3, short', '\x8f'),
|
||||
('EUC_JP', 'CS3, short', '\x8f02'),
|
||||
('EUC_JP', 'CS3', '\x8f0203'),
|
||||
-- EUC_CN
|
||||
-- 3 8e (CS2, not used but arbitrarily considered to have length 3)
|
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_CN', 'ASCII', 'a'),
|
||||
('EUC_CN', 'CS1, short', '\x80'),
|
||||
('EUC_CN', 'CS1', '\x8002'),
|
||||
('EUC_CN', 'CS2, short', '\x8e'),
|
||||
('EUC_CN', 'CS2, short', '\x8e02'),
|
||||
('EUC_CN', 'CS2', '\x8e0203'),
|
||||
('EUC_CN', 'CS3, short', '\x8f'),
|
||||
('EUC_CN', 'CS3, short', '\x8f02'),
|
||||
('EUC_CN', 'CS3', '\x8f0203'),
|
||||
-- EUC_TW:
|
||||
-- 4 8e (CS2)
|
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
|
||||
-- 2 80..ff (CS1)
|
||||
('EUC_TW', 'ASCII', 'a'),
|
||||
('EUC_TW', 'CS1, short', '\x80'),
|
||||
('EUC_TW', 'CS1', '\x8002'),
|
||||
('EUC_TW', 'CS2, short', '\x8e'),
|
||||
('EUC_TW', 'CS2, short', '\x8e02'),
|
||||
('EUC_TW', 'CS2, short', '\x8e0203'),
|
||||
('EUC_TW', 'CS2', '\x8e020304'),
|
||||
('EUC_TW', 'CS3, short', '\x8f'),
|
||||
('EUC_TW', 'CS3, short', '\x8f02'),
|
||||
('EUC_TW', 'CS3', '\x8f0203'),
|
||||
-- UTF8
|
||||
-- 2 c0..df
|
||||
-- 3 e0..ef
|
||||
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
|
||||
-- 5 f8..fb (not supported)
|
||||
-- 6 fc..fd (not supported)
|
||||
('UTF8', 'ASCII', 'a'),
|
||||
('UTF8', '2 byte, short', '\xdf'),
|
||||
('UTF8', '2 byte', '\xdf82'),
|
||||
('UTF8', '3 byte, short', '\xef'),
|
||||
('UTF8', '3 byte, short', '\xef82'),
|
||||
('UTF8', '3 byte', '\xef8283'),
|
||||
('UTF8', '4 byte, short', '\xf7'),
|
||||
('UTF8', '4 byte, short', '\xf782'),
|
||||
('UTF8', '4 byte, short', '\xf78283'),
|
||||
('UTF8', '4 byte', '\xf7828384'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb82'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb8283'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb828384'),
|
||||
('UTF8', '5 byte, unsupported', '\xfb82838485'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd82'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd8283'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd828384'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd82838485'),
|
||||
('UTF8', '6 byte, unsupported', '\xfd8283848586'),
|
||||
-- MULE_INTERNAL
|
||||
-- 2 81..8d LC1
|
||||
-- 3 90..99 LC2
|
||||
('MULE_INTERNAL', 'ASCII', 'a'),
|
||||
('MULE_INTERNAL', 'LC1, short', '\x81'),
|
||||
('MULE_INTERNAL', 'LC1', '\x8182'),
|
||||
('MULE_INTERNAL', 'LC2, short', '\x90'),
|
||||
('MULE_INTERNAL', 'LC2, short', '\x9082'),
|
||||
('MULE_INTERNAL', 'LC2', '\x908283');
|
||||
|
||||
SELECT COUNT(test_encoding(encoding, description, input)) > 0
|
||||
FROM encoding_tests;
|
||||
|
||||
DROP TABLE encoding_tests;
|
||||
DROP FUNCTION test_encoding;
|
||||
DROP FUNCTION test_text_to_wchars;
|
||||
DROP FUNCTION test_mblen_func;
|
||||
DROP FUNCTION test_bytea_to_text;
|
||||
DROP FUNCTION test_text_to_bytea;
|
||||
|
||||
|
||||
-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
|
||||
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
|
||||
-- Levenshtein distance metric: exercise character length cache.
|
||||
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
|
||||
-- JSON errcontext: truncate long data.
|
||||
SELECT repeat(U&'\00A7', 30)::json;
|
||||
12
src/test/regress/sql/euc_kr.sql
Normal file
12
src/test/regress/sql/euc_kr.sql
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
|
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
|
||||
-- of EUC_KR, also run the test in UTF8.
|
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
|
||||
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
|
||||
SELECT POSITION(
|
||||
convert_from('\xbcf6c7d0', 'EUC_KR') IN
|
||||
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
|
||||
Loading…
Reference in a new issue