From cecedb912aeea391332bcb307d8e089266dbb2be Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Wed, 7 Jan 2026 22:14:31 +1300 Subject: [PATCH] Replace pg_mblen() with bounds-checked versions. A corrupted string could cause code that iterates with pg_mblen() to overrun its buffer. Fix, by converting all callers to one of the following: 1. Callers with a null-terminated string now use pg_mblen_cstr(), which raises an "illegal byte sequence" error if it finds a terminator in the middle of the sequence. 2. Callers with a length or end pointer now use either pg_mblen_with_len() or pg_mblen_range(), for the same effect, depending on which of the two seems more convenient at each site. 3. A small number of cases pre-validate a string, and can use pg_mblen_unbounded(). The traditional pg_mblen() function and COPYCHAR macro still exist for backward compatibility, but are no longer used by core code and are hereby deprecated. The same applies to the t_isXXX() functions. Security: CVE-2026-2006 Backpatch-through: 14 Co-authored-by: Thomas Munro Co-authored-by: Noah Misch Reviewed-by: Heikki Linnakangas Reported-by: Paul Gerste (as part of zeroday.cloud) Reported-by: Moritz Sanft (as part of zeroday.cloud) --- contrib/btree_gist/btree_utils_var.c | 21 +++- contrib/dict_xsyn/dict_xsyn.c | 8 +- contrib/hstore/hstore_io.c | 8 +- contrib/ltree/lquery_op.c | 4 +- contrib/ltree/ltree.h | 3 +- contrib/ltree/ltree_io.c | 16 +-- contrib/ltree/ltxtquery_io.c | 4 +- contrib/pageinspect/heapfuncs.c | 2 +- contrib/pg_trgm/trgm.h | 4 +- contrib/pg_trgm/trgm_op.c | 48 +++++--- contrib/pg_trgm/trgm_regexp.c | 21 ++-- contrib/unaccent/unaccent.c | 7 +- src/backend/catalog/pg_proc.c | 2 +- src/backend/tsearch/dict_synonym.c | 8 +- src/backend/tsearch/dict_thesaurus.c | 18 +-- src/backend/tsearch/regis.c | 37 +++--- src/backend/tsearch/spell.c | 123 +++++++++---------- src/backend/tsearch/ts_locale.c | 97 ++++++--------- src/backend/tsearch/ts_utils.c | 4 +- src/backend/tsearch/wparser_def.c | 3 +- src/backend/utils/adt/encode.c | 10 +- src/backend/utils/adt/formatting.c | 22 ++-- src/backend/utils/adt/jsonfuncs.c | 2 +- src/backend/utils/adt/jsonpath_gram.y | 2 +- src/backend/utils/adt/levenshtein.c | 14 ++- src/backend/utils/adt/like.c | 18 +-- src/backend/utils/adt/like_match.c | 3 +- src/backend/utils/adt/oracle_compat.c | 33 +++-- src/backend/utils/adt/regexp.c | 6 +- src/backend/utils/adt/tsquery.c | 25 ++-- src/backend/utils/adt/tsvector.c | 11 +- src/backend/utils/adt/tsvector_op.c | 10 +- src/backend/utils/adt/tsvector_parser.c | 29 ++--- src/backend/utils/adt/varbit.c | 8 +- src/backend/utils/adt/varlena.c | 41 ++++--- src/backend/utils/adt/xml.c | 11 +- src/backend/utils/mb/mbutils.c | 150 +++++++++++++++++++++-- src/include/mb/pg_wchar.h | 7 ++ src/include/tsearch/ts_locale.h | 34 ++++- src/include/tsearch/ts_utils.h | 14 +-- src/test/modules/test_regex/test_regex.c | 3 +- 41 files changed, 537 insertions(+), 354 deletions(-) diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index 2886c08b85e..9d93b3c775e 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -116,36 +116,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo) /* * returns the common prefix length of a node key + * + * If the underlying type is character data, the prefix length may point in + * the middle of a multibyte character. */ static int32 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) { GBT_VARKEY_R r = gbt_var_key_readable(node); int32 i = 0; - int32 l = 0; + int32 l_left_to_match = 0; + int32 l_total = 0; int32 t1len = VARSIZE(r.lower) - VARHDRSZ; int32 t2len = VARSIZE(r.upper) - VARHDRSZ; int32 ml = Min(t1len, t2len); char *p1 = VARDATA(r.lower); char *p2 = VARDATA(r.upper); + const char *end1 = p1 + t1len; + const char *end2 = p2 + t2len; if (ml == 0) return 0; while (i < ml) { - if (tinfo->eml > 1 && l == 0) + if (tinfo->eml > 1 && l_left_to_match == 0) { - if ((l = pg_mblen(p1)) != pg_mblen(p2)) + l_total = pg_mblen_range(p1, end1); + if (l_total != pg_mblen_range(p2, end2)) { return i; } + l_left_to_match = l_total; } if (*p1 != *p2) { if (tinfo->eml > 1) { - return (i - l + 1); + int32 l_matched_subset = l_total - l_left_to_match; + + /* end common prefix at final byte of last matching char */ + return i - l_matched_subset; } else { @@ -155,7 +166,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) p1++; p2++; - l--; + l_left_to_match--; i++; } return ml; /* lower == upper */ diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index 79c4f18f409..5c5e4e7a3c9 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -48,15 +48,15 @@ find_word(char *in, char **end) char *start; *end = NULL; - while (*in && t_isspace(in)) - in += pg_mblen(in); + while (*in && t_isspace_cstr(in)) + in += pg_mblen_cstr(in); if (!*in || *in == '#') return NULL; start = in; - while (*in && !t_isspace(in)) - in += pg_mblen(in); + while (*in && !t_isspace_cstr(in)) + in += pg_mblen_cstr(in); *end = in; diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 03057f085d1..0b1e0581e84 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -82,7 +82,7 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped) else if (*(state->ptr) == '=' && !ignoreeq) { elog(ERROR, "Syntax error near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int32) (state->ptr - state->begin)); } else if (*(state->ptr) == '\\') @@ -223,7 +223,7 @@ parse_hstore(HSParser *state) else if (!scanner_isspace((unsigned char) *(state->ptr))) { elog(ERROR, "Syntax error near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int32) (state->ptr - state->begin)); } } @@ -240,7 +240,7 @@ parse_hstore(HSParser *state) else { elog(ERROR, "Syntax error near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int32) (state->ptr - state->begin)); } } @@ -275,7 +275,7 @@ parse_hstore(HSParser *state) else if (!scanner_isspace((unsigned char) *(state->ptr))) { elog(ERROR, "Syntax error near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int32) (state->ptr - state->begin)); } } diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index d89af20f6cf..46019a0e83a 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -26,14 +26,14 @@ getlexeme(char *start, char *end, int *len) char *ptr; int charlen; - while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_')) + while (start < end && (charlen = pg_mblen_range(start, end)) == 1 && t_iseq(start, '_')) start += charlen; ptr = start; if (ptr >= end) return NULL; - while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_'))) + while (ptr < end && !((charlen = pg_mblen_range(ptr, end)) == 1 && t_iseq(ptr, '_'))) ptr += charlen; *len = ptr - start; diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 4e18caeb2fc..c7f00a99b06 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -113,7 +113,8 @@ typedef struct #define LQUERY_HASNOT 0x01 -#define ISALNUM(x) ( t_isalpha(x) || t_isdigit(x) || ( pg_mblen(x) == 1 && t_iseq((x), '_') ) ) +/* Caller has already called mblen, so we can use _unbounded variants safely. */ +#define ISALNUM(x) ( t_isalpha_unbounded(x) || t_isdigit_unbounded(x) || ( pg_mblen_unbounded(x) == 1 && t_iseq((x), '_') ) ) /* full text query */ diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c index 15115cb29f3..0a44a8c4691 100644 --- a/contrib/ltree/ltree_io.c +++ b/contrib/ltree/ltree_io.c @@ -54,7 +54,7 @@ parse_ltree(const char *buf) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; ptr += charlen; @@ -69,7 +69,7 @@ parse_ltree(const char *buf) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -285,7 +285,7 @@ parse_lquery(const char *buf) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; @@ -305,7 +305,7 @@ parse_lquery(const char *buf) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -402,7 +402,7 @@ parse_lquery(const char *buf) case LQPRS_WAITFNUM: if (t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; - else if (t_isdigit(ptr)) + else if (t_isdigit_cstr(ptr)) { int low = atoi(ptr); @@ -420,7 +420,7 @@ parse_lquery(const char *buf) UNCHAR; break; case LQPRS_WAITSNUM: - if (t_isdigit(ptr)) + if (t_isdigit_cstr(ptr)) { int high = atoi(ptr); @@ -451,7 +451,7 @@ parse_lquery(const char *buf) case LQPRS_WAITCLOSE: if (t_iseq(ptr, '}')) state = LQPRS_WAITEND; - else if (!t_isdigit(ptr)) + else if (!t_isdigit_cstr(ptr)) UNCHAR; break; case LQPRS_WAITND: @@ -462,7 +462,7 @@ parse_lquery(const char *buf) } else if (t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; - else if (!t_isdigit(ptr)) + else if (!t_isdigit_cstr(ptr)) UNCHAR; break; case LQPRS_WAITEND: diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index d967f92110f..7f98bdedecb 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -59,7 +59,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint for (;;) { - charlen = pg_mblen(state->buf); + charlen = pg_mblen_cstr(state->buf); switch (state->state) { @@ -83,7 +83,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint *lenval = charlen; *flag = 0; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_unbounded(state->buf)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("operand syntax error"))); diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index b067dd3660e..ebf030d484c 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -101,7 +101,7 @@ text_to_bits(char *str, int len) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid character \"%.*s\" in t_bits string", - pg_mblen(str + off), str + off))); + pg_mblen_cstr(str + off), str + off))); if (off % 8 == 7) bits[off / 8] = byte; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 405a1d95528..06d3994e692 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -52,10 +52,10 @@ typedef char trgm[3]; } while(0) #ifdef KEEPONLYALNUM -#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c)) +#define ISWORDCHR(c, len) (t_isalpha_with_len(c, len) || t_isdigit_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #else -#define ISWORDCHR(c) (!t_isspace(c)) +#define ISWORDCHR(c, len) (!t_isspace_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) #endif #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index fb38135f7a3..63895c3017d 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -171,18 +171,29 @@ static char * find_word(char *str, int lenstr, char **endword, int *charlen) { char *beginword = str; + const char *endstr = str + lenstr; - while (beginword - str < lenstr && !ISWORDCHR(beginword)) - beginword += pg_mblen(beginword); + while (beginword < endstr) + { + int clen = pg_mblen_range(beginword, endstr); - if (beginword - str >= lenstr) + if (ISWORDCHR(beginword, clen)) + break; + beginword += clen; + } + + if (beginword >= endstr) return NULL; *endword = beginword; *charlen = 0; - while (*endword - str < lenstr && ISWORDCHR(*endword)) + while (*endword < endstr) { - *endword += pg_mblen(*endword); + int clen = pg_mblen_range(*endword, endstr); + + if (!ISWORDCHR(*endword, clen)) + break; + *endword += clen; (*charlen)++; } @@ -230,9 +241,9 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) if (bytelen > charlen) { /* Find multibyte character boundaries and apply compact_trigram */ - int lenfirst = pg_mblen(str), - lenmiddle = pg_mblen(str + lenfirst), - lenlast = pg_mblen(str + lenfirst + lenmiddle); + int lenfirst = pg_mblen_unbounded(str), + lenmiddle = pg_mblen_unbounded(str + lenfirst), + lenlast = pg_mblen_unbounded(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { @@ -243,7 +254,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) lenfirst = lenmiddle; lenmiddle = lenlast; - lenlast = pg_mblen(ptr + lenfirst + lenmiddle); + lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle); } } else @@ -723,6 +734,7 @@ get_wildcard_part(const char *str, int lenstr, { const char *beginword = str; const char *endword; + const char *endstr = str + lenstr; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; @@ -735,11 +747,13 @@ get_wildcard_part(const char *str, int lenstr, * from this loop to the next one, since we may exit at a word character * that is in_escape. */ - while (beginword - str < lenstr) + while (beginword < endstr) { + clen = pg_mblen_range(beginword, endstr); + if (in_escape) { - if (ISWORDCHR(beginword)) + if (ISWORDCHR(beginword, clen)) break; in_escape = false; in_leading_wildcard_meta = false; @@ -750,12 +764,12 @@ get_wildcard_part(const char *str, int lenstr, in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; - else if (ISWORDCHR(beginword)) + else if (ISWORDCHR(beginword, clen)) break; else in_leading_wildcard_meta = false; } - beginword += pg_mblen(beginword); + beginword += clen; } /* @@ -788,12 +802,12 @@ get_wildcard_part(const char *str, int lenstr, * string boundary. Strip escapes during copy. */ endword = beginword; - while (endword - str < lenstr) + while (endword < endstr) { - clen = pg_mblen(endword); + clen = pg_mblen_range(endword, endstr); if (in_escape) { - if (ISWORDCHR(endword)) + if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); (*charlen)++; @@ -821,7 +835,7 @@ get_wildcard_part(const char *str, int lenstr, in_trailing_wildcard_meta = true; break; } - else if (ISWORDCHR(endword)) + else if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); (*charlen)++; diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 3485a725cde..2b5cfe87299 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -480,7 +480,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation); static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); -static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); +static int convertPgWchar(pg_wchar c, trgm_mb_char *result); static void transformGraph(TrgmNFA *trgmNFA); static void processState(TrgmNFA *trgmNFA, TrgmState *state); static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); @@ -816,10 +816,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) for (j = 0; j < charsCount; j++) { trgm_mb_char c; + int clen = convertPgWchar(chars[j], &c); - if (!convertPgWchar(chars[j], &c)) + if (!clen) continue; /* ok to ignore it altogether */ - if (ISWORDCHR(c.bytes)) + if (ISWORDCHR(c.bytes, clen)) colorInfo->wordChars[colorInfo->wordCharsCount++] = c; else colorInfo->containsNonWord = true; @@ -831,13 +832,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) /* * Convert pg_wchar to multibyte format. - * Returns false if the character should be ignored completely. + * Returns 0 if the character should be ignored completely, else returns its + * byte length. */ -static bool +static int convertPgWchar(pg_wchar c, trgm_mb_char *result) { /* "s" has enough space for a multibyte character and a trailing NUL */ char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + int clen; /* * We can ignore the NUL character, since it can never appear in a PG text @@ -845,11 +848,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) * reconstructing trigrams. */ if (c == 0) - return false; + return 0; /* Do the conversion, making sure the result is NUL-terminated */ memset(s, 0, sizeof(s)); - pg_wchar2mb_with_len(&c, s, 1); + clen = pg_wchar2mb_with_len(&c, s, 1); /* * In IGNORECASE mode, we can ignore uppercase characters. We assume that @@ -871,7 +874,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) if (strcmp(lowerCased, s) != 0) { pfree(lowerCased); - return false; + return 0; } pfree(lowerCased); } @@ -879,7 +882,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); - return true; + return clen; } diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 2b3819fb2e8..b3cf893417d 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -149,9 +149,9 @@ initTrie(const char *filename) state = 0; for (ptr = line; *ptr; ptr += ptrlen) { - ptrlen = pg_mblen(ptr); + ptrlen = pg_mblen_cstr(ptr); /* ignore whitespace, but end src or trg */ - if (t_isspace(ptr)) + if (t_isspace_cstr(ptr)) { if (state == 1) state = 2; @@ -315,6 +315,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *srcstart = srcchar; + const char *srcend = srcstart + len; TSLexeme *res; StringInfoData buf; @@ -342,7 +343,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) } else { - matchlen = pg_mblen(srcchar); + matchlen = pg_mblen_range(srcchar, srcend); if (buf.data != NULL) appendBinaryStringInfo(&buf, srcchar, matchlen); } diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index 821c2bef444..d043886d04e 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -1170,7 +1170,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal, if (cursorpos > 0) newcp++; } - chlen = pg_mblen(prosrc); + chlen = pg_mblen_cstr(prosrc); if (strncmp(prosrc, literal, chlen) != 0) goto fail; prosrc += chlen; diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index ed885ca5551..8c99ecaa0a0 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -47,8 +47,8 @@ findwrd(char *in, char **end, uint16 *flags) char *lastchar; /* Skip leading spaces */ - while (*in && t_isspace(in)) - in += pg_mblen(in); + while (*in && t_isspace_cstr(in)) + in += pg_mblen_cstr(in); /* Return NULL on empty lines */ if (*in == '\0') @@ -60,10 +60,10 @@ findwrd(char *in, char **end, uint16 *flags) lastchar = start = in; /* Find end of word */ - while (*in && !t_isspace(in)) + while (*in && !t_isspace_cstr(in)) { lastchar = in; - in += pg_mblen(in); + in += pg_mblen_cstr(in); } if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index a95ed0891dd..45686654c93 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -190,8 +190,8 @@ thesaurusRead(const char *filename, DictThesaurus *d) ptr = line; /* is it a comment? */ - while (*ptr && t_isspace(ptr)) - ptr += pg_mblen(ptr); + while (*ptr && t_isspace_cstr(ptr)) + ptr += pg_mblen_cstr(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) @@ -212,7 +212,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) errmsg("unexpected delimiter"))); state = TR_WAITSUBS; } - else if (!t_isspace(ptr)) + else if (!t_isspace_cstr(ptr)) { beginwrd = ptr; state = TR_INLEX; @@ -225,7 +225,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITSUBS; } - else if (t_isspace(ptr)) + else if (t_isspace_cstr(ptr)) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITLEX; @@ -237,15 +237,15 @@ thesaurusRead(const char *filename, DictThesaurus *d) { useasis = true; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } - else if (!t_isspace(ptr)) + else if (!t_isspace_cstr(ptr)) { useasis = false; beginwrd = ptr; @@ -254,7 +254,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) } else if (state == TR_INSUBS) { - if (t_isspace(ptr)) + if (t_isspace_cstr(ptr)) { if (ptr == beginwrd) ereport(ERROR, @@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) else elog(ERROR, "unrecognized thesaurus state: %d", state); - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); } if (state == TR_INSUBS) diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c index 80017177222..d890b65b063 100644 --- a/src/backend/tsearch/regis.c +++ b/src/backend/tsearch/regis.c @@ -37,7 +37,7 @@ RS_isRegis(const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, '[')) state = RS_IN_ONEOF; @@ -48,14 +48,14 @@ RS_isRegis(const char *str) { if (t_iseq(c, '^')) state = RS_IN_NONEOF; - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) state = RS_IN_ONEOF_IN; else return false; } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, ']')) state = RS_IN_WAIT; @@ -64,7 +64,7 @@ RS_isRegis(const char *str) } else elog(ERROR, "internal error in RS_isRegis: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return (state == RS_IN_WAIT); @@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) { if (ptr) ptr = newRegisNode(ptr, len); else ptr = r->node = newRegisNode(NULL, len); - COPYCHAR(ptr->data, c); ptr->type = RSF_ONEOF; - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); } else if (t_iseq(c, '[')) { @@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str) ptr->type = RSF_NONEOF; state = RS_IN_NONEOF; } - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) { - COPYCHAR(ptr->data, c); - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); state = RS_IN_ONEOF_IN; } else /* shouldn't get here */ @@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) - { - COPYCHAR(ptr->data + ptr->len, c); - ptr->len += pg_mblen(c); - } + if (t_isalpha_cstr(c)) + ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c); else if (t_iseq(c, ']')) state = RS_IN_WAIT; else /* shouldn't get here */ @@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else elog(ERROR, "internal error in RS_compile: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (state != RS_IN_WAIT) /* shouldn't get here */ @@ -187,10 +182,10 @@ mb_strchr(char *str, char *c) char *ptr = str; bool res = false; - clen = pg_mblen(c); + clen = pg_mblen_cstr(c); while (*ptr && !res) { - plen = pg_mblen(ptr); + plen = pg_mblen_cstr(ptr); if (plen == clen) { i = plen; @@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str) while (*c) { len++; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (len < r->nchar) @@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str) { len -= r->nchar; while (len-- > 0) - c += pg_mblen(c); + c += pg_mblen_cstr(c); } @@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str) elog(ERROR, "unrecognized regis node type: %d", ptr->type); } ptr = ptr->next; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return true; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 2c555ebdcce..f65c083c06f 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -232,7 +232,7 @@ findchar(char *str, int c) { if (t_iseq(str, c)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -245,7 +245,7 @@ findchar2(char *str, int c1, int c2) { if (t_iseq(str, c1) || t_iseq(str, c2)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -352,6 +352,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) char *next, *sbuf = *sflagset; int maxstep; + int clen; bool stop = false; bool met_comma = false; @@ -363,11 +364,11 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) { case FM_LONG: case FM_CHAR: - COPYCHAR(sflag, *sflagset); - sflag += pg_mblen(*sflagset); + clen = ts_copychar_cstr(sflag, *sflagset); + sflag += clen; /* Go to start of the next flag */ - *sflagset += pg_mblen(*sflagset); + *sflagset += clen; /* Check if we get all characters of flag */ maxstep--; @@ -391,7 +392,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset = next; while (**sflagset) { - if (t_isdigit(*sflagset)) + if (t_isdigit_cstr(*sflagset)) { if (!met_comma) ereport(ERROR, @@ -409,7 +410,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset))); met_comma = true; } - else if (!t_isspace(*sflagset)) + else if (!t_isspace_cstr(*sflagset)) { ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), @@ -417,7 +418,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset))); } - *sflagset += pg_mblen(*sflagset); + *sflagset += pg_mblen_cstr(*sflagset); } stop = true; break; @@ -543,7 +544,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) while (*s) { /* we allow only single encoded flags for faster works */ - if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) + if (pg_mblen_cstr(s) == 1 && t_isprint_unbounded(s) && !t_isspace_unbounded(s)) s++; else { @@ -559,12 +560,12 @@ NIImportDictionary(IspellDict *Conf, const char *filename) s = line; while (*s) { - if (t_isspace(s)) + if (t_isspace_cstr(s)) { *s = '\0'; break; } - s += pg_mblen(s); + s += pg_mblen_cstr(s); } pstr = lowerstr_ctx(Conf, line); @@ -816,17 +817,17 @@ get_nextfield(char **str, char *next) while (**str) { + int clen = pg_mblen_cstr(*str); + if (state == PAE_WAIT_MASK) { if (t_iseq(*str, '#')) return false; - else if (!t_isspace(*str)) + else if (!t_isspace_cstr(*str)) { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } @@ -835,24 +836,22 @@ get_nextfield(char **str, char *next) } else /* state == PAE_INMASK */ { - if (t_isspace(*str)) + if (t_isspace_cstr(*str)) { *next = '\0'; return true; } else { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } } } - *str += pg_mblen(*str); + *str += clen; } *next = '\0'; @@ -942,14 +941,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl) while (*str) { + int clen = pg_mblen_cstr(str); + if (state == PAE_WAIT_MASK) { if (t_iseq(str, '#')) return false; - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); state = PAE_INMASK; } } @@ -960,10 +960,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pmask = '\0'; state = PAE_WAIT_FIND; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); } } else if (state == PAE_WAIT_FIND) @@ -972,13 +971,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { state = PAE_INFIND; } - else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) + else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ ) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -990,12 +988,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pfind = '\0'; state = PAE_WAIT_REPL; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(pfind, str); - pfind += pg_mblen(str); + pfind += ts_copychar_with_len(pfind, str, clen); } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -1006,13 +1003,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { break; /* void repl */ } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -1024,12 +1020,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *prepl = '\0'; break; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -1037,7 +1032,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) else elog(ERROR, "unrecognized state in parse_affentry: %d", state); - str += pg_mblen(str); + str += clen; } *pmask = *pfind = *prepl = '\0'; @@ -1090,10 +1085,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) CompoundAffixFlag *newValue; char sbuf[BUFSIZ]; char *sflag; - int clen; - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (!*s) ereport(ERROR, @@ -1102,10 +1096,10 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) /* Get flag without \n */ sflag = sbuf; - while (*s && !t_isspace(s) && *s != '\n') + while (*s && !t_isspace_cstr(s) && *s != '\n') { - clen = pg_mblen(s); - COPYCHAR(sflag, s); + int clen = ts_copychar_cstr(sflag, s); + sflag += clen; s += clen; } @@ -1248,7 +1242,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) while ((recoded = tsearch_readline(&trst)) != NULL) { - if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) { pfree(recoded); continue; @@ -1285,8 +1279,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) { char *s = recoded + strlen("FLAG"); - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (*s) { @@ -1321,7 +1315,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) { int fields_read; - if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) goto nextline; fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask); @@ -1484,12 +1478,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename) s = findchar2(recoded, 'l', 'L'); if (s) { - while (*s && !t_isspace(s)) - s += pg_mblen(s); - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && !t_isspace_cstr(s)) + s += pg_mblen_cstr(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); Conf->usecompound = true; @@ -1517,8 +1511,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename) s = recoded + 4; /* we need non-lowercased string */ flagflags = 0; - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (*s == '*') { @@ -1539,14 +1533,13 @@ NIImportAffixes(IspellDict *Conf, const char *filename) * be followed by EOL, whitespace, or ':'. Otherwise this is a * new-format flag command. */ - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { - COPYCHAR(flag, s); + flag[0] = *s++; flag[1] = '\0'; - s++; if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || - t_isspace(s)) + t_isspace_cstr(s)) { oldformat = true; goto nextline; @@ -1769,7 +1762,7 @@ NISortDictionary(IspellDict *Conf) (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid affix alias \"%s\"", Conf->Spell[i]->p.flag))); - if (*end != '\0' && !t_isdigit(end) && !t_isspace(end)) + if (*end != '\0' && !t_isdigit_cstr(end) && !t_isspace_cstr(end)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid affix alias \"%s\"", diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index f918cc8908b..cc9e71ec708 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -33,70 +33,43 @@ static void tsearch_readline_callback(void *arg); */ #define WC_BUF_LEN 3 -int -t_isdigit(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || lc_ctype_is_c(collation)) - return isdigit(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswdigit((wint_t) character[0]); -} - -int -t_isspace(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || lc_ctype_is_c(collation)) - return isspace(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswspace((wint_t) character[0]); -} - -int -t_isalpha(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || lc_ctype_is_c(collation)) - return isalpha(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswalpha((wint_t) character[0]); -} - -int -t_isprint(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || lc_ctype_is_c(collation)) - return isprint(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswprint((wint_t) character[0]); +#define GENERATE_T_ISCLASS_DEF(character_class) \ +/* mblen shall be that of the first character */ \ +int \ +t_is##character_class##_with_len(const char *ptr, int mblen) \ +{ \ + int clen = pg_mblen_with_len(ptr, mblen); \ + wchar_t character[WC_BUF_LEN]; \ + pg_locale_t mylocale = 0; /* TODO */ \ + if (clen == 1 || lc_ctype_is_c(DEFAULT_COLLATION_OID)) \ + return is##character_class(TOUCHAR(ptr)); \ + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); \ + return isw##character_class((wint_t) character[0]); \ +} \ +\ +/* ptr shall point to a NUL-terminated string */ \ +int \ +t_is##character_class##_cstr(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \ +} \ +/* ptr shall point to a string with pre-validated encoding */ \ +int \ +t_is##character_class##_unbounded(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \ +} \ +/* historical name for _unbounded */ \ +int \ +t_is##character_class(const char *ptr) \ +{ \ + return t_is##character_class##_unbounded(ptr); \ } +GENERATE_T_ISCLASS_DEF(alpha) +GENERATE_T_ISCLASS_DEF(digit) +GENERATE_T_ISCLASS_DEF(print) +GENERATE_T_ISCLASS_DEF(space) /* * Set up to read a file using tsearch_readline(). This facility is diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index ed16a2e25a2..51902ab1e14 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -88,8 +88,8 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) char *pbuf = line; /* Trim trailing space */ - while (*pbuf && !t_isspace(pbuf)) - pbuf += pg_mblen(pbuf); + while (*pbuf && !t_isspace_cstr(pbuf)) + pbuf += pg_mblen_cstr(pbuf); *pbuf = '\0'; /* Skip empty lines */ diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 290c2223205..48f9a87523c 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1728,7 +1728,8 @@ TParserGet(TParser *prs) prs->state->charlen = 0; else prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen : - pg_mblen(prs->str + prs->state->posbyte); + pg_mblen_range(prs->str + prs->state->posbyte, + prs->str + prs->lenstr); Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr); Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null); diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 6dd93f9a322..06ea275c59c 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -172,7 +172,7 @@ hex_encode(const char *src, size_t len, char *dst) } static inline char -get_hex(const char *cp) +get_hex(const char *cp, const char *end) { unsigned char c = (unsigned char) *cp; int res = -1; @@ -184,7 +184,7 @@ get_hex(const char *cp) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(cp), cp))); + pg_mblen_range(cp, end), cp))); return (char) res; } @@ -208,14 +208,14 @@ hex_decode(const char *src, size_t len, char *dst) s++; continue; } - v1 = get_hex(s) << 4; + v1 = get_hex(s, srcend) << 4; s++; if (s >= srcend) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal data: odd number of digits"))); - v2 = get_hex(s); + v2 = get_hex(s, srcend); s++; *p++ = v1 | v2; } @@ -344,7 +344,7 @@ pg_base64_decode(const char *src, size_t len, char *dst) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", - pg_mblen(s - 1), s - 1))); + pg_mblen_range(s - 1, srcend), s - 1))); } /* add it to buffer */ buf = (buf << 6) + b; diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 9169539b76d..5b617ff4619 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1392,7 +1392,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ereport(ERROR, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid datetime format separator: \"%s\"", - pnstrdup(str, pg_mblen(str))))); + pnstrdup(str, pg_mblen_cstr(str))))); if (*str == ' ') n->type = NODE_TYPE_SPACE; @@ -1422,7 +1422,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); n->type = NODE_TYPE_CHAR; memcpy(n->character, str, chlen); n->character[chlen] = '\0'; @@ -1440,7 +1440,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, */ if (*str == '\\' && *(str + 1) == '"') str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); if ((flags & DCH_FLAG) && is_separator_char(str)) n->type = NODE_TYPE_SEPARATOR; @@ -2151,8 +2151,8 @@ asc_toupper_z(const char *buff) do { \ if (S_THth(_suf)) \ { \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ } \ } while (0) @@ -3365,7 +3365,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, * insist that the consumed character match the format's * character. */ - s += pg_mblen(s); + s += pg_mblen_cstr(s); } continue; } @@ -3387,11 +3387,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (extra_skip > 0) extra_skip--; else - s += pg_mblen(s); + s += pg_mblen_cstr(s); } else { - int chlen = pg_mblen(s); + int chlen = pg_mblen_cstr(s); /* * Standard mode requires strict match of format characters. @@ -5563,13 +5563,15 @@ NUM_numpart_to_char(NUMProc *Np, int id) static void NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len) { + const char *end = Np->inout + input_len; + while (n-- > 0) { if (OVERLOAD_TEST) break; /* end of input */ if (strchr("0123456789.,+-", *Np->inout_p) != NULL) break; /* it's a data character */ - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, end); } } @@ -6026,7 +6028,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, } else { - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len); } continue; } diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index f6a074aa7d0..583c1712ad1 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -663,7 +663,7 @@ report_json_context(JsonLexContext *lex) { /* Advance to next multibyte character */ if (IS_HIGHBIT_SET(*context_start)) - context_start += pg_mblen(context_start); + context_start += pg_mblen_range(context_start, context_end); else context_start++; } diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index 3377cc81cbf..30183f66b15 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -527,7 +527,7 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("unrecognized flag character \"%.*s\" in LIKE_REGEX predicate", - pg_mblen(flags->val + i), flags->val + i))); + pg_mblen_range(flags->val + i, flags->val + flags->len), flags->val + i))); break; } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index f8979776d0d..38d4f580d01 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -84,6 +84,8 @@ varstr_levenshtein(const char *source, int slen, int i, j; const char *y; + const char *send = source + slen; + const char *tend = target + tlen; /* * For varstr_levenshtein_less_equal, we have real variables called @@ -184,10 +186,10 @@ varstr_levenshtein(const char *source, int slen, #endif /* - * In order to avoid calling pg_mblen() repeatedly on each character in s, - * we cache all the lengths before starting the main loop -- but if all - * the characters in both strings are single byte, then we skip this and - * use a fast-path in the main loop. If only one string contains + * In order to avoid calling pg_mblen_range() repeatedly on each character + * in s, we cache all the lengths before starting the main loop -- but if + * all the characters in both strings are single byte, then we skip this + * and use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ @@ -199,7 +201,7 @@ varstr_levenshtein(const char *source, int slen, s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { - s_char_len[i] = pg_mblen(cp); + s_char_len[i] = pg_mblen_range(cp, send); cp += s_char_len[i]; } s_char_len[i] = 0; @@ -225,7 +227,7 @@ varstr_levenshtein(const char *source, int slen, { int *temp; const char *x = source; - int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; + int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index eed183cd0dc..080bb6af840 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -54,20 +54,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation); *-------------------- */ static inline int -wchareq(const char *p1, const char *p2) +wchareq(const char *p1, int p1len, const char *p2, int p2len) { - int p1_len; + int p1clen; /* Optimization: quickly compare the first byte. */ if (*p1 != *p2) return 0; - p1_len = pg_mblen(p1); - if (pg_mblen(p2) != p1_len) + p1clen = pg_mblen_with_len(p1, p1len); + if (pg_mblen_with_len(p2, p2len) != p1clen) return 0; /* They are the same length */ - while (p1_len--) + while (p1clen--) { if (*p1++ != *p2++) return 0; @@ -106,11 +106,11 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) #define NextByte(p, plen) ((p)++, (plen)--) /* Set up to compile like_match.c for multibyte characters */ -#define CHAREQ(p1, p2) wchareq((p1), (p2)) +#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len)) #define NextChar(p, plen) \ - do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ - do { int __l = pg_mblen(src); \ + do { int __l = pg_mblen_with_len((src), (srclen)); \ (srclen) -= __l; \ while (__l-- > 0) \ *(dst)++ = *(src)++; \ @@ -122,7 +122,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) #include "like_match.c" /* Set up to compile like_match.c for single-byte characters */ -#define CHAREQ(p1, p2) (*(p1) == *(p2)) +#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2)) #define NextChar(p, plen) NextByte((p), (plen)) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 2f32cdaf020..9df572e92bd 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -294,6 +294,7 @@ do_like_escape(text *pat, text *esc) errhint("Escape string must be empty or one character."))); e = VARDATA_ANY(esc); + elen = VARSIZE_ANY_EXHDR(esc); /* * If specified escape is '\', just copy the pattern as-is. @@ -312,7 +313,7 @@ do_like_escape(text *pat, text *esc) afterescape = false; while (plen > 0) { - if (CHAREQ(p, e) && !afterescape) + if (CHAREQ(p, plen, e, elen) && !afterescape) { *r++ = '\\'; NextChar(p, plen); diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index bd9e5f9e243..471c6967be0 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -150,8 +150,8 @@ lpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -196,7 +196,7 @@ lpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -209,7 +209,7 @@ lpad(PG_FUNCTION_ARGS) while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -248,8 +248,8 @@ rpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -288,11 +288,12 @@ rpad(PG_FUNCTION_ARGS) m = len - s1len; ptr1 = VARDATA_ANY(string1); + ptr_ret = VARDATA(ret); while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -304,7 +305,7 @@ rpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -389,6 +390,7 @@ dotrim(const char *string, int stringlen, */ const char **stringchars; const char **setchars; + const char *setend; int *stringmblen; int *setmblen; int stringnchars; @@ -396,6 +398,7 @@ dotrim(const char *string, int stringlen, int resultndx; int resultnchars; const char *p; + const char *pend; int len; int mblen; const char *str_pos; @@ -406,10 +409,11 @@ dotrim(const char *string, int stringlen, stringnchars = 0; p = string; len = stringlen; + pend = p + len; while (len > 0) { stringchars[stringnchars] = p; - stringmblen[stringnchars] = mblen = pg_mblen(p); + stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend); stringnchars++; p += mblen; len -= mblen; @@ -420,10 +424,11 @@ dotrim(const char *string, int stringlen, setnchars = 0; p = set; len = setlen; + setend = set + setlen; while (len > 0) { setchars[setnchars] = p; - setmblen[setnchars] = mblen = pg_mblen(p); + setmblen[setnchars] = mblen = pg_mblen_range(p, setend); setnchars++; p += mblen; len -= mblen; @@ -801,6 +806,8 @@ translate(PG_FUNCTION_ARGS) *to_end; char *source, *target; + const char *source_end; + const char *from_end; int m, fromlen, tolen, @@ -815,9 +822,11 @@ translate(PG_FUNCTION_ARGS) if (m <= 0) PG_RETURN_TEXT_P(string); source = VARDATA_ANY(string); + source_end = source + m; fromlen = VARSIZE_ANY_EXHDR(from); from_ptr = VARDATA_ANY(from); + from_end = from_ptr + fromlen; tolen = VARSIZE_ANY_EXHDR(to); to_ptr = VARDATA_ANY(to); to_end = to_ptr + tolen; @@ -840,12 +849,12 @@ translate(PG_FUNCTION_ARGS) while (m > 0) { - source_len = pg_mblen(source); + source_len = pg_mblen_range(source, source_end); from_index = 0; for (i = 0; i < fromlen; i += len) { - len = pg_mblen(&from_ptr[i]); + len = pg_mblen_range(&from_ptr[i], from_end); if (len == source_len && memcmp(source, &from_ptr[i], len) == 0) break; @@ -861,11 +870,11 @@ translate(PG_FUNCTION_ARGS) { if (p >= to_end) break; - p += pg_mblen(p); + p += pg_mblen_range(p, to_end); } if (p < to_end) { - len = pg_mblen(p); + len = pg_mblen_range(p, to_end); memcpy(target, p, len); target += len; retlen += len; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 4d2ea4848fb..0eaa528dc5c 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -424,7 +424,7 @@ parse_re_flags(pg_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i))); break; } } @@ -672,6 +672,7 @@ similar_escape_internal(text *pat_text, text *esc_text) *r; int plen, elen; + const char *pend; bool afterescape = false; int nquotes = 0; int bracket_depth = 0; /* square bracket nesting level */ @@ -679,6 +680,7 @@ similar_escape_internal(text *pat_text, text *esc_text) p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); + pend = p + plen; if (esc_text == NULL) { /* No ESCAPE clause provided; default to backslash as escape */ @@ -778,7 +780,7 @@ similar_escape_internal(text *pat_text, text *esc_text) if (elen > 1) { - int mblen = pg_mblen(p); + int mblen = pg_mblen_range(p, pend); if (mblen > 1) { diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index dde3a90cc3b..370f57ede58 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -109,7 +109,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) return buf; buf++; - while (*buf && pg_mblen(buf) == 1) + while (*buf && pg_mblen_cstr(buf) == 1) { switch (*buf) { @@ -186,7 +186,7 @@ parse_phrase_operator(TSQueryParserState pstate, int16 *distance) continue; } - if (!t_isdigit(ptr)) + if (!t_isdigit_cstr(ptr)) return false; errno = 0; @@ -248,12 +248,12 @@ parse_or_operator(TSQueryParserState pstate) return false; /* it shouldn't be a part of any word */ - if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr)) + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha_cstr(ptr) || t_isdigit_cstr(ptr)) return false; for (;;) { - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (*ptr == '\0') /* got end of string without operand */ return false; @@ -263,7 +263,7 @@ parse_or_operator(TSQueryParserState pstate) * So we still treat OR literal as operation with possibly incorrect * operand and will not search it as lexeme */ - if (!t_isspace(ptr)) + if (!t_isspace_cstr(ptr)) break; } @@ -306,7 +306,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, errmsg("syntax error in tsquery: \"%s\"", state->buffer))); } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* * We rely on the tsvector parser to parse the value for @@ -364,14 +364,14 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, { return (state->count) ? PT_ERR : PT_END; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { return PT_ERR; } break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -425,7 +425,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, state->state = WAITOPERAND; continue; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* * We rely on the tsvector parser to parse the value for @@ -468,7 +468,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, state->buf++; continue; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* insert implicit AND between operands */ state->state = WAITOPERAND; @@ -478,7 +478,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -961,9 +961,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) *(in->cur) = '\\'; in->cur++; } - COPYCHAR(in->cur, op); - clen = pg_mblen(op); + clen = ts_copychar_cstr(in->cur, op); op += clen; in->cur += clen; } diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 3831dd3c806..bc600915f74 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -313,9 +313,9 @@ tsvectorout(PG_FUNCTION_ARGS) lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); - char *curbegin, - *curin, + char *curin, *curout; + const char *curend; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) @@ -328,13 +328,14 @@ tsvectorout(PG_FUNCTION_ARGS) curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { - curbegin = curin = STRPTR(out) + ptr->pos; + curin = STRPTR(out) + ptr->pos; + curend = curin + ptr->len; if (i != 0) *curout++ = ' '; *curout++ = '\''; - while (curin - curbegin < ptr->len) + while (curin < curend) { - int len = pg_mblen(curin); + int len = pg_mblen_range(curin, curend); if (t_iseq(curin, '\'')) *curout++ = '\''; diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 4237806d6d8..d9562e209da 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -2434,11 +2434,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) if (ws) { char *buf; + const char *end; buf = VARDATA_ANY(ws); - while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) + end = buf + VARSIZE_ANY_EXHDR(ws); + while (buf < end) { - if (pg_mblen(buf) == 1) + int len = pg_mblen_range(buf, end); + + if (len == 1) { switch (*buf) { @@ -2462,7 +2466,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) stat->weight |= 0; } } - buf += pg_mblen(buf); + buf += len; } } diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index c2df4093e6b..1187f0af523 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -185,10 +185,9 @@ gettoken_tsvector(TSVectorParseState state, else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) PRSSYNTAXERROR; - else if (!t_isspace(state->prsbuf)) + else if (!t_isspace_cstr(state->prsbuf)) { - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDWORD; } } @@ -202,8 +201,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } @@ -215,7 +213,7 @@ gettoken_tsvector(TSVectorParseState state, statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) { @@ -238,8 +236,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITENDCMPLX) @@ -258,8 +255,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITCHARCMPLX) @@ -267,8 +263,7 @@ gettoken_tsvector(TSVectorParseState state, if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDCMPLX; } else @@ -279,7 +274,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; if (state->oprisdelim) { - /* state->prsbuf+=pg_mblen(state->prsbuf); */ + /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */ RETURN_TOKEN; } else @@ -296,7 +291,7 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == INPOSINFO) { - if (t_isdigit(state->prsbuf)) + if (t_isdigit_cstr(state->prsbuf)) { if (posalen == 0) { @@ -351,10 +346,10 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } - else if (t_isspace(state->prsbuf) || + else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; - else if (!t_isdigit(state->prsbuf)) + else if (!t_isdigit_cstr(state->prsbuf)) PRSSYNTAXERROR; } else /* internal error */ @@ -362,6 +357,6 @@ gettoken_tsvector(TSVectorParseState state, statecode); /* get next char */ - state->prsbuf += pg_mblen(state->prsbuf); + state->prsbuf += pg_mblen_cstr(state->prsbuf); } } diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c index 0d0c0fd9f3c..b008959ab54 100644 --- a/src/backend/utils/adt/varbit.c +++ b/src/backend/utils/adt/varbit.c @@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { @@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d62cbdce325..b79ae35a0d2 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -779,8 +779,11 @@ text_catenate(text *t1, text *t2) * charlen_to_bytelen() * Compute the number of bytes occupied by n characters starting at *p * - * It is caller's responsibility that there actually are n characters; - * the string need not be null-terminated. + * The caller shall ensure there are n complete characters. Callers achieve + * this by deriving "n" from regmatch_t findings from searching a wchar array. + * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex + * matches will end no later than the last complete character. (The string + * need not be null-terminated.) */ static int charlen_to_bytelen(const char *p, int n) @@ -795,7 +798,7 @@ charlen_to_bytelen(const char *p, int n) const char *s; for (s = p; n > 0; n--) - s += pg_mblen(s); + s += pg_mblen_unbounded(s); /* caller verified encoding */ return s - p; } @@ -928,6 +931,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 slice_start; int32 slice_size; int32 slice_strlen; + int32 slice_len; text *slice; int32 E1; int32 i; @@ -997,7 +1001,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) slice = (text *) DatumGetPointer(str); /* see if we got back an empty string */ - if (VARSIZE_ANY_EXHDR(slice) == 0) + slice_len = VARSIZE_ANY_EXHDR(slice); + if (slice_len == 0) { if (slice != (text *) DatumGetPointer(str)) pfree(slice); @@ -1006,7 +1011,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) /* Now we can get the actual length of the slice in MB characters */ slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - VARSIZE_ANY_EXHDR(slice)); + slice_len); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -1033,7 +1038,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) */ p = VARDATA_ANY(slice); for (i = 0; i < S1 - 1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); /* hang onto a pointer to our start position */ s = p; @@ -1043,7 +1048,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * length. */ for (i = S1; i < E1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); ret = (text *) palloc(VARHDRSZ + (p - s)); SET_VARSIZE(ret, VARHDRSZ + (p - s)); @@ -1350,6 +1355,8 @@ retry: */ if (state->is_multibyte_char_in_char) { + const char *haystack_end = state->str1 + state->len1; + /* Walk one character at a time, until we reach the match. */ /* the search should never move backwards. */ @@ -1358,7 +1365,7 @@ retry: while (state->refpoint < matchptr) { /* step to next character. */ - state->refpoint += pg_mblen(state->refpoint); + state->refpoint += pg_mblen_range(state->refpoint, haystack_end); state->refpos++; /* @@ -1473,7 +1480,8 @@ text_position_get_match_pos(TextPositionState *state) /* Convert the byte position to char position. */ while (state->refpoint < state->last_match) { - state->refpoint += pg_mblen(state->refpoint); + state->refpoint += pg_mblen_range(state->refpoint, + state->last_match); state->refpos++; } Assert(state->refpoint == state->last_match); @@ -4368,7 +4376,7 @@ check_replace_text_has_escape_char(const text *replace_text) } else { - for (; p < p_end; p += pg_mblen(p)) + for (; p < p_end; p += pg_mblen_range(p, p_end)) { if (*p == '\\') return true; @@ -4408,7 +4416,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, } else { - for (; p < p_end && *p != '\\'; p += pg_mblen(p)) + for (; p < p_end && *p != '\\'; p += pg_mblen_range(p, p_end)) /* nothing */ ; } @@ -4966,6 +4974,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) } else { + const char *end_ptr; + /* * When fldsep is NULL, each character in the input string becomes a * separate element in the result set. The separator is effectively @@ -4974,10 +4984,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) inputstring_len = VARSIZE_ANY_EXHDR(inputstring); start_ptr = VARDATA_ANY(inputstring); + end_ptr = start_ptr + inputstring_len; while (inputstring_len > 0) { - int chunk_len = pg_mblen(start_ptr); + int chunk_len = pg_mblen_range(start_ptr, end_ptr); CHECK_FOR_INTERRUPTS(); @@ -5656,7 +5667,7 @@ text_reverse(PG_FUNCTION_ARGS) { int sz; - sz = pg_mblen(p); + sz = pg_mblen_range(p, endp); dst -= sz; memcpy(dst, p, sz); p += sz; @@ -5817,7 +5828,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); /* If indirect width was specified, get its value */ @@ -5938,7 +5949,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); break; } diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 0137ff90f0a..98dcc04122b 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2037,8 +2037,7 @@ sqlchar_to_unicode(const char *s) char *utf8string; pg_wchar ret[2]; /* need space for trailing zero */ - /* note we're not assuming s is null-terminated */ - utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8); + utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8); pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret, pg_encoding_mblen(PG_UTF8, utf8string)); @@ -2091,7 +2090,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, initStringInfo(&buf); - for (p = ident; *p; p += pg_mblen(p)) + for (p = ident; *p; p += pg_mblen_cstr(p)) { if (*p == ':' && (p == ident || fully_escaped)) appendStringInfoString(&buf, "_x003A_"); @@ -2116,7 +2115,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, : !is_valid_xml_namechar(u)) appendStringInfo(&buf, "_x%04X_", (unsigned int) u); else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } } @@ -2139,7 +2138,7 @@ map_xml_name_to_sql_identifier(const char *name) initStringInfo(&buf); - for (p = name; *p; p += pg_mblen(p)) + for (p = name; *p; p += pg_mblen_cstr(p)) { if (*p == '_' && *(p + 1) == 'x' && isxdigit((unsigned char) *(p + 2)) @@ -2157,7 +2156,7 @@ map_xml_name_to_sql_identifier(const char *name) p += 6; } else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } return buf.data; diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index b308ff826fe..c7b9a403a7f 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -38,6 +38,7 @@ #include "catalog/namespace.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" +#include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/relcache.h" #include "utils/syscache.h" @@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server); static int cliplen(const char *str, int len, int limit); +pg_attribute_noreturn() +static void report_invalid_encoding_int(int encoding, const char *mbstr, + int mblen, int len); + +pg_attribute_noreturn() +static void report_invalid_encoding_db(const char *mbstr, int mblen, int len); + /* * Prepare for a future call to SetClientEncoding. Success should mean @@ -962,11 +970,126 @@ pg_encoding_wchar2mb_with_len(int encoding, return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); } -/* returns the byte length of a multibyte character */ +/* + * Returns the byte length of a multibyte character sequence in a + * null-terminated string. Raises an illegal byte sequence error if the + * sequence would hit a null terminator. + * + * The caller is expected to have checked for a terminator at *mbstr == 0 + * before calling, but some callers want 1 in that case, so this function + * continues that tradition. + * + * This must only be used for strings that have a null-terminator to enable + * bounds detection. + */ +int +pg_mblen_cstr(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + /* + * The .mblen functions return 1 when given a pointer to a terminator. + * Some callers depend on that, so we tolerate it for now. Well-behaved + * callers check the leading byte for a terminator *before* calling. + */ + for (int i = 1; i < length; ++i) + if (unlikely(mbstr[i] == 0)) + report_invalid_encoding_db(mbstr, length, i); + + /* + * String should be NUL-terminated, but checking that would make typical + * callers O(N^2), tripling Valgrind check-world time. Unless + * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we + * found a character, not a terminator, the next byte must be a terminator + * or the start of the next character.) If the caller iterates the whole + * string, the last call will diagnose a missing terminator. + */ + if (mbstr[0] != '\0') + { +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr)); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1); +#endif + } + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence + * error if the sequence would exceed the range. + */ +int +pg_mblen_range(const char *mbstr, const char *end) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(end > mbstr); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(mbstr + length > end)) + report_invalid_encoding_db(mbstr, length, end - mbstr); + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * extending for 'limit' bytes, which must be at least one. Raises an illegal + * byte sequence error if the sequence would exceed the range. + */ +int +pg_mblen_with_len(const char *mbstr, int limit) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(limit >= 1); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(length > limit)) + report_invalid_encoding_db(mbstr, length, limit); + + return length; +} + + +/* + * Returns the length of a multibyte character sequence, without any + * validation of bounds. + * + * PLEASE NOTE: This function can only be used safely if the caller has + * already verified the input string, since otherwise there is a risk of + * overrunning the buffer if the string is invalid. A prior call to a + * pg_mbstrlen* function suffices. + */ +int +pg_mblen_unbounded(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); + + return length; +} + +/* + * Historical name for pg_mblen_unbounded(). Should not be used and will be + * removed in a later version. + */ int pg_mblen(const char *mbstr) { - return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + return pg_mblen_unbounded(mbstr); } /* returns the display length of a multibyte character */ @@ -988,14 +1111,14 @@ pg_mbstrlen(const char *mbstr) while (*mbstr) { - mbstr += pg_mblen(mbstr); + mbstr += pg_mblen_cstr(mbstr); len++; } return len; } /* returns the length (counted in wchars) of a multibyte string - * (not necessarily NULL terminated) + * (stops at the first of "limit" or a NUL) */ int pg_mbstrlen_with_len(const char *mbstr, int limit) @@ -1008,7 +1131,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit) while (limit > 0 && *mbstr) { - int l = pg_mblen(mbstr); + int l = pg_mblen_with_len(mbstr, limit); limit -= l; mbstr += l; @@ -1078,7 +1201,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit) while (len > 0 && *mbstr) { - l = pg_mblen(mbstr); + l = pg_mblen_with_len(mbstr, len); nch++; if (nch > limit) break; @@ -1648,12 +1771,19 @@ void report_invalid_encoding(int encoding, const char *mbstr, int len) { int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); + + report_invalid_encoding_int(encoding, mbstr, l, len); +} + +static void +report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len) +{ char buf[8 * 5 + 1]; char *p = buf; int j, jlimit; - jlimit = Min(l, len); + jlimit = Min(mblen, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) @@ -1670,6 +1800,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) buf))); } +static void +report_invalid_encoding_db(const char *mbstr, int mblen, int len) +{ + report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len); +} + /* * report_untranslatable_char: complain about untranslatable character * diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index a44ea1acbc0..ff832f6d50b 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -608,7 +608,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); extern size_t pg_wchar_strlen(const pg_wchar *wstr); +extern int pg_mblen_cstr(const char *mbstr); +extern int pg_mblen_range(const char *mbstr, const char *end); +extern int pg_mblen_with_len(const char *mbstr, int limit); +extern int pg_mblen_unbounded(const char *mbstr); + +/* deprecated */ extern int pg_mblen(const char *mbstr); + extern int pg_dsplen(const char *mbstr); extern int pg_mbstrlen(const char *mbstr); extern int pg_mbstrlen_with_len(const char *mbstr, int len); diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index cfd34b32443..5d5c240b492 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -45,12 +45,36 @@ typedef struct /* The second argument of t_iseq() must be a plain ASCII character */ #define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) -#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) +/* Copy multibyte character of known byte length, return byte length. */ +static inline int +ts_copychar_with_len(void *dest, const void *src, int length) +{ + memcpy(dest, src, length); + return length; +} -extern int t_isdigit(const char *ptr); -extern int t_isspace(const char *ptr); -extern int t_isalpha(const char *ptr); -extern int t_isprint(const char *ptr); +/* Copy multibyte character from null-terminated string, return byte length. */ +static inline int +ts_copychar_cstr(void *dest, const void *src) +{ + return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src)); +} + +/* Historical macro for the above. */ +#define COPYCHAR ts_copychar_cstr + +#define GENERATE_T_ISCLASS_DECL(character_class) \ +extern int t_is##character_class##_with_len(const char *ptr, int len); \ +extern int t_is##character_class##_cstr(const char *ptr); \ +extern int t_is##character_class##_unbounded(const char *ptr); \ +\ +/* deprecated */ \ +extern int t_is##character_class(const char *ptr); + +GENERATE_T_ISCLASS_DECL(alpha); +GENERATE_T_ISCLASS_DECL(digit); +GENERATE_T_ISCLASS_DECL(print); +GENERATE_T_ISCLASS_DECL(space); extern char *lowerstr(const char *str); extern char *lowerstr_with_len(const char *str, int len); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index 4266560151f..ef3d67aeb1a 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -38,14 +38,12 @@ extern bool gettoken_tsvector(TSVectorParseState state, extern void close_tsvector_parser(TSVectorParseState state); /* phrase operator begins with '<' */ -#define ISOPERATOR(x) \ - ( pg_mblen(x) == 1 && ( *(x) == '!' || \ - *(x) == '&' || \ - *(x) == '|' || \ - *(x) == '(' || \ - *(x) == ')' || \ - *(x) == '<' \ - ) ) +#define ISOPERATOR(x) (*(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<') /* parse_tsquery */ diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c index 095751cf04e..b8ff535c8f3 100644 --- a/src/test/modules/test_regex/test_regex.c +++ b/src/test/modules/test_regex/test_regex.c @@ -424,7 +424,8 @@ parse_test_flags(test_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression test option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), + opt_p + i))); break; } }