From cecedb912aeea391332bcb307d8e089266dbb2be Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 7 Jan 2026 22:14:31 +1300
Subject: [PATCH] Replace pg_mblen() with bounds-checked versions.

A corrupted string could cause code that iterates with pg_mblen() to
overrun its buffer.  Fix, by converting all callers to one of the
following:

1. Callers with a null-terminated string now use pg_mblen_cstr(), which
raises an "illegal byte sequence" error if it finds a terminator in the
middle of the sequence.

2. Callers with a length or end pointer now use either
pg_mblen_with_len() or pg_mblen_range(), for the same effect, depending
on which of the two seems more convenient at each site.

3. A small number of cases pre-validate a string, and can use
pg_mblen_unbounded().

The traditional pg_mblen() function and COPYCHAR macro still exist for
backward compatibility, but are no longer used by core code and are
hereby deprecated.  The same applies to the t_isXXX() functions.

Security: CVE-2026-2006
Backpatch-through: 14
Co-authored-by: Thomas Munro <thomas.munro@gmail.com>
Co-authored-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reported-by: Paul Gerste (as part of zeroday.cloud)
Reported-by: Moritz Sanft (as part of zeroday.cloud)
---
 contrib/btree_gist/btree_utils_var.c     |  21 +++-
 contrib/dict_xsyn/dict_xsyn.c            |   8 +-
 contrib/hstore/hstore_io.c               |   8 +-
 contrib/ltree/lquery_op.c                |   4 +-
 contrib/ltree/ltree.h                    |   3 +-
 contrib/ltree/ltree_io.c                 |  16 +--
 contrib/ltree/ltxtquery_io.c             |   4 +-
 contrib/pageinspect/heapfuncs.c          |   2 +-
 contrib/pg_trgm/trgm.h                   |   4 +-
 contrib/pg_trgm/trgm_op.c                |  48 +++++---
 contrib/pg_trgm/trgm_regexp.c            |  21 ++--
 contrib/unaccent/unaccent.c              |   7 +-
 src/backend/catalog/pg_proc.c            |   2 +-
 src/backend/tsearch/dict_synonym.c       |   8 +-
 src/backend/tsearch/dict_thesaurus.c     |  18 +--
 src/backend/tsearch/regis.c              |  37 +++---
 src/backend/tsearch/spell.c              | 123 +++++++++----------
 src/backend/tsearch/ts_locale.c          |  97 ++++++---------
 src/backend/tsearch/ts_utils.c           |   4 +-
 src/backend/tsearch/wparser_def.c        |   3 +-
 src/backend/utils/adt/encode.c           |  10 +-
 src/backend/utils/adt/formatting.c       |  22 ++--
 src/backend/utils/adt/jsonfuncs.c        |   2 +-
 src/backend/utils/adt/jsonpath_gram.y    |   2 +-
 src/backend/utils/adt/levenshtein.c      |  14 ++-
 src/backend/utils/adt/like.c             |  18 +--
 src/backend/utils/adt/like_match.c       |   3 +-
 src/backend/utils/adt/oracle_compat.c    |  33 +++--
 src/backend/utils/adt/regexp.c           |   6 +-
 src/backend/utils/adt/tsquery.c          |  25 ++--
 src/backend/utils/adt/tsvector.c         |  11 +-
 src/backend/utils/adt/tsvector_op.c      |  10 +-
 src/backend/utils/adt/tsvector_parser.c  |  29 ++---
 src/backend/utils/adt/varbit.c           |   8 +-
 src/backend/utils/adt/varlena.c          |  41 ++++---
 src/backend/utils/adt/xml.c              |  11 +-
 src/backend/utils/mb/mbutils.c           | 150 +++++++++++++++++++++--
 src/include/mb/pg_wchar.h                |   7 ++
 src/include/tsearch/ts_locale.h          |  34 ++++-
 src/include/tsearch/ts_utils.h           |  14 +--
 src/test/modules/test_regex/test_regex.c |   3 +-
 41 files changed, 537 insertions(+), 354 deletions(-)

diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c
index 2886c08b85e..9d93b3c775e 100644
--- a/contrib/btree_gist/btree_utils_var.c
+++ b/contrib/btree_gist/btree_utils_var.c
@@ -116,36 +116,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo)
 
 /*
  * returns the common prefix length of a node key
+ *
+ * If the underlying type is character data, the prefix length may point in
+ * the middle of a multibyte character.
 */
 static int32
 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
 {
 	GBT_VARKEY_R r = gbt_var_key_readable(node);
 	int32		i = 0;
-	int32		l = 0;
+	int32		l_left_to_match = 0;
+	int32		l_total = 0;
 	int32		t1len = VARSIZE(r.lower) - VARHDRSZ;
 	int32		t2len = VARSIZE(r.upper) - VARHDRSZ;
 	int32		ml = Min(t1len, t2len);
 	char	   *p1 = VARDATA(r.lower);
 	char	   *p2 = VARDATA(r.upper);
+	const char *end1 = p1 + t1len;
+	const char *end2 = p2 + t2len;
 
 	if (ml == 0)
 		return 0;
 
 	while (i < ml)
 	{
-		if (tinfo->eml > 1 && l == 0)
+		if (tinfo->eml > 1 && l_left_to_match == 0)
 		{
-			if ((l = pg_mblen(p1)) != pg_mblen(p2))
+			l_total = pg_mblen_range(p1, end1);
+			if (l_total != pg_mblen_range(p2, end2))
 			{
 				return i;
 			}
+			l_left_to_match = l_total;
 		}
 		if (*p1 != *p2)
 		{
 			if (tinfo->eml > 1)
 			{
-				return (i - l + 1);
+				int32		l_matched_subset = l_total - l_left_to_match;
+
+				/* end common prefix at final byte of last matching char */
+				return i - l_matched_subset;
 			}
 			else
 			{
@@ -155,7 +166,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
 
 		p1++;
 		p2++;
-		l--;
+		l_left_to_match--;
 		i++;
 	}
 	return ml;					/* lower == upper */
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 79c4f18f409..5c5e4e7a3c9 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -48,15 +48,15 @@ find_word(char *in, char **end)
 	char	   *start;
 
 	*end = NULL;
-	while (*in && t_isspace(in))
-		in += pg_mblen(in);
+	while (*in && t_isspace_cstr(in))
+		in += pg_mblen_cstr(in);
 
 	if (!*in || *in == '#')
 		return NULL;
 	start = in;
 
-	while (*in && !t_isspace(in))
-		in += pg_mblen(in);
+	while (*in && !t_isspace_cstr(in))
+		in += pg_mblen_cstr(in);
 
 	*end = in;
 
diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 03057f085d1..0b1e0581e84 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -82,7 +82,7 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped)
 			else if (*(state->ptr) == '=' && !ignoreeq)
 			{
 				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-					 pg_mblen(state->ptr), state->ptr,
+					 pg_mblen_cstr(state->ptr), state->ptr,
 					 (int32) (state->ptr - state->begin));
 			}
 			else if (*(state->ptr) == '\\')
@@ -223,7 +223,7 @@ parse_hstore(HSParser *state)
 			else if (!scanner_isspace((unsigned char) *(state->ptr)))
 			{
 				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-					 pg_mblen(state->ptr), state->ptr,
+					 pg_mblen_cstr(state->ptr), state->ptr,
 					 (int32) (state->ptr - state->begin));
 			}
 		}
@@ -240,7 +240,7 @@ parse_hstore(HSParser *state)
 			else
 			{
 				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-					 pg_mblen(state->ptr), state->ptr,
+					 pg_mblen_cstr(state->ptr), state->ptr,
 					 (int32) (state->ptr - state->begin));
 			}
 		}
@@ -275,7 +275,7 @@ parse_hstore(HSParser *state)
 			else if (!scanner_isspace((unsigned char) *(state->ptr)))
 			{
 				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
-					 pg_mblen(state->ptr), state->ptr,
+					 pg_mblen_cstr(state->ptr), state->ptr,
 					 (int32) (state->ptr - state->begin));
 			}
 		}
diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c
index d89af20f6cf..46019a0e83a 100644
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -26,14 +26,14 @@ getlexeme(char *start, char *end, int *len)
 	char	   *ptr;
 	int			charlen;
 
-	while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_'))
+	while (start < end && (charlen = pg_mblen_range(start, end)) == 1 && t_iseq(start, '_'))
 		start += charlen;
 
 	ptr = start;
 	if (ptr >= end)
 		return NULL;
 
-	while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_')))
+	while (ptr < end && !((charlen = pg_mblen_range(ptr, end)) == 1 && t_iseq(ptr, '_')))
 		ptr += charlen;
 
 	*len = ptr - start;
diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h
index 4e18caeb2fc..c7f00a99b06 100644
--- a/contrib/ltree/ltree.h
+++ b/contrib/ltree/ltree.h
@@ -113,7 +113,8 @@ typedef struct
 
 #define LQUERY_HASNOT		0x01
 
-#define ISALNUM(x)	( t_isalpha(x) || t_isdigit(x)	|| ( pg_mblen(x) == 1 && t_iseq((x), '_') ) )
+/* Caller has already called mblen, so we can use _unbounded variants safely. */
+#define ISALNUM(x)	( t_isalpha_unbounded(x) || t_isdigit_unbounded(x) || ( pg_mblen_unbounded(x) == 1 && t_iseq((x), '_') ) )
 
 /* full text query */
 
diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c
index 15115cb29f3..0a44a8c4691 100644
--- a/contrib/ltree/ltree_io.c
+++ b/contrib/ltree/ltree_io.c
@@ -54,7 +54,7 @@ parse_ltree(const char *buf)
 	ptr = buf;
 	while (*ptr)
 	{
-		charlen = pg_mblen(ptr);
+		charlen = pg_mblen_cstr(ptr);
 		if (t_iseq(ptr, '.'))
 			num++;
 		ptr += charlen;
@@ -69,7 +69,7 @@ parse_ltree(const char *buf)
 	ptr = buf;
 	while (*ptr)
 	{
-		charlen = pg_mblen(ptr);
+		charlen = pg_mblen_cstr(ptr);
 
 		switch (state)
 		{
@@ -285,7 +285,7 @@ parse_lquery(const char *buf)
 	ptr = buf;
 	while (*ptr)
 	{
-		charlen = pg_mblen(ptr);
+		charlen = pg_mblen_cstr(ptr);
 
 		if (t_iseq(ptr, '.'))
 			num++;
@@ -305,7 +305,7 @@ parse_lquery(const char *buf)
 	ptr = buf;
 	while (*ptr)
 	{
-		charlen = pg_mblen(ptr);
+		charlen = pg_mblen_cstr(ptr);
 
 		switch (state)
 		{
@@ -402,7 +402,7 @@ parse_lquery(const char *buf)
 			case LQPRS_WAITFNUM:
 				if (t_iseq(ptr, ','))
 					state = LQPRS_WAITSNUM;
-				else if (t_isdigit(ptr))
+				else if (t_isdigit_cstr(ptr))
 				{
 					int			low = atoi(ptr);
 
@@ -420,7 +420,7 @@ parse_lquery(const char *buf)
 					UNCHAR;
 				break;
 			case LQPRS_WAITSNUM:
-				if (t_isdigit(ptr))
+				if (t_isdigit_cstr(ptr))
 				{
 					int			high = atoi(ptr);
 
@@ -451,7 +451,7 @@ parse_lquery(const char *buf)
 			case LQPRS_WAITCLOSE:
 				if (t_iseq(ptr, '}'))
 					state = LQPRS_WAITEND;
-				else if (!t_isdigit(ptr))
+				else if (!t_isdigit_cstr(ptr))
 					UNCHAR;
 				break;
 			case LQPRS_WAITND:
@@ -462,7 +462,7 @@ parse_lquery(const char *buf)
 				}
 				else if (t_iseq(ptr, ','))
 					state = LQPRS_WAITSNUM;
-				else if (!t_isdigit(ptr))
+				else if (!t_isdigit_cstr(ptr))
 					UNCHAR;
 				break;
 			case LQPRS_WAITEND:
diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c
index d967f92110f..7f98bdedecb 100644
--- a/contrib/ltree/ltxtquery_io.c
+++ b/contrib/ltree/ltxtquery_io.c
@@ -59,7 +59,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
 
 	for (;;)
 	{
-		charlen = pg_mblen(state->buf);
+		charlen = pg_mblen_cstr(state->buf);
 
 		switch (state->state)
 		{
@@ -83,7 +83,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
 					*lenval = charlen;
 					*flag = 0;
 				}
-				else if (!t_isspace(state->buf))
+				else if (!t_isspace_unbounded(state->buf))
 					ereport(ERROR,
 							(errcode(ERRCODE_SYNTAX_ERROR),
 							 errmsg("operand syntax error")));
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index b067dd3660e..ebf030d484c 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -101,7 +101,7 @@ text_to_bits(char *str, int len)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("invalid character \"%.*s\" in t_bits string",
-							pg_mblen(str + off), str + off)));
+							pg_mblen_cstr(str + off), str + off)));
 
 		if (off % 8 == 7)
 			bits[off / 8] = byte;
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 405a1d95528..06d3994e692 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -52,10 +52,10 @@ typedef char trgm[3];
 } while(0)
 
 #ifdef KEEPONLYALNUM
-#define ISWORDCHR(c)	(t_isalpha(c) || t_isdigit(c))
+#define ISWORDCHR(c, len)	(t_isalpha_with_len(c, len) || t_isdigit_with_len(c, len))
 #define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
 #else
-#define ISWORDCHR(c)	(!t_isspace(c))
+#define ISWORDCHR(c, len)	(!t_isspace_with_len(c, len))
 #define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
 #endif
 #define ISPRINTABLETRGM(t)	( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index fb38135f7a3..63895c3017d 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -171,18 +171,29 @@ static char *
 find_word(char *str, int lenstr, char **endword, int *charlen)
 {
 	char	   *beginword = str;
+	const char *endstr = str + lenstr;
 
-	while (beginword - str < lenstr && !ISWORDCHR(beginword))
-		beginword += pg_mblen(beginword);
+	while (beginword < endstr)
+	{
+		int			clen = pg_mblen_range(beginword, endstr);
 
-	if (beginword - str >= lenstr)
+		if (ISWORDCHR(beginword, clen))
+			break;
+		beginword += clen;
+	}
+
+	if (beginword >= endstr)
 		return NULL;
 
 	*endword = beginword;
 	*charlen = 0;
-	while (*endword - str < lenstr && ISWORDCHR(*endword))
+	while (*endword < endstr)
 	{
-		*endword += pg_mblen(*endword);
+		int			clen = pg_mblen_range(*endword, endstr);
+
+		if (!ISWORDCHR(*endword, clen))
+			break;
+		*endword += clen;
 		(*charlen)++;
 	}
 
@@ -230,9 +241,9 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
 	if (bytelen > charlen)
 	{
 		/* Find multibyte character boundaries and apply compact_trigram */
-		int			lenfirst = pg_mblen(str),
-					lenmiddle = pg_mblen(str + lenfirst),
-					lenlast = pg_mblen(str + lenfirst + lenmiddle);
+		int			lenfirst = pg_mblen_unbounded(str),
+					lenmiddle = pg_mblen_unbounded(str + lenfirst),
+					lenlast = pg_mblen_unbounded(str + lenfirst + lenmiddle);
 
 		while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
 		{
@@ -243,7 +254,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
 
 			lenfirst = lenmiddle;
 			lenmiddle = lenlast;
-			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
+			lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
 		}
 	}
 	else
@@ -723,6 +734,7 @@ get_wildcard_part(const char *str, int lenstr,
 {
 	const char *beginword = str;
 	const char *endword;
+	const char *endstr = str + lenstr;
 	char	   *s = buf;
 	bool		in_leading_wildcard_meta = false;
 	bool		in_trailing_wildcard_meta = false;
@@ -735,11 +747,13 @@ get_wildcard_part(const char *str, int lenstr,
 	 * from this loop to the next one, since we may exit at a word character
 	 * that is in_escape.
 	 */
-	while (beginword - str < lenstr)
+	while (beginword < endstr)
 	{
+		clen = pg_mblen_range(beginword, endstr);
+
 		if (in_escape)
 		{
-			if (ISWORDCHR(beginword))
+			if (ISWORDCHR(beginword, clen))
 				break;
 			in_escape = false;
 			in_leading_wildcard_meta = false;
@@ -750,12 +764,12 @@ get_wildcard_part(const char *str, int lenstr,
 				in_escape = true;
 			else if (ISWILDCARDCHAR(beginword))
 				in_leading_wildcard_meta = true;
-			else if (ISWORDCHR(beginword))
+			else if (ISWORDCHR(beginword, clen))
 				break;
 			else
 				in_leading_wildcard_meta = false;
 		}
-		beginword += pg_mblen(beginword);
+		beginword += clen;
 	}
 
 	/*
@@ -788,12 +802,12 @@ get_wildcard_part(const char *str, int lenstr,
 	 * string boundary.  Strip escapes during copy.
 	 */
 	endword = beginword;
-	while (endword - str < lenstr)
+	while (endword < endstr)
 	{
-		clen = pg_mblen(endword);
+		clen = pg_mblen_range(endword, endstr);
 		if (in_escape)
 		{
-			if (ISWORDCHR(endword))
+			if (ISWORDCHR(endword, clen))
 			{
 				memcpy(s, endword, clen);
 				(*charlen)++;
@@ -821,7 +835,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_trailing_wildcard_meta = true;
 				break;
 			}
-			else if (ISWORDCHR(endword))
+			else if (ISWORDCHR(endword, clen))
 			{
 				memcpy(s, endword, clen);
 				(*charlen)++;
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 3485a725cde..2b5cfe87299 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -480,7 +480,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
 static void RE_compile(regex_t *regex, text *text_re,
 					   int cflags, Oid collation);
 static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static int	convertPgWchar(pg_wchar c, trgm_mb_char *result);
 static void transformGraph(TrgmNFA *trgmNFA);
 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -816,10 +816,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
 		for (j = 0; j < charsCount; j++)
 		{
 			trgm_mb_char c;
+			int			clen = convertPgWchar(chars[j], &c);
 
-			if (!convertPgWchar(chars[j], &c))
+			if (!clen)
 				continue;		/* ok to ignore it altogether */
-			if (ISWORDCHR(c.bytes))
+			if (ISWORDCHR(c.bytes, clen))
 				colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
 			else
 				colorInfo->containsNonWord = true;
@@ -831,13 +832,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
 
 /*
  * Convert pg_wchar to multibyte format.
- * Returns false if the character should be ignored completely.
+ * Returns 0 if the character should be ignored completely, else returns its
+ * byte length.
  */
-static bool
+static int
 convertPgWchar(pg_wchar c, trgm_mb_char *result)
 {
 	/* "s" has enough space for a multibyte character and a trailing NUL */
 	char		s[MAX_MULTIBYTE_CHAR_LEN + 1];
+	int			clen;
 
 	/*
 	 * We can ignore the NUL character, since it can never appear in a PG text
@@ -845,11 +848,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
 	 * reconstructing trigrams.
 	 */
 	if (c == 0)
-		return false;
+		return 0;
 
 	/* Do the conversion, making sure the result is NUL-terminated */
 	memset(s, 0, sizeof(s));
-	pg_wchar2mb_with_len(&c, s, 1);
+	clen = pg_wchar2mb_with_len(&c, s, 1);
 
 	/*
 	 * In IGNORECASE mode, we can ignore uppercase characters.  We assume that
@@ -871,7 +874,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
 		if (strcmp(lowerCased, s) != 0)
 		{
 			pfree(lowerCased);
-			return false;
+			return 0;
 		}
 		pfree(lowerCased);
 	}
@@ -879,7 +882,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
 
 	/* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
 	memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
-	return true;
+	return clen;
 }
 
 
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
index 2b3819fb2e8..b3cf893417d 100644
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -149,9 +149,9 @@ initTrie(const char *filename)
 				state = 0;
 				for (ptr = line; *ptr; ptr += ptrlen)
 				{
-					ptrlen = pg_mblen(ptr);
+					ptrlen = pg_mblen_cstr(ptr);
 					/* ignore whitespace, but end src or trg */
-					if (t_isspace(ptr))
+					if (t_isspace_cstr(ptr))
 					{
 						if (state == 1)
 							state = 2;
@@ -315,6 +315,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
 	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
 	int32		len = PG_GETARG_INT32(2);
 	char	   *srcstart = srcchar;
+	const char *srcend = srcstart + len;
 	TSLexeme   *res;
 	StringInfoData buf;
 
@@ -342,7 +343,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
 		}
 		else
 		{
-			matchlen = pg_mblen(srcchar);
+			matchlen = pg_mblen_range(srcchar, srcend);
 			if (buf.data != NULL)
 				appendBinaryStringInfo(&buf, srcchar, matchlen);
 		}
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index 821c2bef444..d043886d04e 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -1170,7 +1170,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal,
 			if (cursorpos > 0)
 				newcp++;
 		}
-		chlen = pg_mblen(prosrc);
+		chlen = pg_mblen_cstr(prosrc);
 		if (strncmp(prosrc, literal, chlen) != 0)
 			goto fail;
 		prosrc += chlen;
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index ed885ca5551..8c99ecaa0a0 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -47,8 +47,8 @@ findwrd(char *in, char **end, uint16 *flags)
 	char	   *lastchar;
 
 	/* Skip leading spaces */
-	while (*in && t_isspace(in))
-		in += pg_mblen(in);
+	while (*in && t_isspace_cstr(in))
+		in += pg_mblen_cstr(in);
 
 	/* Return NULL on empty lines */
 	if (*in == '\0')
@@ -60,10 +60,10 @@ findwrd(char *in, char **end, uint16 *flags)
 	lastchar = start = in;
 
 	/* Find end of word */
-	while (*in && !t_isspace(in))
+	while (*in && !t_isspace_cstr(in))
 	{
 		lastchar = in;
-		in += pg_mblen(in);
+		in += pg_mblen_cstr(in);
 	}
 
 	if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index a95ed0891dd..45686654c93 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -190,8 +190,8 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 		ptr = line;
 
 		/* is it a comment? */
-		while (*ptr && t_isspace(ptr))
-			ptr += pg_mblen(ptr);
+		while (*ptr && t_isspace_cstr(ptr))
+			ptr += pg_mblen_cstr(ptr);
 
 		if (t_iseq(ptr, '#') || *ptr == '\0' ||
 			t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
@@ -212,7 +212,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 								 errmsg("unexpected delimiter")));
 					state = TR_WAITSUBS;
 				}
-				else if (!t_isspace(ptr))
+				else if (!t_isspace_cstr(ptr))
 				{
 					beginwrd = ptr;
 					state = TR_INLEX;
@@ -225,7 +225,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
 					state = TR_WAITSUBS;
 				}
-				else if (t_isspace(ptr))
+				else if (t_isspace_cstr(ptr))
 				{
 					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
 					state = TR_WAITLEX;
@@ -237,15 +237,15 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 				{
 					useasis = true;
 					state = TR_INSUBS;
-					beginwrd = ptr + pg_mblen(ptr);
+					beginwrd = ptr + pg_mblen_cstr(ptr);
 				}
 				else if (t_iseq(ptr, '\\'))
 				{
 					useasis = false;
 					state = TR_INSUBS;
-					beginwrd = ptr + pg_mblen(ptr);
+					beginwrd = ptr + pg_mblen_cstr(ptr);
 				}
-				else if (!t_isspace(ptr))
+				else if (!t_isspace_cstr(ptr))
 				{
 					useasis = false;
 					beginwrd = ptr;
@@ -254,7 +254,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 			}
 			else if (state == TR_INSUBS)
 			{
-				if (t_isspace(ptr))
+				if (t_isspace_cstr(ptr))
 				{
 					if (ptr == beginwrd)
 						ereport(ERROR,
@@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 			else
 				elog(ERROR, "unrecognized thesaurus state: %d", state);
 
-			ptr += pg_mblen(ptr);
+			ptr += pg_mblen_cstr(ptr);
 		}
 
 		if (state == TR_INSUBS)
diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c
index 80017177222..d890b65b063 100644
--- a/src/backend/tsearch/regis.c
+++ b/src/backend/tsearch/regis.c
@@ -37,7 +37,7 @@ RS_isRegis(const char *str)
 	{
 		if (state == RS_IN_WAIT)
 		{
-			if (t_isalpha(c))
+			if (t_isalpha_cstr(c))
 				 /* okay */ ;
 			else if (t_iseq(c, '['))
 				state = RS_IN_ONEOF;
@@ -48,14 +48,14 @@ RS_isRegis(const char *str)
 		{
 			if (t_iseq(c, '^'))
 				state = RS_IN_NONEOF;
-			else if (t_isalpha(c))
+			else if (t_isalpha_cstr(c))
 				state = RS_IN_ONEOF_IN;
 			else
 				return false;
 		}
 		else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
 		{
-			if (t_isalpha(c))
+			if (t_isalpha_cstr(c))
 				 /* okay */ ;
 			else if (t_iseq(c, ']'))
 				state = RS_IN_WAIT;
@@ -64,7 +64,7 @@ RS_isRegis(const char *str)
 		}
 		else
 			elog(ERROR, "internal error in RS_isRegis: state %d", state);
-		c += pg_mblen(c);
+		c += pg_mblen_cstr(c);
 	}
 
 	return (state == RS_IN_WAIT);
@@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str)
 	{
 		if (state == RS_IN_WAIT)
 		{
-			if (t_isalpha(c))
+			if (t_isalpha_cstr(c))
 			{
 				if (ptr)
 					ptr = newRegisNode(ptr, len);
 				else
 					ptr = r->node = newRegisNode(NULL, len);
-				COPYCHAR(ptr->data, c);
 				ptr->type = RSF_ONEOF;
-				ptr->len = pg_mblen(c);
+				ptr->len = ts_copychar_cstr(ptr->data, c);
 			}
 			else if (t_iseq(c, '['))
 			{
@@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str)
 				ptr->type = RSF_NONEOF;
 				state = RS_IN_NONEOF;
 			}
-			else if (t_isalpha(c))
+			else if (t_isalpha_cstr(c))
 			{
-				COPYCHAR(ptr->data, c);
-				ptr->len = pg_mblen(c);
+				ptr->len = ts_copychar_cstr(ptr->data, c);
 				state = RS_IN_ONEOF_IN;
 			}
 			else				/* shouldn't get here */
@@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str)
 		}
 		else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
 		{
-			if (t_isalpha(c))
-			{
-				COPYCHAR(ptr->data + ptr->len, c);
-				ptr->len += pg_mblen(c);
-			}
+			if (t_isalpha_cstr(c))
+				ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c);
 			else if (t_iseq(c, ']'))
 				state = RS_IN_WAIT;
 			else				/* shouldn't get here */
@@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str)
 		}
 		else
 			elog(ERROR, "internal error in RS_compile: state %d", state);
-		c += pg_mblen(c);
+		c += pg_mblen_cstr(c);
 	}
 
 	if (state != RS_IN_WAIT)	/* shouldn't get here */
@@ -187,10 +182,10 @@ mb_strchr(char *str, char *c)
 	char	   *ptr = str;
 	bool		res = false;
 
-	clen = pg_mblen(c);
+	clen = pg_mblen_cstr(c);
 	while (*ptr && !res)
 	{
-		plen = pg_mblen(ptr);
+		plen = pg_mblen_cstr(ptr);
 		if (plen == clen)
 		{
 			i = plen;
@@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str)
 	while (*c)
 	{
 		len++;
-		c += pg_mblen(c);
+		c += pg_mblen_cstr(c);
 	}
 
 	if (len < r->nchar)
@@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str)
 	{
 		len -= r->nchar;
 		while (len-- > 0)
-			c += pg_mblen(c);
+			c += pg_mblen_cstr(c);
 	}
 
 
@@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str)
 				elog(ERROR, "unrecognized regis node type: %d", ptr->type);
 		}
 		ptr = ptr->next;
-		c += pg_mblen(c);
+		c += pg_mblen_cstr(c);
 	}
 
 	return true;
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index 2c555ebdcce..f65c083c06f 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -232,7 +232,7 @@ findchar(char *str, int c)
 	{
 		if (t_iseq(str, c))
 			return str;
-		str += pg_mblen(str);
+		str += pg_mblen_cstr(str);
 	}
 
 	return NULL;
@@ -245,7 +245,7 @@ findchar2(char *str, int c1, int c2)
 	{
 		if (t_iseq(str, c1) || t_iseq(str, c2))
 			return str;
-		str += pg_mblen(str);
+		str += pg_mblen_cstr(str);
 	}
 
 	return NULL;
@@ -352,6 +352,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
 	char	   *next,
 			   *sbuf = *sflagset;
 	int			maxstep;
+	int			clen;
 	bool		stop = false;
 	bool		met_comma = false;
 
@@ -363,11 +364,11 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
 		{
 			case FM_LONG:
 			case FM_CHAR:
-				COPYCHAR(sflag, *sflagset);
-				sflag += pg_mblen(*sflagset);
+				clen = ts_copychar_cstr(sflag, *sflagset);
+				sflag += clen;
 
 				/* Go to start of the next flag */
-				*sflagset += pg_mblen(*sflagset);
+				*sflagset += clen;
 
 				/* Check if we get all characters of flag */
 				maxstep--;
@@ -391,7 +392,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
 				*sflagset = next;
 				while (**sflagset)
 				{
-					if (t_isdigit(*sflagset))
+					if (t_isdigit_cstr(*sflagset))
 					{
 						if (!met_comma)
 							ereport(ERROR,
@@ -409,7 +410,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
 											*sflagset)));
 						met_comma = true;
 					}
-					else if (!t_isspace(*sflagset))
+					else if (!t_isspace_cstr(*sflagset))
 					{
 						ereport(ERROR,
 								(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -417,7 +418,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
 										*sflagset)));
 					}
 
-					*sflagset += pg_mblen(*sflagset);
+					*sflagset += pg_mblen_cstr(*sflagset);
 				}
 				stop = true;
 				break;
@@ -543,7 +544,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
 			while (*s)
 			{
 				/* we allow only single encoded flags for faster works */
-				if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
+				if (pg_mblen_cstr(s) == 1 && t_isprint_unbounded(s) && !t_isspace_unbounded(s))
 					s++;
 				else
 				{
@@ -559,12 +560,12 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
 		s = line;
 		while (*s)
 		{
-			if (t_isspace(s))
+			if (t_isspace_cstr(s))
 			{
 				*s = '\0';
 				break;
 			}
-			s += pg_mblen(s);
+			s += pg_mblen_cstr(s);
 		}
 		pstr = lowerstr_ctx(Conf, line);
 
@@ -816,17 +817,17 @@ get_nextfield(char **str, char *next)
 
 	while (**str)
 	{
+		int			clen = pg_mblen_cstr(*str);
+
 		if (state == PAE_WAIT_MASK)
 		{
 			if (t_iseq(*str, '#'))
 				return false;
-			else if (!t_isspace(*str))
+			else if (!t_isspace_cstr(*str))
 			{
-				int			clen = pg_mblen(*str);
-
 				if (clen < avail)
 				{
-					COPYCHAR(next, *str);
+					ts_copychar_with_len(next, *str, clen);
 					next += clen;
 					avail -= clen;
 				}
@@ -835,24 +836,22 @@ get_nextfield(char **str, char *next)
 		}
 		else					/* state == PAE_INMASK */
 		{
-			if (t_isspace(*str))
+			if (t_isspace_cstr(*str))
 			{
 				*next = '\0';
 				return true;
 			}
 			else
 			{
-				int			clen = pg_mblen(*str);
-
 				if (clen < avail)
 				{
-					COPYCHAR(next, *str);
+					ts_copychar_with_len(next, *str, clen);
 					next += clen;
 					avail -= clen;
 				}
 			}
 		}
-		*str += pg_mblen(*str);
+		*str += clen;
 	}
 
 	*next = '\0';
@@ -942,14 +941,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 
 	while (*str)
 	{
+		int			clen = pg_mblen_cstr(str);
+
 		if (state == PAE_WAIT_MASK)
 		{
 			if (t_iseq(str, '#'))
 				return false;
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 			{
-				COPYCHAR(pmask, str);
-				pmask += pg_mblen(str);
+				pmask += ts_copychar_with_len(pmask, str, clen);
 				state = PAE_INMASK;
 			}
 		}
@@ -960,10 +960,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 				*pmask = '\0';
 				state = PAE_WAIT_FIND;
 			}
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 			{
-				COPYCHAR(pmask, str);
-				pmask += pg_mblen(str);
+				pmask += ts_copychar_with_len(pmask, str, clen);
 			}
 		}
 		else if (state == PAE_WAIT_FIND)
@@ -972,13 +971,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 			{
 				state = PAE_INFIND;
 			}
-			else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+			else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
 			{
-				COPYCHAR(prepl, str);
-				prepl += pg_mblen(str);
+				prepl += ts_copychar_with_len(prepl, str, clen);
 				state = PAE_INREPL;
 			}
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 				ereport(ERROR,
 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
 						 errmsg("syntax error")));
@@ -990,12 +988,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 				*pfind = '\0';
 				state = PAE_WAIT_REPL;
 			}
-			else if (t_isalpha(str))
+			else if (t_isalpha_cstr(str))
 			{
-				COPYCHAR(pfind, str);
-				pfind += pg_mblen(str);
+				pfind += ts_copychar_with_len(pfind, str, clen);
 			}
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 				ereport(ERROR,
 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
 						 errmsg("syntax error")));
@@ -1006,13 +1003,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 			{
 				break;			/* void repl */
 			}
-			else if (t_isalpha(str))
+			else if (t_isalpha_cstr(str))
 			{
-				COPYCHAR(prepl, str);
-				prepl += pg_mblen(str);
+				prepl += ts_copychar_with_len(prepl, str, clen);
 				state = PAE_INREPL;
 			}
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 				ereport(ERROR,
 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
 						 errmsg("syntax error")));
@@ -1024,12 +1020,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 				*prepl = '\0';
 				break;
 			}
-			else if (t_isalpha(str))
+			else if (t_isalpha_cstr(str))
 			{
-				COPYCHAR(prepl, str);
-				prepl += pg_mblen(str);
+				prepl += ts_copychar_with_len(prepl, str, clen);
 			}
-			else if (!t_isspace(str))
+			else if (!t_isspace_cstr(str))
 				ereport(ERROR,
 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
 						 errmsg("syntax error")));
@@ -1037,7 +1032,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
 		else
 			elog(ERROR, "unrecognized state in parse_affentry: %d", state);
 
-		str += pg_mblen(str);
+		str += clen;
 	}
 
 	*pmask = *pfind = *prepl = '\0';
@@ -1090,10 +1085,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
 	CompoundAffixFlag *newValue;
 	char		sbuf[BUFSIZ];
 	char	   *sflag;
-	int			clen;
 
-	while (*s && t_isspace(s))
-		s += pg_mblen(s);
+	while (*s && t_isspace_cstr(s))
+		s += pg_mblen_cstr(s);
 
 	if (!*s)
 		ereport(ERROR,
@@ -1102,10 +1096,10 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
 
 	/* Get flag without \n */
 	sflag = sbuf;
-	while (*s && !t_isspace(s) && *s != '\n')
+	while (*s && !t_isspace_cstr(s) && *s != '\n')
 	{
-		clen = pg_mblen(s);
-		COPYCHAR(sflag, s);
+		int			clen = ts_copychar_cstr(sflag, s);
+
 		sflag += clen;
 		s += clen;
 	}
@@ -1248,7 +1242,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
-		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+		if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#'))
 		{
 			pfree(recoded);
 			continue;
@@ -1285,8 +1279,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 		{
 			char	   *s = recoded + strlen("FLAG");
 
-			while (*s && t_isspace(s))
-				s += pg_mblen(s);
+			while (*s && t_isspace_cstr(s))
+				s += pg_mblen_cstr(s);
 
 			if (*s)
 			{
@@ -1321,7 +1315,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	{
 		int			fields_read;
 
-		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+		if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#'))
 			goto nextline;
 
 		fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
@@ -1484,12 +1478,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 			s = findchar2(recoded, 'l', 'L');
 			if (s)
 			{
-				while (*s && !t_isspace(s))
-					s += pg_mblen(s);
-				while (*s && t_isspace(s))
-					s += pg_mblen(s);
+				while (*s && !t_isspace_cstr(s))
+					s += pg_mblen_cstr(s);
+				while (*s && t_isspace_cstr(s))
+					s += pg_mblen_cstr(s);
 
-				if (*s && pg_mblen(s) == 1)
+				if (*s && pg_mblen_cstr(s) == 1)
 				{
 					addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
 					Conf->usecompound = true;
@@ -1517,8 +1511,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 			s = recoded + 4;	/* we need non-lowercased string */
 			flagflags = 0;
 
-			while (*s && t_isspace(s))
-				s += pg_mblen(s);
+			while (*s && t_isspace_cstr(s))
+				s += pg_mblen_cstr(s);
 
 			if (*s == '*')
 			{
@@ -1539,14 +1533,13 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 			 * be followed by EOL, whitespace, or ':'.  Otherwise this is a
 			 * new-format flag command.
 			 */
-			if (*s && pg_mblen(s) == 1)
+			if (*s && pg_mblen_cstr(s) == 1)
 			{
-				COPYCHAR(flag, s);
+				flag[0] = *s++;
 				flag[1] = '\0';
 
-				s++;
 				if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
-					t_isspace(s))
+					t_isspace_cstr(s))
 				{
 					oldformat = true;
 					goto nextline;
@@ -1769,7 +1762,7 @@ NISortDictionary(IspellDict *Conf)
 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
 							 errmsg("invalid affix alias \"%s\"",
 									Conf->Spell[i]->p.flag)));
-				if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
+				if (*end != '\0' && !t_isdigit_cstr(end) && !t_isspace_cstr(end))
 					ereport(ERROR,
 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
 							 errmsg("invalid affix alias \"%s\"",
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index f918cc8908b..cc9e71ec708 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -33,70 +33,43 @@ static void tsearch_readline_callback(void *arg);
  */
 #define WC_BUF_LEN  3
 
-int
-t_isdigit(const char *ptr)
-{
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
-	pg_locale_t mylocale = 0;	/* TODO */
-
-	if (clen == 1 || lc_ctype_is_c(collation))
-		return isdigit(TOUCHAR(ptr));
-
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswdigit((wint_t) character[0]);
-}
-
-int
-t_isspace(const char *ptr)
-{
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
-	pg_locale_t mylocale = 0;	/* TODO */
-
-	if (clen == 1 || lc_ctype_is_c(collation))
-		return isspace(TOUCHAR(ptr));
-
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswspace((wint_t) character[0]);
-}
-
-int
-t_isalpha(const char *ptr)
-{
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
-	pg_locale_t mylocale = 0;	/* TODO */
-
-	if (clen == 1 || lc_ctype_is_c(collation))
-		return isalpha(TOUCHAR(ptr));
-
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswalpha((wint_t) character[0]);
-}
-
-int
-t_isprint(const char *ptr)
-{
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
-	pg_locale_t mylocale = 0;	/* TODO */
-
-	if (clen == 1 || lc_ctype_is_c(collation))
-		return isprint(TOUCHAR(ptr));
-
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswprint((wint_t) character[0]);
+#define GENERATE_T_ISCLASS_DEF(character_class) \
+/* mblen shall be that of the first character */ \
+int \
+t_is##character_class##_with_len(const char *ptr, int mblen) \
+{ \
+	int			clen = pg_mblen_with_len(ptr, mblen); \
+	wchar_t		character[WC_BUF_LEN]; \
+	pg_locale_t mylocale = 0;	/* TODO */ \
+	if (clen == 1 || lc_ctype_is_c(DEFAULT_COLLATION_OID)) \
+		return is##character_class(TOUCHAR(ptr)); \
+	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); \
+	return isw##character_class((wint_t) character[0]); \
+} \
+\
+/* ptr shall point to a NUL-terminated string */ \
+int \
+t_is##character_class##_cstr(const char *ptr) \
+{ \
+	return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+} \
+/* ptr shall point to a string with pre-validated encoding */ \
+int \
+t_is##character_class##_unbounded(const char *ptr) \
+{ \
+	return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+} \
+/* historical name for _unbounded */ \
+int \
+t_is##character_class(const char *ptr) \
+{ \
+	return t_is##character_class##_unbounded(ptr); \
 }
 
+GENERATE_T_ISCLASS_DEF(alpha)
+GENERATE_T_ISCLASS_DEF(digit)
+GENERATE_T_ISCLASS_DEF(print)
+GENERATE_T_ISCLASS_DEF(space)
 
 /*
  * Set up to read a file using tsearch_readline().  This facility is
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c
index ed16a2e25a2..51902ab1e14 100644
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -88,8 +88,8 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
 			char	   *pbuf = line;
 
 			/* Trim trailing space */
-			while (*pbuf && !t_isspace(pbuf))
-				pbuf += pg_mblen(pbuf);
+			while (*pbuf && !t_isspace_cstr(pbuf))
+				pbuf += pg_mblen_cstr(pbuf);
 			*pbuf = '\0';
 
 			/* Skip empty lines */
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 290c2223205..48f9a87523c 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1728,7 +1728,8 @@ TParserGet(TParser *prs)
 			prs->state->charlen = 0;
 		else
 			prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
-				pg_mblen(prs->str + prs->state->posbyte);
+				pg_mblen_range(prs->str + prs->state->posbyte,
+							   prs->str + prs->lenstr);
 
 		Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
 		Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 6dd93f9a322..06ea275c59c 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -172,7 +172,7 @@ hex_encode(const char *src, size_t len, char *dst)
 }
 
 static inline char
-get_hex(const char *cp)
+get_hex(const char *cp, const char *end)
 {
 	unsigned char c = (unsigned char) *cp;
 	int			res = -1;
@@ -184,7 +184,7 @@ get_hex(const char *cp)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid hexadecimal digit: \"%.*s\"",
-						pg_mblen(cp), cp)));
+						pg_mblen_range(cp, end), cp)));
 
 	return (char) res;
 }
@@ -208,14 +208,14 @@ hex_decode(const char *src, size_t len, char *dst)
 			s++;
 			continue;
 		}
-		v1 = get_hex(s) << 4;
+		v1 = get_hex(s, srcend) << 4;
 		s++;
 		if (s >= srcend)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("invalid hexadecimal data: odd number of digits")));
 
-		v2 = get_hex(s);
+		v2 = get_hex(s, srcend);
 		s++;
 		*p++ = v1 | v2;
 	}
@@ -344,7 +344,7 @@ pg_base64_decode(const char *src, size_t len, char *dst)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
-								pg_mblen(s - 1), s - 1)));
+								pg_mblen_range(s - 1, srcend), s - 1)));
 		}
 		/* add it to buffer */
 		buf = (buf << 6) + b;
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 9169539b76d..5b617ff4619 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1392,7 +1392,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
 							 errmsg("invalid datetime format separator: \"%s\"",
-									pnstrdup(str, pg_mblen(str)))));
+									pnstrdup(str, pg_mblen_cstr(str)))));
 
 				if (*str == ' ')
 					n->type = NODE_TYPE_SPACE;
@@ -1422,7 +1422,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 					/* backslash quotes the next character, if any */
 					if (*str == '\\' && *(str + 1))
 						str++;
-					chlen = pg_mblen(str);
+					chlen = pg_mblen_cstr(str);
 					n->type = NODE_TYPE_CHAR;
 					memcpy(n->character, str, chlen);
 					n->character[chlen] = '\0';
@@ -1440,7 +1440,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 				 */
 				if (*str == '\\' && *(str + 1) == '"')
 					str++;
-				chlen = pg_mblen(str);
+				chlen = pg_mblen_cstr(str);
 
 				if ((flags & DCH_FLAG) && is_separator_char(str))
 					n->type = NODE_TYPE_SEPARATOR;
@@ -2151,8 +2151,8 @@ asc_toupper_z(const char *buff)
 	do { \
 		if (S_THth(_suf)) \
 		{ \
-			if (*(ptr)) (ptr) += pg_mblen(ptr); \
-			if (*(ptr)) (ptr) += pg_mblen(ptr); \
+			if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
+			if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
 		} \
 	} while (0)
 
@@ -3365,7 +3365,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
 				 * insist that the consumed character match the format's
 				 * character.
 				 */
-				s += pg_mblen(s);
+				s += pg_mblen_cstr(s);
 			}
 			continue;
 		}
@@ -3387,11 +3387,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
 				if (extra_skip > 0)
 					extra_skip--;
 				else
-					s += pg_mblen(s);
+					s += pg_mblen_cstr(s);
 			}
 			else
 			{
-				int			chlen = pg_mblen(s);
+				int			chlen = pg_mblen_cstr(s);
 
 				/*
 				 * Standard mode requires strict match of format characters.
@@ -5563,13 +5563,15 @@ NUM_numpart_to_char(NUMProc *Np, int id)
 static void
 NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len)
 {
+	const char *end = Np->inout + input_len;
+
 	while (n-- > 0)
 	{
 		if (OVERLOAD_TEST)
 			break;				/* end of input */
 		if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
 			break;				/* it's a data character */
-		Np->inout_p += pg_mblen(Np->inout_p);
+		Np->inout_p += pg_mblen_range(Np->inout_p, end);
 	}
 }
 
@@ -6026,7 +6028,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 			}
 			else
 			{
-				Np->inout_p += pg_mblen(Np->inout_p);
+				Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len);
 			}
 			continue;
 		}
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index f6a074aa7d0..583c1712ad1 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -663,7 +663,7 @@ report_json_context(JsonLexContext *lex)
 	{
 		/* Advance to next multibyte character */
 		if (IS_HIGHBIT_SET(*context_start))
-			context_start += pg_mblen(context_start);
+			context_start += pg_mblen_range(context_start, context_end);
 		else
 			context_start++;
 	}
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index 3377cc81cbf..30183f66b15 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -527,7 +527,7 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("invalid input syntax for type %s", "jsonpath"),
 						 errdetail("unrecognized flag character \"%.*s\" in LIKE_REGEX predicate",
-								   pg_mblen(flags->val + i), flags->val + i)));
+								   pg_mblen_range(flags->val + i, flags->val + flags->len), flags->val + i)));
 				break;
 		}
 	}
diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c
index f8979776d0d..38d4f580d01 100644
--- a/src/backend/utils/adt/levenshtein.c
+++ b/src/backend/utils/adt/levenshtein.c
@@ -84,6 +84,8 @@ varstr_levenshtein(const char *source, int slen,
 	int			i,
 				j;
 	const char *y;
+	const char *send = source + slen;
+	const char *tend = target + tlen;
 
 	/*
 	 * For varstr_levenshtein_less_equal, we have real variables called
@@ -184,10 +186,10 @@ varstr_levenshtein(const char *source, int slen,
 #endif
 
 	/*
-	 * In order to avoid calling pg_mblen() repeatedly on each character in s,
-	 * we cache all the lengths before starting the main loop -- but if all
-	 * the characters in both strings are single byte, then we skip this and
-	 * use a fast-path in the main loop.  If only one string contains
+	 * In order to avoid calling pg_mblen_range() repeatedly on each character
+	 * in s, we cache all the lengths before starting the main loop -- but if
+	 * all the characters in both strings are single byte, then we skip this
+	 * and use a fast-path in the main loop.  If only one string contains
 	 * multi-byte characters, we still build the array, so that the fast-path
 	 * needn't deal with the case where the array hasn't been initialized.
 	 */
@@ -199,7 +201,7 @@ varstr_levenshtein(const char *source, int slen,
 		s_char_len = (int *) palloc((m + 1) * sizeof(int));
 		for (i = 0; i < m; ++i)
 		{
-			s_char_len[i] = pg_mblen(cp);
+			s_char_len[i] = pg_mblen_range(cp, send);
 			cp += s_char_len[i];
 		}
 		s_char_len[i] = 0;
@@ -225,7 +227,7 @@ varstr_levenshtein(const char *source, int slen,
 	{
 		int		   *temp;
 		const char *x = source;
-		int			y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
+		int			y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
 
 #ifdef LEVENSHTEIN_LESS_EQUAL
 
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index eed183cd0dc..080bb6af840 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -54,20 +54,20 @@ static int	Generic_Text_IC_like(text *str, text *pat, Oid collation);
  *--------------------
  */
 static inline int
-wchareq(const char *p1, const char *p2)
+wchareq(const char *p1, int p1len, const char *p2, int p2len)
 {
-	int			p1_len;
+	int			p1clen;
 
 	/* Optimization:  quickly compare the first byte. */
 	if (*p1 != *p2)
 		return 0;
 
-	p1_len = pg_mblen(p1);
-	if (pg_mblen(p2) != p1_len)
+	p1clen = pg_mblen_with_len(p1, p1len);
+	if (pg_mblen_with_len(p2, p2len) != p1clen)
 		return 0;
 
 	/* They are the same length */
-	while (p1_len--)
+	while (p1clen--)
 	{
 		if (*p1++ != *p2++)
 			return 0;
@@ -106,11 +106,11 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c)
 #define NextByte(p, plen)	((p)++, (plen)--)
 
 /* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq((p1), (p2))
+#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len))
 #define NextChar(p, plen) \
-	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+	do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0)
 #define CopyAdvChar(dst, src, srclen) \
-	do { int __l = pg_mblen(src); \
+	do { int __l = pg_mblen_with_len((src), (srclen)); \
 		 (srclen) -= __l; \
 		 while (__l-- > 0) \
 			 *(dst)++ = *(src)++; \
@@ -122,7 +122,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c)
 #include "like_match.c"
 
 /* Set up to compile like_match.c for single-byte characters */
-#define CHAREQ(p1, p2) (*(p1) == *(p2))
+#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2))
 #define NextChar(p, plen) NextByte((p), (plen))
 #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
 
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 2f32cdaf020..9df572e92bd 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -294,6 +294,7 @@ do_like_escape(text *pat, text *esc)
 					 errhint("Escape string must be empty or one character.")));
 
 		e = VARDATA_ANY(esc);
+		elen = VARSIZE_ANY_EXHDR(esc);
 
 		/*
 		 * If specified escape is '\', just copy the pattern as-is.
@@ -312,7 +313,7 @@ do_like_escape(text *pat, text *esc)
 		afterescape = false;
 		while (plen > 0)
 		{
-			if (CHAREQ(p, e) && !afterescape)
+			if (CHAREQ(p, plen, e, elen) && !afterescape)
 			{
 				*r++ = '\\';
 				NextChar(p, plen);
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index bd9e5f9e243..471c6967be0 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -150,8 +150,8 @@ lpad(PG_FUNCTION_ARGS)
 	char	   *ptr1,
 			   *ptr2,
 			   *ptr2start,
-			   *ptr2end,
 			   *ptr_ret;
+	const char *ptr2end;
 	int			m,
 				s1len,
 				s2len;
@@ -196,7 +196,7 @@ lpad(PG_FUNCTION_ARGS)
 
 	while (m--)
 	{
-		int			mlen = pg_mblen(ptr2);
+		int			mlen = pg_mblen_range(ptr2, ptr2end);
 
 		memcpy(ptr_ret, ptr2, mlen);
 		ptr_ret += mlen;
@@ -209,7 +209,7 @@ lpad(PG_FUNCTION_ARGS)
 
 	while (s1len--)
 	{
-		int			mlen = pg_mblen(ptr1);
+		int			mlen = pg_mblen_unbounded(ptr1);
 
 		memcpy(ptr_ret, ptr1, mlen);
 		ptr_ret += mlen;
@@ -248,8 +248,8 @@ rpad(PG_FUNCTION_ARGS)
 	char	   *ptr1,
 			   *ptr2,
 			   *ptr2start,
-			   *ptr2end,
 			   *ptr_ret;
+	const char *ptr2end;
 	int			m,
 				s1len,
 				s2len;
@@ -288,11 +288,12 @@ rpad(PG_FUNCTION_ARGS)
 	m = len - s1len;
 
 	ptr1 = VARDATA_ANY(string1);
+
 	ptr_ret = VARDATA(ret);
 
 	while (s1len--)
 	{
-		int			mlen = pg_mblen(ptr1);
+		int			mlen = pg_mblen_unbounded(ptr1);
 
 		memcpy(ptr_ret, ptr1, mlen);
 		ptr_ret += mlen;
@@ -304,7 +305,7 @@ rpad(PG_FUNCTION_ARGS)
 
 	while (m--)
 	{
-		int			mlen = pg_mblen(ptr2);
+		int			mlen = pg_mblen_range(ptr2, ptr2end);
 
 		memcpy(ptr_ret, ptr2, mlen);
 		ptr_ret += mlen;
@@ -389,6 +390,7 @@ dotrim(const char *string, int stringlen,
 			 */
 			const char **stringchars;
 			const char **setchars;
+			const char *setend;
 			int		   *stringmblen;
 			int		   *setmblen;
 			int			stringnchars;
@@ -396,6 +398,7 @@ dotrim(const char *string, int stringlen,
 			int			resultndx;
 			int			resultnchars;
 			const char *p;
+			const char *pend;
 			int			len;
 			int			mblen;
 			const char *str_pos;
@@ -406,10 +409,11 @@ dotrim(const char *string, int stringlen,
 			stringnchars = 0;
 			p = string;
 			len = stringlen;
+			pend = p + len;
 			while (len > 0)
 			{
 				stringchars[stringnchars] = p;
-				stringmblen[stringnchars] = mblen = pg_mblen(p);
+				stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend);
 				stringnchars++;
 				p += mblen;
 				len -= mblen;
@@ -420,10 +424,11 @@ dotrim(const char *string, int stringlen,
 			setnchars = 0;
 			p = set;
 			len = setlen;
+			setend = set + setlen;
 			while (len > 0)
 			{
 				setchars[setnchars] = p;
-				setmblen[setnchars] = mblen = pg_mblen(p);
+				setmblen[setnchars] = mblen = pg_mblen_range(p, setend);
 				setnchars++;
 				p += mblen;
 				len -= mblen;
@@ -801,6 +806,8 @@ translate(PG_FUNCTION_ARGS)
 			   *to_end;
 	char	   *source,
 			   *target;
+	const char *source_end;
+	const char *from_end;
 	int			m,
 				fromlen,
 				tolen,
@@ -815,9 +822,11 @@ translate(PG_FUNCTION_ARGS)
 	if (m <= 0)
 		PG_RETURN_TEXT_P(string);
 	source = VARDATA_ANY(string);
+	source_end = source + m;
 
 	fromlen = VARSIZE_ANY_EXHDR(from);
 	from_ptr = VARDATA_ANY(from);
+	from_end = from_ptr + fromlen;
 	tolen = VARSIZE_ANY_EXHDR(to);
 	to_ptr = VARDATA_ANY(to);
 	to_end = to_ptr + tolen;
@@ -840,12 +849,12 @@ translate(PG_FUNCTION_ARGS)
 
 	while (m > 0)
 	{
-		source_len = pg_mblen(source);
+		source_len = pg_mblen_range(source, source_end);
 		from_index = 0;
 
 		for (i = 0; i < fromlen; i += len)
 		{
-			len = pg_mblen(&from_ptr[i]);
+			len = pg_mblen_range(&from_ptr[i], from_end);
 			if (len == source_len &&
 				memcmp(source, &from_ptr[i], len) == 0)
 				break;
@@ -861,11 +870,11 @@ translate(PG_FUNCTION_ARGS)
 			{
 				if (p >= to_end)
 					break;
-				p += pg_mblen(p);
+				p += pg_mblen_range(p, to_end);
 			}
 			if (p < to_end)
 			{
-				len = pg_mblen(p);
+				len = pg_mblen_range(p, to_end);
 				memcpy(target, p, len);
 				target += len;
 				retlen += len;
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 4d2ea4848fb..0eaa528dc5c 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -424,7 +424,7 @@ parse_re_flags(pg_re_flags *flags, text *opts)
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							 errmsg("invalid regular expression option: \"%.*s\"",
-									pg_mblen(opt_p + i), opt_p + i)));
+									pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
 					break;
 			}
 		}
@@ -672,6 +672,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 			   *r;
 	int			plen,
 				elen;
+	const char *pend;
 	bool		afterescape = false;
 	int			nquotes = 0;
 	int			bracket_depth = 0;	/* square bracket nesting level */
@@ -679,6 +680,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 
 	p = VARDATA_ANY(pat_text);
 	plen = VARSIZE_ANY_EXHDR(pat_text);
+	pend = p + plen;
 	if (esc_text == NULL)
 	{
 		/* No ESCAPE clause provided; default to backslash as escape */
@@ -778,7 +780,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 
 		if (elen > 1)
 		{
-			int			mblen = pg_mblen(p);
+			int			mblen = pg_mblen_range(p, pend);
 
 			if (mblen > 1)
 			{
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c
index dde3a90cc3b..370f57ede58 100644
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -109,7 +109,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
 		return buf;
 
 	buf++;
-	while (*buf && pg_mblen(buf) == 1)
+	while (*buf && pg_mblen_cstr(buf) == 1)
 	{
 		switch (*buf)
 		{
@@ -186,7 +186,7 @@ parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
 					continue;
 				}
 
-				if (!t_isdigit(ptr))
+				if (!t_isdigit_cstr(ptr))
 					return false;
 
 				errno = 0;
@@ -248,12 +248,12 @@ parse_or_operator(TSQueryParserState pstate)
 		return false;
 
 	/* it shouldn't be a part of any word */
-	if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
+	if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha_cstr(ptr) || t_isdigit_cstr(ptr))
 		return false;
 
 	for (;;)
 	{
-		ptr += pg_mblen(ptr);
+		ptr += pg_mblen_cstr(ptr);
 
 		if (*ptr == '\0')		/* got end of string without operand */
 			return false;
@@ -263,7 +263,7 @@ parse_or_operator(TSQueryParserState pstate)
 		 * So we still treat OR literal as operation with possibly incorrect
 		 * operand and  will not search it as lexeme
 		 */
-		if (!t_isspace(ptr))
+		if (!t_isspace_cstr(ptr))
 			break;
 	}
 
@@ -306,7 +306,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
 							 errmsg("syntax error in tsquery: \"%s\"",
 									state->buffer)));
 				}
-				else if (!t_isspace(state->buf))
+				else if (!t_isspace_cstr(state->buf))
 				{
 					/*
 					 * We rely on the tsvector parser to parse the value for
@@ -364,14 +364,14 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
 				{
 					return (state->count) ? PT_ERR : PT_END;
 				}
-				else if (!t_isspace(state->buf))
+				else if (!t_isspace_cstr(state->buf))
 				{
 					return PT_ERR;
 				}
 				break;
 		}
 
-		state->buf += pg_mblen(state->buf);
+		state->buf += pg_mblen_cstr(state->buf);
 	}
 }
 
@@ -425,7 +425,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
 					state->state = WAITOPERAND;
 					continue;
 				}
-				else if (!t_isspace(state->buf))
+				else if (!t_isspace_cstr(state->buf))
 				{
 					/*
 					 * We rely on the tsvector parser to parse the value for
@@ -468,7 +468,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
 					state->buf++;
 					continue;
 				}
-				else if (!t_isspace(state->buf))
+				else if (!t_isspace_cstr(state->buf))
 				{
 					/* insert implicit AND between operands */
 					state->state = WAITOPERAND;
@@ -478,7 +478,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
 				break;
 		}
 
-		state->buf += pg_mblen(state->buf);
+		state->buf += pg_mblen_cstr(state->buf);
 	}
 }
 
@@ -961,9 +961,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp)
 				*(in->cur) = '\\';
 				in->cur++;
 			}
-			COPYCHAR(in->cur, op);
 
-			clen = pg_mblen(op);
+			clen = ts_copychar_cstr(in->cur, op);
 			op += clen;
 			in->cur += clen;
 		}
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 3831dd3c806..bc600915f74 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -313,9 +313,9 @@ tsvectorout(PG_FUNCTION_ARGS)
 				lenbuf = 0,
 				pp;
 	WordEntry  *ptr = ARRPTR(out);
-	char	   *curbegin,
-			   *curin,
+	char	   *curin,
 			   *curout;
+	const char *curend;
 
 	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
 	for (i = 0; i < out->size; i++)
@@ -328,13 +328,14 @@ tsvectorout(PG_FUNCTION_ARGS)
 	curout = outbuf = (char *) palloc(lenbuf);
 	for (i = 0; i < out->size; i++)
 	{
-		curbegin = curin = STRPTR(out) + ptr->pos;
+		curin = STRPTR(out) + ptr->pos;
+		curend = curin + ptr->len;
 		if (i != 0)
 			*curout++ = ' ';
 		*curout++ = '\'';
-		while (curin - curbegin < ptr->len)
+		while (curin < curend)
 		{
-			int			len = pg_mblen(curin);
+			int			len = pg_mblen_range(curin, curend);
 
 			if (t_iseq(curin, '\''))
 				*curout++ = '\'';
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 4237806d6d8..d9562e209da 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -2434,11 +2434,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
 	if (ws)
 	{
 		char	   *buf;
+		const char *end;
 
 		buf = VARDATA_ANY(ws);
-		while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+		end = buf + VARSIZE_ANY_EXHDR(ws);
+		while (buf < end)
 		{
-			if (pg_mblen(buf) == 1)
+			int			len = pg_mblen_range(buf, end);
+
+			if (len == 1)
 			{
 				switch (*buf)
 				{
@@ -2462,7 +2466,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
 						stat->weight |= 0;
 				}
 			}
-			buf += pg_mblen(buf);
+			buf += len;
 		}
 	}
 
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c
index c2df4093e6b..1187f0af523 100644
--- a/src/backend/utils/adt/tsvector_parser.c
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -185,10 +185,9 @@ gettoken_tsvector(TSVectorParseState state,
 			else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
 					 (state->is_web && t_iseq(state->prsbuf, '"')))
 				PRSSYNTAXERROR;
-			else if (!t_isspace(state->prsbuf))
+			else if (!t_isspace_cstr(state->prsbuf))
 			{
-				COPYCHAR(curpos, state->prsbuf);
-				curpos += pg_mblen(state->prsbuf);
+				curpos += ts_copychar_cstr(curpos, state->prsbuf);
 				statecode = WAITENDWORD;
 			}
 		}
@@ -202,8 +201,7 @@ gettoken_tsvector(TSVectorParseState state,
 			else
 			{
 				RESIZEPRSBUF;
-				COPYCHAR(curpos, state->prsbuf);
-				curpos += pg_mblen(state->prsbuf);
+				curpos += ts_copychar_cstr(curpos, state->prsbuf);
 				Assert(oldstate != 0);
 				statecode = oldstate;
 			}
@@ -215,7 +213,7 @@ gettoken_tsvector(TSVectorParseState state,
 				statecode = WAITNEXTCHAR;
 				oldstate = WAITENDWORD;
 			}
-			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+			else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0' ||
 					 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
 					 (state->is_web && t_iseq(state->prsbuf, '"')))
 			{
@@ -238,8 +236,7 @@ gettoken_tsvector(TSVectorParseState state,
 			else
 			{
 				RESIZEPRSBUF;
-				COPYCHAR(curpos, state->prsbuf);
-				curpos += pg_mblen(state->prsbuf);
+				curpos += ts_copychar_cstr(curpos, state->prsbuf);
 			}
 		}
 		else if (statecode == WAITENDCMPLX)
@@ -258,8 +255,7 @@ gettoken_tsvector(TSVectorParseState state,
 			else
 			{
 				RESIZEPRSBUF;
-				COPYCHAR(curpos, state->prsbuf);
-				curpos += pg_mblen(state->prsbuf);
+				curpos += ts_copychar_cstr(curpos, state->prsbuf);
 			}
 		}
 		else if (statecode == WAITCHARCMPLX)
@@ -267,8 +263,7 @@ gettoken_tsvector(TSVectorParseState state,
 			if (!state->is_web && t_iseq(state->prsbuf, '\''))
 			{
 				RESIZEPRSBUF;
-				COPYCHAR(curpos, state->prsbuf);
-				curpos += pg_mblen(state->prsbuf);
+				curpos += ts_copychar_cstr(curpos, state->prsbuf);
 				statecode = WAITENDCMPLX;
 			}
 			else
@@ -279,7 +274,7 @@ gettoken_tsvector(TSVectorParseState state,
 					PRSSYNTAXERROR;
 				if (state->oprisdelim)
 				{
-					/* state->prsbuf+=pg_mblen(state->prsbuf); */
+					/* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
 					RETURN_TOKEN;
 				}
 				else
@@ -296,7 +291,7 @@ gettoken_tsvector(TSVectorParseState state,
 		}
 		else if (statecode == INPOSINFO)
 		{
-			if (t_isdigit(state->prsbuf))
+			if (t_isdigit_cstr(state->prsbuf))
 			{
 				if (posalen == 0)
 				{
@@ -351,10 +346,10 @@ gettoken_tsvector(TSVectorParseState state,
 					PRSSYNTAXERROR;
 				WEP_SETWEIGHT(pos[npos - 1], 0);
 			}
-			else if (t_isspace(state->prsbuf) ||
+			else if (t_isspace_cstr(state->prsbuf) ||
 					 *(state->prsbuf) == '\0')
 				RETURN_TOKEN;
-			else if (!t_isdigit(state->prsbuf))
+			else if (!t_isdigit_cstr(state->prsbuf))
 				PRSSYNTAXERROR;
 		}
 		else					/* internal error */
@@ -362,6 +357,6 @@ gettoken_tsvector(TSVectorParseState state,
 				 statecode);
 
 		/* get next char */
-		state->prsbuf += pg_mblen(state->prsbuf);
+		state->prsbuf += pg_mblen_cstr(state->prsbuf);
 	}
 }
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index 0d0c0fd9f3c..b008959ab54 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("\"%.*s\" is not a valid binary digit",
-								pg_mblen(sp), sp)));
+								pg_mblen_cstr(sp), sp)));
 
 			x >>= 1;
 			if (x == 0)
@@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("\"%.*s\" is not a valid hexadecimal digit",
-								pg_mblen(sp), sp)));
+								pg_mblen_cstr(sp), sp)));
 
 			if (bc)
 			{
@@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("\"%.*s\" is not a valid binary digit",
-								pg_mblen(sp), sp)));
+								pg_mblen_cstr(sp), sp)));
 
 			x >>= 1;
 			if (x == 0)
@@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("\"%.*s\" is not a valid hexadecimal digit",
-								pg_mblen(sp), sp)));
+								pg_mblen_cstr(sp), sp)));
 
 			if (bc)
 			{
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index d62cbdce325..b79ae35a0d2 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -779,8 +779,11 @@ text_catenate(text *t1, text *t2)
  * charlen_to_bytelen()
  *	Compute the number of bytes occupied by n characters starting at *p
  *
- * It is caller's responsibility that there actually are n characters;
- * the string need not be null-terminated.
+ * The caller shall ensure there are n complete characters.  Callers achieve
+ * this by deriving "n" from regmatch_t findings from searching a wchar array.
+ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
+ * matches will end no later than the last complete character.  (The string
+ * need not be null-terminated.)
  */
 static int
 charlen_to_bytelen(const char *p, int n)
@@ -795,7 +798,7 @@ charlen_to_bytelen(const char *p, int n)
 		const char *s;
 
 		for (s = p; n > 0; n--)
-			s += pg_mblen(s);
+			s += pg_mblen_unbounded(s); /* caller verified encoding */
 
 		return s - p;
 	}
@@ -928,6 +931,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		int32		slice_start;
 		int32		slice_size;
 		int32		slice_strlen;
+		int32		slice_len;
 		text	   *slice;
 		int32		E1;
 		int32		i;
@@ -997,7 +1001,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 			slice = (text *) DatumGetPointer(str);
 
 		/* see if we got back an empty string */
-		if (VARSIZE_ANY_EXHDR(slice) == 0)
+		slice_len = VARSIZE_ANY_EXHDR(slice);
+		if (slice_len == 0)
 		{
 			if (slice != (text *) DatumGetPointer(str))
 				pfree(slice);
@@ -1006,7 +1011,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 
 		/* Now we can get the actual length of the slice in MB characters */
 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
-											VARSIZE_ANY_EXHDR(slice));
+											slice_len);
 
 		/*
 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
@@ -1033,7 +1038,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 */
 		p = VARDATA_ANY(slice);
 		for (i = 0; i < S1 - 1; i++)
-			p += pg_mblen(p);
+			p += pg_mblen_unbounded(p);
 
 		/* hang onto a pointer to our start position */
 		s = p;
@@ -1043,7 +1048,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 * length.
 		 */
 		for (i = S1; i < E1; i++)
-			p += pg_mblen(p);
+			p += pg_mblen_unbounded(p);
 
 		ret = (text *) palloc(VARHDRSZ + (p - s));
 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
@@ -1350,6 +1355,8 @@ retry:
 	 */
 	if (state->is_multibyte_char_in_char)
 	{
+		const char *haystack_end = state->str1 + state->len1;
+
 		/* Walk one character at a time, until we reach the match. */
 
 		/* the search should never move backwards. */
@@ -1358,7 +1365,7 @@ retry:
 		while (state->refpoint < matchptr)
 		{
 			/* step to next character. */
-			state->refpoint += pg_mblen(state->refpoint);
+			state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
 			state->refpos++;
 
 			/*
@@ -1473,7 +1480,8 @@ text_position_get_match_pos(TextPositionState *state)
 		/* Convert the byte position to char position. */
 		while (state->refpoint < state->last_match)
 		{
-			state->refpoint += pg_mblen(state->refpoint);
+			state->refpoint += pg_mblen_range(state->refpoint,
+											  state->last_match);
 			state->refpos++;
 		}
 		Assert(state->refpoint == state->last_match);
@@ -4368,7 +4376,7 @@ check_replace_text_has_escape_char(const text *replace_text)
 	}
 	else
 	{
-		for (; p < p_end; p += pg_mblen(p))
+		for (; p < p_end; p += pg_mblen_range(p, p_end))
 		{
 			if (*p == '\\')
 				return true;
@@ -4408,7 +4416,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 		}
 		else
 		{
-			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
+			for (; p < p_end && *p != '\\'; p += pg_mblen_range(p, p_end))
 				 /* nothing */ ;
 		}
 
@@ -4966,6 +4974,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
 	}
 	else
 	{
+		const char *end_ptr;
+
 		/*
 		 * When fldsep is NULL, each character in the input string becomes a
 		 * separate element in the result set.  The separator is effectively
@@ -4974,10 +4984,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
 
 		start_ptr = VARDATA_ANY(inputstring);
+		end_ptr = start_ptr + inputstring_len;
 
 		while (inputstring_len > 0)
 		{
-			int			chunk_len = pg_mblen(start_ptr);
+			int			chunk_len = pg_mblen_range(start_ptr, end_ptr);
 
 			CHECK_FOR_INTERRUPTS();
 
@@ -5656,7 +5667,7 @@ text_reverse(PG_FUNCTION_ARGS)
 		{
 			int			sz;
 
-			sz = pg_mblen(p);
+			sz = pg_mblen_range(p, endp);
 			dst -= sz;
 			memcpy(dst, p, sz);
 			p += sz;
@@ -5817,7 +5828,7 @@ text_format(PG_FUNCTION_ARGS)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("unrecognized format() type specifier \"%.*s\"",
-							pg_mblen(cp), cp),
+							pg_mblen_range(cp, end_ptr), cp),
 					 errhint("For a single \"%%\" use \"%%%%\".")));
 
 		/* If indirect width was specified, get its value */
@@ -5938,7 +5949,7 @@ text_format(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("unrecognized format() type specifier \"%.*s\"",
-								pg_mblen(cp), cp),
+								pg_mblen_range(cp, end_ptr), cp),
 						 errhint("For a single \"%%\" use \"%%%%\".")));
 				break;
 		}
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 0137ff90f0a..98dcc04122b 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -2037,8 +2037,7 @@ sqlchar_to_unicode(const char *s)
 	char	   *utf8string;
 	pg_wchar	ret[2];			/* need space for trailing zero */
 
-	/* note we're not assuming s is null-terminated */
-	utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8);
+	utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8);
 
 	pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret,
 								  pg_encoding_mblen(PG_UTF8, utf8string));
@@ -2091,7 +2090,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
 
 	initStringInfo(&buf);
 
-	for (p = ident; *p; p += pg_mblen(p))
+	for (p = ident; *p; p += pg_mblen_cstr(p))
 	{
 		if (*p == ':' && (p == ident || fully_escaped))
 			appendStringInfoString(&buf, "_x003A_");
@@ -2116,7 +2115,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
 				: !is_valid_xml_namechar(u))
 				appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
 			else
-				appendBinaryStringInfo(&buf, p, pg_mblen(p));
+				appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
 		}
 	}
 
@@ -2139,7 +2138,7 @@ map_xml_name_to_sql_identifier(const char *name)
 
 	initStringInfo(&buf);
 
-	for (p = name; *p; p += pg_mblen(p))
+	for (p = name; *p; p += pg_mblen_cstr(p))
 	{
 		if (*p == '_' && *(p + 1) == 'x'
 			&& isxdigit((unsigned char) *(p + 2))
@@ -2157,7 +2156,7 @@ map_xml_name_to_sql_identifier(const char *name)
 			p += 6;
 		}
 		else
-			appendBinaryStringInfo(&buf, p, pg_mblen(p));
+			appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
 	}
 
 	return buf.data;
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index b308ff826fe..c7b9a403a7f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -38,6 +38,7 @@
 #include "catalog/namespace.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
+#include "utils/memdebug.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
 #include "utils/syscache.h"
@@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src,
 												 int len, bool is_client_to_server);
 static int	cliplen(const char *str, int len, int limit);
 
+pg_attribute_noreturn()
+static void report_invalid_encoding_int(int encoding, const char *mbstr,
+										int mblen, int len);
+
+pg_attribute_noreturn()
+static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
+
 
 /*
  * Prepare for a future call to SetClientEncoding.  Success should mean
@@ -962,11 +970,126 @@ pg_encoding_wchar2mb_with_len(int encoding,
 	return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
 }
 
-/* returns the byte length of a multibyte character */
+/*
+ * Returns the byte length of a multibyte character sequence in a
+ * null-terminated string.  Raises an illegal byte sequence error if the
+ * sequence would hit a null terminator.
+ *
+ * The caller is expected to have checked for a terminator at *mbstr == 0
+ * before calling, but some callers want 1 in that case, so this function
+ * continues that tradition.
+ *
+ * This must only be used for strings that have a null-terminator to enable
+ * bounds detection.
+ */
+int
+pg_mblen_cstr(const char *mbstr)
+{
+	int			length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+	/*
+	 * The .mblen functions return 1 when given a pointer to a terminator.
+	 * Some callers depend on that, so we tolerate it for now.  Well-behaved
+	 * callers check the leading byte for a terminator *before* calling.
+	 */
+	for (int i = 1; i < length; ++i)
+		if (unlikely(mbstr[i] == 0))
+			report_invalid_encoding_db(mbstr, length, i);
+
+	/*
+	 * String should be NUL-terminated, but checking that would make typical
+	 * callers O(N^2), tripling Valgrind check-world time.  Unless
+	 * VALGRIND_EXPENSIVE, check 1 byte after each actual character.  (If we
+	 * found a character, not a terminator, the next byte must be a terminator
+	 * or the start of the next character.)  If the caller iterates the whole
+	 * string, the last call will diagnose a missing terminator.
+	 */
+	if (mbstr[0] != '\0')
+	{
+#ifdef VALGRIND_EXPENSIVE
+		VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
+#else
+		VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
+#endif
+	}
+
+	return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * [mbstr, end) of at least one byte in size.  Raises an illegal byte sequence
+ * error if the sequence would exceed the range.
+ */
+int
+pg_mblen_range(const char *mbstr, const char *end)
+{
+	int			length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+	Assert(end > mbstr);
+#ifdef VALGRIND_EXPENSIVE
+	VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
+#else
+	VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+	if (unlikely(mbstr + length > end))
+		report_invalid_encoding_db(mbstr, length, end - mbstr);
+
+	return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * extending for 'limit' bytes, which must be at least one.  Raises an illegal
+ * byte sequence error if the sequence would exceed the range.
+ */
+int
+pg_mblen_with_len(const char *mbstr, int limit)
+{
+	int			length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+	Assert(limit >= 1);
+#ifdef VALGRIND_EXPENSIVE
+	VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
+#else
+	VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+	if (unlikely(length > limit))
+		report_invalid_encoding_db(mbstr, length, limit);
+
+	return length;
+}
+
+
+/*
+ * Returns the length of a multibyte character sequence, without any
+ * validation of bounds.
+ *
+ * PLEASE NOTE:  This function can only be used safely if the caller has
+ * already verified the input string, since otherwise there is a risk of
+ * overrunning the buffer if the string is invalid.  A prior call to a
+ * pg_mbstrlen* function suffices.
+ */
+int
+pg_mblen_unbounded(const char *mbstr)
+{
+	int			length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+	VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+
+	return length;
+}
+
+/*
+ * Historical name for pg_mblen_unbounded().  Should not be used and will be
+ * removed in a later version.
+ */
 int
 pg_mblen(const char *mbstr)
 {
-	return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+	return pg_mblen_unbounded(mbstr);
 }
 
 /* returns the display length of a multibyte character */
@@ -988,14 +1111,14 @@ pg_mbstrlen(const char *mbstr)
 
 	while (*mbstr)
 	{
-		mbstr += pg_mblen(mbstr);
+		mbstr += pg_mblen_cstr(mbstr);
 		len++;
 	}
 	return len;
 }
 
 /* returns the length (counted in wchars) of a multibyte string
- * (not necessarily NULL terminated)
+ * (stops at the first of "limit" or a NUL)
  */
 int
 pg_mbstrlen_with_len(const char *mbstr, int limit)
@@ -1008,7 +1131,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit)
 
 	while (limit > 0 && *mbstr)
 	{
-		int			l = pg_mblen(mbstr);
+		int			l = pg_mblen_with_len(mbstr, limit);
 
 		limit -= l;
 		mbstr += l;
@@ -1078,7 +1201,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit)
 
 	while (len > 0 && *mbstr)
 	{
-		l = pg_mblen(mbstr);
+		l = pg_mblen_with_len(mbstr, len);
 		nch++;
 		if (nch > limit)
 			break;
@@ -1648,12 +1771,19 @@ void
 report_invalid_encoding(int encoding, const char *mbstr, int len)
 {
 	int			l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
+
+	report_invalid_encoding_int(encoding, mbstr, l, len);
+}
+
+static void
+report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
+{
 	char		buf[8 * 5 + 1];
 	char	   *p = buf;
 	int			j,
 				jlimit;
 
-	jlimit = Min(l, len);
+	jlimit = Min(mblen, len);
 	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
 
 	for (j = 0; j < jlimit; j++)
@@ -1670,6 +1800,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
 					buf)));
 }
 
+static void
+report_invalid_encoding_db(const char *mbstr, int mblen, int len)
+{
+	report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
+}
+
 /*
  * report_untranslatable_char: complain about untranslatable character
  *
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index a44ea1acbc0..ff832f6d50b 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -608,7 +608,14 @@ extern int	pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
 extern int	pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
 extern int	pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
 extern size_t pg_wchar_strlen(const pg_wchar *wstr);
+extern int	pg_mblen_cstr(const char *mbstr);
+extern int	pg_mblen_range(const char *mbstr, const char *end);
+extern int	pg_mblen_with_len(const char *mbstr, int limit);
+extern int	pg_mblen_unbounded(const char *mbstr);
+
+/* deprecated */
 extern int	pg_mblen(const char *mbstr);
+
 extern int	pg_dsplen(const char *mbstr);
 extern int	pg_mbstrlen(const char *mbstr);
 extern int	pg_mbstrlen_with_len(const char *mbstr, int len);
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index cfd34b32443..5d5c240b492 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -45,12 +45,36 @@ typedef struct
 /* The second argument of t_iseq() must be a plain ASCII character */
 #define t_iseq(x,c)		(TOUCHAR(x) == (unsigned char) (c))
 
-#define COPYCHAR(d,s)	memcpy(d, s, pg_mblen(s))
+/* Copy multibyte character of known byte length, return byte length. */
+static inline int
+ts_copychar_with_len(void *dest, const void *src, int length)
+{
+	memcpy(dest, src, length);
+	return length;
+}
 
-extern int	t_isdigit(const char *ptr);
-extern int	t_isspace(const char *ptr);
-extern int	t_isalpha(const char *ptr);
-extern int	t_isprint(const char *ptr);
+/* Copy multibyte character from null-terminated string,  return byte length. */
+static inline int
+ts_copychar_cstr(void *dest, const void *src)
+{
+	return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src));
+}
+
+/* Historical macro for the above. */
+#define COPYCHAR ts_copychar_cstr
+
+#define GENERATE_T_ISCLASS_DECL(character_class) \
+extern int	t_is##character_class##_with_len(const char *ptr, int len); \
+extern int	t_is##character_class##_cstr(const char *ptr); \
+extern int	t_is##character_class##_unbounded(const char *ptr); \
+\
+/* deprecated */ \
+extern int	t_is##character_class(const char *ptr);
+
+GENERATE_T_ISCLASS_DECL(alpha);
+GENERATE_T_ISCLASS_DECL(digit);
+GENERATE_T_ISCLASS_DECL(print);
+GENERATE_T_ISCLASS_DECL(space);
 
 extern char *lowerstr(const char *str);
 extern char *lowerstr_with_len(const char *str, int len);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index 4266560151f..ef3d67aeb1a 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -38,14 +38,12 @@ extern bool gettoken_tsvector(TSVectorParseState state,
 extern void close_tsvector_parser(TSVectorParseState state);
 
 /* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
-	( pg_mblen(x) == 1 && ( *(x) == '!' ||	\
-							*(x) == '&' ||	\
-							*(x) == '|' ||	\
-							*(x) == '(' ||	\
-							*(x) == ')' ||	\
-							*(x) == '<'		\
-						  ) )
+#define ISOPERATOR(x)		(*(x) == '!' ||	\
+							 *(x) == '&' ||	\
+							 *(x) == '|' ||	\
+							 *(x) == '(' ||	\
+							 *(x) == ')' ||	\
+							 *(x) == '<')
 
 /* parse_tsquery */
 
diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c
index 095751cf04e..b8ff535c8f3 100644
--- a/src/test/modules/test_regex/test_regex.c
+++ b/src/test/modules/test_regex/test_regex.c
@@ -424,7 +424,8 @@ parse_test_flags(test_re_flags *flags, text *opts)
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							 errmsg("invalid regular expression test option: \"%.*s\"",
-									pg_mblen(opt_p + i), opt_p + i)));
+									pg_mblen_range(opt_p + i, opt_p + opt_len),
+									opt_p + i)));
 					break;
 			}
 		}