Truncate text by byte
From PostgreSQL wiki
Jump to navigationJump to searchTruncate UTF-8 Text by byte width
Works with PostgreSQL
Any version
Written in
PL/pgSQLl
Depends on
Nothing
Several different ways to truncate a String/Text that is encoded in UTF-8 or other variable encoding method to specified byte width:
Method 1: author Laruenz Albe is an SQL implementation generating a record set containing all possible byte lengths to character length in a table then filters the result down
CREATE OR REPLACE FUNCTION get_prefix (
string text,
max_bytes bigint
) RETURNS text
LANGUAGE sql STRICT AS
$$SELECT p
FROM (SELECT p.p, octet_length(p.p) AS len
FROM generate_series(0, length($1)) AS len
CROSS JOIN LATERAL substr($1, 1, len.len) AS p) AS q
WHERE len <= $2
ORDER BY len DESC
LIMIT 1$$;
Method 2: Author Zsheep is an PLpg/SQL implementation, iterating over the string 1 character at a time moving left to right until the string byte width is approximately the input width
Create or Replace function max_bytea_length(pchars text, bytea_length int)
RETURNS bytea
LANGUAGE plpgsql
COST 100
VOLATILE
AS $BODY$
declare
_i int;
_length_of_chars int;
_newchars text;
_testchars text;
begin
if octet_length(pchars::bytea) <= bytea_length then
return pchars::bytea;
end if;
_i = least( octet_length(pchars)-4, bytea_length-4);
_length_of_chars = char_length(pchars);
loop
_newchars= left(pchars, _i);
_testchars = left(pchars, _i+1);
if octet_length(_testchars::bytea) > bytea_length or _i = _length_of_chars then
return _newchars::bytea;
end if ;
_i = _i+1;
end loop ;
end;
$BODY$
Method 3: Author Daniel Veritte is an SQL function that inspects the UTF-8 byte sequence looking for the high order bytea that is <= max byte width.
create function utf8_truncate(str text, len int) returns text
as $$
select case when octet_length(str) <= len then str
else (
with bstr(s) as (select convert_to(str, 'UTF-8'))
select
case
when len>=1 and (get_byte(s, len) & 192) <> 128
then convert_from(substring(s, 1, len), 'UTF-8')
else
case
when len>=2 and (get_byte(s, len-1) & 192) <> 128
then convert_from(substring(s, 1, len-1), 'UTF-8')
else
case
when len>=3 and (get_byte(s, len-2) & 192) <> 128
then convert_from(substring(s, 1, len-2), 'UTF-8')
else
case
when len>=4 and (get_byte(s, len-3) & 192) <> 128
then convert_from(substring(s, 1, len-3), 'UTF-8')
else ''
end
end
end
end
from bstr)
end;
$$ language sql strict immutable parallel safe;
Examples of usage
SELECT get_prefix('abc€', 4);
select max_bytea_length('i ♥ u function changed it to be faster', 56);
select utf8_truncate('method 3 €', 9);