summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNRK <nrk@disroot.org>2024-07-04 21:25:37 +0000
committerRafael Marçalo <raroma09@gmail.com>2024-07-17 01:32:03 +0100
commitaab11c56951008296a1c2aec9e980d35a2a2d495 (patch)
tree0a06268e3336938f067704f70a3dc028506a9e33
parent23ca2de587c521e2cf3199718ccc0394af8ddef0 (diff)
overhaul utf8decode()
this changes the utf8decode function to: * report when an error occurs * report how many bytes to advance on error these will be useful in the next commit to render invalid utf8 sequences. the new implementation is also shorter and more direct.
-rw-r--r--drw.c76
1 files changed, 31 insertions, 45 deletions
diff --git a/drw.c b/drw.c
index 78a2b27..eb71da7 100644
--- a/drw.c
+++ b/drw.c
@@ -9,54 +9,40 @@
#include "util.h"
#define UTF_INVALID 0xFFFD
-#define UTF_SIZ 4
-static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
-static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
-static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
-static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
-
-static long
-utf8decodebyte(const char c, size_t *i)
-{
- for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
- if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
- return (unsigned char)c & ~utfmask[*i];
- return 0;
-}
-
-static size_t
-utf8validate(long *u, size_t i)
+static int
+utf8decode(const char *s_in, long *u, int *err)
{
- if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
- *u = UTF_INVALID;
- for (i = 1; *u > utfmax[i]; ++i)
- ;
- return i;
-}
-
-static size_t
-utf8decode(const char *c, long *u, size_t clen)
-{
- size_t i, j, len, type;
- long udecoded;
-
+ static const unsigned char lens[] = {
+ /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0, /* invalid */
+ /* 110XX */ 2, 2, 2, 2,
+ /* 1110X */ 3, 3,
+ /* 11110 */ 4,
+ /* 11111 */ 0, /* invalid */
+ };
+ static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+ static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
+
+ const unsigned char *s = (const unsigned char *)s_in;
+ int len = lens[*s >> 3];
*u = UTF_INVALID;
- if (!clen)
- return 0;
- udecoded = utf8decodebyte(c[0], &len);
- if (!BETWEEN(len, 1, UTF_SIZ))
+ *err = 1;
+ if (len == 0)
return 1;
- for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
- udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
- if (type)
- return j;
+
+ long cp = s[0] & leading_mask[len - 1];
+ for (int i = 1; i < len; ++i) {
+ if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
+ return i;
+ cp = (cp << 6) | (s[i] & 0x3F);
}
- if (j < len)
- return 0;
- *u = udecoded;
- utf8validate(u, len);
+ /* out of range, surrogate, overlong encoding */
+ if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
+ return len;
+ *err = 0;
+ *u = cp;
return len;
}
@@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
XftDraw *d = NULL;
Fnt *usedfont, *curfont, *nextfont;
- int utf8strlen, utf8charlen, render = x || y || w || h;
+ int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
long utf8codepoint = 0;
const char *utf8str;
FcCharSet *fccharset;
@@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
if (!ellipsis_width && render)
ellipsis_width = drw_fontset_getwidth(drw, "...");
while (1) {
- ew = ellipsis_len = utf8strlen = 0;
+ ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
utf8str = text;
nextfont = NULL;
while (*text) {
- utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
+ utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
for (curfont = drw->fonts; curfont; curfont = curfont->next) {
charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
if (charexists) {