Fix loop when wrapping line with ISO-8859-1 character

Changes utf8_char_length(), utf8_to_unicode() and utf8_length() implementation to rely on utf8proc. Fixes jonas#1087
koutcher · Feb 27, 2021 · 7a4cb37 · 7a4cb37
1 parent 9fb0a2f
commit 7a4cb37
Showing 1 changed file with 17 additions and 72 deletions.
diff --git a/src/string.c b/src/string.c
@@ -214,72 +214,24 @@ unicode_width(unsigned long c, int tab_size)
 
 /* Number of bytes used for encoding a UTF-8 character indexed by first byte.
  * Illegal bytes are set one. */
-static const unsigned char utf8_bytes[256] = {
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
-	2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
-	3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
-};
-
 unsigned char
 utf8_char_length(const char *string)
 {
-	int c = *(unsigned char *) string;
+	size_t c = *(unsigned char *) string;
 
-	return utf8_bytes[c];
+	return utf8proc_utf8class[c] ? utf8proc_utf8class[c] : 1;
 }
 
 /* Decode UTF-8 multi-byte representation into a Unicode character. */
 unsigned long
 utf8_to_unicode(const char *string, size_t length)
 {
-	unsigned long unicode;
-
-	switch (length) {
-	case 1:
-		unicode  =   string[0];
-		break;
-	case 2:
-		unicode  =  (string[0] & 0x1f) << 6;
-		unicode +=  (string[1] & 0x3f);
-		break;
-	case 3:
-		unicode  =  (string[0] & 0x0f) << 12;
-		unicode += ((string[1] & 0x3f) << 6);
-		unicode +=  (string[2] & 0x3f);
-		break;
-	case 4:
-		unicode  =  (string[0] & 0x0f) << 18;
-		unicode += ((string[1] & 0x3f) << 12);
-		unicode += ((string[2] & 0x3f) << 6);
-		unicode +=  (string[3] & 0x3f);
-		break;
-	case 5:
-		unicode  =  (string[0] & 0x0f) << 24;
-		unicode += ((string[1] & 0x3f) << 18);
-		unicode += ((string[2] & 0x3f) << 12);
-		unicode += ((string[3] & 0x3f) << 6);
-		unicode +=  (string[4] & 0x3f);
-		break;
-	case 6:
-		unicode  =  (string[0] & 0x01) << 30;
-		unicode += ((string[1] & 0x3f) << 24);
-		unicode += ((string[2] & 0x3f) << 18);
-		unicode += ((string[3] & 0x3f) << 12);
-		unicode += ((string[4] & 0x3f) << 6);
-		unicode +=  (string[5] & 0x3f);
-		break;
-	default:
-		return 0;
-	}
+	utf8proc_int32_t unicode;
+	utf8proc_ssize_t slen = utf8proc_iterate((const utf8proc_uint8_t *) string, length, &unicode);
 
 	/* Invalid characters could return the special 0xfffd value but NUL
 	 * should be just as good. */
-	return unicode > 0x10FFFF ? 0 : unicode;
+	return slen <= 0 || !utf8proc_codepoint_valid(unicode) ? 0 : unicode;
 }
 
 /* Calculates how much of string can be shown within the given maximum width
@@ -293,30 +245,23 @@ utf8_length(const char **start, int max_chars, size_t skip, int *width, size_t m
 {
 	const char *string = *start;
 	const char *end = max_chars < 0 ? strchr(string, '\0') : string + max_chars;
-	unsigned char last_bytes = 0;
-	size_t last_ucwidth = 0;
+	utf8proc_ssize_t last_bytes = 0;
+	int last_ucwidth = 0;
 
 	*width = 0;
 	*trimmed = 0;
 
 	while (string < end) {
-		unsigned char bytes = utf8_char_length(string);
-		size_t ucwidth;
-		unsigned long unicode;
-
-		if (string + bytes > end)
-			break;
-
-		/* Change representation to figure out whether
-		 * it is a single- or double-width character. */
-
-		unicode = utf8_to_unicode(string, bytes);
-		/* FIXME: Graceful handling of invalid Unicode character. */
-		if (!unicode)
-			break;
-
-		ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) :
-					    utf8proc_charwidth((utf8proc_int32_t) unicode);
+		utf8proc_int32_t unicode;
+		utf8proc_ssize_t bytes = utf8proc_iterate((const utf8proc_uint8_t *) string, end - string, &unicode);
+		int ucwidth;
+
+		/* Assume a width of 1 for invalid UTF-8 encoding (could be ISO-8859-1). */
+		if (bytes <= 0 || !utf8proc_codepoint_valid(unicode))
+			ucwidth = bytes = 1;
+		else
+			ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) :
+						    utf8proc_charwidth(unicode);
 		if (skip > 0) {
 			skip -= ucwidth <= skip ? ucwidth : skip;
 			*start += bytes;