From 282b50043f91273f9f62a69e39ca0795b08ce143 Mon Sep 17 00:00:00 2001
From: Pepper Gray <111446242+peppergrayxyz@users.noreply.github.com>
Date: Thu, 17 Oct 2024 23:50:13 +0200
Subject: [PATCH] replace uchar with rune

---
 .../utf8/east_asian/east_asian_width.v        |  2 +-
 vlib/encoding/utf8/utf8_util.v                | 42 +++++++++++++------
 vlib/encoding/utf8/utf8_util_test.v           | 17 ++++++--
 vlib/x/ttf/render_bmp.v                       |  6 +--
 4 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/vlib/encoding/utf8/east_asian/east_asian_width.v b/vlib/encoding/utf8/east_asian/east_asian_width.v
index 21c6ba121a7185..57dda23c4440a0 100644
--- a/vlib/encoding/utf8/east_asian/east_asian_width.v
+++ b/vlib/encoding/utf8/east_asian/east_asian_width.v
@@ -33,7 +33,7 @@ pub fn display_width(s string, ambiguous_width int) int {
 
 // width_property_at returns the East Asian Width properties at string[index]
 pub fn east_asian_width_property_at(s string, index int) EastAsianWidthProperty {
-	codepoint := utf8.get_uchar(s, index)
+	codepoint := utf8.get_rune(s, index)
 	mut left, mut right := 0, east_asian_width_data.len - 1
 	for left <= right {
 		middle := left + ((right - left) / 2)
diff --git a/vlib/encoding/utf8/utf8_util.v b/vlib/encoding/utf8/utf8_util.v
index d39e4b51c69ae1..032bfcc66e84be 100644
--- a/vlib/encoding/utf8/utf8_util.v
+++ b/vlib/encoding/utf8/utf8_util.v
@@ -27,8 +27,14 @@ pub fn len(s string) int {
 	return count
 }
 
-// get_uchar convert a unicode glyph in string[index] into a int unicode char
+@[deprecated: 'use `.get_rune(s string, index int)` instead']
+@[deprecated_after: '2024-10-17']
 pub fn get_uchar(s string, index int) int {
+	return int(get_rune(s, index))
+}
+
+// get_rune convert a UTF-8 unicode codepoint in string[index] into a UTF-32 encoded rune
+pub fn get_rune(s string, index int) rune {
 	mut res := 0
 	mut ch_len := 0
 	if s.len > 0 {
@@ -81,7 +87,7 @@ pub fn raw_index(s string, index int) string {
 
 		r << if ch_len > 0 {
 			i += ch_len
-			rune(get_uchar(s, i - ch_len))
+			rune(get_rune(s, i - ch_len))
 		} else {
 			rune(b)
 		}
@@ -126,7 +132,7 @@ pub fn to_lower(s string) string {
 
 // is_punct return true if the string[index] byte is the start of a unicode western punctuation
 pub fn is_punct(s string, index int) bool {
-	return is_uchar_punct(get_uchar(s, index))
+	return is_rune_punct(get_rune(s, index))
 }
 
 // is_control return true if the rune is control code
@@ -174,21 +180,33 @@ pub fn is_number(r rune) bool {
 	return is_excluding_latin(number_table, r)
 }
 
-// is_uchar_punct return true if the input unicode is a western unicode punctuation
+@[deprecated: 'use `.is_rune_punct(r rune)` instead']
+@[deprecated_after: '2024-10-17']
 pub fn is_uchar_punct(uchar int) bool {
-	return find_punct_in_table(uchar, unicode_punct_western) != 0
+	return is_rune_punct(rune(uchar))
+}
+
+// is_rune_punct return true if the input unicode is a western unicode punctuation
+pub fn is_rune_punct(r rune) bool {
+	return find_punct_in_table(r, unicode_punct_western) != 0
 }
 
 // Global
 
 // is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation
 pub fn is_global_punct(s string, index int) bool {
-	return is_uchar_global_punct(get_uchar(s, index))
+	return is_rune_global_punct(get_rune(s, index))
 }
 
-// is_uchar_global_punct return true if the input unicode is a global unicode punctuation
+@[deprecated: 'use `.is_rune_global_punct(r rune)` instead']
+@[deprecated_after: '2024-10-17']
 pub fn is_uchar_global_punct(uchar int) bool {
-	return find_punct_in_table(uchar, unicode_punct) != 0
+	return is_rune_global_punct(rune(uchar))
+}
+
+// is_rune_global_punct return true if the input unicode is a global unicode punctuation
+pub fn is_rune_global_punct(r rune) bool {
+	return find_punct_in_table(r, unicode_punct) != 0
 }
 
 // Private functions
@@ -523,13 +541,13 @@ fn convert_case(s string, upper_flag bool) string {
 
 // find_punct_in_table looks for valid punctuation in table
 @[direct_array_access]
-fn find_punct_in_table(in_code int, in_table []int) int {
+fn find_punct_in_table(in_code rune, in_table []rune) rune {
 	// uses simple binary search
 
 	mut first_index := 0
 	mut last_index := (in_table.len)
 	mut index := 0
-	mut x := 0
+	mut x := rune(0)
 
 	for {
 		index = (first_index + last_index) >> 1
@@ -559,7 +577,7 @@ fn find_punct_in_table(in_code int, in_table []int) int {
 // Western punctuation mark
 // Character	Name	Browser	Image
 const unicode_punct_western = [
-	0x0021, // EXCLAMATION MARK !
+	rune(0x0021), // EXCLAMATION MARK !
 	0x0022, // QUOTATION MARK "
 	0x0027, // APOSTROPHE '
 	0x002A, // ASTERISK *
@@ -593,7 +611,7 @@ const unicode_punct_western = [
 // Unicode Characters in the 'Punctuation, Other' Category
 // Character	Name	Browser	Image
 const unicode_punct = [
-	0x0021, // EXCLAMATION MARK	!
+	rune(0x0021), // EXCLAMATION MARK	!
 	0x0022, // QUOTATION MARK	"
 	0x0023, // NUMBER SIGN	#
 	0x0025, // PERCENT SIGN	%
diff --git a/vlib/encoding/utf8/utf8_util_test.v b/vlib/encoding/utf8/utf8_util_test.v
index 3b2e503dbaf147..22cda41c45c9a5 100644
--- a/vlib/encoding/utf8/utf8_util_test.v
+++ b/vlib/encoding/utf8/utf8_util_test.v
@@ -23,7 +23,7 @@ fn test_utf8_util() {
 	a := '.abc?abcòàè.'
 	assert utf8.is_punct(a, 0) == true
 	assert utf8.is_punct('b', 0) == false
-	assert utf8.is_uchar_punct(0x002E) == true
+	assert utf8.is_rune_punct(0x002E) == true
 	assert utf8.is_punct(a, 4) == true // ?
 	assert utf8.is_punct(a, 14) == true // last .
 	assert utf8.is_punct(a, 12) == false // è
@@ -33,12 +33,16 @@ fn test_utf8_util() {
 	b := '.ĂĂa. ÔÔ TESTO Æ€'
 	assert utf8.is_global_punct(b, 0) == true
 	assert utf8.is_global_punct('.', 0) == true
-	assert utf8.is_uchar_punct(0x002E) == true
+	assert utf8.is_rune_punct(0x002E) == true
 	assert utf8.is_global_punct(b, 6) == true // .
 	assert utf8.is_global_punct(b, 1) == false // a
 
 	// test utility functions
-	assert utf8.get_uchar(b, 0) == 0x002E
+	c := 'a©★🚀'
+	assert utf8.get_rune(c, 0) == `a` // 1 byte
+	assert utf8.get_rune(c, 1) == `©` // 2 bytes
+	assert utf8.get_rune(c, 3) == `★` // 3 bytes
+	assert utf8.get_rune(c, 6) == `🚀` // 4 bytes
 }
 
 fn test_raw_indexing() {
@@ -56,6 +60,13 @@ fn test_raw_indexing() {
 	assert utf8.raw_index(a, 6) == 'n'
 	assert utf8.raw_index(a, 7) == 'g'
 	assert utf8.raw_index(a, 8) == '!'
+
+	// test differnt utf8 byte lenghts
+	c := 'a©★🚀'
+	assert utf8.raw_index(c, 0) == 'a' // 1 byte
+	assert utf8.raw_index(c, 1) == '©' // 2 bytes
+	assert utf8.raw_index(c, 2) == '★' // 3 bytes
+	assert utf8.raw_index(c, 3) == '🚀' // 4 bytes
 }
 
 fn test_reversed() {
diff --git a/vlib/x/ttf/render_bmp.v b/vlib/x/ttf/render_bmp.v
index b4ac048fd101ca..465fd47a306929 100644
--- a/vlib/x/ttf/render_bmp.v
+++ b/vlib/x/ttf/render_bmp.v
@@ -483,7 +483,7 @@ pub fn (mut bmp BitMap) get_chars_bbox(in_string string) []int {
 		// manage unicode chars like latin greek etc
 		c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
 		if c_len > 1 {
-			tmp_char := utf8.get_uchar(in_string, i)
+			tmp_char := utf8.get_rune(in_string, i)
 			// dprintln("tmp_char: ${tmp_char.hex()}")
 			chr = u16(tmp_char)
 		}
@@ -554,7 +554,7 @@ pub fn (mut bmp BitMap) get_bbox(in_string string) (int, int) {
 		// manage unicode chars like latin greek etc
 		c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
 		if c_len > 1 {
-			tmp_char := utf8.get_uchar(in_string, i)
+			tmp_char := utf8.get_rune(in_string, i)
 			// dprintln("tmp_char: ${tmp_char.hex()}")
 			chr = u16(tmp_char)
 		}
@@ -649,7 +649,7 @@ pub fn (mut bmp BitMap) draw_text(in_string string) (int, int) {
 		// manage unicode chars like latin greek etc
 		c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
 		if c_len > 1 {
-			tmp_char := utf8.get_uchar(in_string, i)
+			tmp_char := utf8.get_rune(in_string, i)
 			// dprintln("tmp_char: ${tmp_char.hex()}")
 			chr = u16(tmp_char)
 		}