From 282b50043f91273f9f62a69e39ca0795b08ce143 Mon Sep 17 00:00:00 2001 From: Pepper Gray <111446242+peppergrayxyz@users.noreply.github.com> Date: Thu, 17 Oct 2024 23:50:13 +0200 Subject: [PATCH] replace uchar with rune --- .../utf8/east_asian/east_asian_width.v | 2 +- vlib/encoding/utf8/utf8_util.v | 42 +++++++++++++------ vlib/encoding/utf8/utf8_util_test.v | 17 ++++++-- vlib/x/ttf/render_bmp.v | 6 +-- 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/vlib/encoding/utf8/east_asian/east_asian_width.v b/vlib/encoding/utf8/east_asian/east_asian_width.v index 21c6ba121a7185..57dda23c4440a0 100644 --- a/vlib/encoding/utf8/east_asian/east_asian_width.v +++ b/vlib/encoding/utf8/east_asian/east_asian_width.v @@ -33,7 +33,7 @@ pub fn display_width(s string, ambiguous_width int) int { // width_property_at returns the East Asian Width properties at string[index] pub fn east_asian_width_property_at(s string, index int) EastAsianWidthProperty { - codepoint := utf8.get_uchar(s, index) + codepoint := utf8.get_rune(s, index) mut left, mut right := 0, east_asian_width_data.len - 1 for left <= right { middle := left + ((right - left) / 2) diff --git a/vlib/encoding/utf8/utf8_util.v b/vlib/encoding/utf8/utf8_util.v index d39e4b51c69ae1..032bfcc66e84be 100644 --- a/vlib/encoding/utf8/utf8_util.v +++ b/vlib/encoding/utf8/utf8_util.v @@ -27,8 +27,14 @@ pub fn len(s string) int { return count } -// get_uchar convert a unicode glyph in string[index] into a int unicode char +@[deprecated: 'use `.get_rune(s string, index int)` instead'] +@[deprecated_after: '2024-10-17'] pub fn get_uchar(s string, index int) int { + return int(get_rune(s, index)) +} + +// get_rune convert a UTF-8 unicode codepoint in string[index] into a UTF-32 encoded rune +pub fn get_rune(s string, index int) rune { mut res := 0 mut ch_len := 0 if s.len > 0 { @@ -81,7 +87,7 @@ pub fn raw_index(s string, index int) string { r << if ch_len > 0 { i += ch_len - rune(get_uchar(s, i - ch_len)) + rune(get_rune(s, i - ch_len)) } else { rune(b) } @@ -126,7 +132,7 @@ pub fn to_lower(s string) string { // is_punct return true if the string[index] byte is the start of a unicode western punctuation pub fn is_punct(s string, index int) bool { - return is_uchar_punct(get_uchar(s, index)) + return is_rune_punct(get_rune(s, index)) } // is_control return true if the rune is control code @@ -174,21 +180,33 @@ pub fn is_number(r rune) bool { return is_excluding_latin(number_table, r) } -// is_uchar_punct return true if the input unicode is a western unicode punctuation +@[deprecated: 'use `.is_rune_punct(r rune)` instead'] +@[deprecated_after: '2024-10-17'] pub fn is_uchar_punct(uchar int) bool { - return find_punct_in_table(uchar, unicode_punct_western) != 0 + return is_rune_punct(rune(uchar)) +} + +// is_rune_punct return true if the input unicode is a western unicode punctuation +pub fn is_rune_punct(r rune) bool { + return find_punct_in_table(r, unicode_punct_western) != 0 } // Global // is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation pub fn is_global_punct(s string, index int) bool { - return is_uchar_global_punct(get_uchar(s, index)) + return is_rune_global_punct(get_rune(s, index)) } -// is_uchar_global_punct return true if the input unicode is a global unicode punctuation +@[deprecated: 'use `.is_rune_global_punct(r rune)` instead'] +@[deprecated_after: '2024-10-17'] pub fn is_uchar_global_punct(uchar int) bool { - return find_punct_in_table(uchar, unicode_punct) != 0 + return is_rune_global_punct(rune(uchar)) +} + +// is_rune_global_punct return true if the input unicode is a global unicode punctuation +pub fn is_rune_global_punct(r rune) bool { + return find_punct_in_table(r, unicode_punct) != 0 } // Private functions @@ -523,13 +541,13 @@ fn convert_case(s string, upper_flag bool) string { // find_punct_in_table looks for valid punctuation in table @[direct_array_access] -fn find_punct_in_table(in_code int, in_table []int) int { +fn find_punct_in_table(in_code rune, in_table []rune) rune { // uses simple binary search mut first_index := 0 mut last_index := (in_table.len) mut index := 0 - mut x := 0 + mut x := rune(0) for { index = (first_index + last_index) >> 1 @@ -559,7 +577,7 @@ fn find_punct_in_table(in_code int, in_table []int) int { // Western punctuation mark // Character Name Browser Image const unicode_punct_western = [ - 0x0021, // EXCLAMATION MARK ! + rune(0x0021), // EXCLAMATION MARK ! 0x0022, // QUOTATION MARK " 0x0027, // APOSTROPHE ' 0x002A, // ASTERISK * @@ -593,7 +611,7 @@ const unicode_punct_western = [ // Unicode Characters in the 'Punctuation, Other' Category // Character Name Browser Image const unicode_punct = [ - 0x0021, // EXCLAMATION MARK ! + rune(0x0021), // EXCLAMATION MARK ! 0x0022, // QUOTATION MARK " 0x0023, // NUMBER SIGN # 0x0025, // PERCENT SIGN % diff --git a/vlib/encoding/utf8/utf8_util_test.v b/vlib/encoding/utf8/utf8_util_test.v index 3b2e503dbaf147..22cda41c45c9a5 100644 --- a/vlib/encoding/utf8/utf8_util_test.v +++ b/vlib/encoding/utf8/utf8_util_test.v @@ -23,7 +23,7 @@ fn test_utf8_util() { a := '.abc?abcòàè.' assert utf8.is_punct(a, 0) == true assert utf8.is_punct('b', 0) == false - assert utf8.is_uchar_punct(0x002E) == true + assert utf8.is_rune_punct(0x002E) == true assert utf8.is_punct(a, 4) == true // ? assert utf8.is_punct(a, 14) == true // last . assert utf8.is_punct(a, 12) == false // è @@ -33,12 +33,16 @@ fn test_utf8_util() { b := '.ĂĂa. ÔÔ TESTO Æ€' assert utf8.is_global_punct(b, 0) == true assert utf8.is_global_punct('.', 0) == true - assert utf8.is_uchar_punct(0x002E) == true + assert utf8.is_rune_punct(0x002E) == true assert utf8.is_global_punct(b, 6) == true // . assert utf8.is_global_punct(b, 1) == false // a // test utility functions - assert utf8.get_uchar(b, 0) == 0x002E + c := 'a©★🚀' + assert utf8.get_rune(c, 0) == `a` // 1 byte + assert utf8.get_rune(c, 1) == `©` // 2 bytes + assert utf8.get_rune(c, 3) == `★` // 3 bytes + assert utf8.get_rune(c, 6) == `🚀` // 4 bytes } fn test_raw_indexing() { @@ -56,6 +60,13 @@ fn test_raw_indexing() { assert utf8.raw_index(a, 6) == 'n' assert utf8.raw_index(a, 7) == 'g' assert utf8.raw_index(a, 8) == '!' + + // test differnt utf8 byte lenghts + c := 'a©★🚀' + assert utf8.raw_index(c, 0) == 'a' // 1 byte + assert utf8.raw_index(c, 1) == '©' // 2 bytes + assert utf8.raw_index(c, 2) == '★' // 3 bytes + assert utf8.raw_index(c, 3) == '🚀' // 4 bytes } fn test_reversed() { diff --git a/vlib/x/ttf/render_bmp.v b/vlib/x/ttf/render_bmp.v index b4ac048fd101ca..465fd47a306929 100644 --- a/vlib/x/ttf/render_bmp.v +++ b/vlib/x/ttf/render_bmp.v @@ -483,7 +483,7 @@ pub fn (mut bmp BitMap) get_chars_bbox(in_string string) []int { // manage unicode chars like latin greek etc c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1 if c_len > 1 { - tmp_char := utf8.get_uchar(in_string, i) + tmp_char := utf8.get_rune(in_string, i) // dprintln("tmp_char: ${tmp_char.hex()}") chr = u16(tmp_char) } @@ -554,7 +554,7 @@ pub fn (mut bmp BitMap) get_bbox(in_string string) (int, int) { // manage unicode chars like latin greek etc c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1 if c_len > 1 { - tmp_char := utf8.get_uchar(in_string, i) + tmp_char := utf8.get_rune(in_string, i) // dprintln("tmp_char: ${tmp_char.hex()}") chr = u16(tmp_char) } @@ -649,7 +649,7 @@ pub fn (mut bmp BitMap) draw_text(in_string string) (int, int) { // manage unicode chars like latin greek etc c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1 if c_len > 1 { - tmp_char := utf8.get_uchar(in_string, i) + tmp_char := utf8.get_rune(in_string, i) // dprintln("tmp_char: ${tmp_char.hex()}") chr = u16(tmp_char) }