devela/text/char/namespace.rs
1// devela::text::char::namespace
2//
3//! Defines the [`Char`] namespace.
4//
5// MAYBE: Defines the [`Char`] and [`Utf8`] namespaces.
6
7#[doc = crate::TAG_NAMESPACE!()]
8/// Unicode scalars-related operations.
9///
10/// See also [`Str`][crate::Str], [`ExtMem`][crate::ExtMem],
11pub struct Char;
12
13impl Char {
14 /// Returns the number of bytes necessary to store the given unicode scalar `code`.
15 #[must_use]
16 pub const fn byte_len(code: u32) -> usize {
17 match code {
18 0..=0xFF => 1,
19 0x100..=0xFFFF => 2,
20 _ => 3,
21 }
22 }
23
24 /// Returns `true` if the given unicode scalar `code` is a 7bit ASCII code.
25 #[must_use]
26 pub const fn is_7bit(code: u32) -> bool {
27 code <= 0x7F
28 }
29
30 /// Returns `true` if the given unicode scalar `code` is a [noncharacter][0].
31 ///
32 /// [0]: https://www.unicode.org/glossary/#noncharacter
33 #[must_use]
34 // FIXME: make a version that checks for surrogates
35 pub const fn is_noncharacter(code: u32) -> bool {
36 // sub-block of 32 non-characters:
37 (code >= 0xFDD0 && code <= 0xFDEF)
38 // 2× non-characters at the end of each plane:
39 || (code >= 0xFFFE && (code & 0xFF) == 0xFE)
40 || (code >= 0xFFFE && (code & 0xFF) == 0xFF)
41 // unallocated range (16 potential non-characters):
42 || (code >= 0x2FE0 && code <= 0x2FEF)
43 }
44
45 /// Returns the expected UTF-8 byte length based on the first byte.
46 ///
47 /// This function does **not** validate UTF-8 but determines how many bytes
48 /// a valid sequence **should** occupy based on the leading byte.
49 ///
50 /// - ASCII (0xxxxxxx) → 1 byte
51 /// - 2-byte (110xxxxx) → 2 bytes
52 /// - 3-byte (1110xxxx) → 3 bytes
53 /// - 4-byte (11110xxx) → 4 bytes
54 ///
55 /// ### Caveat
56 /// - If used on malformed UTF-8, it may suggest a length longer than the actual valid sequence.
57 /// - Always use in conjunction with proper UTF-8 validation if handling untrusted input.
58 ///
59 /// For a stricter check, see [`utf8_len_checked`][Self::utf8_len_checked].
60 pub const fn utf8_len(first_byte: u8) -> u8 {
61 match first_byte {
62 0x00..=0x7F => 1, // ASCII (1 byte)
63 0xC0..=0xDF => 2, // 2-byte sequence
64 0xE0..=0xEF => 3, // 3-byte sequence
65 0xF0..=0xF7 => 4, // 4-byte sequence
66 _ => 0, // Invalid leading byte
67 }
68 }
69
70 /// Returns the UTF-8 byte length or `None` if the first byte is invalid.
71 ///
72 /// This function detects invalid UTF-8 leading bytes and ensures
73 /// they fall within valid Unicode scalar boundaries.
74 ///
75 /// - Returns `Some(len)` for valid leading bytes.
76 /// - Returns `None` for invalid first bytes that cannot start a UTF-8 sequence.
77 ///
78 /// ### Stricter Handling
79 /// - Rejects overlong sequences (C0, C1).
80 /// - Enforces the valid UTF-8 upper bound (max `F4`).
81 /// - Safer for processing untrusted input where malformed UTF-8 must be detected.
82 ///
83 /// For a simpler length-only function, see [`utf8_len`][Self::utf8_len].
84 #[must_use]
85 pub const fn utf8_len_checked(first_byte: u8) -> Option<u8> {
86 match first_byte {
87 0x00..=0x7F => Some(1),
88 0xC2..=0xDF => Some(2),
89 0xE0..=0xEF => Some(3),
90 0xF0..=0xF4 => Some(4),
91 _ => None,
92 }
93 }
94
95 /// Returns the number of bytes needed to store the given unicode scalar `code`,
96 /// already UTF-8 encoded in 2 bytes.
97 #[must_use]
98 #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
99 pub const fn utf8_2bytes_len(code: [u8; 2]) -> u8 {
100 1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
101 }
102
103 /// Returns the number of bytes needed to store the given unicode scalar `code`,
104 /// already UTF-8 encoded in 3 bytes.
105 #[must_use]
106 #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
107 pub const fn utf8_3bytes_len(code: [u8; 3]) -> u8 {
108 1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
109 + ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
110 }
111
112 /// Returns the number of bytes needed to store the given unicode scalar `code`,
113 /// already UTF-8 encoded in 4 bytes.
114 #[must_use]
115 #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
116 pub const fn utf8_4bytes_len(code: [u8; 4]) -> u8 {
117 1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
118 + ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
119 + ((code[3] > 0) & (code[3] & 0b1100_0000 != 0b1000_0000)) as u8
120 }
121
122 /// Returns the number of bytes needed to encode the given unicode scalar `code` as UTF-8.
123 #[must_use] #[rustfmt::skip]
124 pub const fn len_utf8(code: char) -> usize {
125 let code = code as u32;
126 if code < 0x80 { 1 } else if code < 0x800 { 2 } else if code < 0x10_000 { 3 } else { 4 }
127 }
128 /// Returns the number of bytes needed to encode the given unicode scalar `code` as UTF-8.
129 #[must_use]
130 #[deprecated(since = "0.23.0", note = "Use `len_utf8` instead")]
131 pub const fn len_to_utf8(code: char) -> usize {
132 Self::len_utf8(code)
133 }
134
135 /// Converts this `char` to an UTF-8 encoded sequence of bytes.
136 ///
137 /// Note that this function always returns a 4-byte array, but the actual
138 /// UTF-8 sequence may be shorter. The unused bytes are set to 0.
139 ///
140 /// See also [`char::encode_utf8`].
141 #[must_use]
142 #[allow(clippy::unusual_byte_groupings)]
143 pub const fn to_utf8_bytes(c: char) -> [u8; 4] {
144 let c = c as u32;
145 match c {
146 // From 0x0000 to 0x007F:
147 // the UTF-8 encoding is the same as the scalar value.
148 0x0000..=0x007F => [c as u8, 0, 0, 0],
149
150 // from 0x0080 to 0x07FF:
151 // the UTF-8 encoding is 110xxxxx 10xxxxxx,
152 // where xxxxx and xxxxxx are the bits of the scalar value.
153 0x0080..=0x07FF => {
154 let y = 0b10_000000 | (0b0011_1111 & (c as u8));
155 let x = 0b110_00000 | ((c >> 6) as u8);
156 [x, y, 0, 0]
157 }
158
159 // From from 0x0800 to 0xFFFF:
160 // the UTF-8 encoding is 1110xxxx 10xxxxxx 10xxxxxx.
161 0x0800..=0xFFFF => {
162 let z = 0b10_000000 | (0b0011_1111 & (c as u8));
163 let y = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
164 let x = 0b1110_0000 | ((c >> 12) as u8);
165 [x, y, z, 0]
166 }
167
168 // From 0x10000 to 0x10FFFF:
169 // the UTF-8 encoding is 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
170 _ => {
171 let w = 0b10_000000 | (0b0011_1111 & (c as u8));
172 let z = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
173 let y = 0b10_000000 | ((c >> 12) & 0b0011_1111) as u8;
174 let x = 0b11110_000 | ((c >> 18) as u8);
175 [x, y, z, w]
176 }
177 }
178 }
179}