devela/text/char/
namespace.rs

1// devela::text::char::namespace
2//
3//! Defines the [`Char`] namespace.
4//
5// MAYBE: Defines the [`Char`] and [`Utf8`] namespaces.
6
7#[doc = crate::TAG_NAMESPACE!()]
8/// Unicode scalars-related operations.
9///
10/// See also [`Str`][crate::Str], [`ExtMem`][crate::ExtMem],
11pub struct Char;
12
13impl Char {
14    /// Returns the number of bytes necessary to store the given unicode scalar `code`.
15    #[must_use]
16    pub const fn byte_len(code: u32) -> usize {
17        match code {
18            0..=0xFF => 1,
19            0x100..=0xFFFF => 2,
20            _ => 3,
21        }
22    }
23
24    /// Returns `true` if the given unicode scalar `code` is a 7bit ASCII code.
25    #[must_use]
26    pub const fn is_7bit(code: u32) -> bool {
27        code <= 0x7F
28    }
29
30    /// Returns `true` if the given unicode scalar `code` is a [noncharacter][0].
31    ///
32    /// [0]: https://www.unicode.org/glossary/#noncharacter
33    #[must_use]
34    // FIXME: make a version that checks for surrogates
35    pub const fn is_noncharacter(code: u32) -> bool {
36        // sub-block of 32 non-characters:
37        (code >= 0xFDD0 && code <= 0xFDEF)
38            // 2× non-characters at the end of each plane:
39            || (code >= 0xFFFE && (code & 0xFF) == 0xFE)
40            || (code >= 0xFFFE && (code & 0xFF) == 0xFF)
41            // unallocated range (16 potential non-characters):
42            || (code >= 0x2FE0 && code <= 0x2FEF)
43    }
44
45    /// Returns the expected UTF-8 byte length based on the first byte.
46    ///
47    /// This function does **not** validate UTF-8 but determines how many bytes
48    /// a valid sequence **should** occupy based on the leading byte.
49    ///
50    /// - ASCII (0xxxxxxx) → 1 byte
51    /// - 2-byte (110xxxxx) → 2 bytes
52    /// - 3-byte (1110xxxx) → 3 bytes
53    /// - 4-byte (11110xxx) → 4 bytes
54    ///
55    /// ### Caveat
56    /// - If used on malformed UTF-8, it may suggest a length longer than the actual valid sequence.
57    /// - Always use in conjunction with proper UTF-8 validation if handling untrusted input.
58    ///
59    /// For a stricter check, see [`utf8_len_checked`][Self::utf8_len_checked].
60    pub const fn utf8_len(first_byte: u8) -> u8 {
61        match first_byte {
62            0x00..=0x7F => 1, // ASCII (1 byte)
63            0xC0..=0xDF => 2, // 2-byte sequence
64            0xE0..=0xEF => 3, // 3-byte sequence
65            0xF0..=0xF7 => 4, // 4-byte sequence
66            _ => 0,           // Invalid leading byte
67        }
68    }
69
70    /// Returns the UTF-8 byte length or `None` if the first byte is invalid.
71    ///
72    /// This function detects invalid UTF-8 leading bytes and ensures
73    /// they fall within valid Unicode scalar boundaries.
74    ///
75    /// - Returns `Some(len)` for valid leading bytes.
76    /// - Returns `None` for invalid first bytes that cannot start a UTF-8 sequence.
77    ///
78    /// ### Stricter Handling
79    /// - Rejects overlong sequences (C0, C1).
80    /// - Enforces the valid UTF-8 upper bound (max `F4`).
81    /// - Safer for processing untrusted input where malformed UTF-8 must be detected.
82    ///
83    /// For a simpler length-only function, see [`utf8_len`][Self::utf8_len].
84    #[must_use]
85    pub const fn utf8_len_checked(first_byte: u8) -> Option<u8> {
86        match first_byte {
87            0x00..=0x7F => Some(1),
88            0xC2..=0xDF => Some(2),
89            0xE0..=0xEF => Some(3),
90            0xF0..=0xF4 => Some(4),
91            _ => None,
92        }
93    }
94
95    /// Returns the number of bytes needed to store the given unicode scalar `code`,
96    /// already UTF-8 encoded in 2 bytes.
97    #[must_use]
98    #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
99    pub const fn utf8_2bytes_len(code: [u8; 2]) -> u8 {
100        1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
101    }
102
103    /// Returns the number of bytes needed to store the given unicode scalar `code`,
104    /// already UTF-8 encoded in 3 bytes.
105    #[must_use]
106    #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
107    pub const fn utf8_3bytes_len(code: [u8; 3]) -> u8 {
108        1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
109            + ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
110    }
111
112    /// Returns the number of bytes needed to store the given unicode scalar `code`,
113    /// already UTF-8 encoded in 4 bytes.
114    #[must_use]
115    #[deprecated(since = "0.23.0", note = "Use `utf8_len` instead")]
116    pub const fn utf8_4bytes_len(code: [u8; 4]) -> u8 {
117        1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
118            + ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
119            + ((code[3] > 0) & (code[3] & 0b1100_0000 != 0b1000_0000)) as u8
120    }
121
122    /// Returns the number of bytes needed to encode the given unicode scalar `code` as UTF-8.
123    #[must_use] #[rustfmt::skip]
124    pub const fn len_utf8(code: char) -> usize {
125        let code = code as u32;
126        if code < 0x80 { 1 } else if code < 0x800 { 2 } else if code < 0x10_000 { 3 } else { 4 }
127    }
128    /// Returns the number of bytes needed to encode the given unicode scalar `code` as UTF-8.
129    #[must_use]
130    #[deprecated(since = "0.23.0", note = "Use `len_utf8` instead")]
131    pub const fn len_to_utf8(code: char) -> usize {
132        Self::len_utf8(code)
133    }
134
135    /// Converts this `char` to an UTF-8 encoded sequence of bytes.
136    ///
137    /// Note that this function always returns a 4-byte array, but the actual
138    /// UTF-8 sequence may be shorter. The unused bytes are set to 0.
139    ///
140    /// See also [`char::encode_utf8`].
141    #[must_use]
142    #[allow(clippy::unusual_byte_groupings)]
143    pub const fn to_utf8_bytes(c: char) -> [u8; 4] {
144        let c = c as u32;
145        match c {
146            // From 0x0000 to 0x007F:
147            // the UTF-8 encoding is the same as the scalar value.
148            0x0000..=0x007F => [c as u8, 0, 0, 0],
149
150            // from 0x0080 to 0x07FF:
151            // the UTF-8 encoding is 110xxxxx 10xxxxxx,
152            // where xxxxx and xxxxxx are the bits of the scalar value.
153            0x0080..=0x07FF => {
154                let y = 0b10_000000 | (0b0011_1111 & (c as u8));
155                let x = 0b110_00000 | ((c >> 6) as u8);
156                [x, y, 0, 0]
157            }
158
159            // From from 0x0800 to 0xFFFF:
160            // the UTF-8 encoding is 1110xxxx 10xxxxxx 10xxxxxx.
161            0x0800..=0xFFFF => {
162                let z = 0b10_000000 | (0b0011_1111 & (c as u8));
163                let y = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
164                let x = 0b1110_0000 | ((c >> 12) as u8);
165                [x, y, z, 0]
166            }
167
168            // From 0x10000 to 0x10FFFF:
169            // the UTF-8 encoding is 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
170            _ => {
171                let w = 0b10_000000 | (0b0011_1111 & (c as u8));
172                let z = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
173                let y = 0b10_000000 | ((c >> 12) & 0b0011_1111) as u8;
174                let x = 0b11110_000 | ((c >> 18) as u8);
175                [x, y, z, w]
176            }
177        }
178    }
179}