Skip to main content

devela/text/unicode/scalar/namespace/
slice.rs

1// devela/src/text/unicode/scalar/namespace/slice.rs
2//
3// TOC
4// - methods over &[u8]
5// - methods over &[u8; N]
6
7use crate::{Char, is, unwrap};
8
9/// # Methods over `u8` slice.
10#[rustfmt::skip]
11impl Char<&[u8]> {
12    /// Decodes a UTF-8 scalar at `index`.
13    ///
14    /// Returns `Some((char, len))` if the input is a valid UTF-8 sequence
15    /// and the decoded value is a valid Unicode scalar.
16    ///
17    /// Returns `None` if:
18    /// - The index is out of bounds.
19    /// - The bytes do not form a valid UTF-8 sequence.
20    /// - The decoded value is not a valid Unicode scalar.
21    ///
22    /// This is implemented via `Char::`[`to_scalar()`][Self::to_scalar].
23    ///
24    /// # Examples
25    /// ```
26    /// # use devela::Char;
27    /// // Valid UTF-8 sequence
28    /// let result = Char(b"\xE2\x82\xAC").to_char(0); // €
29    /// assert_eq!(result, Some(('€', 3)));
30    ///
31    /// // Invalid continuation bytes
32    /// let invalid_continuation = Char(b"\xE2\x41\xAC").to_char(0);
33    /// assert_eq!(invalid_continuation, None);
34    ///
35    /// // Surrogate code point
36    /// let surrogate = Char(b"\xED\xA0\x80").to_char(0); // U+D800
37    /// assert_eq!(surrogate, None);
38    ///
39    /// // Out of bounds index
40    /// let out_of_bounds = Char(b"hello").to_char(10);
41    /// assert_eq!(out_of_bounds, None);
42    ///
43    /// // Incomplete sequence
44    /// let incomplete = Char(b"\xE2\x82").to_char(0); // Missing third byte
45    /// assert_eq!(incomplete, None);
46    /// ```
47    /// # Features
48    /// Uses the `unsafe_str` feature to skip duplicated validation checks.
49    #[must_use]
50    pub const fn to_char(self, index: usize) -> Option<(char, usize)> {
51        let (cp, len) = unwrap![some? self.to_scalar(index)]; // check cp is a valid scalar
52        cfg_select! { all(feature = "unsafe_str", not(feature = "safe_text")) => {
53            Some((unsafe { char::from_u32_unchecked(cp) }, len)) // SAFETY: we just checked
54        } _ => { Some((unwrap![some? char::from_u32(cp)], len)) }}
55    }
56
57    /// Decodes a UTF-8 scalar leniently at `index`, validating only the final Unicode scalar.
58    ///
59    /// This method is forgiving of UTF-8 encoding errors but ensures the result
60    /// is a valid Unicode scalar value.
61    ///
62    /// - Does not validate UTF-8 continuation bytes (may decode malformed sequences).
63    /// - If the leading byte is invalid it returns the replacement character (`�`).
64    ///
65    /// This is implemented via `Char::`[to_scalar_unchecked()][Self::to_scalar_unchecked].
66    ///
67    /// # Panics
68    /// Panics if the decoded value is not a valid Unicode scalar value,
69    /// or if the `index` is out of bounds.
70    ///
71    /// # Examples
72    /// ```
73    /// # use devela::Char;
74    /// // Valid UTF-8 sequence
75    /// let result = Char(b"\xE2\x82\xAC").to_char_lenient(0); // €
76    /// assert_eq!(result, ('€', 3));
77    ///
78    /// // Invalid UTF-8 but decodes to valid scalar - behavior depends on input
79    /// // This may return unexpected characters rather than panicking
80    /// let result = Char(b"\xE2\x41\xAC").to_char_lenient(0);
81    /// assert_eq!(result, ('\u{206c}', 3));
82    ///
83    /// // Surrogate code point - will panic
84    /// // let result = Char(b"\xED\xA0\x80").to_char_lenient(0); // PANIC: U+D800 is invalid
85    ///
86    /// // Out of bounds index - will panic
87    /// // let result = Char(b"hello").to_char_lenient(10); // PANIC: index out of bounds
88    /// ```
89    #[must_use]
90    pub const fn to_char_lenient(self, index: usize) -> (char, usize) {
91        let (cp, len) = self.to_scalar_unchecked(index);
92        (unwrap![some char::from_u32(cp)], len)
93    }
94
95    /// Decodes a UTF-8 scalar at `index` without any validation.
96    ///
97    /// If the leading byte is invalid it returns the replacement character (`�`).
98    ///
99    /// This is implemented via `Char::`[`to_scalar_unchecked`][Self::to_scalar_unchecked].
100    ///
101    /// # Safety
102    /// The caller must ensure that:
103    /// - `index` is within bounds of `bytes`
104    /// - `bytes[index..]` contains a valid UTF-8 sequence
105    /// - The decoded value is a valid Unicode scalar.
106    ///
107    /// Violating these conditions may lead to undefined behavior.
108    #[must_use]
109    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
110    #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
111    pub const unsafe fn to_char_unchecked(self, index: usize) -> (char, usize) {
112        let (cp, len) = self.to_scalar_unchecked(index);
113        (unsafe { char::from_u32_unchecked(cp) }, len)
114    }
115
116    /// Decodes a UTF-8 scalar from the given byte slice, starting at `index`.
117    ///
118    /// Returns `(scalar, len)`, where `scalar` is the decoded Unicode scalar,
119    /// and `len` is the number of bytes consumed.
120    ///
121    /// Returns `None` if:
122    /// - The index is out of bounds.
123    /// - The bytes do not form a valid UTF-8 sequence.
124    /// - The decoded value is not a valid Unicode scalar.
125    ///
126    /// # Examples
127    /// ```
128    /// # use devela::Char;
129    /// assert_eq!(Char("Ħ".as_bytes()).to_scalar(0), Some((u32::from('Ħ'), 2)));
130    ///
131    /// let invalid = b"\x80"; // Invalid leading byte
132    /// assert_eq!(Char(invalid).to_scalar(0), None);
133    /// ```
134    #[must_use]
135    pub const fn to_scalar(self, index: usize) -> Option<(u32, usize)> {
136        if index >= self.0.len() { return None; } // out of bounds
137        let (bytes, first) = (self.0, self.0[index]);
138        if first < 0x80 { return Some((first as u32, 1)); } // ASCII fast path
139        let len = unwrap![some? Char(bytes[index]).len_utf8()]; // invalid leading byte?
140        if index + len > bytes.len() { return None; } // not enough bytes?
141        if !self.has_valid_continuation(index, len) { return None; } // malformed utf-8?
142        if self.has_overlong_encoding(index, len) { return None; } // overlong encoding?
143        let scalar = self.decode_scalar(index, len);
144        is![Char(scalar).is_valid_scalar(), Some((scalar, len)), None] // invalid scalar?
145    }
146
147    /// Decodes a UTF-8 scalar from the given byte slice, starting at `index`, without validation.
148    ///
149    /// Returns `(scalar, len)`, where `scalar` is the decoded Unicode scalar,
150    /// and `len` is the number of bytes consumed.
151    ///
152    /// It assumes `bytes[index..]` contains a valid UTF-8 sequence,
153    /// and it doesn't validate the resulting Unicode scalar.
154    ///
155    /// If the leading byte is invalid it returns the replacement character (`�`).
156    ///
157    /// # Panics
158    /// It will panic if the index is out of bounds.
159    #[must_use]
160    pub const fn to_scalar_unchecked(self, index: usize) -> (u32, usize) {
161        let first = self.0[index];
162        if first < 0x80 { return (first as u32, 1); } // ASCII fast path
163        let len = Char(first).len_utf8_unchecked();
164        if len == 0 { return (char::REPLACEMENT_CHARACTER as u32, 1); } // invalid leading byte?
165        (self.decode_scalar(index, len), len)
166    }
167
168    #[must_use]
169    #[inline(always)]
170    const fn decode_scalar(self, index: usize, len: usize) -> u32 {
171        let (bytes, first) = (self.0, self.0[index]);
172        match len {
173            1 => first as u32,
174            2 => ((first as u32 & 0x1F) << 6) | (bytes[index + 1] as u32 & Char::<u32>::CONT_MASK),
175            3 => ((first as u32 & 0x0F) << 12)
176                | ((bytes[index + 1] as u32 & Char::<u32>::CONT_MASK) << 6)
177                | (bytes[index + 2] as u32 & Char::<u32>::CONT_MASK),
178            4 => ((first as u32 & 0x07) << 18)
179                | ((bytes[index + 1] as u32 & Char::<u32>::CONT_MASK) << 12)
180                | ((bytes[index + 2] as u32 & Char::<u32>::CONT_MASK) << 6)
181                | (bytes[index + 3] as u32 & Char::<u32>::CONT_MASK),
182            _ => char::REPLACEMENT_CHARACTER as u32,
183        }
184    }
185
186    /// Returns `true` if the UTF-8 sequence starting at `index` is overlong encoded.
187    ///
188    /// This method only checks for overlong encodings, but not other UTF-8 validity rules.
189    /// It does not verify continuation byte patterns nor invalid scalar values.
190    ///
191    /// Overlong encodings use more bytes than necessary to represent a character,
192    /// which is invalid in well-formed UTF-8.
193    ///
194    /// # Examples
195    /// ```
196    /// # use devela::Char;
197    /// assert!(Char(b"\xE0\x80\x80").has_overlong_encoding(0, 3)); // overlong encoding
198    /// assert!(!Char(b"\xE0\xA0\x80").has_overlong_encoding(0, 3)); // valid 3-byte sequence
199    /// ```
200    #[must_use] #[rustfmt::skip]
201    pub const fn has_overlong_encoding(self, index: usize, len: usize) -> bool {
202        let bytes = self.0;
203        if index + len > bytes.len() { return false; }
204        let first = bytes[index];
205        match len {
206            // should've been 1: C0, C1 are always overlong
207            2 => { first == 0xC0 || first == 0xC1 }
208            // E0 80..9F are overlong (should be 1-2 bytes)
209            3 if first == 0xE0 => { let second = bytes[index + 1]; second < 0xA0 }
210            // F0 80..8F are overlong (should be 1-3 bytes)
211            4 if first == 0xF0 => { let second = bytes[index + 1]; second < 0x90 }
212            _ => false, // 1-byte sequences can't be overlong
213        }
214    }
215
216    /// Verifies that the continuation bytes following a UTF-8 leading byte are properly formatted.
217    ///
218    /// Each continuation byte must match the pattern `10xxxxxx` (i.e., have the high bits `0b10`).
219    /// This ensures the byte sequence follows proper UTF-8 encoding rules.
220    ///
221    /// This method only verifies correct syntax, but not correct semantics.
222    /// It does not check for overlong encodings nor invalid scalar values.
223    ///
224    /// # Examples
225    /// ```
226    /// # use devela::Char;
227    /// assert!(Char(b"\xE2\x82\xAC").has_valid_continuation(0, 3)); // euro sign €
228    /// assert!(!Char(b"\xE2\x41\xAC").has_valid_continuation(0, 3)); // second byte is ASCII 'A'
229    /// assert!(!Char(b"\xC2").has_valid_continuation(0, 2)); // incomplete sequence
230    /// ```
231    pub const fn has_valid_continuation(self, index: usize, len: usize) -> bool {
232        let bytes = self.0;
233        is![bytes.len() < index + len, return false]; // ensure sufficient len
234        match len {
235            1 => true, // no continuation bytes needed for ASCII
236            2 => bytes[index + 1] & 0xC0 == 0x80,
237            3 => bytes[index + 1] & 0xC0 == 0x80 && bytes[index + 2] & 0xC0 == 0x80,
238            4 => bytes[index + 1] & 0xC0 == 0x80 && bytes[index + 2] & 0xC0 == 0x80
239                && bytes[index + 3] & 0xC0 == 0x80,
240            _ => false, // invalid length
241        }
242    }
243
244    /// Returns `true` if the byte at `index` is a valid starting point for a UTF-8 sequence.
245    ///
246    /// This checks if the byte is not a UTF-8 continuation byte (i.e., it's either
247    /// an ASCII character or a valid leading byte of a multi-byte sequence).
248    ///
249    /// Useful for safely starting UTF-8 decoding from an arbitrary position in a byte slice.
250    #[must_use]
251    pub const fn is_utf8_boundary(self, index: usize) -> bool {
252        index == self.0.len()
253            || (index < self.0.len() && Char(self.0[index]).is_utf8_boundary())
254    }
255    /// Returns the smallest UTF-8 boundary `>= index`.
256    #[must_use]
257    pub const fn ceil_utf8_boundary(self, index: usize) -> usize {
258        let bytes = self.0;
259        let mut i = is![index < bytes.len(), index, bytes.len()];
260        while i < bytes.len() && !Char(bytes).is_utf8_boundary(i) { i += 1; }
261        i
262    }
263    /// Returns the greatest UTF-8 boundary `<= index`.
264    ///
265    /// If `index > self.len()`, starts from `self.len()`.
266    ///
267    /// This only checks boundary shape, not full UTF-8 validity.
268    /// It is intended for already-valid UTF-8 byte slices.
269    #[must_use]
270    pub const fn floor_utf8_boundary(self, index: usize) -> usize {
271        let bytes = self.0;
272        let mut i = is![index < bytes.len(), index, bytes.len()];
273        while i > 0 && !Char(bytes).is_utf8_boundary(i) { i -= 1; }
274        i
275    }
276}
277
278macro_rules! _impl_char_array_ref_wrappers {
279    () => {};
280    ($(#[$attr:meta])* unsafe fn $name:ident($($arg:ident: $arg_ty:ty),* $(,)?) -> $ret:ty;
281     $($rest:tt)* ) => {
282        #[doc = concat!( "A wrapper over [`", stringify!($name), "()`](#method.", stringify!($name),
283        ").", "\n\n# Safety\nSame requirements as the wrapped method.")]
284        #[must_use] #[inline(always)] $(#[$attr])*
285        pub const unsafe fn $name(self $(, $arg: $arg_ty)*) -> $ret {
286            let bytes: &[u8] = self.0;
287            unsafe { Char(bytes).$name($($arg),*) }
288        }
289        _impl_char_array_ref_wrappers!($($rest)*);
290    };
291    ($(#[$attr:meta])* fn $name:ident($($arg:ident: $arg_ty:ty),* $(,)?) -> $ret:ty;
292     $($rest:tt)* ) => {
293        #[doc = concat!("A wrapper over [`",
294            stringify!($name), "()`](#method.", stringify!($name), ").")]
295        #[must_use] #[inline(always)] $(#[$attr])*
296        pub const fn $name(self $(, $arg: $arg_ty)*) -> $ret {
297            let bytes: &[u8] = self.0; Char(bytes).$name($($arg),*)
298        }
299        _impl_char_array_ref_wrappers!($($rest)*);
300    };
301}
302/// Method wrappers over a byte array reference.
303impl<const N: usize> Char<&[u8; N]> {
304    _impl_char_array_ref_wrappers! {
305        fn to_char(index: usize) -> Option<(char, usize)>;
306        fn to_char_lenient(index: usize) -> (char, usize);
307        #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
308        #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
309        unsafe fn to_char_unchecked(index: usize) -> (char, usize);
310
311        fn to_scalar(index: usize) -> Option<(u32, usize)>;
312        fn to_scalar_unchecked(index: usize) -> (u32, usize);
313
314        fn has_overlong_encoding(index: usize, len: usize) -> bool;
315        fn has_valid_continuation(index: usize, len: usize) -> bool;
316
317        fn is_utf8_boundary(index: usize) -> bool;
318        fn ceil_utf8_boundary(index: usize) -> usize;
319        fn floor_utf8_boundary(index: usize) -> usize;
320    }
321}