devela/text/unicode/scalar/namespace/slice.rs
1// devela/src/text/unicode/scalar/namespace/slice.rs
2//
3// TOC
4// - methods over &[u8]
5// - methods over &[u8; N]
6
7use crate::{Char, is, unwrap};
8
9/// # Methods over `u8` slice.
10#[rustfmt::skip]
11impl Char<&[u8]> {
12 /// Decodes a UTF-8 scalar at `index`.
13 ///
14 /// Returns `Some((char, len))` if the input is a valid UTF-8 sequence
15 /// and the decoded value is a valid Unicode scalar.
16 ///
17 /// Returns `None` if:
18 /// - The index is out of bounds.
19 /// - The bytes do not form a valid UTF-8 sequence.
20 /// - The decoded value is not a valid Unicode scalar.
21 ///
22 /// This is implemented via `Char::`[`to_scalar()`][Self::to_scalar].
23 ///
24 /// # Examples
25 /// ```
26 /// # use devela::Char;
27 /// // Valid UTF-8 sequence
28 /// let result = Char(b"\xE2\x82\xAC").to_char(0); // €
29 /// assert_eq!(result, Some(('€', 3)));
30 ///
31 /// // Invalid continuation bytes
32 /// let invalid_continuation = Char(b"\xE2\x41\xAC").to_char(0);
33 /// assert_eq!(invalid_continuation, None);
34 ///
35 /// // Surrogate code point
36 /// let surrogate = Char(b"\xED\xA0\x80").to_char(0); // U+D800
37 /// assert_eq!(surrogate, None);
38 ///
39 /// // Out of bounds index
40 /// let out_of_bounds = Char(b"hello").to_char(10);
41 /// assert_eq!(out_of_bounds, None);
42 ///
43 /// // Incomplete sequence
44 /// let incomplete = Char(b"\xE2\x82").to_char(0); // Missing third byte
45 /// assert_eq!(incomplete, None);
46 /// ```
47 /// # Features
48 /// Uses the `unsafe_str` feature to skip duplicated validation checks.
49 #[must_use]
50 pub const fn to_char(self, index: usize) -> Option<(char, usize)> {
51 let (cp, len) = unwrap![some? self.to_scalar(index)]; // check cp is a valid scalar
52 cfg_select! { all(feature = "unsafe_str", not(feature = "safe_text")) => {
53 Some((unsafe { char::from_u32_unchecked(cp) }, len)) // SAFETY: we just checked
54 } _ => { Some((unwrap![some? char::from_u32(cp)], len)) }}
55 }
56
57 /// Decodes a UTF-8 scalar leniently at `index`, validating only the final Unicode scalar.
58 ///
59 /// This method is forgiving of UTF-8 encoding errors but ensures the result
60 /// is a valid Unicode scalar value.
61 ///
62 /// - Does not validate UTF-8 continuation bytes (may decode malformed sequences).
63 /// - If the leading byte is invalid it returns the replacement character (`�`).
64 ///
65 /// This is implemented via `Char::`[to_scalar_unchecked()][Self::to_scalar_unchecked].
66 ///
67 /// # Panics
68 /// Panics if the decoded value is not a valid Unicode scalar value,
69 /// or if the `index` is out of bounds.
70 ///
71 /// # Examples
72 /// ```
73 /// # use devela::Char;
74 /// // Valid UTF-8 sequence
75 /// let result = Char(b"\xE2\x82\xAC").to_char_lenient(0); // €
76 /// assert_eq!(result, ('€', 3));
77 ///
78 /// // Invalid UTF-8 but decodes to valid scalar - behavior depends on input
79 /// // This may return unexpected characters rather than panicking
80 /// let result = Char(b"\xE2\x41\xAC").to_char_lenient(0);
81 /// assert_eq!(result, ('\u{206c}', 3));
82 ///
83 /// // Surrogate code point - will panic
84 /// // let result = Char(b"\xED\xA0\x80").to_char_lenient(0); // PANIC: U+D800 is invalid
85 ///
86 /// // Out of bounds index - will panic
87 /// // let result = Char(b"hello").to_char_lenient(10); // PANIC: index out of bounds
88 /// ```
89 #[must_use]
90 pub const fn to_char_lenient(self, index: usize) -> (char, usize) {
91 let (cp, len) = self.to_scalar_unchecked(index);
92 (unwrap![some char::from_u32(cp)], len)
93 }
94
95 /// Decodes a UTF-8 scalar at `index` without any validation.
96 ///
97 /// If the leading byte is invalid it returns the replacement character (`�`).
98 ///
99 /// This is implemented via `Char::`[`to_scalar_unchecked`][Self::to_scalar_unchecked].
100 ///
101 /// # Safety
102 /// The caller must ensure that:
103 /// - `index` is within bounds of `bytes`
104 /// - `bytes[index..]` contains a valid UTF-8 sequence
105 /// - The decoded value is a valid Unicode scalar.
106 ///
107 /// Violating these conditions may lead to undefined behavior.
108 #[must_use]
109 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
110 #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
111 pub const unsafe fn to_char_unchecked(self, index: usize) -> (char, usize) {
112 let (cp, len) = self.to_scalar_unchecked(index);
113 (unsafe { char::from_u32_unchecked(cp) }, len)
114 }
115
116 /// Decodes a UTF-8 scalar from the given byte slice, starting at `index`.
117 ///
118 /// Returns `(scalar, len)`, where `scalar` is the decoded Unicode scalar,
119 /// and `len` is the number of bytes consumed.
120 ///
121 /// Returns `None` if:
122 /// - The index is out of bounds.
123 /// - The bytes do not form a valid UTF-8 sequence.
124 /// - The decoded value is not a valid Unicode scalar.
125 ///
126 /// # Examples
127 /// ```
128 /// # use devela::Char;
129 /// assert_eq!(Char("Ħ".as_bytes()).to_scalar(0), Some((u32::from('Ħ'), 2)));
130 ///
131 /// let invalid = b"\x80"; // Invalid leading byte
132 /// assert_eq!(Char(invalid).to_scalar(0), None);
133 /// ```
134 #[must_use]
135 pub const fn to_scalar(self, index: usize) -> Option<(u32, usize)> {
136 if index >= self.0.len() { return None; } // out of bounds
137 let (bytes, first) = (self.0, self.0[index]);
138 if first < 0x80 { return Some((first as u32, 1)); } // ASCII fast path
139 let len = unwrap![some? Char(bytes[index]).len_utf8()]; // invalid leading byte?
140 if index + len > bytes.len() { return None; } // not enough bytes?
141 if !self.has_valid_continuation(index, len) { return None; } // malformed utf-8?
142 if self.has_overlong_encoding(index, len) { return None; } // overlong encoding?
143 let scalar = self.decode_scalar(index, len);
144 is![Char(scalar).is_valid_scalar(), Some((scalar, len)), None] // invalid scalar?
145 }
146
147 /// Decodes a UTF-8 scalar from the given byte slice, starting at `index`, without validation.
148 ///
149 /// Returns `(scalar, len)`, where `scalar` is the decoded Unicode scalar,
150 /// and `len` is the number of bytes consumed.
151 ///
152 /// It assumes `bytes[index..]` contains a valid UTF-8 sequence,
153 /// and it doesn't validate the resulting Unicode scalar.
154 ///
155 /// If the leading byte is invalid it returns the replacement character (`�`).
156 ///
157 /// # Panics
158 /// It will panic if the index is out of bounds.
159 #[must_use]
160 pub const fn to_scalar_unchecked(self, index: usize) -> (u32, usize) {
161 let first = self.0[index];
162 if first < 0x80 { return (first as u32, 1); } // ASCII fast path
163 let len = Char(first).len_utf8_unchecked();
164 if len == 0 { return (char::REPLACEMENT_CHARACTER as u32, 1); } // invalid leading byte?
165 (self.decode_scalar(index, len), len)
166 }
167
168 #[must_use]
169 #[inline(always)]
170 const fn decode_scalar(self, index: usize, len: usize) -> u32 {
171 let (bytes, first) = (self.0, self.0[index]);
172 match len {
173 1 => first as u32,
174 2 => ((first as u32 & 0x1F) << 6) | (bytes[index + 1] as u32 & Char::<u32>::CONT_MASK),
175 3 => ((first as u32 & 0x0F) << 12)
176 | ((bytes[index + 1] as u32 & Char::<u32>::CONT_MASK) << 6)
177 | (bytes[index + 2] as u32 & Char::<u32>::CONT_MASK),
178 4 => ((first as u32 & 0x07) << 18)
179 | ((bytes[index + 1] as u32 & Char::<u32>::CONT_MASK) << 12)
180 | ((bytes[index + 2] as u32 & Char::<u32>::CONT_MASK) << 6)
181 | (bytes[index + 3] as u32 & Char::<u32>::CONT_MASK),
182 _ => char::REPLACEMENT_CHARACTER as u32,
183 }
184 }
185
186 /// Returns `true` if the UTF-8 sequence starting at `index` is overlong encoded.
187 ///
188 /// This method only checks for overlong encodings, but not other UTF-8 validity rules.
189 /// It does not verify continuation byte patterns nor invalid scalar values.
190 ///
191 /// Overlong encodings use more bytes than necessary to represent a character,
192 /// which is invalid in well-formed UTF-8.
193 ///
194 /// # Examples
195 /// ```
196 /// # use devela::Char;
197 /// assert!(Char(b"\xE0\x80\x80").has_overlong_encoding(0, 3)); // overlong encoding
198 /// assert!(!Char(b"\xE0\xA0\x80").has_overlong_encoding(0, 3)); // valid 3-byte sequence
199 /// ```
200 #[must_use] #[rustfmt::skip]
201 pub const fn has_overlong_encoding(self, index: usize, len: usize) -> bool {
202 let bytes = self.0;
203 if index + len > bytes.len() { return false; }
204 let first = bytes[index];
205 match len {
206 // should've been 1: C0, C1 are always overlong
207 2 => { first == 0xC0 || first == 0xC1 }
208 // E0 80..9F are overlong (should be 1-2 bytes)
209 3 if first == 0xE0 => { let second = bytes[index + 1]; second < 0xA0 }
210 // F0 80..8F are overlong (should be 1-3 bytes)
211 4 if first == 0xF0 => { let second = bytes[index + 1]; second < 0x90 }
212 _ => false, // 1-byte sequences can't be overlong
213 }
214 }
215
216 /// Verifies that the continuation bytes following a UTF-8 leading byte are properly formatted.
217 ///
218 /// Each continuation byte must match the pattern `10xxxxxx` (i.e., have the high bits `0b10`).
219 /// This ensures the byte sequence follows proper UTF-8 encoding rules.
220 ///
221 /// This method only verifies correct syntax, but not correct semantics.
222 /// It does not check for overlong encodings nor invalid scalar values.
223 ///
224 /// # Examples
225 /// ```
226 /// # use devela::Char;
227 /// assert!(Char(b"\xE2\x82\xAC").has_valid_continuation(0, 3)); // euro sign €
228 /// assert!(!Char(b"\xE2\x41\xAC").has_valid_continuation(0, 3)); // second byte is ASCII 'A'
229 /// assert!(!Char(b"\xC2").has_valid_continuation(0, 2)); // incomplete sequence
230 /// ```
231 pub const fn has_valid_continuation(self, index: usize, len: usize) -> bool {
232 let bytes = self.0;
233 is![bytes.len() < index + len, return false]; // ensure sufficient len
234 match len {
235 1 => true, // no continuation bytes needed for ASCII
236 2 => bytes[index + 1] & 0xC0 == 0x80,
237 3 => bytes[index + 1] & 0xC0 == 0x80 && bytes[index + 2] & 0xC0 == 0x80,
238 4 => bytes[index + 1] & 0xC0 == 0x80 && bytes[index + 2] & 0xC0 == 0x80
239 && bytes[index + 3] & 0xC0 == 0x80,
240 _ => false, // invalid length
241 }
242 }
243
244 /// Returns `true` if the byte at `index` is a valid starting point for a UTF-8 sequence.
245 ///
246 /// This checks if the byte is not a UTF-8 continuation byte (i.e., it's either
247 /// an ASCII character or a valid leading byte of a multi-byte sequence).
248 ///
249 /// Useful for safely starting UTF-8 decoding from an arbitrary position in a byte slice.
250 #[must_use]
251 pub const fn is_utf8_boundary(self, index: usize) -> bool {
252 index == self.0.len()
253 || (index < self.0.len() && Char(self.0[index]).is_utf8_boundary())
254 }
255 /// Returns the smallest UTF-8 boundary `>= index`.
256 #[must_use]
257 pub const fn ceil_utf8_boundary(self, index: usize) -> usize {
258 let bytes = self.0;
259 let mut i = is![index < bytes.len(), index, bytes.len()];
260 while i < bytes.len() && !Char(bytes).is_utf8_boundary(i) { i += 1; }
261 i
262 }
263 /// Returns the greatest UTF-8 boundary `<= index`.
264 ///
265 /// If `index > self.len()`, starts from `self.len()`.
266 ///
267 /// This only checks boundary shape, not full UTF-8 validity.
268 /// It is intended for already-valid UTF-8 byte slices.
269 #[must_use]
270 pub const fn floor_utf8_boundary(self, index: usize) -> usize {
271 let bytes = self.0;
272 let mut i = is![index < bytes.len(), index, bytes.len()];
273 while i > 0 && !Char(bytes).is_utf8_boundary(i) { i -= 1; }
274 i
275 }
276}
277
278macro_rules! _impl_char_array_ref_wrappers {
279 () => {};
280 ($(#[$attr:meta])* unsafe fn $name:ident($($arg:ident: $arg_ty:ty),* $(,)?) -> $ret:ty;
281 $($rest:tt)* ) => {
282 #[doc = concat!( "A wrapper over [`", stringify!($name), "()`](#method.", stringify!($name),
283 ").", "\n\n# Safety\nSame requirements as the wrapped method.")]
284 #[must_use] #[inline(always)] $(#[$attr])*
285 pub const unsafe fn $name(self $(, $arg: $arg_ty)*) -> $ret {
286 let bytes: &[u8] = self.0;
287 unsafe { Char(bytes).$name($($arg),*) }
288 }
289 _impl_char_array_ref_wrappers!($($rest)*);
290 };
291 ($(#[$attr:meta])* fn $name:ident($($arg:ident: $arg_ty:ty),* $(,)?) -> $ret:ty;
292 $($rest:tt)* ) => {
293 #[doc = concat!("A wrapper over [`",
294 stringify!($name), "()`](#method.", stringify!($name), ").")]
295 #[must_use] #[inline(always)] $(#[$attr])*
296 pub const fn $name(self $(, $arg: $arg_ty)*) -> $ret {
297 let bytes: &[u8] = self.0; Char(bytes).$name($($arg),*)
298 }
299 _impl_char_array_ref_wrappers!($($rest)*);
300 };
301}
302/// Method wrappers over a byte array reference.
303impl<const N: usize> Char<&[u8; N]> {
304 _impl_char_array_ref_wrappers! {
305 fn to_char(index: usize) -> Option<(char, usize)>;
306 fn to_char_lenient(index: usize) -> (char, usize);
307 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
308 #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
309 unsafe fn to_char_unchecked(index: usize) -> (char, usize);
310
311 fn to_scalar(index: usize) -> Option<(u32, usize)>;
312 fn to_scalar_unchecked(index: usize) -> (u32, usize);
313
314 fn has_overlong_encoding(index: usize, len: usize) -> bool;
315 fn has_valid_continuation(index: usize, len: usize) -> bool;
316
317 fn is_utf8_boundary(index: usize) -> bool;
318 fn ceil_utf8_boundary(index: usize) -> usize;
319 fn floor_utf8_boundary(index: usize) -> usize;
320 }
321}