devela/text/char/
definitions.rs

1// devela::text::char::definitions
2//
3//! Define Char* structs and UnicodeScalar trait.
4//
5// TOC
6// - struct char7
7// - struct char8
8// - struct char16
9// - trait UnicodeScalar
10
11#![allow(non_camel_case_types)]
12
13// In sync with devela::num::niche::non_value
14#[cfg(feature = "_char7")]
15pub(super) use crate::NonExtremeU8;
16#[cfg(feature = "_char16")]
17pub(super) use crate::NonValueU16;
18
19// This is a surrogate UTF-16 code point that can't ever be a unicode scalar.
20#[cfg(feature = "_char16")]
21pub(super) type NonSurrogateU16 = NonValueU16<0xDFFF>;
22
23/* public types */
24
25/// A 7-bit [unicode scalar][scalar], limited to [basic latin][0w] subset
26/// (ASCII).
27///
28/// `Option<char7>` is the same size as `char7` or `char8` (1 byte).
29///
30/// See also: [`char8`], [`char16`], [`char`][crate::char].
31///
32/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
33/// [0w]: https://en.wikipedia.org/wiki/Basic_Latin_(Unicode_block)
34#[repr(transparent)]
35#[cfg(feature = "_char7")]
36#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char7")))]
37#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
38pub struct char7(pub(super) NonExtremeU8);
39
40/// An 8-bit [unicode scalar][scalar], limited to [basic latin][0w]
41/// and [latin-1][1w] subsets.
42///
43/// This is the only scalar type without memory layout optimization
44/// because each possible value is a valid unicode scalar. Therefore
45/// `Option<char8>` is the same size as `char16` or `Option<char16>` (2 bytes).
46///
47/// See also: [`char7`], [`char16`], [`char`][crate::char].
48///
49/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
50/// [0w]: https://en.wikipedia.org/wiki/Basic_Latin_(Unicode_block)
51/// [1w]: https://en.wikipedia.org/wiki/Latin-1_Supplement
52#[repr(transparent)]
53#[cfg(feature = "_char8")]
54#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char8")))]
55#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
56pub struct char8(pub(super) u8);
57
58/// A 16-bit [unicode scalar][scalar], limited to the
59/// [Basic Multilingual Plane][0w] subset.
60///
61/// It can represent every scalar from the [Basic Multilingual Plane][0w] (BMP),
62/// the first and most important plane in the Unicode standard (also known as
63/// plane 0), containing nearly all commonly used writing systems and symbols.
64///
65/// `Option<char16>` is the same size as `char16` (2 bytes).
66///
67/// See also: [`char7`], [`char8`], [`char`][crate::char].
68///
69/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
70/// [0w]: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
71#[repr(transparent)]
72#[cfg(feature = "_char16")]
73#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char16")))]
74#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
75pub struct char16(pub(super) NonSurrogateU16);
76
77/// Common trait for unicode scalar types.
78///
79/// It's implemented for: [`char7`], [`char8`], [`char16`],
80/// and [`char`][crate::char].
81pub trait UnicodeScalar {
82    /// The lowest unicode scalar that can be represented.
83    const MIN: Self;
84    /// The highest unicode scalar that can be represented.
85    const MAX: Self;
86
87    /* encode */
88
89    /// Returns the number of bytes needed to represent the scalar value.
90    #[must_use]
91    fn byte_len(self) -> usize;
92
93    /// Returns the number of bytes needed to encode in UTF-8.
94    #[must_use]
95    fn len_utf8(self) -> usize;
96
97    /// Returns the number of bytes needed to encode in UTF-16.
98    #[must_use]
99    fn len_utf16(self) -> usize;
100
101    /// Encodes this scalar as UTF-8 into the provided byte buffer,
102    /// and then returns the subslice of the buffer that contains the encoded scalar.
103    ///
104    /// # Panics
105    /// Panics if the buffer is not large enough.
106    /// A buffer of length four is large enough to encode any char.
107    #[must_use]
108    fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
109
110    /// Converts this `scalar` to an UTF-8 encoded sequence of bytes.
111    ///
112    /// Note that this function always returns a 4-byte array, but the actual
113    /// UTF-8 sequence may be shorter. The unused bytes are set to 0.
114    #[must_use]
115    fn to_utf8_bytes(self) -> [u8; 4];
116
117    /// Encodes this scalar as UTF-16 into the provided byte buffer,
118    /// and then returns the subslice of the buffer that contains the encoded scalar.
119    ///
120    /// # Panics
121    /// Panics if the buffer is not large enough.
122    /// A buffer of length 2 is large enough to encode any char.
123    #[must_use]
124    fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
125
126    /// Converts the scalar to a digit in the given radix.
127    ///
128    /// ‘Digit’ is defined to be only the following characters:
129    /// `0-9`, `a-z`, `A-Z`.
130    ///
131    /// # Errors
132    /// Returns None if the char does not refer to a digit in the given radix.
133    ///
134    /// # Panics
135    /// Panics if given a radix larger than 36.
136    #[must_use]
137    fn to_digit(self, radix: u32) -> Option<u32>;
138
139    /// Makes a copy of the value in its ASCII upper case equivalent.
140    ///
141    /// ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, but non-ASCII letters
142    /// are unchanged.
143    #[must_use]
144    fn to_ascii_uppercase(self) -> Self
145    where
146        Self: Sized;
147
148    /// Makes a copy of the value in its ASCII lower case equivalent.
149    ///
150    /// ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, but non-ASCII letters
151    /// are unchanged.
152    #[must_use]
153    fn to_ascii_lowercase(self) -> Self
154    where
155        Self: Sized;
156
157    /* escape */
158
159    /* queries */
160
161    /// Returns `true` if this unicode scalar is a [noncharacter][0].
162    ///
163    /// [0]: https://www.unicode.org/glossary/#noncharacter
164    #[must_use]
165    fn is_noncharacter(self) -> bool;
166
167    /// Returns `true` if this unicode scalar is an [abstract character][0].
168    ///
169    /// [0]: https://www.unicode.org/glossary/#abstract_character
170    #[must_use]
171    fn is_character(self) -> bool
172    where
173        Self: Sized,
174    {
175        !self.is_noncharacter()
176    }
177
178    /// Checks if the unicode scalar is a digit in the given radix.
179    ///
180    /// See also [`to_digit`][Self#method.to_digit].
181    #[must_use]
182    fn is_digit(self, radix: u32) -> bool;
183
184    /// Returns `true` if this unicode scalar has the general category for
185    /// control codes.
186    #[must_use]
187    fn is_control(self) -> bool;
188
189    /// Returns `true` if this unicode scalar is the nul character (`0x00`).
190    #[must_use]
191    fn is_nul(self) -> bool;
192
193    /// Returns `true` if this unicode scalar has the `Alphabetic` property.
194    #[must_use]
195    fn is_alphabetic(self) -> bool;
196
197    /// Returns `true` if this unicode scalar has one of the general categories
198    /// for numbers.
199    ///
200    /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N,
201    /// use [`is_ascii_digit`][Self#method.is_ascii_digit] or
202    /// [`is_digit`][Self#method.is_digit] instead.
203    #[must_use]
204    fn is_numeric(self) -> bool;
205
206    /// Returns `true` if this unicode scalar satisfies either
207    /// [`is_alphabetic()`][Self#method.is_alphabetic] or
208    /// [`is_numeric()`][Self#method.is_numeric].
209    #[must_use]
210    fn is_alphanumeric(self) -> bool;
211
212    /// Returns `true` if this unicode scalar has the `Lowercase` property.
213    #[must_use]
214    fn is_lowercase(self) -> bool;
215
216    /// Returns `true` if this unicode scalar has the `Lowercase` property.
217    #[must_use]
218    fn is_uppercase(self) -> bool;
219
220    /// Returns `true` if this unicode scalar has the `White_Space` property.
221    #[must_use]
222    fn is_whitespace(self) -> bool;
223
224    /* ascii */
225
226    /// Checks if the value is within the ASCII range.
227    #[must_use]
228    fn is_ascii(self) -> bool;
229}