devela/text/char/definitions.rs
1// devela::text::char::definitions
2//
3//! Define Char* structs and UnicodeScalar trait.
4//
5// TOC
6// - struct char7
7// - struct char8
8// - struct char16
9// - trait UnicodeScalar
10
11#![allow(non_camel_case_types)]
12
13// In sync with devela::num::niche::non_value
14#[cfg(feature = "_char7")]
15pub(super) use crate::NonExtremeU8;
16#[cfg(feature = "_char16")]
17pub(super) use crate::NonValueU16;
18
19// This is a surrogate UTF-16 code point that can't ever be a unicode scalar.
20#[cfg(feature = "_char16")]
21pub(super) type NonSurrogateU16 = NonValueU16<0xDFFF>;
22
23/* public types */
24
25/// A 7-bit [unicode scalar][scalar], limited to [basic latin][0w] subset
26/// (ASCII).
27///
28/// `Option<char7>` is the same size as `char7` or `char8` (1 byte).
29///
30/// See also: [`char8`], [`char16`], [`char`][crate::char].
31///
32/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
33/// [0w]: https://en.wikipedia.org/wiki/Basic_Latin_(Unicode_block)
34#[repr(transparent)]
35#[cfg(feature = "_char7")]
36#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char7")))]
37#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
38pub struct char7(pub(super) NonExtremeU8);
39
40/// An 8-bit [unicode scalar][scalar], limited to [basic latin][0w]
41/// and [latin-1][1w] subsets.
42///
43/// This is the only scalar type without memory layout optimization
44/// because each possible value is a valid unicode scalar. Therefore
45/// `Option<char8>` is the same size as `char16` or `Option<char16>` (2 bytes).
46///
47/// See also: [`char7`], [`char16`], [`char`][crate::char].
48///
49/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
50/// [0w]: https://en.wikipedia.org/wiki/Basic_Latin_(Unicode_block)
51/// [1w]: https://en.wikipedia.org/wiki/Latin-1_Supplement
52#[repr(transparent)]
53#[cfg(feature = "_char8")]
54#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char8")))]
55#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
56pub struct char8(pub(super) u8);
57
58/// A 16-bit [unicode scalar][scalar], limited to the
59/// [Basic Multilingual Plane][0w] subset.
60///
61/// It can represent every scalar from the [Basic Multilingual Plane][0w] (BMP),
62/// the first and most important plane in the Unicode standard (also known as
63/// plane 0), containing nearly all commonly used writing systems and symbols.
64///
65/// `Option<char16>` is the same size as `char16` (2 bytes).
66///
67/// See also: [`char7`], [`char8`], [`char`][crate::char].
68///
69/// [scalar]: https://www.unicode.org/glossary/#unicode_scalar_value
70/// [0w]: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
71#[repr(transparent)]
72#[cfg(feature = "_char16")]
73#[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char16")))]
74#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
75pub struct char16(pub(super) NonSurrogateU16);
76
77/// Common trait for unicode scalar types.
78///
79/// It's implemented for: [`char7`], [`char8`], [`char16`],
80/// and [`char`][crate::char].
81pub trait UnicodeScalar {
82 /// The lowest unicode scalar that can be represented.
83 const MIN: Self;
84 /// The highest unicode scalar that can be represented.
85 const MAX: Self;
86
87 /* encode */
88
89 /// Returns the number of bytes needed to represent the scalar value.
90 #[must_use]
91 fn byte_len(self) -> usize;
92
93 /// Returns the number of bytes needed to encode in UTF-8.
94 #[must_use]
95 fn len_utf8(self) -> usize;
96
97 /// Returns the number of bytes needed to encode in UTF-16.
98 #[must_use]
99 fn len_utf16(self) -> usize;
100
101 /// Encodes this scalar as UTF-8 into the provided byte buffer,
102 /// and then returns the subslice of the buffer that contains the encoded scalar.
103 ///
104 /// # Panics
105 /// Panics if the buffer is not large enough.
106 /// A buffer of length four is large enough to encode any char.
107 #[must_use]
108 fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
109
110 /// Converts this `scalar` to an UTF-8 encoded sequence of bytes.
111 ///
112 /// Note that this function always returns a 4-byte array, but the actual
113 /// UTF-8 sequence may be shorter. The unused bytes are set to 0.
114 #[must_use]
115 fn to_utf8_bytes(self) -> [u8; 4];
116
117 /// Encodes this scalar as UTF-16 into the provided byte buffer,
118 /// and then returns the subslice of the buffer that contains the encoded scalar.
119 ///
120 /// # Panics
121 /// Panics if the buffer is not large enough.
122 /// A buffer of length 2 is large enough to encode any char.
123 #[must_use]
124 fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
125
126 /// Converts the scalar to a digit in the given radix.
127 ///
128 /// ‘Digit’ is defined to be only the following characters:
129 /// `0-9`, `a-z`, `A-Z`.
130 ///
131 /// # Errors
132 /// Returns None if the char does not refer to a digit in the given radix.
133 ///
134 /// # Panics
135 /// Panics if given a radix larger than 36.
136 #[must_use]
137 fn to_digit(self, radix: u32) -> Option<u32>;
138
139 /// Makes a copy of the value in its ASCII upper case equivalent.
140 ///
141 /// ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, but non-ASCII letters
142 /// are unchanged.
143 #[must_use]
144 fn to_ascii_uppercase(self) -> Self
145 where
146 Self: Sized;
147
148 /// Makes a copy of the value in its ASCII lower case equivalent.
149 ///
150 /// ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, but non-ASCII letters
151 /// are unchanged.
152 #[must_use]
153 fn to_ascii_lowercase(self) -> Self
154 where
155 Self: Sized;
156
157 /* escape */
158
159 /* queries */
160
161 /// Returns `true` if this unicode scalar is a [noncharacter][0].
162 ///
163 /// [0]: https://www.unicode.org/glossary/#noncharacter
164 #[must_use]
165 fn is_noncharacter(self) -> bool;
166
167 /// Returns `true` if this unicode scalar is an [abstract character][0].
168 ///
169 /// [0]: https://www.unicode.org/glossary/#abstract_character
170 #[must_use]
171 fn is_character(self) -> bool
172 where
173 Self: Sized,
174 {
175 !self.is_noncharacter()
176 }
177
178 /// Checks if the unicode scalar is a digit in the given radix.
179 ///
180 /// See also [`to_digit`][Self#method.to_digit].
181 #[must_use]
182 fn is_digit(self, radix: u32) -> bool;
183
184 /// Returns `true` if this unicode scalar has the general category for
185 /// control codes.
186 #[must_use]
187 fn is_control(self) -> bool;
188
189 /// Returns `true` if this unicode scalar is the nul character (`0x00`).
190 #[must_use]
191 fn is_nul(self) -> bool;
192
193 /// Returns `true` if this unicode scalar has the `Alphabetic` property.
194 #[must_use]
195 fn is_alphabetic(self) -> bool;
196
197 /// Returns `true` if this unicode scalar has one of the general categories
198 /// for numbers.
199 ///
200 /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N,
201 /// use [`is_ascii_digit`][Self#method.is_ascii_digit] or
202 /// [`is_digit`][Self#method.is_digit] instead.
203 #[must_use]
204 fn is_numeric(self) -> bool;
205
206 /// Returns `true` if this unicode scalar satisfies either
207 /// [`is_alphabetic()`][Self#method.is_alphabetic] or
208 /// [`is_numeric()`][Self#method.is_numeric].
209 #[must_use]
210 fn is_alphanumeric(self) -> bool;
211
212 /// Returns `true` if this unicode scalar has the `Lowercase` property.
213 #[must_use]
214 fn is_lowercase(self) -> bool;
215
216 /// Returns `true` if this unicode scalar has the `Lowercase` property.
217 #[must_use]
218 fn is_uppercase(self) -> bool;
219
220 /// Returns `true` if this unicode scalar has the `White_Space` property.
221 #[must_use]
222 fn is_whitespace(self) -> bool;
223
224 /* ascii */
225
226 /// Checks if the value is within the ASCII range.
227 #[must_use]
228 fn is_ascii(self) -> bool;
229}