devela/text/ascii/char.rs
1// devela::text::ascii::char
2//
3//!
4//
5// Ported from:
6// - https://doc.rust-lang.org/stable/core/ascii/enum.Char.html
7// - WAIT: [ascii::Char](https://github.com/rust-lang/rust/issues/110998)
8
9#[cfg(feature = "unsafe_str")]
10use crate::transmute;
11use crate::{ConstDefault, _core::fmt};
12
13/// One of 128 Unicode characters (`U+0000` to `U+007F`), the ASCII subset.
14///
15/// Officially, this is the first [block] in Unicode, _Basic Latin_.
16/// For details, see the [*C0 Controls and Basic Latin*][chart] code chart.
17///
18/// This block was based on older 7-bit character code standards such as
19/// ANSI X3.4-1977, ISO 646-1973, and [NIST FIPS 1-2].
20///
21/// # When to use this
22/// The main advantage of this subset is that it's always valid UTF-8. As such,
23/// the `&[ascii::AsciiChar]` -> `&str` conversion function (as well as other related
24/// ones) are O(1): *no* runtime checks are needed.
25///
26/// If you're consuming strings, you should usually handle Unicode and thus
27/// accept `str`s, not limit yourself to `ascii::AsciiChar`s.
28///
29/// However, certain formats are intentionally designed to produce ASCII-only
30/// output in order to be 8-bit-clean. In those cases, it can be simpler and
31/// faster to generate `ascii::AsciiChar`s instead of dealing with the variable width
32/// properties of general UTF-8 encoded strings, while still allowing the result
33/// to be used freely with other Rust things that deal in general `str`s.
34///
35/// For example, a UUID library might offer a way to produce the string
36/// representation of a UUID as an `[ascii::AsciiChar; 36]` to avoid memory
37/// allocation yet still allow it to be used as UTF-8 via `as_str` without
38/// paying for validation (or needing `unsafe` code) the way it would if it
39/// were provided as a `[u8; 36]`.
40///
41/// # Layout
42/// This type is guaranteed to have a size and alignment of 1 byte.
43///
44/// # Names
45/// The variants on this type are [Unicode names][NamesList] of the characters
46/// in upper camel case, with a few tweaks:
47/// - For `<control>` characters, the primary alias name is used.
48/// - `LATIN` is dropped, as this block has no non-latin letters.
49/// - `LETTER` is dropped, as `CAPITAL`/`SMALL` suffices in this block.
50/// - `DIGIT`s use a single digit rather than writing out `ZERO`, `ONE`, etc.
51///
52/// [ASCII]: https://www.unicode.org/glossary/index.html#ASCII
53/// [block]: https://www.unicode.org/glossary/index.html#block
54/// [chart]: https://www.unicode.org/charts/PDF/U0000.pdf
55/// [NIST FIPS 1-2]: https://nvlpubs.nist.gov/nistpubs/Legacy/FIPS/fipspub1-2-1977.pdf
56/// [NamesList]: https://www.unicode.org/Public/15.0.0/ucd/NamesList.txt
57#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
58#[repr(u8)]
59pub enum AsciiChar {
60 /// U+0000 (Default variant)
61 #[default]
62 Null = 0,
63 /// U+0001
64 StartOfHeading = 1,
65 /// U+0002
66 StartOfText = 2,
67 /// U+0003
68 EndOfText = 3,
69 /// U+0004
70 EndOfTransmission = 4,
71 /// U+0005
72 Enquiry = 5,
73 /// U+0006
74 Acknowledge = 6,
75 /// U+0007
76 Bell = 7,
77 /// U+0008
78 Backspace = 8,
79 /// U+0009
80 AsciiCharacterTabulation = 9,
81 /// U+000A
82 LineFeed = 10,
83 /// U+000B
84 LineTabulation = 11,
85 /// U+000C
86 FormFeed = 12,
87 /// U+000D
88 CarriageReturn = 13,
89 /// U+000E
90 ShiftOut = 14,
91 /// U+000F
92 ShiftIn = 15,
93 /// U+0010
94 DataLinkEscape = 16,
95 /// U+0011
96 DeviceControlOne = 17,
97 /// U+0012
98 DeviceControlTwo = 18,
99 /// U+0013
100 DeviceControlThree = 19,
101 /// U+0014
102 DeviceControlFour = 20,
103 /// U+0015
104 NegativeAcknowledge = 21,
105 /// U+0016
106 SynchronousIdle = 22,
107 /// U+0017
108 EndOfTransmissionBlock = 23,
109 /// U+0018
110 Cancel = 24,
111 /// U+0019
112 EndOfMedium = 25,
113 /// U+001A
114 Substitute = 26,
115 /// U+001B
116 Escape = 27,
117 /// U+001C
118 InformationSeparatorFour = 28,
119 /// U+001D
120 InformationSeparatorThree = 29,
121 /// U+001E
122 InformationSeparatorTwo = 30,
123 /// U+001F
124 InformationSeparatorOne = 31,
125 /// U+0020
126 Space = 32,
127 /// U+0021
128 ExclamationMark = 33,
129 /// U+0022
130 QuotationMark = 34,
131 /// U+0023
132 NumberSign = 35,
133 /// U+0024
134 DollarSign = 36,
135 /// U+0025
136 PercentSign = 37,
137 /// U+0026
138 Ampersand = 38,
139 /// U+0027
140 Apostrophe = 39,
141 /// U+0028
142 LeftParenthesis = 40,
143 /// U+0029
144 RightParenthesis = 41,
145 /// U+002A
146 Asterisk = 42,
147 /// U+002B
148 PlusSign = 43,
149 /// U+002C
150 Comma = 44,
151 /// U+002D
152 HyphenMinus = 45,
153 /// U+002E
154 FullStop = 46,
155 /// U+002F
156 Solidus = 47,
157 /// U+0030
158 Digit0 = 48,
159 /// U+0031
160 Digit1 = 49,
161 /// U+0032
162 Digit2 = 50,
163 /// U+0033
164 Digit3 = 51,
165 /// U+0034
166 Digit4 = 52,
167 /// U+0035
168 Digit5 = 53,
169 /// U+0036
170 Digit6 = 54,
171 /// U+0037
172 Digit7 = 55,
173 /// U+0038
174 Digit8 = 56,
175 /// U+0039
176 Digit9 = 57,
177 /// U+003A
178 Colon = 58,
179 /// U+003B
180 Semicolon = 59,
181 /// U+003C
182 LessThanSign = 60,
183 /// U+003D
184 EqualsSign = 61,
185 /// U+003E
186 GreaterThanSign = 62,
187 /// U+003F
188 QuestionMark = 63,
189 /// U+0040
190 CommercialAt = 64,
191 /// U+0041
192 CapitalA = 65,
193 /// U+0042
194 CapitalB = 66,
195 /// U+0043
196 CapitalC = 67,
197 /// U+0044
198 CapitalD = 68,
199 /// U+0045
200 CapitalE = 69,
201 /// U+0046
202 CapitalF = 70,
203 /// U+0047
204 CapitalG = 71,
205 /// U+0048
206 CapitalH = 72,
207 /// U+0049
208 CapitalI = 73,
209 /// U+004A
210 CapitalJ = 74,
211 /// U+004B
212 CapitalK = 75,
213 /// U+004C
214 CapitalL = 76,
215 /// U+004D
216 CapitalM = 77,
217 /// U+004E
218 CapitalN = 78,
219 /// U+004F
220 CapitalO = 79,
221 /// U+0050
222 CapitalP = 80,
223 /// U+0051
224 CapitalQ = 81,
225 /// U+0052
226 CapitalR = 82,
227 /// U+0053
228 CapitalS = 83,
229 /// U+0054
230 CapitalT = 84,
231 /// U+0055
232 CapitalU = 85,
233 /// U+0056
234 CapitalV = 86,
235 /// U+0057
236 CapitalW = 87,
237 /// U+0058
238 CapitalX = 88,
239 /// U+0059
240 CapitalY = 89,
241 /// U+005A
242 CapitalZ = 90,
243 /// U+005B
244 LeftSquareBracket = 91,
245 /// U+005C
246 ReverseSolidus = 92,
247 /// U+005D
248 RightSquareBracket = 93,
249 /// U+005E
250 CircumflexAccent = 94,
251 /// U+005F
252 LowLine = 95,
253 /// U+0060
254 GraveAccent = 96,
255 /// U+0061
256 SmallA = 97,
257 /// U+0062
258 SmallB = 98,
259 /// U+0063
260 SmallC = 99,
261 /// U+0064
262 SmallD = 100,
263 /// U+0065
264 SmallE = 101,
265 /// U+0066
266 SmallF = 102,
267 /// U+0067
268 SmallG = 103,
269 /// U+0068
270 SmallH = 104,
271 /// U+0069
272 SmallI = 105,
273 /// U+006A
274 SmallJ = 106,
275 /// U+006B
276 SmallK = 107,
277 /// U+006C
278 SmallL = 108,
279 /// U+006D
280 SmallM = 109,
281 /// U+006E
282 SmallN = 110,
283 /// U+006F
284 SmallO = 111,
285 /// U+0070
286 SmallP = 112,
287 /// U+0071
288 SmallQ = 113,
289 /// U+0072
290 SmallR = 114,
291 /// U+0073
292 SmallS = 115,
293 /// U+0074
294 SmallT = 116,
295 /// U+0075
296 SmallU = 117,
297 /// U+0076
298 SmallV = 118,
299 /// U+0077
300 SmallW = 119,
301 /// U+0078
302 SmallX = 120,
303 /// U+0079
304 SmallY = 121,
305 /// U+007A
306 SmallZ = 122,
307 /// U+007B
308 LeftCurlyBracket = 123,
309 /// U+007C
310 VerticalLine = 124,
311 /// U+007D
312 RightCurlyBracket = 125,
313 /// U+007E
314 Tilde = 126,
315 /// U+007F
316 Delete = 127,
317}
318
319impl AsciiChar {
320 /// Creates an ascii character from the byte `b`,
321 /// or returns `None` if it's too large.
322 #[must_use]
323 pub const fn from_u8(b: u8) -> Option<Self> {
324 match b {
325 0 => Some(Self::Null),
326 1 => Some(Self::StartOfHeading),
327 2 => Some(Self::StartOfText),
328 3 => Some(Self::EndOfText),
329 4 => Some(Self::EndOfTransmission),
330 5 => Some(Self::Enquiry),
331 6 => Some(Self::Acknowledge),
332 7 => Some(Self::Bell),
333 8 => Some(Self::Backspace),
334 9 => Some(Self::AsciiCharacterTabulation),
335 10 => Some(Self::LineFeed),
336 11 => Some(Self::LineTabulation),
337 12 => Some(Self::FormFeed),
338 13 => Some(Self::CarriageReturn),
339 14 => Some(Self::ShiftOut),
340 15 => Some(Self::ShiftIn),
341 16 => Some(Self::DataLinkEscape),
342 17 => Some(Self::DeviceControlOne),
343 18 => Some(Self::DeviceControlTwo),
344 19 => Some(Self::DeviceControlThree),
345 20 => Some(Self::DeviceControlFour),
346 21 => Some(Self::NegativeAcknowledge),
347 22 => Some(Self::SynchronousIdle),
348 23 => Some(Self::EndOfTransmissionBlock),
349 24 => Some(Self::Cancel),
350 25 => Some(Self::EndOfMedium),
351 26 => Some(Self::Substitute),
352 27 => Some(Self::Escape),
353 28 => Some(Self::InformationSeparatorFour),
354 29 => Some(Self::InformationSeparatorThree),
355 30 => Some(Self::InformationSeparatorTwo),
356 31 => Some(Self::InformationSeparatorOne),
357 32 => Some(Self::Space),
358 33 => Some(Self::ExclamationMark),
359 34 => Some(Self::QuotationMark),
360 35 => Some(Self::NumberSign),
361 36 => Some(Self::DollarSign),
362 37 => Some(Self::PercentSign),
363 38 => Some(Self::Ampersand),
364 39 => Some(Self::Apostrophe),
365 40 => Some(Self::LeftParenthesis),
366 41 => Some(Self::RightParenthesis),
367 42 => Some(Self::Asterisk),
368 43 => Some(Self::PlusSign),
369 44 => Some(Self::Comma),
370 45 => Some(Self::HyphenMinus),
371 46 => Some(Self::FullStop),
372 47 => Some(Self::Solidus),
373 48 => Some(Self::Digit0),
374 49 => Some(Self::Digit1),
375 50 => Some(Self::Digit2),
376 51 => Some(Self::Digit3),
377 52 => Some(Self::Digit4),
378 53 => Some(Self::Digit5),
379 54 => Some(Self::Digit6),
380 55 => Some(Self::Digit7),
381 56 => Some(Self::Digit8),
382 57 => Some(Self::Digit9),
383 58 => Some(Self::Colon),
384 59 => Some(Self::Semicolon),
385 60 => Some(Self::LessThanSign),
386 61 => Some(Self::EqualsSign),
387 62 => Some(Self::GreaterThanSign),
388 63 => Some(Self::QuestionMark),
389 64 => Some(Self::CommercialAt),
390 65 => Some(Self::CapitalA),
391 66 => Some(Self::CapitalB),
392 67 => Some(Self::CapitalC),
393 68 => Some(Self::CapitalD),
394 69 => Some(Self::CapitalE),
395 70 => Some(Self::CapitalF),
396 71 => Some(Self::CapitalG),
397 72 => Some(Self::CapitalH),
398 73 => Some(Self::CapitalI),
399 74 => Some(Self::CapitalJ),
400 75 => Some(Self::CapitalK),
401 76 => Some(Self::CapitalL),
402 77 => Some(Self::CapitalM),
403 78 => Some(Self::CapitalN),
404 79 => Some(Self::CapitalO),
405 80 => Some(Self::CapitalP),
406 81 => Some(Self::CapitalQ),
407 82 => Some(Self::CapitalR),
408 83 => Some(Self::CapitalS),
409 84 => Some(Self::CapitalT),
410 85 => Some(Self::CapitalU),
411 86 => Some(Self::CapitalV),
412 87 => Some(Self::CapitalW),
413 88 => Some(Self::CapitalX),
414 89 => Some(Self::CapitalY),
415 90 => Some(Self::CapitalZ),
416 91 => Some(Self::LeftSquareBracket),
417 92 => Some(Self::ReverseSolidus),
418 93 => Some(Self::RightSquareBracket),
419 94 => Some(Self::CircumflexAccent),
420 95 => Some(Self::LowLine),
421 96 => Some(Self::GraveAccent),
422 97 => Some(Self::SmallA),
423 98 => Some(Self::SmallB),
424 99 => Some(Self::SmallC),
425 100 => Some(Self::SmallD),
426 101 => Some(Self::SmallE),
427 102 => Some(Self::SmallF),
428 103 => Some(Self::SmallG),
429 104 => Some(Self::SmallH),
430 105 => Some(Self::SmallI),
431 106 => Some(Self::SmallJ),
432 107 => Some(Self::SmallK),
433 108 => Some(Self::SmallL),
434 109 => Some(Self::SmallM),
435 110 => Some(Self::SmallN),
436 111 => Some(Self::SmallO),
437 112 => Some(Self::SmallP),
438 113 => Some(Self::SmallQ),
439 114 => Some(Self::SmallR),
440 115 => Some(Self::SmallS),
441 116 => Some(Self::SmallT),
442 117 => Some(Self::SmallU),
443 118 => Some(Self::SmallV),
444 119 => Some(Self::SmallW),
445 120 => Some(Self::SmallX),
446 121 => Some(Self::SmallY),
447 122 => Some(Self::SmallZ),
448 123 => Some(Self::LeftCurlyBracket),
449 124 => Some(Self::VerticalLine),
450 125 => Some(Self::RightCurlyBracket),
451 126 => Some(Self::Tilde),
452 127 => Some(Self::Delete),
453 _ => None,
454 }
455 }
456
457 /// Creates an ASCII character from the byte `b`,
458 /// without checking whether it's valid.
459 /// # Safety
460 /// `b` must be in `0..=127`, or else this is UB.
461 #[must_use]
462 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
463 #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "unsafe_str")))]
464 pub const unsafe fn from_u8_unchecked(b: u8) -> Self {
465 // SAFETY: Our safety precondition is that `b` is in-range.
466 unsafe { transmute(b) }
467 }
468
469 /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
470 /// `'0'`, `'1'`, …, `'9'` respectively.
471 ///
472 /// If `d >= 10`, returns `None`.
473 ///
474 /// # Features
475 /// Uses `unsafe_hint` for performance optimizations.
476 #[must_use]
477 pub const fn digit(d: u8) -> Option<Self> {
478 if d < 10 {
479 let sum = {
480 #[cfg(any(feature = "safe_text", not(feature = "unsafe_hint")))]
481 {
482 b'0' + d
483 }
484 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_hint"))]
485 // SAFETY: we've checked d < 10
486 unsafe {
487 b'0'.unchecked_add(d)
488 }
489 };
490 Self::from_u8(sum)
491 } else {
492 None
493 }
494 }
495
496 /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
497 /// `'0'`, `'1'`, …, `'9'` respectively, without checking that it's in-range.
498 ///
499 /// # Safety
500 /// This is immediate UB if called with `d > 64`.
501 ///
502 /// If `d >= 10` and `d <= 64`, this is allowed to return any value or panic.
503 /// Notably, it should not be expected to return hex digits, or any other
504 /// reasonable extension of the decimal digits.
505 ///
506 /// (This lose safety condition is intended to simplify soundness proofs
507 /// when writing code using this method, since the implementation doesn't
508 /// need something really specific, not to make those other arguments do
509 /// something useful. It might be tightened before stabilization.)
510 #[must_use]
511 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
512 #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "unsafe_str")))]
513 pub const unsafe fn digit_unchecked(d: u8) -> Self {
514 debug_assert!(d < 10);
515
516 // SAFETY: `'0'` through `'9'` are U+00030 through U+0039,
517 // so because `d` must be 64 or less the addition can return at most
518 // 112 (0x70), which doesn't overflow and is within the ASCII range.
519 unsafe {
520 let byte = b'0'.unchecked_add(d);
521 Self::from_u8_unchecked(byte)
522 }
523 }
524
525 /// Gets this ASCII character as a byte.
526 #[must_use]
527 pub const fn as_u8(self) -> u8 {
528 self as u8
529 }
530
531 /// Gets this ASCII character as a `char` Unicode Scalar Value.
532 #[must_use]
533 pub const fn as_char(self) -> char {
534 self as u8 as char
535 }
536
537 /// Views this ASCII character as a one-code-unit UTF-8 `str`.
538 #[must_use]
539 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
540 #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "unsafe_str")))]
541 pub const fn as_str(&self) -> &str {
542 Self::slice_as_str(core::slice::from_ref(self))
543 }
544}
545
546impl AsciiChar {
547 /// Views a slice of ASCII characters as a UTF-8 `str`.
548 #[must_use]
549 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
550 #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "unsafe_str")))]
551 pub const fn slice_as_str(slice: &[AsciiChar]) -> &str {
552 let ascii_ptr: *const [AsciiChar] = slice;
553 let str_ptr = ascii_ptr as *const str;
554 // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
555 // code unit having the same value as the ASCII byte.
556 unsafe { &*str_ptr }
557 }
558
559 /// Views a slice of ASCII characters as a slice of `u8` bytes.
560 #[must_use]
561 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
562 #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "unsafe_str")))]
563 pub const fn slice_as_bytes(slice: &[AsciiChar]) -> &[u8] {
564 AsciiChar::slice_as_str(slice).as_bytes()
565 }
566}
567// impl [AsciiChar] {
568// /// Views this slice of ASCII characters as a UTF-8 `str`.
569// #[must_use]
570// pub const fn as_str(&self) -> &str {
571// let ascii_ptr: *const Self = self;
572// let str_ptr = ascii_ptr as *const str;
573// // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
574// // code unit having the same value as the ASCII byte.
575// unsafe { &*str_ptr }
576// }
577//
578// /// Views this slice of ASCII characters as a slice of `u8` bytes.
579// #[must_use]
580// pub const fn as_bytes(&self) -> &[u8] {
581// self.as_str().as_bytes()
582// }
583// }
584
585impl fmt::Display for AsciiChar {
586 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587 fmt::Display::fmt(&self.as_char(), f)
588 }
589}
590
591impl ConstDefault for AsciiChar {
592 const DEFAULT: Self = AsciiChar::Null;
593}
594
595#[cfg(feature = "bit")]
596crate::bit_sized![= 7; for AsciiChar];