devela/text/char/impls/
c16.rs

1// devela::text::char::impls::char16
2//
3//!
4//
5
6use super::*;
7#[cfg(feature = "ascii")]
8use crate::AsciiChar;
9use crate::{text::char::NonSurrogateU16, Char, DataOverflow};
10
11impl char16 {
12    /* private helper fns */
13
14    // SAFETY: this is not marked as unsafe because it's only used privately
15    // by this module for a few selected operations.
16    #[must_use]
17    const fn from_char_unchecked(c: char) -> char16 {
18        char16::new_unchecked(c as u32 as u16)
19    }
20
21    // useful because Option::<T>::unwrap is not yet stable as const fn
22    #[must_use]
23    const fn new_unchecked(value: u16) -> char16 {
24        #[cfg(any(feature = "safe_text", not(feature = "unsafe_niche")))]
25        if let Some(c) = NonSurrogateU16::new(value) {
26            char16(c)
27        } else {
28            unreachable![]
29        }
30        #[cfg(all(not(feature = "safe_text"), feature = "unsafe_niche"))]
31        unsafe {
32            char16(NonSurrogateU16::new_unchecked(value))
33        }
34    }
35
36    /* constants */
37
38    /// The lowest unicode scalar a `char16` can represent, `'\u{00}'`.
39    pub const MIN: char16 = char16::new_unchecked(0x0000);
40
41    /// The highest unicode scalar a `char16` can represent, `'\u{FFFF}'`.
42    ///
43    /// Note that `'\u{FFFF}'` is a *noncharacter*.
44    pub const MAX: char16 = char16::new_unchecked(0xFFFF);
45
46    /// `U+FFFD REPLACEMENT CHARACTER (�)` is used in Unicode to represent a decoding error.
47    pub const REPLACEMENT_CHARACTER: char16 =
48        char16::new_unchecked(char::REPLACEMENT_CHARACTER as u32 as u16);
49
50    /* conversions */
51
52    /// Converts an `AsciiChar` to `char16`.
53    #[must_use]
54    #[cfg(feature = "ascii")]
55    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "ascii")))]
56    pub const fn from_ascii_char(c: AsciiChar) -> char16 {
57        char16::new_unchecked(c as u8 as u16)
58    }
59
60    /// Converts a `char7` to `char16`.
61    #[must_use]
62    #[cfg(feature = "_char7")]
63    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char7")))]
64    pub const fn from_char7(c: char7) -> char16 {
65        char16::new_unchecked(c.0.get() as u16)
66    }
67    /// Converts a `char8` to `char16`.
68    #[must_use]
69    #[cfg(feature = "_char8")]
70    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char8")))]
71    pub const fn from_char8(c: char8) -> char16 {
72        char16::new_unchecked(c.0 as u16)
73    }
74    /// Tries to convert a `char` to `char16`.
75    ///
76    /// # Errors
77    /// Returns [`DataOverflow`] if the character can't fit in 16 bits.
78    pub const fn try_from_char(c: char) -> Result<char16, DataOverflow> {
79        if Char::byte_len(c as u32) <= 2 {
80            Ok(char16::new_unchecked(c as u32 as u16))
81        } else {
82            Err(DataOverflow(Some(c as u32 as usize)))
83        }
84    }
85
86    //
87
88    /// Tries to convert this `char16` to `AsciiChar`.
89    ///
90    /// # Errors
91    /// Returns [`DataOverflow`] if `self` can't fit in 7 bits.
92    ///
93    /// # Features
94    /// Makes use of the `unsafe_niche` feature if enabled.
95    #[cfg(feature = "ascii")]
96    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "ascii")))]
97    pub const fn try_to_ascii_char(self) -> Result<AsciiChar, DataOverflow> {
98        if Char::is_7bit(self.to_u32()) {
99            #[cfg(any(feature = "safe_text", not(feature = "unsafe_niche")))]
100            if let Some(c) = AsciiChar::from_u8(self.0.get() as u8) {
101                return Ok(c);
102            } else {
103                unreachable![]
104            }
105
106            #[cfg(all(not(feature = "safe_text"), feature = "unsafe_niche"))]
107            // SAFETY: we've already checked it's in range.
108            return Ok(unsafe { AsciiChar::from_u8_unchecked(self.0.get() as u8) });
109        }
110        Err(DataOverflow(Some(self.to_u32() as usize)))
111    }
112
113    /// Tries to convert this `char16` to `char7`.
114    ///
115    /// # Errors
116    /// Returns [`DataOverflow`] if `self` can't fit in 7 bits.
117    #[cfg(feature = "_char7")]
118    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char7")))]
119    pub const fn try_to_char7(self) -> Result<char7, DataOverflow> {
120        char7::try_from_char16(self)
121    }
122    /// Tries to convert this `char16` to `char8`.
123    ///
124    /// # Errors
125    /// Returns [`DataOverflow`] if `self` can't fit in 8 bits.
126    #[cfg(feature = "_char8")]
127    #[cfg_attr(feature = "nightly_doc", doc(cfg(feature = "_char8")))]
128    pub const fn try_to_char8(self) -> Result<char8, DataOverflow> {
129        char8::try_from_char16(self)
130    }
131    /// Converts this `char16` to `char`.
132    #[must_use]
133    #[rustfmt::skip]
134    pub const fn to_char(self) -> char {
135        // #[cfg(any(feature = "safe_text", not(feature = "unsafe_niche")))]
136        if let Some(c) = char::from_u32(self.0.get() as u32) { c } else { unreachable![] }
137
138        // WAIT: [stable const](https://github.com/rust-lang/rust/issues/89259)
139        // #[cfg(all(not(feature = "safe_text"), feature = "unsafe_niche"))]
140        // SAFETY: we've already checked we contain a valid char.
141        // return unsafe { char::from_u32_unchecked(self.0 as u32) };
142    }
143    /// Converts this `char16` to `u32`.
144    #[must_use]
145    pub const fn to_u32(self) -> u32 {
146        self.0.get() as u32
147    }
148
149    /// Converts this `char16` to an UTF-8 encoded sequence of bytes.
150    ///
151    /// Note that this function always returns a 3-byte array, but the actual
152    /// UTF-8 sequence may be shorter. The unused bytes are set to 0.
153    //
154    // https://en.wikipedia.org/wiki/UTF-8#Encoding
155    #[must_use]
156    #[allow(clippy::unusual_byte_groupings)]
157    pub const fn to_utf8_bytes(self) -> [u8; 3] {
158        let c = self.0.get();
159        match c {
160            // From 0x0000 to 0x007F:
161            // the UTF-8 encoding is the same as the scalar value.
162            0x0000..=0x007F => [c as u8, 0, 0],
163
164            // from 0x0080 to 0x07FF:
165            // the UTF-8 encoding is 110xxxxx 10xxxxxx,
166            // where xxxxx and xxxxxx are the bits of the scalar value.
167            0x0080..=0x07FF => {
168                let y = 0b10_000000 | (0b0011_1111 & (c as u8));
169                let x = 0b110_00000 | ((c >> 6) as u8);
170                [x, y, 0]
171            }
172
173            // From from 0x0800 to 0xFFFF:
174            // the UTF-8 encoding is 1110xxxx 10xxxxxx 10xxxxxx.
175            0x0800..=0xFFFF => {
176                let z = 0b10_000000 | (0b0011_1111 & (c as u8));
177                let y = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
178                let x = 0b1110_0000 | ((c >> 12) as u8);
179                [x, y, z]
180            }
181        }
182    }
183
184    //
185
186    /// Makes a copy of the value in its ASCII upper case equivalent.
187    ///
188    /// ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, but non-ASCII letters
189    /// are unchanged.
190    #[must_use]
191    pub const fn to_ascii_uppercase(self) -> char16 {
192        Self::from_char_unchecked(char::to_ascii_uppercase(&self.to_char()))
193    }
194
195    /// Makes a copy of the value in its ASCII lower case equivalent.
196    ///
197    /// ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, but non-ASCII letters
198    /// are unchanged.
199    #[must_use]
200    pub const fn to_ascii_lowercase(self) -> char16 {
201        Self::from_char_unchecked(char::to_ascii_lowercase(&self.to_char()))
202    }
203
204    /* queries */
205
206    /// Returns `true` if this unicode scalar is a [noncharacter][0].
207    ///
208    /// [0]: https://www.unicode.org/glossary/#noncharacter
209    #[must_use]
210    pub const fn is_noncharacter(self) -> bool {
211        Char::is_noncharacter(self.0.get() as u32)
212    }
213
214    /// Returns `true` if this unicode scalar is an [abstract character][0].
215    ///
216    /// [0]: https://www.unicode.org/glossary/#abstract_character
217    #[must_use]
218    pub const fn is_character(self) -> bool {
219        !self.is_noncharacter()
220    }
221
222    /// Checks if the value is within the ASCII range.
223    #[must_use]
224    pub const fn is_ascii(self) -> bool {
225        self.0.get() <= 0x7F
226    }
227}