devela/text/char/namespace.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
// devela::text::char::namespace
//
//! `Char` namespace.
//
/// Unicode scalars-related operations.
///
/// See also [`ExtMem`][crate::ExtMem],
pub struct Char;
impl Char {
/// Returns the number of bytes necessary to store the given unicode scalar `code`.
#[must_use]
pub const fn byte_len(code: u32) -> usize {
match code {
0..=0xFF => 1,
0x100..=0xFFFF => 2,
_ => 3,
}
}
/// Returns `true` if the given unicode scalar `code` is a 7bit ASCII code.
#[must_use]
pub const fn is_7bit(code: u32) -> bool {
code <= 0x7F
}
/// Returns `true` if the given unicode scalar `code` is a [noncharacter][0].
///
/// [0]: https://www.unicode.org/glossary/#noncharacter
#[must_use]
// FIXME: make a version that checks for surrogates
pub const fn is_noncharacter(code: u32) -> bool {
// sub-block of 32 non-characters:
(code >= 0xFDD0 && code <= 0xFDEF)
// 2× non-characters at the end of each plane:
|| (code >= 0xFFFE && (code & 0xFF) == 0xFE)
|| (code >= 0xFFFE && (code & 0xFF) == 0xFF)
// unallocated range (16 potential non-characters):
|| (code >= 0x2FE0 && code <= 0x2FEF)
// surrogates (0xD800..=0xDFFF) are already filtered out in `char`.
}
/// Returns the number of bytes needed to store the given unicode scalar `code`,
/// already UTF-8 encoded in 2 bytes.
#[must_use]
pub const fn utf8_2bytes_len(code: [u8; 2]) -> u8 {
1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
}
/// Returns the number of bytes needed to store the given unicode scalar `code`,
/// already UTF-8 encoded in 3 bytes.
#[must_use]
pub const fn utf8_3bytes_len(code: [u8; 3]) -> u8 {
1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
+ ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
}
/// Returns the number of bytes needed to store the given unicode scalar `code`,
/// already UTF-8 encoded in 4 bytes.
#[must_use]
pub const fn utf8_4bytes_len(code: [u8; 4]) -> u8 {
1 + ((code[1] > 0) & (code[1] & 0b1100_0000 != 0b1000_0000)) as u8
+ ((code[2] > 0) & (code[2] & 0b1100_0000 != 0b1000_0000)) as u8
+ ((code[3] > 0) & (code[3] & 0b1100_0000 != 0b1000_0000)) as u8
}
/// Returns the number of bytes needed to encode the given unicode scalar `code` as UTF-8
#[must_use] #[rustfmt::skip]
pub const fn len_to_utf8(code: char) -> usize {
let code = code as u32;
if code < 0x80 { 1 } else if code < 0x800 { 2 } else if code < 0x10_000 { 3 } else { 4 }
}
/// Converts this `char` to an UTF-8 encoded sequence of bytes.
///
/// Note that this function always returns a 4-byte array, but the actual
/// UTF-8 sequence may be shorter. The unused bytes are set to 0.
///
/// See also [`char::encode_utf8`].
#[must_use]
#[allow(clippy::unusual_byte_groupings)]
pub const fn to_utf8_bytes(c: char) -> [u8; 4] {
let c = c as u32;
match c {
// From 0x0000 to 0x007F:
// the UTF-8 encoding is the same as the scalar value.
0x0000..=0x007F => [c as u8, 0, 0, 0],
// from 0x0080 to 0x07FF:
// the UTF-8 encoding is 110xxxxx 10xxxxxx,
// where xxxxx and xxxxxx are the bits of the scalar value.
0x0080..=0x07FF => {
let y = 0b10_000000 | (0b0011_1111 & (c as u8));
let x = 0b110_00000 | ((c >> 6) as u8);
[x, y, 0, 0]
}
// From from 0x0800 to 0xFFFF:
// the UTF-8 encoding is 1110xxxx 10xxxxxx 10xxxxxx.
0x0800..=0xFFFF => {
let z = 0b10_000000 | (0b0011_1111 & (c as u8));
let y = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
let x = 0b1110_0000 | ((c >> 12) as u8);
[x, y, z, 0]
}
// From 0x10000 to 0x10FFFF:
// the UTF-8 encoding is 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
_ => {
let w = 0b10_000000 | (0b0011_1111 & (c as u8));
let z = 0b10_000000 | ((c >> 6) & 0b0011_1111) as u8;
let y = 0b10_000000 | ((c >> 12) & 0b0011_1111) as u8;
let x = 0b11110_000 | ((c >> 18) as u8);
[x, y, z, w]
}
}
}
}