codepoint.fz
# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------
#
# Tokiwa Software GmbH, Germany
#
# Source code of Fuzion standard library feature codepoint
#
# Author: Fridtjof Siebert (siebert@tokiwa.software)
#
# -----------------------------------------------------------------------
# codepoint -- represents a unicode codepoint
#
public codepoint(public val u32) : String
pre
debug: (codepoint.range.contains val)
debug: !codepoint.utf16_surrogate.contains val
is
# the utf8 encoded bytes for the string representation
# of this codepoint
#
public redef utf8 Sequence u8 =>
if codepoint.utf8_encoded_in_one_byte .contains val then [ val.low8bits ]
else if codepoint.utf8_encoded_in_two_bytes .contains val then [ (u32 0xc0 | (val >> 6) & 0x1f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
else if codepoint.utf8_encoded_in_three_bytes.contains val then [ (u32 0xe0 | (val >> 12) & 0x1f).low8bits,
(u32 0x80 | (val >> 6) & 0x3f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
else if codepoint.utf8_encoded_in_four_bytes .contains val then [ (u32 0xf0 | (val >> 18) & 0x07).low8bits,
(u32 0x80 | (val >> 12) & 0x3f).low8bits,
(u32 0x80 | (val >> 6) & 0x3f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
else
fuzion.std.panic "failed to encode code point {codepoint.this}"
# is this an ASCII code encoded in one byte
#
public redef is_ascii => codepoint.ascii.contains val
# range of permitted value for a codepoint
#
public type.range => u32 0 .. 0x10ffff
# range of values encoded in one byte
#
public type.ascii => u32 0 .. 0x7f
# 0 to 9
public type.ascii_digit => u32 0x30 .. 0x39
# A to Z (uppercase only)
public type.A_to_Z => u32 0x41 .. 0x5A
# a to z (lowercase only)
public type.a_to_z => u32 0x61 .. 0x7A
# a-z and A-Z
# https://en.wikipedia.org/wiki/ISO_basic_Latin_alphabet
public type.latin_alphabet => codepoint.A_to_Z.concat_sequences a_to_z # NYI It would be better to use union of those two sets
# range of values encoded in one byte
#
public type.utf8_encoded_in_one_byte => ascii
# range of values encoded in two bytes
#
public type.utf8_encoded_in_two_bytes => u32 0x80 .. 0x7ff
# range of values encoded in three bytes
#
public type.utf8_encoded_in_three_bytes => u32 0x800 .. 0xffff
# range of values encoded in four bytes
#
public type.utf8_encoded_in_four_bytes => u32 0x10000 .. 0x10ffff
# range reserved for utf16 surrogate pairs
#
public type.utf16_surrogate => u32 0xd800 .. 0xdfff
# values guaranteed never to be a legal unicode character
#
public type.not_a_character => u32 0xfffe .. 0xffff
# return the number of bytes of this utf-8 character
# by examining the first byte
#
# NYI: implement num_utf8_bytes(first_byte u8) => (~first_byte).leading_zeroes+1.
module type.num_utf8_bytes (first_byte u8) outcome i32 =>
if ((u8 0)..0x7F).contains first_byte
1
else if ((u8 0xC2)..0xDF).contains first_byte
2
else if ((u8 0xE0)..0xEF).contains first_byte
3
else if ((u8 0xF0)..0xF4).contains first_byte
4
else
error "first byte is not the start of utf-8 character."
# compare two codepoints for equality
#
# result is true iff the codepoints have the same value
#
fixed type.equality(a, b codepoint) => a.val = b.val
# compare two codepoints
#
# This defines a total order over strings that is unrelated to alphabetic order.
#
fixed type.lteq(a, b codepoint) => a.val <= b.val
last changed: 2024-03-07