# This file is part of the Fuzion language implementation.
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# License for more details.
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <>.
# -----------------------------------------------------------------------
# Tokiwa Software GmbH, Germany
# Source code of Fuzion standard library feature codepoint
# Author: Fridtjof Siebert (
# -----------------------------------------------------------------------
# codepoint -- represents a unicode codepoint
public codepoint(public val u32) : String
debug: (codepoint.range.contains val)
debug: !codepoint.utf16_surrogate.contains val
# the utf8 encoded bytes for the string representation
# of this codepoint
public redef utf8 Sequence u8 =>
if codepoint.utf8_encoded_in_one_byte .contains val then [ val.low8bits ]
else if codepoint.utf8_encoded_in_two_bytes .contains val then [ (u32 0xc0 | (val >> 6) & 0x1f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
else if codepoint.utf8_encoded_in_three_bytes.contains val then [ (u32 0xe0 | (val >> 12) & 0x1f).low8bits,
(u32 0x80 | (val >> 6) & 0x3f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
else if codepoint.utf8_encoded_in_four_bytes .contains val then [ (u32 0xf0 | (val >> 18) & 0x07).low8bits,
(u32 0x80 | (val >> 12) & 0x3f).low8bits,
(u32 0x80 | (val >> 6) & 0x3f).low8bits,
(u32 0x80 | val & 0x3f).low8bits ]
fuzion.std.panic "failed to encode code point {codepoint.this}"
# is this an ASCII code encoded in one byte
public redef is_ascii => codepoint.ascii.contains val
# range of permitted value for a codepoint
public type.range => u32 0 .. 0x10ffff
# range of values encoded in one byte
public type.ascii => u32 0 .. 0x7f
# 0 to 9
public type.ascii_digit => u32 0x30 .. 0x39
# A to Z (uppercase only)
public type.A_to_Z => u32 0x41 .. 0x5A
# a to z (lowercase only)
public type.a_to_z => u32 0x61 .. 0x7A
# a-z and A-Z
public type.latin_alphabet => codepoint.A_to_Z.concat_sequences a_to_z # NYI It would be better to use union of those two sets
# range of values encoded in one byte
public type.utf8_encoded_in_one_byte => ascii
# range of values encoded in two bytes
public type.utf8_encoded_in_two_bytes => u32 0x80 .. 0x7ff
# range of values encoded in three bytes
public type.utf8_encoded_in_three_bytes => u32 0x800 .. 0xffff
# range of values encoded in four bytes
public type.utf8_encoded_in_four_bytes => u32 0x10000 .. 0x10ffff
# range reserved for utf16 surrogate pairs
public type.utf16_surrogate => u32 0xd800 .. 0xdfff
# values guaranteed never to be a legal unicode character
public type.not_a_character => u32 0xfffe .. 0xffff
# return the number of bytes of this utf-8 character
# by examining the first byte
# NYI: implement num_utf8_bytes(first_byte u8) => (~first_byte).leading_zeroes+1.
module type.num_utf8_bytes (first_byte u8) outcome i32 =>
if ((u8 0)..0x7F).contains first_byte
else if ((u8 0xC2)..0xDF).contains first_byte
else if ((u8 0xE0)..0xEF).contains first_byte
else if ((u8 0xF0)..0xF4).contains first_byte
error "first byte is not the start of utf-8 character."
# compare two codepoints for equality
# result is true iff the codepoints have the same value
fixed type.equality(a, b codepoint) => a.val = b.val
# compare two codepoints
# This defines a total order over strings that is unrelated to alphabetic order.
fixed type.lteq(a, b codepoint) => a.val <= b.val
last changed: 2024-03-07