String.fz
# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------
#
# Tokiwa Software GmbH, Germany
#
# Source code of Fuzion standard library feature string
#
# Author: Fridtjof Siebert (siebert@tokiwa.software)
#
# -----------------------------------------------------------------------
# string -- immutable sequences of utf8 encoded unicode characters
#
public String ref : property.equatable, property.hashable, property.orderable is
# converting a string to a string is just returning string.this
public redef as_string String => String.this
# any concrete string must implement utf8
public utf8 Sequence u8 => abstract
# is this string empty?
public is_empty => utf8.is_empty
# returns true if string is empty or contains whitespace only
public is_blank => utf8 ∀ u -> u.is_ascii_white_space
# returns true if string contains whitespace
public contains_whitespace => utf8 ∃ u -> u.is_ascii_white_space
# length of this string in bytes
public byte_length => utf8.count
# length of this string in codepoints
public codepoint_length => as_codepoints.count
# concatenate string with string representation of another object
public infix + (other Any) String =>
String.concat String.this other.as_string
# repeat string given number of times
public infix * (n i32) ref : String
pre
n ≥ 0
# NYI String.this.utf8.finite, Strings where utf8 is a list are currently always infinite.
is
redef utf8 Sequence u8 =>
bytes := String.this.utf8
bytes.cycle.take (bytes.count * n)
# equality: compare two strings byte-by-byte
#
# result is true iff the strings have the same number of utf8 bytes and those
# bytes are equal.
#
public fixed type.equality(a, b String) =>
((a.utf8.zip b.utf8 aa,bb->aa=bb) ∀ x->x)
& a.utf8.count = b.utf8.count
# is `a` less than or equal to `b` when comparing their utf8 bytes?
#
# This defines a total order over strings that is unrelated to alphabetic order.
#
fixed type.lteq(a, b String) =>
a.utf8
.zip b.utf8 (c,d)->(c,d)
.filter (x ->
(c, d) := x
c != d)
.map (x ->
(c, d) := x
c ≤ d)
# if all bytes are equal lengths of strings might still differ
.first a.utf8.count≤b.utf8.count
# create hash code from a string
#
public type.hash_code(a String.this) u64 =>
sh_l := u64 13
sh_r := u64 51
for
h u64 := 0, ((h << sh_l) | (h >> sh_r)) ^ b.as_u64;
b in a.utf8
while true
else
h
# internal helper to create error for failed parsing
#
private parse_error(msg String) => error "failed to parse '{String.this}': $msg"
# parse this string as a signed 32-bit integer value
#
public parse_i32 => parse_i32 10
public parse_i32_binary => parse_i32 2
public parse_i32_octal => parse_i32 8
public parse_i32_hex => parse_i32 16
public parse_i32 (base i32) outcome i32
pre 1 < base ≤ 36
=>
parse_integer i32 base
# parse this string as an unsigned 32-bit integer value
#
public parse_u32 => parse_u32 10
public parse_u32_binary => parse_u32 2
public parse_u32_octal => parse_u32 8
public parse_u32_hex => parse_u32 16
public parse_u32 (base u32) outcome u32
pre u32 1 < base ≤ (u32 36)
=>
parse_integer u32 base
# parse this string as a signed 64-bit integer value
#
public parse_i64 => parse_i64 10
public parse_i64_binary => parse_i64 2
public parse_i64_octal => parse_i64 8
public parse_i64_hex => parse_i64 16
public parse_i64 (base i64) outcome i64
pre i64 1 < base ≤ (i64 36)
=>
parse_integer i64 base
# parse this string as an unsigned 64-bit integer value
#
public parse_u64 => parse_u64 10
public parse_u64_binary => parse_u64 2
public parse_u64_octal => parse_u64 8
public parse_u64_hex => parse_u64 16
public parse_u64 (base u64) outcome u64
pre u64 1 < base ≤ (u64 36)
=>
parse_integer u64 base
# parse this string as an int value of arbitrary size
#
public parse_int => parse_int int(10)
public parse_int_binary => parse_int int(2)
public parse_int_octal => parse_int int(8)
public parse_int_hex => parse_int int(16)
public parse_int (base int) outcome int
pre (int 1) < base ≤ int 36
=>
parse_integer int base
# parse this string as a integer value given as type parameter
#
public parse_integer(
# the integer type
T type : integer,
# base gives the base of the integer, must be between 2 and 36, inclusive.
base T
) outcome T
pre T.one < base ≤ T.from_u32 36
=>
match utf8.as_list
nil => parse_error "empty string"
c Cons =>
negate := c.head = String.minus_char
d := if (negate || c.head = String.plus_char) String.zero_char else c.head
parse_integer T base negate T.zero d c.tail
# recursive helper for parse_integer T
#
private parse_integer(
# the integer type
T type : integer,
# base gives the base, between 2 and 36
base T,
# do we parse a negative number?
neg bool,
# the value of the highest digits already parsed
hi num_option T,
# the current character to be parsed
c u8,
# the remaining characters to be parsed
s list u8
) outcome T
=>
d outcome u8 :=
if (String.zero_char ≤ c ≤ String.nine_char ) c - String.zero_char
else if (String.a_char ≤ c ≤ String.z_char ) c - String.a_char + 10
else if (String.cap_a_char ≤ c ≤ String.cap_z_char) c - String.cap_a_char + 10
else parse_error "non-digit found"
d.bind T b->
t := parse_integer.this.T.from_u32 b.as_u32 # i converted to T
if t ≥ base
parse_error "invalid integer digit for base $base"
else
hi := hi *? base;
v := if (neg) hi -? t
else hi +? t
match s
c Cons =>
parse_integer T base neg v c.head c.tail
nil =>
v ? nil => parse_error "numerical overflow"
| u T => u
# convert this string into an array of codepoints.
#
codepoint_array => as_codepoints.as_array
# convert this string into a Sequence of codepoint and errors for encoding problems
# found in the underlying utf8 bytes
#
public as_codepoints Sequence codepoint =>
codepoints_and_errors
.map (x ->
match x
c codepoint => c
e error => codepoint 0xFFFD # 'REPLACEMENT CHARACTER' (U+FFFD)
)
# replaces invalid UTF-8 byte sequences in this string with the Unicode
# replacement character (U+FFFD).
to_valid_utf8 String =>
to_valid_utf8 (codepoint 0xFFFD) # 'REPLACEMENT CHARACTER'
# replaces invalid UTF-8 byte sequences in this string with the given
# string.
to_valid_utf8(replacement String) String =>
codepoints_and_errors
.reduce "" r,x->
match x
c codepoint => r + c
e error => r + replacement
# convert this string into a list of codepoint and errors for encoding problems
# found in the underlying utf8 bytes
#
public codepoints_and_errors list (outcome codepoint) =>
codepoints_and_errors utf8.as_list
# the list instance returned by as_codepoints
#
private codepoints_and_errors(l list u8) list (outcome codepoint) =>
match l
nil => nil
c1 Cons =>
# return list of c and rest
ret(c outcome codepoint, rest list u8) list (outcome codepoint) =>
ref : Cons (outcome codepoint) (list (outcome codepoint))
head => c
tail => codepoints_and_errors rest
p := codepoint.type
e(msg String) => error "Bad UTF8 encoding found: cannot decode $msg"
b1 := c1.head
e1(msg String) => ret (e "$b1: $msg") c1.tail
# UTF-8 definition taken from https://en.wikipedia.org/wiki/UTF-8
if b1.as_u32 ∈ p.utf8_encoded_in_one_byte # ASCII
ret (codepoint b1.as_u32) c1.tail
else if 0xc0 ≤ b1 ≤ 0xf4
match c1.tail
nil => e1 "end of String, expected continuation byte"
c2 Cons =>
b2 := c2.head
e2(msg String) => ret (e "$b1, $b2: $msg") c2.tail
if (b2 & 0xc0) != 0x80
e2 "expected continuation byte in the range 0x80..0xbf."
else if 0xc0 ≤ b1 ≤ 0xdf # 0x0080..0x7ff encoded in 2 bytes
res := (b1.as_u32 & 0x1f) << 6 | (b2.as_u32 & 0x3f)
if res ∉ p.utf8_encoded_in_two_bytes
e2 "codepoint $res uses overlong 2-byte encoding, allowed for range {p.utf8_encoded_in_two_bytes}."
else
ret (codepoint res) c2.tail
else if u8 0xe0 ≤ b1
match c2.tail
nil => e2 "end of String, expected continuation byte"
c3 Cons =>
b3 := c3.head
e3(msg String) => ret (e "$b1, $b2, $b3: $msg") c3.tail
if (b3 & 0xc0) != 0x80
e3 "expected two continuation bytes in the range 0x80..0xbf."
else if b1 ≤ 0xef # 0x0800..0xffff encoded in 3 bytes
res := (((b1.as_u32 & 0x0f) << 12) |
((b2.as_u32 & 0x3f) << 6) |
((b3.as_u32 & 0x3f) ) )
if res ∉ p.utf8_encoded_in_three_bytes
e3 "codepoint $res uses overlong 3-byte encoding, allowed for range {p.utf8_encoded_in_two_bytes}."
else if res ∈ p.utf16_surrogate
e3 "codepoint $res is invalid, values in the range {p.utf16_surrogate} are reserved for UTF-16 surrogate halves."
else if res ∈ p.not_a_character
e3 "codepoint $res is not a valid unicode character {p.not_a_character}."
else
ret (codepoint res) c3.tail
else # 0x010000..0x10ffff encoded in 4 bytes
match c3.tail
nil => e3 "end of String, expected continuation byte"
c4 Cons =>
b4 := c4.head
e4(msg String) => ret (e "$b1, $b2, $b3, $b4: $msg") c4.tail
if (b4 & 0xc0) != 0x80
e4 "expected three continuation bytes in the range 0x80..0xbf."
else
res := (((b1.as_u32 & 0x07) << 18) |
((b2.as_u32 & 0x3f) << 12) |
((b3.as_u32 & 0x3f) << 6) |
((b4.as_u32 & 0x3f) ) )
if res ∉ p.utf8_encoded_in_four_bytes
e4 "codepoint $res uses overlong 4-byte encoding."
else if res > (u32 0x10ffff)
e4 "codepoint $res is outside of the allowed range for codepoints 0x000000..0x10ffff."
else
ret (codepoint res) c4.tail
else fuzion.std.panic "String.codepoints_and_errors: missing case for $b1"
else if 0x80 ≤ b1 ≤ 0xbf then e1 "stray continuation byte without preceding leading byte."
else if 0xf5 ≤ b1 ≤ 0xfd then e1 "codes 0xf5..0xfd are undefined."
else if 0xfe ≤ b1 ≤ 0xff then e1 "codes 0xfe and 0xff are undefined, used for endianess checking."
else
fuzion.std.panic "String.codepoints_and_errors: missing case for $b1"
# create substring of this string consisting of bytes from (inclusive) .. to (exclusive).
#
public substring(from, to i32) String
pre
debug: 0 ≤ from ≤ to ≤ String.this.byte_length
# NYI check if from/to are valid start/end bytes?
=>
String.from_bytes (String.this.utf8.slice from to)
# create substring of this string consisting of bytes from (inclusive) .. byte_length (exclusive).
#
public substring(from i32) String
pre
debug: 0 ≤ from ≤ byte_length
=>
substring from byte_length
# create substring of this string consisting of codepoints from (inclusive) .. to (exclusive).
#
public substring_codepoint(from, to i32) String
pre
debug: 0 ≤ from ≤ to ≤ String.this.codepoint_length
=>
codepoint_array
.slice from to
.map String c->c # NYI: this should maybe not be needed since codepoint is a string
.fold String.concat
# create substring of this string consisting of codepoints from (inclusive) .. codepoint_length (exclusive).
#
public substring_codepoint(from i32) String
pre
debug: 0 ≤ from ≤ codepoint_length
=>
substring from codepoint_length
# check if this string starts with given prefix
#
public starts_with(prefx String) =>
(container.searchable_sequence utf8).starts_with prefx.utf8
# check if this string ends with given suffix
#
public ends_with(suffix String) =>
l := byte_length
sl := suffix.byte_length
end := utf8.drop l-sl
(container.searchable_sequence end).starts_with suffix.utf8
# find (utf8-byte-) index of 'substring' witin this string.
#
public find(substring String) =>
(container.searchable_sequence utf8).find substring.utf8
# find (utf8-byte-) index of 'substring' witin this string.
#
public find(substring String,
# start search at this index
from i32)
=>
(container.searchable_sequence (utf8.drop from)).find substring.utf8
# find (utf8-byte-) index of last occurrence of 'substring'
# within this string.
public find_last(substring String) option i32 =>
find_last substring nil
# find (utf8-byte-) index of last occurrence of 'substring'
# within this string.
private find_last(substring String, found option i32) option i32 =>
match find substring
nil => found >>= (pos -> pos - substring.byte_length)
idx i32 =>
skip := idx + substring.byte_length
s := String.from_bytes (utf8.drop skip)
s.find_last substring (skip + (found.get 0))
# replace all occurrences of old by new
#
public replace (old, new String) => String.from_bytes ((container.searchable_sequence utf8).replace old.utf8 new.utf8)
# replace the first n occurrences of old by new
public replace(old, new String, n u64) => String.from_bytes ((container.searchable_sequence utf8).replace old.utf8 new.utf8 n)
# does this string contain the given 'substring'
#
public contains (substring String) => find(substring).exists
# count number of occurrences of given 'substring' in this string
#
public count (substring String) =>
(container.searchable_sequence utf8).count_matches substring.utf8
# Split string separated by (ASCII) white space
#
# Leading and trailing white space is ignored, repeated white space is treated
# like a single white space
#
# The result is a, possibly empty, list of separate non-empty strings.
#
public split list String =>
l := utf8.as_list.drop_while (c -> c.is_ascii_white_space)
if l.is_empty
nil
else
h := String.from_bytes (l.take_while (c -> !c.is_ascii_white_space)).as_array
t := (String.from_bytes (l.drop_while (c -> !c.is_ascii_white_space))).split
ref : Cons String (list String)
head => h
tail => t
# split string at s
#
public split(s String) list String
pre
!s.is_empty
=>
split0 s nil false
# split string after s, that is do the same thing as split but
# include the separator s in the resulting strings
#
public split_after(s String) list String
pre
!s.is_empty
=>
split0 s nil true
# split string at s, for at most n occurrences of s
#
# if s occurs in the string less than n times, the resulting list will have
# less than n elements
#
public split_n(s String, n u32) list String
pre
debug: !s.is_empty
debug: n > (u32 0)
=>
split0 s n false
# split string after s, for at most n occurrences of s
#
# if s occurs in the string less than n times, the resulting list will have
# less than n elements
#
public split_after_n(s String, n u32) list String
pre
!s.is_empty
n > (u32 0)
=>
split0 s n true
# split string at s, if there is no limit, otherwise if limit is an integer n,
# for at most n occurrences of s
#
# if split_after is true, all but the last element of the resulting list include
# the separator
#
# helper feature which unifies the code of the different split features in one
#
private split0(s String, limit option u32, split_after bool) list String
pre
debug: !s.is_empty
debug: match limit
nil => true
n u32 => n > (u32 0)
=>
(container.searchable_sequence utf8).split0 s.utf8 limit split_after
.map_to_list x->(String.from_bytes x)
# remove leading and trailing white space from this string
#
public trim String =>
s0 := utf8
s1 := (s0.drop_while c->c.is_ascii_white_space).reverse
s2 := (s1.drop_while c->c.is_ascii_white_space).reverse
String.from_bytes s2
# remove leading white space from this string
#
trim_start =>
String.from_bytes (utf8.drop_while c->c.is_ascii_white_space)
# remove trailing white space from this string
#
trim_end =>
String.from_bytes (utf8.as_list.reverse.drop_while c->c.is_ascii_white_space).reverse
# pad this string at the end with spaces such that its `codepoint_length` is at least `n`.
#
public pad(n i32)
pre
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length != n): result = String.this)
=>
pad " " n
# pad this string at the beginning with spaces such that its `codepoint_length` is at least `n`.
#
public pad_left(n i32)
pre
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length != n): result = String.this)
=>
pad_left " " n
# pad this string at the beginning and at the end with spaces such that its `codepoint_length` is at least `n`.
#
# In case the required number of codepoints to add is odd, the padding at the end will be longer.
#
public pad_center(n i32)
pre
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length != n): result = String.this)
=>
pad_center " " n
# Helper for pad, pad_left and pad_right to determine minimum number of copies
# of `p` needed to pad String.this to length >= n.
#
private pad_count(p String, n i32) =>
pl := p.codepoint_length
add_cps := max 0 n-codepoint_length
(add_cps + pl - 1) / pl
# pad this string at the end with `p` such that its `codepoint_length` is at least `n`.
#
public pad(p String, n i32)
pre
safety: p.codepoint_length > 0
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length >= n+p.codepoint_length): result = String.this)
=>
String.this + p * (pad_count p n)
# pad this string at the beginning with `p` such that its `codepoint_length` is at least `n`.
#
public pad_left(p String, n i32)
pre
safety: p.codepoint_length > 0
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length >= n+p.codepoint_length): result = String.this)
=>
p * (pad_count p n) + String.this
# pad this string at the beginning and at the end with `p` such that its `codepoint_length` is at least `n`.
#
# In case the required number of copies of `p` is odd, the padding at the end will be longer.
#
public pad_center(p String, n i32)
pre
safety: p.codepoint_length > 0
debug: n >= 0
post
debug: result.codepoint_length >= n
debug: ((result.codepoint_length >= n+p.codepoint_length): result = String.this)
=>
pn := pad_count p n
l := pn/2
r := pn-l
p*l + String.this + p*r
# is this part of given set
#
element_of(s container.Set String) => s.contains String.this
public infix ∈ (s container.Set String) => String.this.element_of s
# is this not part of given set
#
not_element_of(s container.Set String) => !element_of s
public infix ∉ (s container.Set String) => String.this.not_element_of s
# return string of at least length l by
# padding codepoint s to start of string
#
public pad_codepoint_start(l i32, s String) String
pre s.codepoint_length = 1
=>
missing := l - codepoint_length
if missing ≤ 0
String.this
else
(s * missing) + String.this
# Splits this string at codepoints where p is true and returns the result as a
# list of strings. In case multiple, neighboring codepoints in the string are
# evaluated to be true by p, this does not cause empty strings to be added to
# the result list, rather this case is being treated as being one big separator.
#
public fields_func(p codepoint -> bool) list String =>
state_wrapper(l list String, in_run bool, start_pos, current_pos i32) is
i := state_wrapper (list String).empty false 0 0
last_state := as_codepoints.reduce state_wrapper i ((r, c) ->
match p c
TRUE =>
match r.in_run
TRUE => state_wrapper r.l true r.current_pos (r.current_pos + 1)
FALSE => state_wrapper (r.l ++ [(substring_codepoint r.start_pos (r.current_pos))].as_list) true r.current_pos (r.current_pos + 1)
FALSE =>
match r.in_run
TRUE => state_wrapper r.l false r.current_pos (r.current_pos + 1)
FALSE => state_wrapper r.l false r.start_pos (r.current_pos + 1))
match last_state.in_run
TRUE => last_state.l
FALSE => last_state.l ++ [(substring_codepoint last_state.start_pos)]
# Cuts out the first appearance of the string sep from this string, in other words,
# returns a tuple of two strings and a bool, the first string is the substring before
# the first appreance of sep, the second string is the substring after the first
# appearance of sep. The bool result is true iff sep appears in this string.
#
# If sep does not appear in this string at all, return this string as the first string,
# the empty string as the second, and false as the bool.
#
public cut(sep String) tuple String String bool =>
match find sep
nil => (String.this.as_string, "", false)
i i32 =>
l := byte_length
sepl := sep.byte_length
before := String.from_bytes (utf8.slice 0 i)
after := String.from_bytes (utf8.slice (i + sepl) l)
(before, after, true)
# convert this string to upper case
public upper_case String =>
String.from_codepoints (as_codepoints.map (c -> character_encodings.unicode.mappings.upper_case[c.val].get c))
# convert this string to lower case
public lower_case String =>
String.from_codepoints (as_codepoints.map (c -> character_encodings.unicode.mappings.lower_case[c.val].get c))
# Does this String consist of nothing but ascii codepoints?
#
public is_ascii =>
as_codepoints ∀ cp -> cp.is_ascii
# monoid of strings with infix + operation.
#
public type.concat : Monoid String is
redef infix ∙ (a, b String) => a + b
redef e => ""
# monoid of strings with infix '+ sep +' operation, i.e., concatenate with
# given separator
#
public type.concat(sep String) : Monoid String is
redef infix ∙ (a, b String) String => if (a.is_empty || b.is_empty) a + b else a + sep + b
redef e => ""
# concat strings a and b by
# concatenating their byte sequences.
#
public type.concat(a, b String) String =>
ref : String
utf8 Sequence u8 => a.utf8 ++ b.utf8
# Takes a sequence of strings and concatenates its elements, while adding the separator
# sep in between its elements. In case an empty sequence is given, returns the empty string.
#
public type.join(elems Sequence String, sep String) String =>
(elems.as_list.intersperse sep).fold concat
# Takes a sequence of strings and concatenates its elements.
# In case an empty sequence is given, returns the empty string.
#
public type.join(elems Sequence String) String =>
elems.fold concat
# create string by concatenating the results of $a[a.indices].
#
# This uses a growing array if further strings are appended using 'infix +',
# so it avoids quadratic runtime caused if each 'infix +' would create its
# own concatenation-string.
#
# The performance of creating a string a0+a1+a2+...+a<n> is in O(n) since the
# backing array is shared and doubled in size when full (so the final array size
# is less than 2n in size and the sum of all arrays is less than 4n = 2n + n +
# n/2+n/4+...).
#
# The performance of iterating the utf8 bytes of a string is O(l+n) for an
# array of length l created by concatenating n sub-strings.
#
public type.from_mutable_array(a Mutable_Array Any) ref : String is // NYI: Remove?
redef utf8 Sequence u8 =>
a.flat_map u8 ai->ai.as_string.utf8
redef infix + (other Any) String =>
a.add other
from_mutable_array a
# create string from the given utf8 bytes
#
public type.from_bytes(utf8_bytes Sequence u8) String =>
ref : String
redef utf8 := utf8_bytes.as_array_backed
# create string from the given codepoints
#
public type.from_codepoints(codepoints Sequence codepoint) String =>
ref : String
utf8 Sequence u8 =>
codepoints
.flat_map u8 x->x.utf8
.as_array_backed
# NYI: remove the convenience functions when Fuzion supports char literals
#
public type.minus_char => "-".utf8.first
public type.plus_char => "+".utf8.first
public type.zero_char => "0".utf8.first
public type.nine_char => "9".utf8.first
public type.a_char => "a".utf8.first
public type.z_char => "z".utf8.first
public type.cap_a_char => "A".utf8.first
public type.cap_z_char => "Z".utf8.first