Fuzion • APIs • Module nom • nom/parsers/xml.fz
nom/parsers/xml.fz


# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation.  If not, see <https://www.gnu.org/licenses/>.


# -----------------------------------------------------------------------
#
#  Tokiwa Software GmbH, Germany
#
#  Source code of simple (incomplete) xml parser built with nom
#
# -----------------------------------------------------------------------


# an deserialized xml node
#
public xml_node(public name String, public attributes container.Map String String, public data Sequence (choice nom.parsers.xml_node String))
  pre
    debug: !name.contains_whitespace
    debug: attributes.items ∀ x -> !x.0.contains_whitespace
    debug: data ∀ c->
      match c
        str String => str=str.trim && !str.is_blank
        * => true
is


  # NYI: UNDER DEVELOPMENT:
  # public access(seq Sequence (choice String i32)) option (Sequence xml_node)
  #   pre
  #     debug: !seq.is_empty
  # =>


  # this xml node as String
  #
  public redef as_string String =>
    as_string 0


  # this xml node as String
  #
  as_string(indent i32) String =>

    attr =>
      attributes
        .items
        .map t->
          "{t.0}=\"{t.1}\""
        .as_string " "

    dat_str =>
      data
        .map c->
          match c
            str String =>
              check debug: !(str.contains "<")
              check debug: !(str.contains ">")
              "{"  "*(indent+1)}{str}"
            n nom.parsers.xml_node => n.as_string indent+1
        .as_string "\n"

    if attributes.is_empty
      "{"  "*indent}<{name}>\n{dat_str}\n{"  "*(indent)}</{name}>"
    else
      "{"  "*indent}<{name} {attr}>\n{dat_str}\n{"  "*(indent)}</{name}>"



# xml parser
#
# NYI: UNDER DEVELOPMENT: incomplete! can only parse small subset of xml
#
# usage : NYI: UNDER DEVELOPMENT:
#
# Specification: https://www.w3.org/TR/REC-xml/
#
public xml Parser String (Sequence codepoint) (Sequence nom.parsers.xml_node) =>

  # <? ... ?>
  #
  parser_question_mark =>
    preceded (tag "<?") anything_but_question_end

  # NYI
  # # <! ...  >
  # #
  # parser_exclamation_mark =>
  #   delimited (tag "<!")

  # <!-- ...  -->
  #
  parser_comment =>
    preceded (tag "<!--") anything_but_comment_end


  # parser matching anything until -->
  #
  anything_but_comment_end =>
    # NYI: technically incorrect, consider e.g. strings
    parser (Sequence codepoint) (Sequence codepoint) unit input->
      match input.find "-->".codepoints
        nil => (error "--> not found")
        n i32 => success (input.drop n+3) unit


  # parser matching anything until ?>
  #
  anything_but_question_end =>
    # NYI: technically incorrect, consider e.g. strings
    parser (Sequence codepoint) (Sequence codepoint) unit input->
      match input.find "?>".codepoints
        nil => (error "?> not found")
        n i32 => success (input.drop n+2) unit


  # parser for a name+attributes of a tag
  #
  parser_name_and_attributes =>
    tuple2 parser_name parser_attributes


  # parse and discard ignored stuff like comments
  #
  ignored =>
    many0 <| alt [
                  delimited whitespace0 parser_comment whitespace0,
                  delimited whitespace0 parser_question_mark whitespace0
                  # delimited whitespace0 parser_exclamation_mark whitespace0
                 ]


  # parser for an opening tag
  #
  # example: <some_name attri1="hello" attri2="world">
  #
  parser_opening_tag =>
    delimited (tag "<") parser_name_and_attributes (tag ">")


  # parser for a closing tag
  #
  # example: </some_name >
  #
  parser_closing_tag =>
    delimited (tag "</") (tuple2 parser_name whitespace0) (tag ">")


  # parser for a self-closing tag (tag that has no children)
  #
  # example: <some_name attri1="hello" attri2="world"/>
  #
  parser_self_closing_tag =>
    delimited (tag "<") parser_name_and_attributes (tag "/>")
      .map o->
        nom.parsers.xml_node o.0 o.1 []


  # parser for a tag including its children
  #
  parser_full_tag Parser (Sequence codepoint) (Sequence codepoint) nom.parsers.xml_node =>
    parse(input Sequence codepoint) parse_result (Sequence codepoint) nom.parsers.xml_node =>
      parser_opening_tag.call input .bind s1->
        xml_or_text_seq.call s1.rest .bind s2->
          (preceded whitespace0 parser_closing_tag).call s2.rest .bind s3->
            id (parse_result (Sequence codepoint) nom.parsers.xml_node) (success s3.rest (nom.parsers.xml_node s1.out.0 s1.out.1 s2.out))
    parser (Sequence codepoint) _ _ parse


  # parser for a name (attribute name or tag name)
  #
  parser_name =>
    parser (Sequence codepoint) (Sequence codepoint) String  input->
      name := input.take_while (x -> x.is_ascii_alpha_num || x = ":")
      if name.is_empty
        error "empty name"
      else
        success (input.drop name.count) (String.from name)


  # parser for attributes of a tag
  #
  parser_attributes =>
    many0 (delimited whitespace0 parser_attribute whitespace0)
      .map (o -> container.map_of o.as_array)


  # parser for one attribute of a tag
  #
  # example: attr="hello"
  #
  parser_attribute =>
    tuple3 parser_name (tag "=") parser_xml_string
      .map t->
        (t.0,t.2)


  # parser for a string enclosed in double quotes
  #
  parser_xml_string =>
    delimited (tag "\"") (take_while0 codepoint (c -> c != "\"")) (tag "\"")
      .map (s -> String.from s)


  # parser for text, ends at first <
  #
  parser_text =>
    take_while1 codepoint (c -> c != "<")
      .map o->
        String.from o .trim


  # parser for sequence of texts/xml_nodes
  #
  xml_or_text_seq =>
    (many0 (preceded whitespace0 (alt [
      parser_text.map (Sequence (choice nom.parsers.xml_node String)) o->[id (choice nom.parsers.xml_node String) o],
      parser_self_closing_tag.map (Sequence (choice nom.parsers.xml_node String)) o->[id (choice nom.parsers.xml_node String) o],
      parser_full_tag.map (Sequence (choice nom.parsers.xml_node String)) o->[id (choice nom.parsers.xml_node String) o]
    ]))).map o->
      o.flat_map id


  # parse xml tags
  #
  xml_seq =>
    many0 (alt [
        parser_self_closing_tag,
        parser_full_tag
      ])


  # map parser to take string input instead of Sequence of codepoints
  #
  to_string_input (preceded ignored xml_seq)
last changed: 2026-02-23