diff options
| author | San Jacobs | 2025-12-11 20:44:02 +0100 |
|---|---|---|
| committer | San Jacobs | 2025-12-11 20:44:02 +0100 |
| commit | cb7ef81fd339199c69eccd93105c13d2a1f41f71 (patch) | |
| tree | 7109109bd690e91ff74ef2c09605e5626c1d9179 /src/wav/xml/xml_reader.odin | |
| parent | 00121d7c14a3bfa03c5eeb6c28b5edf060baf029 (diff) | |
| download | better-report-cb7ef81fd339199c69eccd93105c13d2a1f41f71.tar.gz better-report-cb7ef81fd339199c69eccd93105c13d2a1f41f71.tar.bz2 better-report-cb7ef81fd339199c69eccd93105c13d2a1f41f71.zip | |
We can now generate reports straight from the WAVs! No CSV needed!
Diffstat (limited to 'src/wav/xml/xml_reader.odin')
| -rw-r--r-- | src/wav/xml/xml_reader.odin | 628 |
1 files changed, 628 insertions, 0 deletions
diff --git a/src/wav/xml/xml_reader.odin b/src/wav/xml/xml_reader.odin new file mode 100644 index 0000000..c19cbf6 --- /dev/null +++ b/src/wav/xml/xml_reader.odin @@ -0,0 +1,628 @@ +package encoding_xml +/* + An XML 1.0 / 1.1 parser + + 2021-2022 Jeroen van Rijn <nom@duclavier.com>. + available under Odin's license. + + List of contributors: + - Jeroen van Rijn: Initial implementation. +*/ + +import "core:bytes" +import "core:encoding/entity" +import "base:intrinsics" +import "core:mem" +import "core:os" +import "core:strings" +import "base:runtime" + +likely :: intrinsics.expect + +DEFAULT_OPTIONS :: Options{ + flags = {.Ignore_Unsupported}, + expected_doctype = "", +} + +Option_Flag :: enum { + // If the caller says that input may be modified, we can perform in-situ parsing. + // If this flag isn't provided, the XML parser first duplicates the input so that it can. + Input_May_Be_Modified, + + // Document MUST start with `<?xml` prologue. + Must_Have_Prolog, + + // Document MUST have a `<!DOCTYPE`. + Must_Have_DocType, + + // By default we skip comments. Use this option to intern a comment on a parented Element. + Intern_Comments, + + // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[ + Error_on_Unsupported, + Ignore_Unsupported, + + // By default CDATA tags are passed-through as-is. + // This option unwraps them when encountered. + Unbox_CDATA, + + // By default SGML entities like `>`, ` ` and ` ` are passed-through as-is. + // This option decodes them when encountered. + Decode_SGML_Entities, + + // If a tag body has a comment, it will be stripped unless this option is given. + Keep_Tag_Body_Comments, +} +Option_Flags :: bit_set[Option_Flag; u16] + +Document :: struct { + elements: [dynamic]Element `fmt:"v,element_count"`, + element_count: Element_ID, + + prologue: Attributes, + encoding: Encoding, + + doctype: struct { + // We only scan the <!DOCTYPE IDENT part and skip the rest. + ident: string, + rest: string, + }, + + // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live. + // Otherwise they'll be in the element tree. + comments: [dynamic]string `fmt:"-"`, + + // Internal + tokenizer: ^Tokenizer `fmt:"-"`, + allocator: mem.Allocator `fmt:"-"`, + + // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified. + input: []u8 `fmt:"-"`, + strings_to_free: [dynamic]string `fmt:"-"`, +} + +Element :: struct { + ident: string, + value: [dynamic]Value, + attribs: Attributes, + + kind: enum { + Element = 0, + Comment, + }, + parent: Element_ID, +} + +Value :: union { + string, + Element_ID, +} + +Attribute :: struct { + key: string, + val: string, +} + +Attributes :: [dynamic]Attribute + +Options :: struct { + flags: Option_Flags, + expected_doctype: string, +} + +Encoding :: enum { + Unknown, + + UTF_8, + ISO_8859_1, + + // Aliases + LATIN_1 = ISO_8859_1, +} + +Error :: enum { + // General return values. + None = 0, + General_Error, + Unexpected_Token, + Invalid_Token, + + // Couldn't find, open or read file. + File_Error, + + // File too short. + Premature_EOF, + + // XML-specific errors. + No_Prolog, + Invalid_Prolog, + Too_Many_Prologs, + + No_DocType, + Too_Many_DocTypes, + DocType_Must_Preceed_Elements, + + // If a DOCTYPE is present _or_ the caller + // asked for a specific DOCTYPE and the DOCTYPE + // and root tag don't match, we return `.Invalid_DocType`. + Invalid_DocType, + + Invalid_Tag_Value, + Mismatched_Closing_Tag, + + Unclosed_Comment, + Comment_Before_Root_Element, + Invalid_Sequence_In_Comment, + + Unsupported_Version, + Unsupported_Encoding, + + // <!FOO are usually skipped. + Unhandled_Bang, + + Duplicate_Attribute, + Conflicting_Options, +} + +parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { + data := data + context.allocator = allocator + + opts := validate_options(options) or_return + + // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place. + if .Input_May_Be_Modified not_in opts.flags { + data = bytes.clone(data) + } + + t := new(Tokenizer) + init(t, string(data), path, error_handler) + + doc = new(Document) + doc.allocator = allocator + doc.tokenizer = t + doc.input = data + + doc.elements = make([dynamic]Element, 1024, 1024, allocator) + + err = .Unexpected_Token + element, parent: Element_ID + open: Token + + // If a DOCTYPE is present, the root tag has to match. + // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match. + expected_doctype := options.expected_doctype + + loop: for { + skip_whitespace(t) + switch t.ch { + case '<': + // Consume peeked `<` + advance_rune(t) + + open = scan(t) + // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed. + if likely(open.kind, Token_Kind.Ident) == .Ident { + // e.g. <odin - Start of new element. + element = new_element(doc) + if element == 0 { // First Element + parent = element + } else { + append(&doc.elements[parent].value, element) + } + + doc.elements[element].parent = parent + doc.elements[element].ident = open.text + + parse_attributes(doc, &doc.elements[element].attribs) or_return + + // If a DOCTYPE is present _or_ the caller + // asked for a specific DOCTYPE and the DOCTYPE + // and root tag don't match, we return .Invalid_Root_Tag. + if element == 0 { // Root tag? + if len(expected_doctype) > 0 && expected_doctype != open.text { + //error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) + return doc, .Invalid_DocType + } + } + + // One of these should follow: + // - `>`, which means we've just opened this tag and expect a later element to close it. + // - `/>`, which means this is an 'empty' or self-closing tag. + end_token := scan(t) + #partial switch end_token.kind { + case .Gt: + // We're now the new parent. + parent = element + + case .Slash: + // Empty tag. Close it. + expect(t, .Gt) or_return + parent = doc.elements[element].parent + element = parent + + case: + //error(t, t.offset, "Expected close tag, got: %#v\n", end_token) + return + } + + } else if open.kind == .Slash { + // Close tag. + ident := expect(t, .Ident) or_return + _ = expect(t, .Gt) or_return + + if doc.elements[element].ident != ident.text { + //error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) + return doc, .Mismatched_Closing_Tag + } + parent = doc.elements[element].parent + element = parent + + } else if open.kind == .Exclaim { + // <! + next := scan(t) + #partial switch next.kind { + case .Ident: + switch next.text { + case "DOCTYPE": + if len(doc.doctype.ident) > 0 { + return doc, .Too_Many_DocTypes + } + if doc.element_count > 0 { + return doc, .DocType_Must_Preceed_Elements + } + parse_doctype(doc) or_return + + if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { + //error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) + return doc, .Invalid_DocType + } + expected_doctype = doc.doctype.ident + + case: + if .Error_on_Unsupported in opts.flags { + //error(t, t.offset, "Unhandled: <!%v\n", next.text) + return doc, .Unhandled_Bang + } + skip_element(t) or_return + } + + case .Dash: + // Comment: <!-- -->. + // The grammar does not allow a comment to end in ---> + expect(t, .Dash) + comment := scan_comment(t) or_return + + if .Intern_Comments in opts.flags { + if len(doc.elements) == 0 { + append(&doc.comments, comment) + } else { + el := new_element(doc) + doc.elements[el].parent = element + doc.elements[el].kind = .Comment + append(&doc.elements[el].value, comment) + append(&doc.elements[element].value, el) + } + } + + case .Open_Bracket: + // This could be a CDATA tag part of a tag's body. Unread the `<![` + t.offset -= 3 + + // Instead of calling `parse_body` here, we could also `continue loop` + // and fall through to the `case:` at the bottom of the outer loop. + // This makes the intent clearer. + parse_body(doc, element, opts) or_return + + case: + //error(t, t.offset, "Unexpected Token after <!: %#v", next) + } + + } else if open.kind == .Question { + // <?xml + next := scan(t) + #partial switch next.kind { + case .Ident: + if len(next.text) == 3 && strings.equal_fold(next.text, "xml") { + parse_prologue(doc) or_return + } else if len(doc.prologue) > 0 { + // We've already seen a prologue. + return doc, .Too_Many_Prologs + } else { + // Could be `<?xml-stylesheet`, etc. Ignore it. + skip_element(t) or_return + } + case: + //error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text) + return + } + + } else { + //error(t, t.offset, "Invalid Token after <: %#v\n", open) + return + } + + case -1: + // End of file. + break loop + + case: + // This should be a tag's body text. + parse_body(doc, element, opts) or_return + } + } + + if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 { + return doc, .No_Prolog + } + + if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 { + return doc, .No_DocType + } + + resize(&doc.elements, int(doc.element_count)) + return doc, .None +} + +parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { + _data := transmute([]u8)data + + return parse_bytes(_data, options, path, error_handler, allocator) +} + +parse :: proc { parse_string, parse_bytes } + +// Load an XML file +load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { + context.allocator = allocator + options := options + + data, data_ok := os.read_entire_file(filename) + if !data_ok { return {}, .File_Error } + + options.flags += { .Input_May_Be_Modified } + + return parse_bytes(data, options, filename, error_handler, allocator) +} + +destroy :: proc(doc: ^Document) { + if doc == nil { return } + + for el in doc.elements { + delete(el.attribs) + delete(el.value) + } + delete(doc.elements) + + delete(doc.prologue) + delete(doc.comments) + delete(doc.input) + + for s in doc.strings_to_free { + delete(s) + } + delete(doc.strings_to_free) + + free(doc.tokenizer) + free(doc) +} + +/* + Helpers. +*/ + +validate_options :: proc(options: Options) -> (validated: Options, err: Error) { + validated = options + + if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags { + return options, .Conflicting_Options + } + return validated, .None +} + +expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) { + tok = scan(t, multiline_string=multiline_string) + if tok.kind == kind { return tok, .None } + + //error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind) + return tok, .Unexpected_Token +} + +parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + key := expect(t, .Ident) or_return + _ = expect(t, .Eq) or_return + value := expect(t, .String, multiline_string=true) or_return + + normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator) + if normalize_err == .None { + append(&doc.strings_to_free, normalized) + value.text = normalized + } + + attr.key = key.text + attr.val = value.text + + err = .None + return +} + +check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) { + for a in attribs { + if attr.key == a.key { + //error(t, offset, "Duplicate attribute: %v\n", attr.key) + return .Duplicate_Attribute + } + } + return .None +} + +parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + for peek(t).kind == .Ident { + attr, offset := parse_attribute(doc) or_return + check_duplicate_attributes(t, attribs^, attr, offset) or_return + append(attribs, attr) + } + skip_whitespace(t) + return .None +} + +parse_prologue :: proc(doc: ^Document) -> (err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + offset := t.offset + parse_attributes(doc, &doc.prologue) or_return + + for attr in doc.prologue { + switch attr.key { + case "version": + switch attr.val { + case "1.0", "1.1": + case: + //error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val) + } + + case "encoding": + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + switch strings.to_lower(attr.val, context.temp_allocator) { + case "utf-8", "utf8": + doc.encoding = .UTF_8 + + case "latin-1", "latin1", "iso-8859-1": + doc.encoding = .LATIN_1 + + case: + // Unrecognized encoding, assume UTF-8. + //error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val) + } + + case: + // Ignored. + } + } + + _ = expect(t, .Question) or_return + _ = expect(t, .Gt) or_return + + return .None +} + +skip_element :: proc(t: ^Tokenizer) -> (err: Error) { + close := 1 + + loop: for { + tok := scan(t) + #partial switch tok.kind { + case .EOF: + //error(t, t.offset, "[skip_element] Premature EOF\n") + return .Premature_EOF + + case .Lt: + close += 1 + + case .Gt: + close -= 1 + if close == 0 { + break loop + } + + case: + + } + } + return .None +} + +parse_doctype :: proc(doc: ^Document) -> (err: Error) { + /* + <!DOCTYPE greeting SYSTEM "hello.dtd"> + + <!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> + ]> + */ + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + tok := expect(t, .Ident) or_return + doc.doctype.ident = tok.text + + skip_whitespace(t) + offset := t.offset + skip_element(t) or_return + + // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it. + doc.doctype.rest = string(t.src[offset : t.offset - 1]) + return .None +} + +parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + body_text := scan_string(t, t.offset) or_return + needs_processing := .Unbox_CDATA in opts.flags + needs_processing |= .Decode_SGML_Entities in opts.flags + + if !needs_processing { + append(&doc.elements[element].value, body_text) + return + } + + decode_opts := entity.XML_Decode_Options{} + if .Keep_Tag_Body_Comments not_in opts.flags { + decode_opts += { .Comment_Strip } + } + + if .Decode_SGML_Entities not_in opts.flags { + decode_opts += { .No_Entity_Decode } + } + + if .Unbox_CDATA in opts.flags { + decode_opts += { .Unbox_CDATA } + if .Decode_SGML_Entities in opts.flags { + decode_opts += { .Decode_CDATA } + } + } + + decoded, decode_err := entity.decode_xml(body_text, decode_opts) + if decode_err == .None { + append(&doc.elements[element].value, decoded) + append(&doc.strings_to_free, decoded) + } else { + append(&doc.elements[element].value, body_text) + } + + return +} + +Element_ID :: u32 + +new_element :: proc(doc: ^Document) -> (id: Element_ID) { + element_space := len(doc.elements) + + // Need to resize + if int(doc.element_count) + 1 > element_space { + if element_space < 65536 { + element_space *= 2 + } else { + element_space += 65536 + } + resize(&doc.elements, element_space) + } + + cur := doc.element_count + doc.element_count += 1 + return cur +}
\ No newline at end of file |