diff options
Diffstat (limited to 'src/wav/xml/tokenizer.odin')
| -rw-r--r-- | src/wav/xml/tokenizer.odin | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/src/wav/xml/tokenizer.odin b/src/wav/xml/tokenizer.odin new file mode 100644 index 0000000..f4c9c8a --- /dev/null +++ b/src/wav/xml/tokenizer.odin @@ -0,0 +1,415 @@ +package encoding_xml + +/* + An XML 1.0 / 1.1 parser + + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. + Made available under Odin's license. + + A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). + + List of contributors: + Jeroen van Rijn: Initial implementation. +*/ + + +import "core:fmt" +import "core:unicode" +import "core:unicode/utf8" +import "core:strings" + +Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any) + +Token :: struct { + kind: Token_Kind, + text: string, + pos: Pos, +} + +Pos :: struct { + file: string, + offset: int, // starting at 0 + line: int, // starting at 1 + column: int, // starting at 1 +} + +Token_Kind :: enum { + Invalid, + + Ident, + Literal, + Rune, + String, + + Double_Quote, // " + Single_Quote, // ' + Colon, // : + + Eq, // = + Lt, // < + Gt, // > + Exclaim, // ! + Question, // ? + Hash, // # + Slash, // / + Dash, // - + + Open_Bracket, // [ + Close_Bracket, // ] + + EOF, +} + +CDATA_START :: "<![CDATA[" +CDATA_END :: "]]>" + +COMMENT_START :: "<!--" +COMMENT_END :: "-->" + +Tokenizer :: struct { + // Immutable data + path: string, + src: string, + err: Error_Handler, + + // Tokenizing state + ch: rune, + offset: int, + read_offset: int, + line_offset: int, + line_count: int, + + // Mutable data + error_count: int, +} + +init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) { + t.src = src + t.err = err + t.ch = ' ' + t.offset = 0 + t.read_offset = 0 + t.line_offset = 0 + t.line_count = len(src) > 0 ? 1 : 0 + t.error_count = 0 + t.path = path + + advance_rune(t) + if t.ch == utf8.RUNE_BOM { + advance_rune(t) + } +} + +@(private) +offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos { + line := t.line_count + column := offset - t.line_offset + 1 + + return Pos { + file = t.path, + offset = offset, + line = line, + column = column, + } +} + +default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { + fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column) + fmt.eprintf(msg, ..args) + fmt.eprintf("\n") +} + +error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { + pos := offset_to_pos(t, offset) + if t.err != nil { + t.err(pos=pos, fmt=msg, args=args) + } + t.error_count += 1 +} + +@(optimization_mode="favor_size") +advance_rune :: proc(t: ^Tokenizer) { + #no_bounds_check { + /* + Already bounds-checked here. + */ + if t.read_offset < len(t.src) { + t.offset = t.read_offset + if t.ch == '\n' { + t.line_offset = t.offset + t.line_count += 1 + } + r, w := rune(t.src[t.read_offset]), 1 + switch { + case r == 0: + //error(t, t.offset, "illegal character NUL") + case r >= utf8.RUNE_SELF: + r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:]) + if r == utf8.RUNE_ERROR && w == 1 { + //error(t, t.offset, "illegal UTF-8 encoding") + } else if r == utf8.RUNE_BOM && t.offset > 0 { + //error(t, t.offset, "illegal byte order mark") + } + } + t.read_offset += w + t.ch = r + } else { + t.offset = len(t.src) + if t.ch == '\n' { + t.line_offset = t.offset + t.line_count += 1 + } + t.ch = -1 + } + } +} + +peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte { + if t.read_offset+offset < len(t.src) { + #no_bounds_check return t.src[t.read_offset+offset] + } + return 0 +} + +@(optimization_mode="favor_size") +skip_whitespace :: proc(t: ^Tokenizer) { + for { + switch t.ch { + case ' ', '\t', '\r', '\n': + advance_rune(t) + case: + return + } + } +} + +@(optimization_mode="favor_size") +is_letter :: proc(r: rune) -> bool { + if r < utf8.RUNE_SELF { + switch r { + case '_': + return true + case 'A'..='Z', 'a'..='z': + return true + } + } + return unicode.is_letter(r) +} + +is_valid_identifier_rune :: proc(r: rune) -> bool { + if r < utf8.RUNE_SELF { + switch r { + case '_', '-', ':': return true + case 'A'..='Z', 'a'..='z': return true + case '0'..='9': return true + case -1: return false + } + } + + if unicode.is_letter(r) || unicode.is_digit(r) { + return true + } + return false +} + +scan_identifier :: proc(t: ^Tokenizer) -> string { + offset := t.offset + namespaced := false + + for is_valid_identifier_rune(t.ch) { + advance_rune(t) + if t.ch == ':' { + // A namespaced attr can have at most two parts, `namespace:ident`. + if namespaced { + break + } + namespaced = true + } + } + return string(t.src[offset : t.offset]) +} + +/* + A comment ends when we see -->, preceded by a character that's not a dash. + "For compatibility, the string "--" (double-hyphen) must not occur within comments." + + See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment + + Thanks to the length (4) of the comment start, we also have enough lookback, + and the peek at the next byte asserts that there's at least one more character + that's a `>`. +*/ +scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) { + offset := t.offset + + for { + advance_rune(t) + ch := t.ch + + if ch < 0 { + //error(t, offset, "[parse] Comment was not terminated\n") + return "", .Unclosed_Comment + } + + if string(t.src[t.offset - 1:][:2]) == "--" { + if peek_byte(t) == '>' { + break + } else { + //error(t, t.offset - 1, "Invalid -- sequence in comment.\n") + return "", .Invalid_Sequence_In_Comment + } + } + } + + expect(t, .Dash) + expect(t, .Gt) + + return string(t.src[offset : t.offset - 1]), .None +} + +// Skip CDATA +skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) { + if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) { + return .None + } + + t.read_offset += len(CDATA_START) + offset := t.offset + + cdata_scan: for { + advance_rune(t) + if t.ch < 0 { + //error(t, offset, "[scan_string] CDATA was not terminated\n") + return .Premature_EOF + } + + // Scan until the end of a CDATA tag. + if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) { + t.read_offset += len(CDATA_END) + break cdata_scan + } + } + return .None +} + +@(optimization_mode="favor_size") +scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) { + err = .None + + loop: for { + ch := t.ch + + switch ch { + case -1: + //error(t, t.offset, "[scan_string] Premature end of file.\n") + return "", .Premature_EOF + + case '<': + if peek_byte(t) == '!' { + if peek_byte(t, 1) == '[' { + // Might be the start of a CDATA tag. + skip_cdata(t) or_return + } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' { + // Comment start. Eat comment. + t.read_offset += 3 + _ = scan_comment(t) or_return + } + } + + case '\n': + if !multiline { + //error(t, offset, string(t.src[offset : t.offset])) + //error(t, offset, "[scan_string] Not terminated\n") + err = .Invalid_Tag_Value + break loop + } + } + + if t.ch == close { + // If it's not a CDATA or comment, it's the end of this body. + break loop + } + advance_rune(t) + } + + // Strip trailing whitespace. + lit := string(t.src[offset : t.offset]) + + end := len(lit) + eat: for ; end > 0; end -= 1 { + ch := lit[end - 1] + switch ch { + case ' ', '\t', '\r', '\n': + case: + break eat + } + } + lit = lit[:end] + + if consume_close { + advance_rune(t) + } + return lit, err +} + +peek :: proc(t: ^Tokenizer) -> (token: Token) { + old := t^ + token = scan(t) + t^ = old + return token +} + +scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token { + skip_whitespace(t) + + offset := t.offset + + kind: Token_Kind + err: Error + lit: string + pos := offset_to_pos(t, offset) + + switch ch := t.ch; true { + case is_letter(ch): + lit = scan_identifier(t) + kind = .Ident + + case: + advance_rune(t) + switch ch { + case -1: + kind = .EOF + + case '<': kind = .Lt + case '>': kind = .Gt + case '!': kind = .Exclaim + case '?': kind = .Question + case '=': kind = .Eq + case '#': kind = .Hash + case '/': kind = .Slash + case '-': kind = .Dash + case ':': kind = .Colon + case '[': kind = .Open_Bracket + case ']': kind = .Close_Bracket + + case '"', '\'': + kind = .Invalid + + lit, err = scan_string(t, t.offset, ch, true, multiline_string) + if err == .None { + kind = .String + } + + case '\n': + lit = "\n" + + case: + kind = .Invalid + } + } + + if kind != .String && lit == "" { + lit = string(t.src[offset : t.offset]) + } + return Token{kind, lit, pos} +} |