summaryrefslogtreecommitdiff
path: root/src/reader.rs
blob: d2cae927459b1222fa5f667bbfe1bb503d393ff8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/// An object that provides characters to the tokenizer.
///
/// See [`crate::Tokenizer::new`] for more information.
pub trait Reader {
    /// Return a new character from the input stream.
    ///
    /// Newlines have to be normalized as described in [Preprocessing the input
    /// stream](https://html.spec.whatwg.org/#preprocessing-the-input-stream), however error
    /// emission is done within the tokenizer.
    fn read_char(&mut self) -> Option<char>;

    /// Attempt to read an entire string at once, either case-insensitively or not.
    ///
    /// `case_sensitive=false` means that characters of the input stream should be compared while
    /// ignoring ASCII-casing.
    ///
    /// It can be assumed that this function is never called with a string that contains `\r` or
    /// `\n`.
    ///
    /// If the next characters equal to `s`, this function consumes the respective characters from
    /// the input stream and returns `true`. If not, it does nothing and returns `false`.
    fn try_read_string(&mut self, s: &str, case_sensitive: bool) -> bool;
}

/// An object that can be converted into a [`crate::Reader`].
///
/// For example, any utf8-string can be converted into a `StringReader`, such that
/// `Tokenizer::new("mystring")` and `Tokenizer::new(&String::new("foo"))` work.
pub trait Readable<'a> {
    /// The reader type to which this type should be converted.
    type Reader: Reader + 'a;

    /// Convert self to some sort of reader.
    fn to_reader(self) -> Self::Reader;
}

impl<'a, R: 'a + Reader> Readable<'a> for R {
    type Reader = Self;

    fn to_reader(self) -> Self::Reader {
        self
    }
}

/// A helper struct to seek forwards and backwards in strings. Used by the tokenizer to read HTML
/// from strings.
pub struct StringReader<'a> {
    input: &'a str,
    pos: usize,
}

impl<'a> StringReader<'a> {
    fn new(input: &'a str) -> Self {
        StringReader { input, pos: 0 }
    }

    fn peek_char(&self) -> Option<char> {
        self.input.get(self.pos..)?.chars().next()
    }
}

impl<'a> Reader for StringReader<'a> {
    fn read_char(&mut self) -> Option<char> {
        let mut r1 = match self.peek_char() {
            Some(x) => x,
            None => {
                self.pos += 1;
                return None;
            }
        };

        self.pos += r1.len_utf8();

        if r1 == '\r' {
            r1 = '\n';
            let r2 = self.peek_char();
            if r2 == Some('\n') {
                self.pos += r2.map(char::len_utf8).unwrap_or(0);
            }
        }

        Some(r1)
    }

    fn try_read_string(&mut self, s1: &str, case_sensitive: bool) -> bool {
        // we do not need to call validate_char here because `s` hopefully does not contain invalid
        // characters

        if let Some(s2) = self.input.get(self.pos..self.pos + s1.len()) {
            if s1 == s2 || (!case_sensitive && s1.eq_ignore_ascii_case(s2)) {
                self.pos += s1.len();
                return true;
            }
        }

        false
    }
}

impl<'a> Readable<'a> for &'a str {
    type Reader = StringReader<'a>;

    fn to_reader(self) -> Self::Reader {
        StringReader::new(self)
    }
}

impl<'a> Readable<'a> for &'a String {
    type Reader = StringReader<'a>;

    fn to_reader(self) -> Self::Reader {
        StringReader::new(self.as_str())
    }
}