quick_xml/reader/
mod.rs

1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::ops::Range;
6
7use crate::encoding::Decoder;
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::reader::parser::Parser;
11
12use memchr;
13
14macro_rules! configure_methods {
15    ($($holder:ident)?) => {
16        /// Changes whether empty elements should be split into an `Open` and a `Close` event.
17        ///
18        /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
19        /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
20        /// default), those tags are represented by an [`Empty`] event instead.
21        ///
22        /// Note, that setting this to `true` will lead to additional allocates that
23        /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
24        /// is also set, only one additional allocation will be performed that support
25        /// both these options.
26        ///
27        /// (`false` by default)
28        ///
29        /// [`Empty`]: Event::Empty
30        /// [`Start`]: Event::Start
31        /// [`End`]: Event::End
32        /// [`check_end_names`]: Self::check_end_names
33        pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self {
34            self $(.$holder)? .parser.expand_empty_elements = val;
35            self
36        }
37
38        /// Changes whether whitespace before and after character data should be removed.
39        ///
40        /// When set to `true`, all [`Text`] events are trimmed.
41        /// If after that the event is empty it will not be pushed.
42        ///
43        /// Changing this option automatically changes the [`trim_text_end`] option.
44        ///
45        /// (`false` by default).
46        ///
47        /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
48        ///
49        /// WARNING: With this option every text events will be trimmed which is
50        /// incorrect behavior when text events delimited by comments, processing
51        /// instructions or CDATA sections. To correctly trim data manually apply
52        /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
53        /// only to necessary events.
54        /// </div>
55        ///
56        /// [`Text`]: Event::Text
57        /// [`trim_text_end`]: Self::trim_text_end
58        /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
59        /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
60        pub fn trim_text(&mut self, val: bool) -> &mut Self {
61            self $(.$holder)? .parser.trim_text_start = val;
62            self $(.$holder)? .parser.trim_text_end = val;
63            self
64        }
65
66        /// Changes whether whitespace after character data should be removed.
67        ///
68        /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
69        /// If after that the event is empty it will not be pushed.
70        ///
71        /// (`false` by default).
72        ///
73        /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
74        ///
75        /// WARNING: With this option every text events will be trimmed which is
76        /// incorrect behavior when text events delimited by comments, processing
77        /// instructions or CDATA sections. To correctly trim data manually apply
78        /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
79        /// only to necessary events.
80        /// </div>
81        ///
82        /// [`Text`]: Event::Text
83        /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
84        /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
85        pub fn trim_text_end(&mut self, val: bool) -> &mut Self {
86            self $(.$holder)? .parser.trim_text_end = val;
87            self
88        }
89
90        /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
91        /// `</a >`.
92        ///
93        /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
94        ///
95        /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
96        /// going to fail erroneously if a closing tag contains trailing whitespaces.
97        ///
98        /// (`true` by default)
99        ///
100        /// [`End`]: Event::End
101        pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self {
102            self $(.$holder)? .parser.trim_markup_names_in_closing_tags = val;
103            self
104        }
105
106        /// Changes whether mismatched closing tag names should be detected.
107        ///
108        /// Note, that start and end tags [should match literally][spec], they cannot
109        /// have different prefixes even if both prefixes resolve to the same namespace.
110        /// The XML
111        ///
112        /// ```xml
113        /// <outer xmlns="namespace" xmlns:p="namespace">
114        /// </p:outer>
115        /// ```
116        ///
117        /// is not valid, even though semantically the start tag is the same as the
118        /// end tag. The reason is that namespaces are an extension of the original
119        /// XML specification (without namespaces) and it should be backward-compatible.
120        ///
121        /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
122        /// For example, `<mytag></different_tag>` will be permitted.
123        ///
124        /// If the XML is known to be sane (already processed, etc.) this saves extra time.
125        ///
126        /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
127        /// contain the data of the mismatched end tag.
128        ///
129        /// Note, that setting this to `true` will lead to additional allocates that
130        /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
131        /// is also set, only one additional allocation will be performed that support
132        /// both these options.
133        ///
134        /// (`true` by default)
135        ///
136        /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
137        /// [`End`]: Event::End
138        /// [`expand_empty_elements`]: Self::expand_empty_elements
139        pub fn check_end_names(&mut self, val: bool) -> &mut Self {
140            self $(.$holder)? .parser.check_end_names = val;
141            self
142        }
143
144        /// Changes whether comments should be validated.
145        ///
146        /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
147        /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
148        /// really care about comment correctness, thus the default value is `false` to improve
149        /// performance.
150        ///
151        /// (`false` by default)
152        ///
153        /// [`Comment`]: Event::Comment
154        pub fn check_comments(&mut self, val: bool) -> &mut Self {
155            self $(.$holder)? .parser.check_comments = val;
156            self
157        }
158    };
159}
160
161macro_rules! read_event_impl {
162    (
163        $self:ident, $buf:ident,
164        $reader:expr,
165        $read_until_open:ident,
166        $read_until_close:ident
167        $(, $await:ident)?
168    ) => {{
169        let event = loop {
170            match $self.parser.state {
171                ParseState::Init => { // Go to OpenedTag state
172                    // If encoding set explicitly, we not need to detect it. For example,
173                    // explicit UTF-8 set automatically if Reader was created using `from_str`.
174                    // But we still need to remove BOM for consistency with no encoding
175                    // feature enabled path
176                    #[cfg(feature = "encoding")]
177                    if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
178                        if $self.parser.encoding.can_be_refined() {
179                            $self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
180                        }
181                    }
182
183                    // Removes UTF-8 BOM if it is present
184                    #[cfg(not(feature = "encoding"))]
185                    $reader.remove_utf8_bom() $(.$await)? ?;
186
187                    // Go to OpenedTag state
188                    match $self.$read_until_open($buf) $(.$await)? {
189                        Ok(Ok(ev)) => break Ok(ev),
190                        Ok(Err(b)) => $buf = b,
191                        Err(err)   => break Err(err),
192                    }
193                },
194                ParseState::ClosedTag => { // Go to OpenedTag state
195                    match $self.$read_until_open($buf) $(.$await)? {
196                        Ok(Ok(ev)) => break Ok(ev),
197                        Ok(Err(b)) => $buf = b,
198                        Err(err)   => break Err(err),
199                    }
200                },
201                // Go to ClosedTag state in next two arms
202                ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
203                ParseState::Empty => break $self.parser.close_expanded_empty(),
204                ParseState::Exit => break Ok(Event::Eof),
205            };
206        };
207        match event {
208            Err(_) | Ok(Event::Eof) => $self.parser.state = ParseState::Exit,
209            _ => {}
210        }
211        event
212    }};
213}
214
215/// Read bytes up to `<` and skip it. If current byte (after skipping all space
216/// characters if [`Parser::trim_text_start`] is `true`) is already `<`, then
217/// returns the next event, otherwise stay at position just after the `<` symbol.
218///
219/// Moves parser to the `OpenedTag` state.
220///
221/// This code is executed in two cases:
222/// - after start of parsing just after skipping BOM if it is present
223/// - after parsing `</tag>` or `<tag>`
224macro_rules! read_until_open {
225    (
226        $self:ident, $buf:ident,
227        $reader:expr,
228        $read_event:ident
229        $(, $await:ident)?
230    ) => {{
231        $self.parser.state = ParseState::OpenedTag;
232
233        if $self.parser.trim_text_start {
234            $reader.skip_whitespace(&mut $self.parser.offset) $(.$await)? ?;
235        }
236
237        // If we already at the `<` symbol, do not try to return an empty Text event
238        if $reader.skip_one(b'<', &mut $self.parser.offset) $(.$await)? ? {
239            // Pass $buf to the next next iteration of parsing loop
240            return Ok(Err($buf));
241        }
242
243        match $reader
244            .read_bytes_until(b'<', $buf, &mut $self.parser.offset)
245            $(.$await)?
246        {
247            // Return Text event with `bytes` content
248            Ok(Some(bytes)) => $self.parser.emit_text(bytes).map(Ok),
249            Ok(None) => Ok(Ok(Event::Eof)),
250            Err(e) => Err(e),
251        }
252    }};
253}
254
255/// Read bytes up to the `>` and skip it. This method is expected to be called
256/// after seeing the `<` symbol and skipping it. Inspects the next (current)
257/// symbol and returns an appropriate [`Event`]:
258///
259/// |Symbol |Event
260/// |-------|-------------------------------------
261/// |`!`    |[`Comment`], [`CData`] or [`DocType`]
262/// |`/`    |[`End`]
263/// |`?`    |[`PI`]
264/// |_other_|[`Start`] or [`Empty`]
265///
266/// Moves parser to the `ClosedTag` state.
267///
268/// [`Comment`]: Event::Comment
269/// [`CData`]: Event::CData
270/// [`DocType`]: Event::DocType
271/// [`End`]: Event::End
272/// [`PI`]: Event::PI
273/// [`Start`]: Event::Start
274/// [`Empty`]: Event::Empty
275macro_rules! read_until_close {
276    (
277        $self:ident, $buf:ident,
278        $reader:expr
279        $(, $await:ident)?
280    ) => {{
281        $self.parser.state = ParseState::ClosedTag;
282
283        match $reader.peek_one() $(.$await)? {
284            // `<!` - comment, CDATA or DOCTYPE declaration
285            Ok(Some(b'!')) => match $reader
286                .read_bang_element($buf, &mut $self.parser.offset)
287                $(.$await)?
288            {
289                Ok(None) => Ok(Event::Eof),
290                Ok(Some((bang_type, bytes))) => $self.parser.emit_bang(bang_type, bytes),
291                Err(e) => Err(e),
292            },
293            // `</` - closing tag
294            Ok(Some(b'/')) => match $reader
295                .read_bytes_until(b'>', $buf, &mut $self.parser.offset)
296                $(.$await)?
297            {
298                Ok(None) => Ok(Event::Eof),
299                Ok(Some(bytes)) => $self.parser.emit_end(bytes),
300                Err(e) => Err(e),
301            },
302            // `<?` - processing instruction
303            Ok(Some(b'?')) => match $reader
304                .read_bytes_until(b'>', $buf, &mut $self.parser.offset)
305                $(.$await)?
306            {
307                Ok(None) => Ok(Event::Eof),
308                Ok(Some(bytes)) => $self.parser.emit_question_mark(bytes),
309                Err(e) => Err(e),
310            },
311            // `<...` - opening or self-closed tag
312            Ok(Some(_)) => match $reader
313                .read_element($buf, &mut $self.parser.offset)
314                $(.$await)?
315            {
316                Ok(None) => Ok(Event::Eof),
317                Ok(Some(bytes)) => $self.parser.emit_start(bytes),
318                Err(e) => Err(e),
319            },
320            Ok(None) => Ok(Event::Eof),
321            Err(e) => Err(e),
322        }
323    }};
324}
325
326/// Generalization of `read_to_end` method for buffered and borrowed readers
327macro_rules! read_to_end {
328    (
329        $self:expr, $end:expr, $buf:expr,
330        $read_event:ident,
331        // Code block that performs clearing of internal buffer after read of each event
332        $clear:block
333        $(, $await:ident)?
334    ) => {{
335        let start = $self.buffer_position();
336        let mut depth = 0;
337        loop {
338            $clear
339            let end = $self.buffer_position();
340            match $self.$read_event($buf) $(.$await)? {
341                Err(e) => return Err(e),
342
343                Ok(Event::Start(e)) if e.name() == $end => depth += 1,
344                Ok(Event::End(e)) if e.name() == $end => {
345                    if depth == 0 {
346                        break start..end;
347                    }
348                    depth -= 1;
349                }
350                Ok(Event::Eof) => {
351                    let name = $self.decoder().decode($end.as_ref());
352                    return Err(Error::UnexpectedEof(format!("</{:?}>", name)));
353                }
354                _ => (),
355            }
356        }
357    }};
358}
359
360#[cfg(feature = "async-tokio")]
361mod async_tokio;
362mod buffered_reader;
363mod ns_reader;
364mod parser;
365mod slice_reader;
366
367pub use ns_reader::NsReader;
368
369/// Range of input in bytes, that corresponds to some piece of XML
370pub type Span = Range<usize>;
371
372////////////////////////////////////////////////////////////////////////////////////////////////////
373
374/// Possible reader states. The state transition diagram (`true` and `false` shows
375/// value of [`Reader::expand_empty_elements()`] option):
376///
377/// ```mermaid
378/// flowchart LR
379///   subgraph _
380///     direction LR
381///
382///     Init      -- "(no event)"\n                                       --> OpenedTag
383///     OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
384///     ClosedTag -- "#lt;false#gt;\n(no event)"\nText                    --> OpenedTag
385///   end
386///   ClosedTag -- "#lt;true#gt;"\nStart --> Empty
387///   Empty     -- End                   --> ClosedTag
388///   _ -. Eof .-> Exit
389/// ```
390#[derive(Clone)]
391enum ParseState {
392    /// Initial state in which reader stay after creation. Transition from that
393    /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
394    /// state is always `OpenedTag`. The reader will never return to this state. The
395    /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
396    /// first symbol not `<`, otherwise no event are emitted.
397    Init,
398    /// State after seeing the `<` symbol. Depending on the next symbol all other
399    /// events could be generated.
400    ///
401    /// After generating one event the reader moves to the `ClosedTag` state.
402    OpenedTag,
403    /// State in which reader searches the `<` symbol of a markup. All bytes before
404    /// that symbol will be returned in the [`Event::Text`] event. After that
405    /// the reader moves to the `OpenedTag` state.
406    ClosedTag,
407    /// This state is used only if option [`expand_empty_elements`] is set to `true`.
408    /// Reader enters to this state when it is in a `ClosedTag` state and emits an
409    /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
410    /// after which reader returned to the `ClosedTag` state.
411    ///
412    /// [`expand_empty_elements`]: Parser::expand_empty_elements
413    Empty,
414    /// Reader enters this state when `Eof` event generated or an error occurred.
415    /// This is the last state, the reader stay in it forever.
416    Exit,
417}
418
419/// A reference to an encoding together with information about how it was retrieved.
420///
421/// The state transition diagram:
422///
423/// ```mermaid
424/// flowchart LR
425///   Implicit    -- from_str       --> Explicit
426///   Implicit    -- BOM            --> BomDetected
427///   Implicit    -- "encoding=..." --> XmlDetected
428///   BomDetected -- "encoding=..." --> XmlDetected
429/// ```
430#[cfg(feature = "encoding")]
431#[derive(Clone, Copy)]
432enum EncodingRef {
433    /// Encoding was implicitly assumed to have a specified value. It can be refined
434    /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
435    Implicit(&'static Encoding),
436    /// Encoding was explicitly set to the desired value. It cannot be changed
437    /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
438    Explicit(&'static Encoding),
439    /// Encoding was detected from a byte order mark (BOM) or by the first bytes
440    /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
441    BomDetected(&'static Encoding),
442    /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
443    /// It can no longer change
444    XmlDetected(&'static Encoding),
445}
446#[cfg(feature = "encoding")]
447impl EncodingRef {
448    #[inline]
449    fn encoding(&self) -> &'static Encoding {
450        match self {
451            Self::Implicit(e) => e,
452            Self::Explicit(e) => e,
453            Self::BomDetected(e) => e,
454            Self::XmlDetected(e) => e,
455        }
456    }
457    #[inline]
458    fn can_be_refined(&self) -> bool {
459        match self {
460            Self::Implicit(_) | Self::BomDetected(_) => true,
461            Self::Explicit(_) | Self::XmlDetected(_) => false,
462        }
463    }
464}
465
466////////////////////////////////////////////////////////////////////////////////////////////////////
467
468/// A low level encoding-agnostic XML event reader.
469///
470/// Consumes bytes and streams XML [`Event`]s.
471///
472/// This reader does not manage namespace declarations and not able to resolve
473/// prefixes. If you want these features, use the [`NsReader`].
474///
475/// # Examples
476///
477/// ```
478/// use quick_xml::events::Event;
479/// use quick_xml::reader::Reader;
480///
481/// let xml = r#"<tag1 att1 = "test">
482///                 <tag2><!--Test comment-->Test</tag2>
483///                 <tag2>Test 2</tag2>
484///              </tag1>"#;
485/// let mut reader = Reader::from_str(xml);
486/// reader.trim_text(true);
487///
488/// let mut count = 0;
489/// let mut txt = Vec::new();
490/// let mut buf = Vec::new();
491///
492/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
493/// loop {
494///     // NOTE: this is the generic case when we don't know about the input BufRead.
495///     // when the input is a &str or a &[u8], we don't actually need to use another
496///     // buffer, we could directly call `reader.read_event()`
497///     match reader.read_event_into(&mut buf) {
498///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
499///         // exits the loop when reaching end of file
500///         Ok(Event::Eof) => break,
501///
502///         Ok(Event::Start(e)) => {
503///             match e.name().as_ref() {
504///                 b"tag1" => println!("attributes values: {:?}",
505///                                     e.attributes().map(|a| a.unwrap().value)
506///                                     .collect::<Vec<_>>()),
507///                 b"tag2" => count += 1,
508///                 _ => (),
509///             }
510///         }
511///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
512///
513///         // There are several other `Event`s we do not consider here
514///         _ => (),
515///     }
516///     // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
517///     buf.clear();
518/// }
519/// ```
520///
521/// [`NsReader`]: crate::reader::NsReader
522#[derive(Clone)]
523pub struct Reader<R> {
524    /// Source of data for parse
525    reader: R,
526    /// Configuration and current parse state
527    parser: Parser,
528}
529
530/// Builder methods
531impl<R> Reader<R> {
532    /// Creates a `Reader` that reads from a given reader.
533    pub fn from_reader(reader: R) -> Self {
534        Self {
535            reader,
536            parser: Parser::default(),
537        }
538    }
539
540    configure_methods!();
541}
542
543/// Getters
544impl<R> Reader<R> {
545    /// Consumes `Reader` returning the underlying reader
546    ///
547    /// Can be used to compute line and column of a parsing error position
548    ///
549    /// # Examples
550    ///
551    /// ```
552    /// # use pretty_assertions::assert_eq;
553    /// use std::{str, io::Cursor};
554    /// use quick_xml::events::Event;
555    /// use quick_xml::reader::Reader;
556    ///
557    /// let xml = r#"<tag1 att1 = "test">
558    ///                 <tag2><!--Test comment-->Test</tag2>
559    ///                 <tag3>Test 2</tag3>
560    ///              </tag1>"#;
561    /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562    /// let mut buf = Vec::new();
563    ///
564    /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565    ///     let end_pos = reader.buffer_position();
566    ///     let mut cursor = reader.into_inner();
567    ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
568    ///         .expect("can't make a string");
569    ///     let mut line = 1;
570    ///     let mut column = 0;
571    ///     for c in s.chars() {
572    ///         if c == '\n' {
573    ///             line += 1;
574    ///             column = 0;
575    ///         } else {
576    ///             column += 1;
577    ///         }
578    ///     }
579    ///     (line, column)
580    /// }
581    ///
582    /// loop {
583    ///     match reader.read_event_into(&mut buf) {
584    ///         Ok(Event::Start(ref e)) => match e.name().as_ref() {
585    ///             b"tag1" | b"tag2" => (),
586    ///             tag => {
587    ///                 assert_eq!(b"tag3", tag);
588    ///                 assert_eq!((3, 22), into_line_and_column(reader));
589    ///                 break;
590    ///             }
591    ///         },
592    ///         Ok(Event::Eof) => unreachable!(),
593    ///         _ => (),
594    ///     }
595    ///     buf.clear();
596    /// }
597    /// ```
598    pub fn into_inner(self) -> R {
599        self.reader
600    }
601
602    /// Gets a reference to the underlying reader.
603    pub fn get_ref(&self) -> &R {
604        &self.reader
605    }
606
607    /// Gets a mutable reference to the underlying reader.
608    pub fn get_mut(&mut self) -> &mut R {
609        &mut self.reader
610    }
611
612    /// Gets the current byte position in the input data.
613    ///
614    /// Useful when debugging errors.
615    pub fn buffer_position(&self) -> usize {
616        // when internal state is OpenedTag, we have actually read until '<',
617        // which we don't want to show
618        if let ParseState::OpenedTag = self.parser.state {
619            self.parser.offset - 1
620        } else {
621            self.parser.offset
622        }
623    }
624
625    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
626    ///
627    /// If `encoding` feature is enabled, the used encoding may change after
628    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
629    ///
630    /// If `encoding` feature is enabled and no encoding is specified in declaration,
631    /// defaults to UTF-8.
632    #[inline]
633    pub fn decoder(&self) -> Decoder {
634        self.parser.decoder()
635    }
636}
637
638/// Private sync reading methods
639impl<R> Reader<R> {
640    /// Read text into the given buffer, and return an event that borrows from
641    /// either that buffer or from the input itself, based on the type of the
642    /// reader.
643    fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
644    where
645        R: XmlSource<'i, B>,
646    {
647        read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
648    }
649
650    /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
651    ///
652    /// Returns inner `Ok` if the loop should be broken and an event returned.
653    /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
654    fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
655    where
656        R: XmlSource<'i, B>,
657    {
658        read_until_open!(self, buf, self.reader, read_event_impl)
659    }
660
661    /// Private function to read until `>` is found. This function expects that
662    /// it was called just after encounter a `<` symbol.
663    fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
664    where
665        R: XmlSource<'i, B>,
666    {
667        read_until_close!(self, buf, self.reader)
668    }
669}
670
671////////////////////////////////////////////////////////////////////////////////////////////////////
672
673/// Represents an input for a reader that can return borrowed data.
674///
675/// There are two implementors of this trait: generic one that read data from
676/// `Self`, copies some part of it into a provided buffer of type `B` and then
677/// returns data that borrow from that buffer.
678///
679/// The other implementor is for `&[u8]` and instead of copying data returns
680/// borrowed data from `Self` instead. This implementation allows zero-copy
681/// deserialization.
682///
683/// # Parameters
684/// - `'r`: lifetime of a buffer from which events will borrow
685/// - `B`: a type of a buffer that can be used to store data read from `Self` and
686///   from which events can borrow
687trait XmlSource<'r, B> {
688    /// Removes UTF-8 BOM if it is present
689    #[cfg(not(feature = "encoding"))]
690    fn remove_utf8_bom(&mut self) -> Result<()>;
691
692    /// Determines encoding from the start of input and removes BOM if it is present
693    #[cfg(feature = "encoding")]
694    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
695
696    /// Read input until `byte` is found or end of input is reached.
697    ///
698    /// Returns a slice of data read up to `byte`, which does not include into result.
699    /// If input (`Self`) is exhausted, returns `None`.
700    ///
701    /// # Example
702    ///
703    /// ```ignore
704    /// let mut position = 0;
705    /// let mut input = b"abc*def".as_ref();
706    /// //                    ^= 4
707    ///
708    /// assert_eq!(
709    ///     input.read_bytes_until(b'*', (), &mut position).unwrap(),
710    ///     Some(b"abc".as_ref())
711    /// );
712    /// assert_eq!(position, 4); // position after the symbol matched
713    /// ```
714    ///
715    /// # Parameters
716    /// - `byte`: Byte for search
717    /// - `buf`: Buffer that could be filled from an input (`Self`) and
718    ///   from which [events] could borrow their data
719    /// - `position`: Will be increased by amount of bytes consumed
720    ///
721    /// [events]: crate::events::Event
722    fn read_bytes_until(
723        &mut self,
724        byte: u8,
725        buf: B,
726        position: &mut usize,
727    ) -> Result<Option<&'r [u8]>>;
728
729    /// Read input until comment, CDATA or processing instruction is finished.
730    ///
731    /// This method expect that `<` already was read.
732    ///
733    /// Returns a slice of data read up to end of comment, CDATA or processing
734    /// instruction (`>`), which does not include into result.
735    ///
736    /// If input (`Self`) is exhausted and nothing was read, returns `None`.
737    ///
738    /// # Parameters
739    /// - `buf`: Buffer that could be filled from an input (`Self`) and
740    ///   from which [events] could borrow their data
741    /// - `position`: Will be increased by amount of bytes consumed
742    ///
743    /// [events]: crate::events::Event
744    fn read_bang_element(
745        &mut self,
746        buf: B,
747        position: &mut usize,
748    ) -> Result<Option<(BangType, &'r [u8])>>;
749
750    /// Read input until XML element is closed by approaching a `>` symbol.
751    /// Returns `Some(buffer)` that contains a data between `<` and `>` or
752    /// `None` if end-of-input was reached and nothing was read.
753    ///
754    /// Derived from `read_until`, but modified to handle XML attributes
755    /// using a minimal state machine.
756    ///
757    /// Attribute values are [defined] as follows:
758    /// ```plain
759    /// AttValue := '"' (([^<&"]) | Reference)* '"'
760    ///           | "'" (([^<&']) | Reference)* "'"
761    /// ```
762    /// (`Reference` is something like `&quot;`, but we don't care about
763    /// escaped characters at this level)
764    ///
765    /// # Parameters
766    /// - `buf`: Buffer that could be filled from an input (`Self`) and
767    ///   from which [events] could borrow their data
768    /// - `position`: Will be increased by amount of bytes consumed
769    ///
770    /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
771    /// [events]: crate::events::Event
772    fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
773
774    /// Consume and discard all the whitespace until the next non-whitespace
775    /// character or EOF.
776    ///
777    /// # Parameters
778    /// - `position`: Will be increased by amount of bytes consumed
779    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
780
781    /// Consume and discard one character if it matches the given byte. Return
782    /// `true` if it matched.
783    ///
784    /// # Parameters
785    /// - `position`: Will be increased by 1 if byte is matched
786    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
787
788    /// Return one character without consuming it, so that future `read_*` calls
789    /// will still include it. On EOF, return `None`.
790    fn peek_one(&mut self) -> Result<Option<u8>>;
791}
792
793/// Possible elements started with `<!`
794#[derive(Debug, PartialEq)]
795enum BangType {
796    /// <![CDATA[...]]>
797    CData,
798    /// <!--...-->
799    Comment,
800    /// <!DOCTYPE...>
801    DocType,
802}
803impl BangType {
804    #[inline(always)]
805    fn new(byte: Option<u8>) -> Result<Self> {
806        Ok(match byte {
807            Some(b'[') => Self::CData,
808            Some(b'-') => Self::Comment,
809            Some(b'D') | Some(b'd') => Self::DocType,
810            Some(b) => return Err(Error::UnexpectedBang(b)),
811            None => return Err(Error::UnexpectedEof("Bang".to_string())),
812        })
813    }
814
815    /// If element is finished, returns its content up to `>` symbol and
816    /// an index of this symbol, otherwise returns `None`
817    ///
818    /// # Parameters
819    /// - `buf`: buffer with data consumed on previous iterations
820    /// - `chunk`: data read on current iteration and not yet consumed from reader
821    #[inline(always)]
822    fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
823        for i in memchr::memchr_iter(b'>', chunk) {
824            match self {
825                // Need to read at least 6 symbols (`!---->`) for properly finished comment
826                // <!----> - XML comment
827                //  012345 - i
828                Self::Comment if buf.len() + i > 4 => {
829                    if chunk[..i].ends_with(b"--") {
830                        // We cannot strip last `--` from the buffer because we need it in case of
831                        // check_comments enabled option. XML standard requires that comment
832                        // will not end with `--->` sequence because this is a special case of
833                        // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
834                        return Some((&chunk[..i], i + 1)); // +1 for `>`
835                    }
836                    // End sequence `-|->` was splitted at |
837                    //        buf --/   \-- chunk
838                    if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
839                        return Some((&chunk[..i], i + 1)); // +1 for `>`
840                    }
841                    // End sequence `--|>` was splitted at |
842                    //         buf --/   \-- chunk
843                    if i == 0 && buf.ends_with(b"--") {
844                        return Some((&[], i + 1)); // +1 for `>`
845                    }
846                }
847                Self::Comment => {}
848                Self::CData => {
849                    if chunk[..i].ends_with(b"]]") {
850                        return Some((&chunk[..i], i + 1)); // +1 for `>`
851                    }
852                    // End sequence `]|]>` was splitted at |
853                    //        buf --/   \-- chunk
854                    if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
855                        return Some((&chunk[..i], i + 1)); // +1 for `>`
856                    }
857                    // End sequence `]]|>` was splitted at |
858                    //         buf --/   \-- chunk
859                    if i == 0 && buf.ends_with(b"]]") {
860                        return Some((&[], i + 1)); // +1 for `>`
861                    }
862                }
863                Self::DocType => {
864                    let content = &chunk[..i];
865                    let balance = memchr::memchr2_iter(b'<', b'>', content)
866                        .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
867                        .sum::<i32>();
868                    if balance == 0 {
869                        return Some((content, i + 1)); // +1 for `>`
870                    }
871                }
872            }
873        }
874        None
875    }
876    #[inline]
877    fn to_err(&self) -> Error {
878        let bang_str = match self {
879            Self::CData => "CData",
880            Self::Comment => "Comment",
881            Self::DocType => "DOCTYPE",
882        };
883        Error::UnexpectedEof(bang_str.to_string())
884    }
885}
886
887/// State machine for the [`XmlSource::read_element`]
888#[derive(Clone, Copy)]
889enum ReadElementState {
890    /// The initial state (inside element, but outside of attribute value)
891    Elem,
892    /// Inside a single-quoted attribute value
893    SingleQ,
894    /// Inside a double-quoted attribute value
895    DoubleQ,
896}
897impl ReadElementState {
898    /// Changes state by analyzing part of input.
899    /// Returns a tuple with part of chunk up to element closing symbol `>`
900    /// and a position after that symbol or `None` if such symbol was not found
901    #[inline(always)]
902    fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
903        for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
904            *self = match (*self, chunk[i]) {
905                // only allowed to match `>` while we are in state `Elem`
906                (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
907                (Self::Elem, b'\'') => Self::SingleQ,
908                (Self::Elem, b'\"') => Self::DoubleQ,
909
910                // the only end_byte that gets us out if the same character
911                (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
912
913                // all other bytes: no state change
914                _ => *self,
915            };
916        }
917        None
918    }
919}
920
921/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
922#[inline]
923pub(crate) const fn is_whitespace(b: u8) -> bool {
924    matches!(b, b' ' | b'\r' | b'\n' | b'\t')
925}
926
927////////////////////////////////////////////////////////////////////////////////////////////////////
928
929#[cfg(test)]
930mod test {
931    /// Checks the internal implementation of the various reader methods
932    macro_rules! check {
933        (
934            #[$test:meta]
935            $read_event:ident,
936            $read_until_close:ident,
937            // constructor of the XML source on which internal functions will be called
938            $source:path,
939            // constructor of the buffer to which read data will stored
940            $buf:expr
941            $(, $async:ident, $await:ident)?
942        ) => {
943            mod read_bytes_until {
944                use super::*;
945                // Use Bytes for printing bytes as strings for ASCII range
946                use crate::utils::Bytes;
947                use pretty_assertions::assert_eq;
948
949                /// Checks that search in the empty buffer returns `None`
950                #[$test]
951                $($async)? fn empty() {
952                    let buf = $buf;
953                    let mut position = 0;
954                    let mut input = b"".as_ref();
955                    //                ^= 0
956
957                    assert_eq!(
958                        $source(&mut input)
959                            .read_bytes_until(b'*', buf, &mut position)
960                            $(.$await)?
961                            .unwrap()
962                            .map(Bytes),
963                        None
964                    );
965                    assert_eq!(position, 0);
966                }
967
968                /// Checks that search in the buffer non-existent value returns entire buffer
969                /// as a result and set `position` to `len()`
970                #[$test]
971                $($async)? fn non_existent() {
972                    let buf = $buf;
973                    let mut position = 0;
974                    let mut input = b"abcdef".as_ref();
975                    //                      ^= 6
976
977                    assert_eq!(
978                        $source(&mut input)
979                            .read_bytes_until(b'*', buf, &mut position)
980                            $(.$await)?
981                            .unwrap()
982                            .map(Bytes),
983                        Some(Bytes(b"abcdef"))
984                    );
985                    assert_eq!(position, 6);
986                }
987
988                /// Checks that search in the buffer an element that is located in the front of
989                /// buffer returns empty slice as a result and set `position` to one symbol
990                /// after match (`1`)
991                #[$test]
992                $($async)? fn at_the_start() {
993                    let buf = $buf;
994                    let mut position = 0;
995                    let mut input = b"*abcdef".as_ref();
996                    //                 ^= 1
997
998                    assert_eq!(
999                        $source(&mut input)
1000                            .read_bytes_until(b'*', buf, &mut position)
1001                            $(.$await)?
1002                            .unwrap()
1003                            .map(Bytes),
1004                        Some(Bytes(b""))
1005                    );
1006                    assert_eq!(position, 1); // position after the symbol matched
1007                }
1008
1009                /// Checks that search in the buffer an element that is located in the middle of
1010                /// buffer returns slice before that symbol as a result and set `position` to one
1011                /// symbol after match
1012                #[$test]
1013                $($async)? fn inside() {
1014                    let buf = $buf;
1015                    let mut position = 0;
1016                    let mut input = b"abc*def".as_ref();
1017                    //                    ^= 4
1018
1019                    assert_eq!(
1020                        $source(&mut input)
1021                            .read_bytes_until(b'*', buf, &mut position)
1022                            $(.$await)?
1023                            .unwrap()
1024                            .map(Bytes),
1025                        Some(Bytes(b"abc"))
1026                    );
1027                    assert_eq!(position, 4); // position after the symbol matched
1028                }
1029
1030                /// Checks that search in the buffer an element that is located in the end of
1031                /// buffer returns slice before that symbol as a result and set `position` to one
1032                /// symbol after match (`len()`)
1033                #[$test]
1034                $($async)? fn in_the_end() {
1035                    let buf = $buf;
1036                    let mut position = 0;
1037                    let mut input = b"abcdef*".as_ref();
1038                    //                       ^= 7
1039
1040                    assert_eq!(
1041                        $source(&mut input)
1042                            .read_bytes_until(b'*', buf, &mut position)
1043                            $(.$await)?
1044                            .unwrap()
1045                            .map(Bytes),
1046                        Some(Bytes(b"abcdef"))
1047                    );
1048                    assert_eq!(position, 7); // position after the symbol matched
1049                }
1050            }
1051
1052            mod read_bang_element {
1053                use super::*;
1054
1055                /// Checks that reading CDATA content works correctly
1056                mod cdata {
1057                    use super::*;
1058                    use crate::errors::Error;
1059                    use crate::reader::BangType;
1060                    use crate::utils::Bytes;
1061                    use pretty_assertions::assert_eq;
1062
1063                    /// Checks that if input begins like CDATA element, but CDATA start sequence
1064                    /// is not finished, parsing ends with an error
1065                    #[$test]
1066                    #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1067                    $($async)? fn not_properly_start() {
1068                        let buf = $buf;
1069                        let mut position = 0;
1070                        let mut input = b"![]]>other content".as_ref();
1071                        //                ^= 0
1072
1073                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1074                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1075                            x => assert!(
1076                                false,
1077                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1078                                x
1079                            ),
1080                        }
1081                        assert_eq!(position, 0);
1082                    }
1083
1084                    /// Checks that if CDATA startup sequence was matched, but an end sequence
1085                    /// is not found, parsing ends with an error
1086                    #[$test]
1087                    $($async)? fn not_closed() {
1088                        let buf = $buf;
1089                        let mut position = 0;
1090                        let mut input = b"![CDATA[other content".as_ref();
1091                        //                ^= 0
1092
1093                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1094                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1095                            x => assert!(
1096                                false,
1097                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1098                                x
1099                            ),
1100                        }
1101                        assert_eq!(position, 0);
1102                    }
1103
1104                    /// Checks that CDATA element without content inside parsed successfully
1105                    #[$test]
1106                    $($async)? fn empty() {
1107                        let buf = $buf;
1108                        let mut position = 0;
1109                        let mut input = b"![CDATA[]]>other content".as_ref();
1110                        //                           ^= 11
1111
1112                        assert_eq!(
1113                            $source(&mut input)
1114                                .read_bang_element(buf, &mut position)
1115                                $(.$await)?
1116                                .unwrap()
1117                                .map(|(ty, data)| (ty, Bytes(data))),
1118                            Some((BangType::CData, Bytes(b"![CDATA[]]")))
1119                        );
1120                        assert_eq!(position, 11);
1121                    }
1122
1123                    /// Checks that CDATA element with content parsed successfully.
1124                    /// Additionally checks that sequences inside CDATA that may look like
1125                    /// a CDATA end sequence do not interrupt CDATA parsing
1126                    #[$test]
1127                    $($async)? fn with_content() {
1128                        let buf = $buf;
1129                        let mut position = 0;
1130                        let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1131                        //                                            ^= 28
1132
1133                        assert_eq!(
1134                            $source(&mut input)
1135                                .read_bang_element(buf, &mut position)
1136                                $(.$await)?
1137                                .unwrap()
1138                                .map(|(ty, data)| (ty, Bytes(data))),
1139                            Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
1140                        );
1141                        assert_eq!(position, 28);
1142                    }
1143                }
1144
1145                /// Checks that reading XML comments works correctly. According to the [specification],
1146                /// comment data can contain any sequence except `--`:
1147                ///
1148                /// ```peg
1149                /// comment = '<--' (!'--' char)* '-->';
1150                /// char = [#x1-#x2C]
1151                ///      / [#x2E-#xD7FF]
1152                ///      / [#xE000-#xFFFD]
1153                ///      / [#x10000-#x10FFFF]
1154                /// ```
1155                ///
1156                /// The presence of this limitation, however, is simply a poorly designed specification
1157                /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1158                /// presence of these sequences by default. This tests allow such content.
1159                ///
1160                /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1161                mod comment {
1162                    use super::*;
1163                    use crate::errors::Error;
1164                    use crate::reader::BangType;
1165                    use crate::utils::Bytes;
1166                    use pretty_assertions::assert_eq;
1167
1168                    #[$test]
1169                    #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1170                    $($async)? fn not_properly_start() {
1171                        let buf = $buf;
1172                        let mut position = 0;
1173                        let mut input = b"!- -->other content".as_ref();
1174                        //                ^= 0
1175
1176                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1177                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1178                            x => assert!(
1179                                false,
1180                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1181                                x
1182                            ),
1183                        }
1184                        assert_eq!(position, 0);
1185                    }
1186
1187                    #[$test]
1188                    $($async)? fn not_properly_end() {
1189                        let buf = $buf;
1190                        let mut position = 0;
1191                        let mut input = b"!->other content".as_ref();
1192                        //                ^= 0
1193
1194                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1195                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1196                            x => assert!(
1197                                false,
1198                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1199                                x
1200                            ),
1201                        }
1202                        assert_eq!(position, 0);
1203                    }
1204
1205                    #[$test]
1206                    $($async)? fn not_closed1() {
1207                        let buf = $buf;
1208                        let mut position = 0;
1209                        let mut input = b"!--other content".as_ref();
1210                        //                ^= 0
1211
1212                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1213                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1214                            x => assert!(
1215                                false,
1216                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1217                                x
1218                            ),
1219                        }
1220                        assert_eq!(position, 0);
1221                    }
1222
1223                    #[$test]
1224                    $($async)? fn not_closed2() {
1225                        let buf = $buf;
1226                        let mut position = 0;
1227                        let mut input = b"!-->other content".as_ref();
1228                        //                ^= 0
1229
1230                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1231                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1232                            x => assert!(
1233                                false,
1234                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1235                                x
1236                            ),
1237                        }
1238                        assert_eq!(position, 0);
1239                    }
1240
1241                    #[$test]
1242                    $($async)? fn not_closed3() {
1243                        let buf = $buf;
1244                        let mut position = 0;
1245                        let mut input = b"!--->other content".as_ref();
1246                        //                ^= 0
1247
1248                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1249                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1250                            x => assert!(
1251                                false,
1252                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1253                                x
1254                            ),
1255                        }
1256                        assert_eq!(position, 0);
1257                    }
1258
1259                    #[$test]
1260                    $($async)? fn empty() {
1261                        let buf = $buf;
1262                        let mut position = 0;
1263                        let mut input = b"!---->other content".as_ref();
1264                        //                      ^= 6
1265
1266                        assert_eq!(
1267                            $source(&mut input)
1268                                .read_bang_element(buf, &mut position)
1269                                $(.$await)?
1270                                .unwrap()
1271                                .map(|(ty, data)| (ty, Bytes(data))),
1272                            Some((BangType::Comment, Bytes(b"!----")))
1273                        );
1274                        assert_eq!(position, 6);
1275                    }
1276
1277                    #[$test]
1278                    $($async)? fn with_content() {
1279                        let buf = $buf;
1280                        let mut position = 0;
1281                        let mut input = b"!--->comment<--->other content".as_ref();
1282                        //                                 ^= 17
1283
1284                        assert_eq!(
1285                            $source(&mut input)
1286                                .read_bang_element(buf, &mut position)
1287                                $(.$await)?
1288                                .unwrap()
1289                                .map(|(ty, data)| (ty, Bytes(data))),
1290                            Some((BangType::Comment, Bytes(b"!--->comment<---")))
1291                        );
1292                        assert_eq!(position, 17);
1293                    }
1294                }
1295
1296                /// Checks that reading DOCTYPE definition works correctly
1297                mod doctype {
1298                    use super::*;
1299
1300                    mod uppercase {
1301                        use super::*;
1302                        use crate::errors::Error;
1303                        use crate::reader::BangType;
1304                        use crate::utils::Bytes;
1305                        use pretty_assertions::assert_eq;
1306
1307                        #[$test]
1308                        $($async)? fn not_properly_start() {
1309                            let buf = $buf;
1310                            let mut position = 0;
1311                            let mut input = b"!D other content".as_ref();
1312                            //                ^= 0
1313
1314                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1315                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1316                                x => assert!(
1317                                    false,
1318                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1319                                    x
1320                                ),
1321                            }
1322                            assert_eq!(position, 0);
1323                        }
1324
1325                        #[$test]
1326                        $($async)? fn without_space() {
1327                            let buf = $buf;
1328                            let mut position = 0;
1329                            let mut input = b"!DOCTYPEother content".as_ref();
1330                            //                ^= 0
1331
1332                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1333                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1334                                x => assert!(
1335                                    false,
1336                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1337                                    x
1338                                ),
1339                            }
1340                            assert_eq!(position, 0);
1341                        }
1342
1343                        #[$test]
1344                        $($async)? fn empty() {
1345                            let buf = $buf;
1346                            let mut position = 0;
1347                            let mut input = b"!DOCTYPE>other content".as_ref();
1348                            //                         ^= 9
1349
1350                            assert_eq!(
1351                                $source(&mut input)
1352                                    .read_bang_element(buf, &mut position)
1353                                    $(.$await)?
1354                                    .unwrap()
1355                                    .map(|(ty, data)| (ty, Bytes(data))),
1356                                Some((BangType::DocType, Bytes(b"!DOCTYPE")))
1357                            );
1358                            assert_eq!(position, 9);
1359                        }
1360
1361                        #[$test]
1362                        $($async)? fn not_closed() {
1363                            let buf = $buf;
1364                            let mut position = 0;
1365                            let mut input = b"!DOCTYPE other content".as_ref();
1366                            //                ^= 0
1367
1368                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1369                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1370                                x => assert!(
1371                                    false,
1372                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1373                                    x
1374                                ),
1375                            }
1376                            assert_eq!(position, 0);
1377                        }
1378                    }
1379
1380                    mod lowercase {
1381                        use super::*;
1382                        use crate::errors::Error;
1383                        use crate::reader::BangType;
1384                        use crate::utils::Bytes;
1385                        use pretty_assertions::assert_eq;
1386
1387                        #[$test]
1388                        $($async)? fn not_properly_start() {
1389                            let buf = $buf;
1390                            let mut position = 0;
1391                            let mut input = b"!d other content".as_ref();
1392                            //                ^= 0
1393
1394                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1395                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1396                                x => assert!(
1397                                    false,
1398                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1399                                    x
1400                                ),
1401                            }
1402                            assert_eq!(position, 0);
1403                        }
1404
1405                        #[$test]
1406                        $($async)? fn without_space() {
1407                            let buf = $buf;
1408                            let mut position = 0;
1409                            let mut input = b"!doctypeother content".as_ref();
1410                            //                ^= 0
1411
1412                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1413                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1414                                x => assert!(
1415                                    false,
1416                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1417                                    x
1418                                ),
1419                            }
1420                            assert_eq!(position, 0);
1421                        }
1422
1423                        #[$test]
1424                        $($async)? fn empty() {
1425                            let buf = $buf;
1426                            let mut position = 0;
1427                            let mut input = b"!doctype>other content".as_ref();
1428                            //                         ^= 9
1429
1430                            assert_eq!(
1431                                $source(&mut input)
1432                                    .read_bang_element(buf, &mut position)
1433                                    $(.$await)?
1434                                    .unwrap()
1435                                    .map(|(ty, data)| (ty, Bytes(data))),
1436                                Some((BangType::DocType, Bytes(b"!doctype")))
1437                            );
1438                            assert_eq!(position, 9);
1439                        }
1440
1441                        #[$test]
1442                        $($async)? fn not_closed() {
1443                            let buf = $buf;
1444                            let mut position = 0;
1445                            let mut input = b"!doctype other content".as_ref();
1446                            //                ^= 0
1447
1448                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1449                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1450                                x => assert!(
1451                                    false,
1452                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1453                                    x
1454                                ),
1455                            }
1456                            assert_eq!(position, 0);
1457                        }
1458                    }
1459                }
1460            }
1461
1462            mod read_element {
1463                use super::*;
1464                use crate::utils::Bytes;
1465                use pretty_assertions::assert_eq;
1466
1467                /// Checks that nothing was read from empty buffer
1468                #[$test]
1469                $($async)? fn empty() {
1470                    let buf = $buf;
1471                    let mut position = 0;
1472                    let mut input = b"".as_ref();
1473                    //                ^= 0
1474
1475                    assert_eq!(
1476                        $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1477                        None
1478                    );
1479                    assert_eq!(position, 0);
1480                }
1481
1482                mod open {
1483                    use super::*;
1484                    use crate::utils::Bytes;
1485                    use pretty_assertions::assert_eq;
1486
1487                    #[$test]
1488                    $($async)? fn empty_tag() {
1489                        let buf = $buf;
1490                        let mut position = 0;
1491                        let mut input = b">".as_ref();
1492                        //                 ^= 1
1493
1494                        assert_eq!(
1495                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1496                            Some(Bytes(b""))
1497                        );
1498                        assert_eq!(position, 1);
1499                    }
1500
1501                    #[$test]
1502                    $($async)? fn normal() {
1503                        let buf = $buf;
1504                        let mut position = 0;
1505                        let mut input = b"tag>".as_ref();
1506                        //                    ^= 4
1507
1508                        assert_eq!(
1509                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1510                            Some(Bytes(b"tag"))
1511                        );
1512                        assert_eq!(position, 4);
1513                    }
1514
1515                    #[$test]
1516                    $($async)? fn empty_ns_empty_tag() {
1517                        let buf = $buf;
1518                        let mut position = 0;
1519                        let mut input = b":>".as_ref();
1520                        //                  ^= 2
1521
1522                        assert_eq!(
1523                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1524                            Some(Bytes(b":"))
1525                        );
1526                        assert_eq!(position, 2);
1527                    }
1528
1529                    #[$test]
1530                    $($async)? fn empty_ns() {
1531                        let buf = $buf;
1532                        let mut position = 0;
1533                        let mut input = b":tag>".as_ref();
1534                        //                     ^= 5
1535
1536                        assert_eq!(
1537                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1538                            Some(Bytes(b":tag"))
1539                        );
1540                        assert_eq!(position, 5);
1541                    }
1542
1543                    #[$test]
1544                    $($async)? fn with_attributes() {
1545                        let buf = $buf;
1546                        let mut position = 0;
1547                        let mut input = br#"tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1548                        //                                                        ^= 38
1549
1550                        assert_eq!(
1551                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1552                            Some(Bytes(br#"tag  attr-1=">"  attr2  =  '>'  3attr"#))
1553                        );
1554                        assert_eq!(position, 38);
1555                    }
1556                }
1557
1558                mod self_closed {
1559                    use super::*;
1560                    use crate::utils::Bytes;
1561                    use pretty_assertions::assert_eq;
1562
1563                    #[$test]
1564                    $($async)? fn empty_tag() {
1565                        let buf = $buf;
1566                        let mut position = 0;
1567                        let mut input = b"/>".as_ref();
1568                        //                  ^= 2
1569
1570                        assert_eq!(
1571                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1572                            Some(Bytes(b"/"))
1573                        );
1574                        assert_eq!(position, 2);
1575                    }
1576
1577                    #[$test]
1578                    $($async)? fn normal() {
1579                        let buf = $buf;
1580                        let mut position = 0;
1581                        let mut input = b"tag/>".as_ref();
1582                        //                     ^= 5
1583
1584                        assert_eq!(
1585                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1586                            Some(Bytes(b"tag/"))
1587                        );
1588                        assert_eq!(position, 5);
1589                    }
1590
1591                    #[$test]
1592                    $($async)? fn empty_ns_empty_tag() {
1593                        let buf = $buf;
1594                        let mut position = 0;
1595                        let mut input = b":/>".as_ref();
1596                        //                   ^= 3
1597
1598                        assert_eq!(
1599                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1600                            Some(Bytes(b":/"))
1601                        );
1602                        assert_eq!(position, 3);
1603                    }
1604
1605                    #[$test]
1606                    $($async)? fn empty_ns() {
1607                        let buf = $buf;
1608                        let mut position = 0;
1609                        let mut input = b":tag/>".as_ref();
1610                        //                      ^= 6
1611
1612                        assert_eq!(
1613                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1614                            Some(Bytes(b":tag/"))
1615                        );
1616                        assert_eq!(position, 6);
1617                    }
1618
1619                    #[$test]
1620                    $($async)? fn with_attributes() {
1621                        let buf = $buf;
1622                        let mut position = 0;
1623                        let mut input = br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/>"#.as_ref();
1624                        //                                                           ^= 41
1625
1626                        assert_eq!(
1627                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1628                            Some(Bytes(br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/"#))
1629                        );
1630                        assert_eq!(position, 41);
1631                    }
1632                }
1633            }
1634
1635            mod issue_344 {
1636                use crate::errors::Error;
1637                use crate::reader::Reader;
1638
1639                #[$test]
1640                $($async)? fn cdata() {
1641                    let mut reader = Reader::from_str("![]]>");
1642
1643                    match reader.$read_until_close($buf) $(.$await)? {
1644                        Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1645                        x => assert!(
1646                            false,
1647                            r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1648                            x
1649                        ),
1650                    }
1651                }
1652
1653                #[$test]
1654                $($async)? fn comment() {
1655                    let mut reader = Reader::from_str("!- -->");
1656
1657                    match reader.$read_until_close($buf) $(.$await)? {
1658                        Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1659                        x => assert!(
1660                            false,
1661                            r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1662                            x
1663                        ),
1664                    }
1665                }
1666
1667                #[$test]
1668                $($async)? fn doctype_uppercase() {
1669                    let mut reader = Reader::from_str("!D>");
1670
1671                    match reader.$read_until_close($buf) $(.$await)? {
1672                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1673                        x => assert!(
1674                            false,
1675                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1676                            x
1677                        ),
1678                    }
1679                }
1680
1681                #[$test]
1682                $($async)? fn doctype_lowercase() {
1683                    let mut reader = Reader::from_str("!d>");
1684
1685                    match reader.$read_until_close($buf) $(.$await)? {
1686                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1687                        x => assert!(
1688                            false,
1689                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1690                            x
1691                        ),
1692                    }
1693                }
1694            }
1695
1696            /// Ensures, that no empty `Text` events are generated
1697            mod $read_event {
1698                use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1699                use crate::reader::Reader;
1700                use pretty_assertions::assert_eq;
1701
1702                /// When `encoding` feature is enabled, encoding should be detected
1703                /// from BOM (UTF-8) and BOM should be stripped.
1704                ///
1705                /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1706                /// character should be stripped for consistency
1707                #[$test]
1708                $($async)? fn bom_from_reader() {
1709                    let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1710
1711                    assert_eq!(
1712                        reader.$read_event($buf) $(.$await)? .unwrap(),
1713                        Event::Text(BytesText::from_escaped("\u{feff}"))
1714                    );
1715
1716                    assert_eq!(
1717                        reader.$read_event($buf) $(.$await)? .unwrap(),
1718                        Event::Eof
1719                    );
1720                }
1721
1722                /// When parsing from &str, encoding is fixed (UTF-8), so
1723                /// - when `encoding` feature is disabled, the behavior the
1724                ///   same as in `bom_from_reader` text
1725                /// - when `encoding` feature is enabled, the behavior should
1726                ///   stay consistent, so the first BOM character is stripped
1727                #[$test]
1728                $($async)? fn bom_from_str() {
1729                    let mut reader = Reader::from_str("\u{feff}\u{feff}");
1730
1731                    assert_eq!(
1732                        reader.$read_event($buf) $(.$await)? .unwrap(),
1733                        Event::Text(BytesText::from_escaped("\u{feff}"))
1734                    );
1735
1736                    assert_eq!(
1737                        reader.$read_event($buf) $(.$await)? .unwrap(),
1738                        Event::Eof
1739                    );
1740                }
1741
1742                #[$test]
1743                $($async)? fn declaration() {
1744                    let mut reader = Reader::from_str("<?xml ?>");
1745
1746                    assert_eq!(
1747                        reader.$read_event($buf) $(.$await)? .unwrap(),
1748                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1749                    );
1750                }
1751
1752                #[$test]
1753                $($async)? fn doctype() {
1754                    let mut reader = Reader::from_str("<!DOCTYPE x>");
1755
1756                    assert_eq!(
1757                        reader.$read_event($buf) $(.$await)? .unwrap(),
1758                        Event::DocType(BytesText::from_escaped("x"))
1759                    );
1760                }
1761
1762                #[$test]
1763                $($async)? fn processing_instruction() {
1764                    let mut reader = Reader::from_str("<?xml-stylesheet?>");
1765
1766                    assert_eq!(
1767                        reader.$read_event($buf) $(.$await)? .unwrap(),
1768                        Event::PI(BytesText::from_escaped("xml-stylesheet"))
1769                    );
1770                }
1771
1772                #[$test]
1773                $($async)? fn start() {
1774                    let mut reader = Reader::from_str("<tag>");
1775
1776                    assert_eq!(
1777                        reader.$read_event($buf) $(.$await)? .unwrap(),
1778                        Event::Start(BytesStart::new("tag"))
1779                    );
1780                }
1781
1782                #[$test]
1783                $($async)? fn end() {
1784                    let mut reader = Reader::from_str("</tag>");
1785                    // Because we expect invalid XML, do not check that
1786                    // the end name paired with the start name
1787                    reader.check_end_names(false);
1788
1789                    assert_eq!(
1790                        reader.$read_event($buf) $(.$await)? .unwrap(),
1791                        Event::End(BytesEnd::new("tag"))
1792                    );
1793                }
1794
1795                #[$test]
1796                $($async)? fn empty() {
1797                    let mut reader = Reader::from_str("<tag/>");
1798
1799                    assert_eq!(
1800                        reader.$read_event($buf) $(.$await)? .unwrap(),
1801                        Event::Empty(BytesStart::new("tag"))
1802                    );
1803                }
1804
1805                #[$test]
1806                $($async)? fn text() {
1807                    let mut reader = Reader::from_str("text");
1808
1809                    assert_eq!(
1810                        reader.$read_event($buf) $(.$await)? .unwrap(),
1811                        Event::Text(BytesText::from_escaped("text"))
1812                    );
1813                }
1814
1815                #[$test]
1816                $($async)? fn cdata() {
1817                    let mut reader = Reader::from_str("<![CDATA[]]>");
1818
1819                    assert_eq!(
1820                        reader.$read_event($buf) $(.$await)? .unwrap(),
1821                        Event::CData(BytesCData::new(""))
1822                    );
1823                }
1824
1825                #[$test]
1826                $($async)? fn comment() {
1827                    let mut reader = Reader::from_str("<!---->");
1828
1829                    assert_eq!(
1830                        reader.$read_event($buf) $(.$await)? .unwrap(),
1831                        Event::Comment(BytesText::from_escaped(""))
1832                    );
1833                }
1834
1835                #[$test]
1836                $($async)? fn eof() {
1837                    let mut reader = Reader::from_str("");
1838
1839                    assert_eq!(
1840                        reader.$read_event($buf) $(.$await)? .unwrap(),
1841                        Event::Eof
1842                    );
1843                }
1844            }
1845        };
1846    }
1847
1848    /// Tests for https://github.com/tafia/quick-xml/issues/469
1849    macro_rules! small_buffers {
1850        (
1851            #[$test:meta]
1852            $read_event:ident: $BufReader:ty
1853            $(, $async:ident, $await:ident)?
1854        ) => {
1855            mod small_buffers {
1856                use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1857                use crate::reader::Reader;
1858                use pretty_assertions::assert_eq;
1859
1860                #[$test]
1861                $($async)? fn decl() {
1862                    let xml = "<?xml ?>";
1863                    //         ^^^^^^^ data that fit into buffer
1864                    let size = xml.match_indices("?>").next().unwrap().0 + 1;
1865                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1866                    let mut reader = Reader::from_reader(br);
1867                    let mut buf = Vec::new();
1868
1869                    assert_eq!(
1870                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1871                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1872                    );
1873                    assert_eq!(
1874                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1875                        Event::Eof
1876                    );
1877                }
1878
1879                #[$test]
1880                $($async)? fn pi() {
1881                    let xml = "<?pi?>";
1882                    //         ^^^^^ data that fit into buffer
1883                    let size = xml.match_indices("?>").next().unwrap().0 + 1;
1884                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1885                    let mut reader = Reader::from_reader(br);
1886                    let mut buf = Vec::new();
1887
1888                    assert_eq!(
1889                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1890                        Event::PI(BytesText::new("pi"))
1891                    );
1892                    assert_eq!(
1893                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1894                        Event::Eof
1895                    );
1896                }
1897
1898                #[$test]
1899                $($async)? fn empty() {
1900                    let xml = "<empty/>";
1901                    //         ^^^^^^^ data that fit into buffer
1902                    let size = xml.match_indices("/>").next().unwrap().0 + 1;
1903                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1904                    let mut reader = Reader::from_reader(br);
1905                    let mut buf = Vec::new();
1906
1907                    assert_eq!(
1908                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1909                        Event::Empty(BytesStart::new("empty"))
1910                    );
1911                    assert_eq!(
1912                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1913                        Event::Eof
1914                    );
1915                }
1916
1917                #[$test]
1918                $($async)? fn cdata1() {
1919                    let xml = "<![CDATA[cdata]]>";
1920                    //         ^^^^^^^^^^^^^^^ data that fit into buffer
1921                    let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1922                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1923                    let mut reader = Reader::from_reader(br);
1924                    let mut buf = Vec::new();
1925
1926                    assert_eq!(
1927                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1928                        Event::CData(BytesCData::new("cdata"))
1929                    );
1930                    assert_eq!(
1931                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1932                        Event::Eof
1933                    );
1934                }
1935
1936                #[$test]
1937                $($async)? fn cdata2() {
1938                    let xml = "<![CDATA[cdata]]>";
1939                    //         ^^^^^^^^^^^^^^^^ data that fit into buffer
1940                    let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1941                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1942                    let mut reader = Reader::from_reader(br);
1943                    let mut buf = Vec::new();
1944
1945                    assert_eq!(
1946                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1947                        Event::CData(BytesCData::new("cdata"))
1948                    );
1949                    assert_eq!(
1950                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1951                        Event::Eof
1952                    );
1953                }
1954
1955                #[$test]
1956                $($async)? fn comment1() {
1957                    let xml = "<!--comment-->";
1958                    //         ^^^^^^^^^^^^ data that fit into buffer
1959                    let size = xml.match_indices("-->").next().unwrap().0 + 1;
1960                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1961                    let mut reader = Reader::from_reader(br);
1962                    let mut buf = Vec::new();
1963
1964                    assert_eq!(
1965                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1966                        Event::Comment(BytesText::new("comment"))
1967                    );
1968                    assert_eq!(
1969                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1970                        Event::Eof
1971                    );
1972                }
1973
1974                #[$test]
1975                $($async)? fn comment2() {
1976                    let xml = "<!--comment-->";
1977                    //         ^^^^^^^^^^^^^ data that fit into buffer
1978                    let size = xml.match_indices("-->").next().unwrap().0 + 2;
1979                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1980                    let mut reader = Reader::from_reader(br);
1981                    let mut buf = Vec::new();
1982
1983                    assert_eq!(
1984                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1985                        Event::Comment(BytesText::new("comment"))
1986                    );
1987                    assert_eq!(
1988                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1989                        Event::Eof
1990                    );
1991                }
1992            }
1993        };
1994    }
1995
1996    // Export macros for the child modules:
1997    // - buffered_reader
1998    // - slice_reader
1999    pub(super) use check;
2000    pub(super) use small_buffers;
2001}