calamine/
ods.rs

1//! A module to parse Open Document Spreadsheets
2//!
3//! # Reference
4//! OASIS Open Document Format for Office Application 1.2 (ODF 1.2)
5//! http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf
6
7use std::borrow::Cow;
8use std::collections::{BTreeMap, HashMap};
9use std::io::{BufReader, Read, Seek};
10
11use quick_xml::events::attributes::Attributes;
12use quick_xml::events::Event;
13use quick_xml::name::QName;
14use quick_xml::Reader as XmlReader;
15use zip::read::{ZipArchive, ZipFile};
16use zip::result::ZipError;
17
18use crate::vba::VbaProject;
19use crate::{DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible};
20use std::marker::PhantomData;
21
22const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet";
23
24type OdsReader<'a> = XmlReader<BufReader<ZipFile<'a>>>;
25
26/// An enum for ods specific errors
27#[derive(Debug)]
28pub enum OdsError {
29    /// Io error
30    Io(std::io::Error),
31    /// Zip error
32    Zip(zip::result::ZipError),
33    /// Xml error
34    Xml(quick_xml::Error),
35    /// Xml attribute error
36    XmlAttr(quick_xml::events::attributes::AttrError),
37    /// Error while parsing string
38    Parse(std::string::ParseError),
39    /// Error while parsing integer
40    ParseInt(std::num::ParseIntError),
41    /// Error while parsing float
42    ParseFloat(std::num::ParseFloatError),
43    /// Error while parsing bool
44    ParseBool(std::str::ParseBoolError),
45
46    /// Invalid MIME
47    InvalidMime(Vec<u8>),
48    /// File not found
49    FileNotFound(&'static str),
50    /// Unexpected end of file
51    Eof(&'static str),
52    /// Unexpected error
53    Mismatch {
54        /// Expected
55        expected: &'static str,
56        /// Found
57        found: String,
58    },
59}
60
61from_err!(std::io::Error, OdsError, Io);
62from_err!(zip::result::ZipError, OdsError, Zip);
63from_err!(quick_xml::Error, OdsError, Xml);
64from_err!(std::string::ParseError, OdsError, Parse);
65from_err!(std::num::ParseFloatError, OdsError, ParseFloat);
66
67impl std::fmt::Display for OdsError {
68    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
69        match self {
70            OdsError::Io(e) => write!(f, "I/O error: {}", e),
71            OdsError::Zip(e) => write!(f, "Zip error: {:?}", e),
72            OdsError::Xml(e) => write!(f, "Xml error: {}", e),
73            OdsError::XmlAttr(e) => write!(f, "Xml attribute error: {}", e),
74            OdsError::Parse(e) => write!(f, "Parse string error: {}", e),
75            OdsError::ParseInt(e) => write!(f, "Parse integer error: {}", e),
76            OdsError::ParseFloat(e) => write!(f, "Parse float error: {}", e),
77            OdsError::ParseBool(e) => write!(f, "Parse bool error: {}", e),
78            OdsError::InvalidMime(mime) => write!(f, "Invalid MIME type: {:?}", mime),
79            OdsError::FileNotFound(file) => write!(f, "'{}' file not found in archive", file),
80            OdsError::Eof(node) => write!(f, "Expecting '{}' node, found end of xml file", node),
81            OdsError::Mismatch { expected, found } => {
82                write!(f, "Expecting '{}', found '{}'", expected, found)
83            }
84        }
85    }
86}
87
88impl std::error::Error for OdsError {
89    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
90        match self {
91            OdsError::Io(e) => Some(e),
92            OdsError::Zip(e) => Some(e),
93            OdsError::Xml(e) => Some(e),
94            OdsError::Parse(e) => Some(e),
95            OdsError::ParseInt(e) => Some(e),
96            OdsError::ParseFloat(e) => Some(e),
97            _ => None,
98        }
99    }
100}
101
102/// An OpenDocument Spreadsheet document parser
103///
104/// # Reference
105/// OASIS Open Document Format for Office Application 1.2 (ODF 1.2)
106/// http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf
107pub struct Ods<RS> {
108    sheets: BTreeMap<String, (Range<DataType>, Range<String>)>,
109    metadata: Metadata,
110    marker: PhantomData<RS>,
111    #[cfg(feature = "picture")]
112    pictures: Option<Vec<(String, Vec<u8>)>>,
113}
114
115impl<RS> Reader<RS> for Ods<RS>
116where
117    RS: Read + Seek,
118{
119    type Error = OdsError;
120
121    fn new(reader: RS) -> Result<Self, OdsError> {
122        let mut zip = ZipArchive::new(reader)?;
123
124        // check mimetype
125        match zip.by_name("mimetype") {
126            Ok(mut f) => {
127                let mut buf = [0u8; 46];
128                f.read_exact(&mut buf)?;
129                if &buf[..] != MIMETYPE {
130                    return Err(OdsError::InvalidMime(buf.to_vec()));
131                }
132            }
133            Err(ZipError::FileNotFound) => return Err(OdsError::FileNotFound("mimetype")),
134            Err(e) => return Err(OdsError::Zip(e)),
135        }
136
137        #[cfg(feature = "picture")]
138        let pictures = read_pictures(&mut zip)?;
139
140        let Content {
141            sheets,
142            sheets_metadata,
143            defined_names,
144        } = parse_content(zip)?;
145        let metadata = Metadata {
146            sheets: sheets_metadata,
147            names: defined_names,
148        };
149
150        Ok(Ods {
151            marker: PhantomData,
152            metadata,
153            sheets,
154            #[cfg(feature = "picture")]
155            pictures,
156        })
157    }
158
159    /// Gets `VbaProject`
160    fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, OdsError>> {
161        None
162    }
163
164    /// Read sheets from workbook.xml and get their corresponding path from relationships
165    fn metadata(&self) -> &Metadata {
166        &self.metadata
167    }
168
169    /// Read worksheet data in corresponding worksheet path
170    fn worksheet_range(&mut self, name: &str) -> Option<Result<Range<DataType>, OdsError>> {
171        self.sheets.get(name).map(|r| Ok(r.0.to_owned()))
172    }
173
174    fn worksheets(&mut self) -> Vec<(String, Range<DataType>)> {
175        self.sheets
176            .iter()
177            .map(|(name, (range, _formula))| (name.to_owned(), range.clone()))
178            .collect()
179    }
180
181    /// Read worksheet data in corresponding worksheet path
182    fn worksheet_formula(&mut self, name: &str) -> Option<Result<Range<String>, OdsError>> {
183        self.sheets.get(name).map(|r| Ok(r.1.to_owned()))
184    }
185
186    #[cfg(feature = "picture")]
187    fn pictures(&self) -> Option<Vec<(String, Vec<u8>)>> {
188        self.pictures.to_owned()
189    }
190}
191
192struct Content {
193    sheets: BTreeMap<String, (Range<DataType>, Range<String>)>,
194    sheets_metadata: Vec<Sheet>,
195    defined_names: Vec<(String, String)>,
196}
197
198/// Parses content.xml and store the result in `self.content`
199fn parse_content<RS: Read + Seek>(mut zip: ZipArchive<RS>) -> Result<Content, OdsError> {
200    let mut reader = match zip.by_name("content.xml") {
201        Ok(f) => {
202            let mut r = XmlReader::from_reader(BufReader::new(f));
203            r.check_end_names(false)
204                .trim_text(false)
205                .check_comments(false)
206                .expand_empty_elements(true);
207            r
208        }
209        Err(ZipError::FileNotFound) => return Err(OdsError::FileNotFound("content.xml")),
210        Err(e) => return Err(OdsError::Zip(e)),
211    };
212    let mut buf = Vec::with_capacity(1024);
213    let mut sheets = BTreeMap::new();
214    let mut defined_names = Vec::new();
215    let mut sheets_metadata = Vec::new();
216    let mut styles = HashMap::new();
217    let mut style_name: Option<String> = None;
218    loop {
219        match reader.read_event_into(&mut buf) {
220            Ok(Event::Start(ref e)) if e.name() == QName(b"style:style") => {
221                style_name = e
222                    .try_get_attribute(b"style:name")?
223                    .map(|a| a.decode_and_unescape_value(&reader))
224                    .transpose()
225                    .map_err(OdsError::Xml)?
226                    .map(|x| x.to_string())
227            }
228            Ok(Event::Start(ref e))
229                if style_name.clone().is_some() && e.name() == QName(b"style:table-properties") =>
230            {
231                let visible = match e.try_get_attribute(b"table:display")? {
232                    Some(a) => match a
233                        .decode_and_unescape_value(&reader)
234                        .map_err(OdsError::Xml)?
235                        .parse()
236                        .map_err(OdsError::ParseBool)?
237                    {
238                        true => SheetVisible::Visible,
239                        false => SheetVisible::Hidden,
240                    },
241                    None => SheetVisible::Visible,
242                };
243                styles.insert(style_name.clone(), visible);
244            }
245            Ok(Event::Start(ref e)) if e.name() == QName(b"table:table") => {
246                let visible = styles
247                    .get(
248                        &e.try_get_attribute(b"table:style-name")?
249                            .map(|a| a.decode_and_unescape_value(&reader))
250                            .transpose()
251                            .map_err(OdsError::Xml)?
252                            .map(|x| x.to_string()),
253                    )
254                    .map(|v| v.to_owned())
255                    .unwrap_or(SheetVisible::Visible);
256                if let Some(ref a) = e
257                    .attributes()
258                    .filter_map(|a| a.ok())
259                    .find(|a| a.key == QName(b"table:name"))
260                {
261                    let name = a
262                        .decode_and_unescape_value(&reader)
263                        .map_err(OdsError::Xml)?
264                        .to_string();
265                    let (range, formulas) = read_table(&mut reader)?;
266                    sheets_metadata.push(Sheet {
267                        name: name.clone(),
268                        typ: SheetType::WorkSheet,
269                        visible,
270                    });
271                    sheets.insert(name, (range, formulas));
272                }
273            }
274            Ok(Event::Start(ref e)) if e.name() == QName(b"table:named-expressions") => {
275                defined_names = read_named_expressions(&mut reader)?;
276            }
277            Ok(Event::Eof) => break,
278            Err(e) => return Err(OdsError::Xml(e)),
279            _ => (),
280        }
281        buf.clear();
282    }
283    Ok(Content {
284        sheets,
285        sheets_metadata,
286        defined_names,
287    })
288}
289
290fn read_table(reader: &mut OdsReader<'_>) -> Result<(Range<DataType>, Range<String>), OdsError> {
291    let mut cells = Vec::new();
292    let mut rows_repeats = Vec::new();
293    let mut formulas = Vec::new();
294    let mut cols = Vec::new();
295    let mut buf = Vec::with_capacity(1024);
296    let mut row_buf = Vec::with_capacity(1024);
297    let mut cell_buf = Vec::with_capacity(1024);
298    cols.push(0);
299    loop {
300        match reader.read_event_into(&mut buf) {
301            Ok(Event::Start(ref e)) if e.name() == QName(b"table:table-row") => {
302                let row_repeats = match e.try_get_attribute(b"table:number-rows-repeated")? {
303                    Some(c) => c
304                        .decode_and_unescape_value(reader)
305                        .map_err(OdsError::Xml)?
306                        .parse()
307                        .map_err(OdsError::ParseInt)?,
308                    None => 1,
309                };
310                read_row(
311                    reader,
312                    &mut row_buf,
313                    &mut cell_buf,
314                    &mut cells,
315                    &mut formulas,
316                )?;
317                cols.push(cells.len());
318                rows_repeats.push(row_repeats);
319            }
320            Ok(Event::End(ref e)) if e.name() == QName(b"table:table") => break,
321            Err(e) => return Err(OdsError::Xml(e)),
322            Ok(_) => (),
323        }
324        buf.clear();
325    }
326    Ok((
327        get_range(cells, &cols, &rows_repeats),
328        get_range(formulas, &cols, &rows_repeats),
329    ))
330}
331
332fn is_empty_row<T: Default + Clone + PartialEq>(row: &[T]) -> bool {
333    row.iter().all(|x| x == &T::default())
334}
335
336fn get_range<T: Default + Clone + PartialEq>(
337    mut cells: Vec<T>,
338    cols: &[usize],
339    rows_repeats: &[usize],
340) -> Range<T> {
341    // find smallest area with non empty Cells
342    let mut row_min = None;
343    let mut row_max = 0;
344    let mut col_min = usize::MAX;
345    let mut col_max = 0;
346    let mut first_empty_rows_repeated = 0;
347    {
348        for (i, w) in cols.windows(2).enumerate() {
349            let row = &cells[w[0]..w[1]];
350            if let Some(p) = row.iter().position(|c| c != &T::default()) {
351                if row_min.is_none() {
352                    row_min = Some(i);
353                    first_empty_rows_repeated =
354                        rows_repeats.iter().take(i).sum::<usize>().saturating_sub(i);
355                }
356                row_max = i;
357                if p < col_min {
358                    col_min = p;
359                }
360                if let Some(p) = row.iter().rposition(|c| c != &T::default()) {
361                    if p > col_max {
362                        col_max = p;
363                    }
364                }
365            }
366        }
367    }
368    let row_min = match row_min {
369        Some(min) => min,
370        _ => return Range::default(),
371    };
372
373    // rebuild cells into its smallest non empty area
374    let cells_len = (row_max + 1 - row_min) * (col_max + 1 - col_min);
375    {
376        let mut new_cells = Vec::with_capacity(cells_len);
377        let empty_cells = vec![T::default(); col_max + 1];
378        let mut empty_row_repeats = 0;
379        for (w, row_repeats) in cols
380            .windows(2)
381            .skip(row_min)
382            .take(row_max + 1)
383            .zip(rows_repeats.iter().skip(row_min).take(row_max + 1))
384        {
385            let row = &cells[w[0]..w[1]];
386            let row_repeats = *row_repeats;
387
388            if is_empty_row(row) {
389                empty_row_repeats = row_repeats;
390                continue;
391            }
392
393            if empty_row_repeats > 0 {
394                row_max = row_max + empty_row_repeats - 1;
395                for _ in 0..empty_row_repeats {
396                    new_cells.extend_from_slice(&empty_cells);
397                }
398                empty_row_repeats = 0;
399            };
400
401            if row_repeats > 1 {
402                row_max = row_max + row_repeats - 1;
403            };
404
405            for _ in 0..row_repeats {
406                match row.len().cmp(&(col_max + 1)) {
407                    std::cmp::Ordering::Less => {
408                        new_cells.extend_from_slice(&row[col_min..]);
409                        new_cells.extend_from_slice(&empty_cells[row.len()..]);
410                    }
411                    std::cmp::Ordering::Equal => {
412                        new_cells.extend_from_slice(&row[col_min..]);
413                    }
414                    std::cmp::Ordering::Greater => {
415                        new_cells.extend_from_slice(&row[col_min..=col_max]);
416                    }
417                }
418            }
419        }
420        cells = new_cells;
421    }
422    let row_min = row_min + first_empty_rows_repeated;
423    let row_max = row_max + first_empty_rows_repeated;
424    Range {
425        start: (row_min as u32, col_min as u32),
426        end: (row_max as u32, col_max as u32),
427        inner: cells,
428    }
429}
430
431fn read_row(
432    reader: &mut OdsReader<'_>,
433    row_buf: &mut Vec<u8>,
434    cell_buf: &mut Vec<u8>,
435    cells: &mut Vec<DataType>,
436    formulas: &mut Vec<String>,
437) -> Result<(), OdsError> {
438    let mut empty_col_repeats = 0;
439    loop {
440        row_buf.clear();
441        match reader.read_event_into(row_buf) {
442            Ok(Event::Start(ref e))
443                if e.name() == QName(b"table:table-cell")
444                    || e.name() == QName(b"table:covered-table-cell") =>
445            {
446                let mut repeats = 1;
447                for a in e.attributes() {
448                    let a = a.map_err(OdsError::XmlAttr)?;
449                    if a.key == QName(b"table:number-columns-repeated") {
450                        repeats = reader
451                            .decoder()
452                            .decode(&a.value)?
453                            .parse()
454                            .map_err(OdsError::ParseInt)?;
455                        break;
456                    }
457                }
458
459                let (value, formula, is_closed) = get_datatype(reader, e.attributes(), cell_buf)?;
460
461                for _ in 0..empty_col_repeats {
462                    cells.push(DataType::Empty);
463                    formulas.push("".to_string());
464                }
465                empty_col_repeats = 0;
466
467                if value.is_empty() && formula.is_empty() {
468                    empty_col_repeats = repeats;
469                } else {
470                    for _ in 0..repeats {
471                        cells.push(value.clone());
472                        formulas.push(formula.clone());
473                    }
474                }
475                if !is_closed {
476                    reader.read_to_end_into(e.name(), cell_buf)?;
477                }
478            }
479            Ok(Event::End(ref e)) if e.name() == QName(b"table:table-row") => break,
480            Err(e) => return Err(OdsError::Xml(e)),
481            Ok(e) => {
482                return Err(OdsError::Mismatch {
483                    expected: "table-cell",
484                    found: format!("{:?}", e),
485                });
486            }
487        }
488    }
489    Ok(())
490}
491
492/// Converts table-cell element into a `DataType`
493///
494/// ODF 1.2-19.385
495fn get_datatype(
496    reader: &mut OdsReader<'_>,
497    atts: Attributes<'_>,
498    buf: &mut Vec<u8>,
499) -> Result<(DataType, String, bool), OdsError> {
500    let mut is_string = false;
501    let mut is_value_set = false;
502    let mut val = DataType::Empty;
503    let mut formula = String::new();
504    for a in atts {
505        let a = a.map_err(OdsError::XmlAttr)?;
506        match a.key {
507            QName(b"office:value") if !is_value_set => {
508                let v = reader.decoder().decode(&a.value)?;
509                val = DataType::Float(v.parse().map_err(OdsError::ParseFloat)?);
510                is_value_set = true;
511            }
512            QName(b"office:string-value" | b"office:date-value" | b"office:time-value")
513                if !is_value_set =>
514            {
515                let attr = a
516                    .decode_and_unescape_value(reader)
517                    .map_err(OdsError::Xml)?
518                    .to_string();
519                val = match a.key {
520                    QName(b"office:date-value") => DataType::DateTimeIso(attr),
521                    QName(b"office:time-value") => DataType::DurationIso(attr),
522                    _ => DataType::String(attr),
523                };
524                is_value_set = true;
525            }
526            QName(b"office:boolean-value") if !is_value_set => {
527                let b = &*a.value == b"TRUE" || &*a.value == b"true";
528                val = DataType::Bool(b);
529                is_value_set = true;
530            }
531            QName(b"office:value-type") if !is_value_set => is_string = &*a.value == b"string",
532            QName(b"table:formula") => {
533                formula = a
534                    .decode_and_unescape_value(reader)
535                    .map_err(OdsError::Xml)?
536                    .to_string();
537            }
538            _ => (),
539        }
540    }
541    if !is_value_set && is_string {
542        // If the value type is string and the office:string-value attribute
543        // is not present, the element content defines the value.
544        let mut s = String::new();
545        let mut first_paragraph = true;
546        loop {
547            buf.clear();
548            match reader.read_event_into(buf) {
549                Ok(Event::Text(ref e)) => {
550                    s.push_str(&e.unescape()?);
551                }
552                Ok(Event::End(ref e))
553                    if e.name() == QName(b"table:table-cell")
554                        || e.name() == QName(b"table:covered-table-cell") =>
555                {
556                    return Ok((DataType::String(s), formula, true));
557                }
558                Ok(Event::Start(ref e)) if e.name() == QName(b"text:p") => {
559                    if first_paragraph {
560                        first_paragraph = false;
561                    } else {
562                        s.push('\n');
563                    }
564                }
565                Ok(Event::Start(ref e)) if e.name() == QName(b"text:s") => {
566                    let count = match e.try_get_attribute("text:c")? {
567                        Some(c) => c
568                            .decode_and_unescape_value(reader)
569                            .map_err(OdsError::Xml)?
570                            .parse()
571                            .map_err(OdsError::ParseInt)?,
572                        None => 1,
573                    };
574                    for _ in 0..count {
575                        s.push(' ');
576                    }
577                }
578                Err(e) => return Err(OdsError::Xml(e)),
579                Ok(Event::Eof) => return Err(OdsError::Eof("table:table-cell")),
580                _ => (),
581            }
582        }
583    } else {
584        Ok((val, formula, false))
585    }
586}
587
588fn read_named_expressions(reader: &mut OdsReader<'_>) -> Result<Vec<(String, String)>, OdsError> {
589    let mut defined_names = Vec::new();
590    let mut buf = Vec::with_capacity(512);
591    loop {
592        buf.clear();
593        match reader.read_event_into(&mut buf) {
594            Ok(Event::Start(ref e))
595                if e.name() == QName(b"table:named-range")
596                    || e.name() == QName(b"table:named-expression") =>
597            {
598                let mut name = String::new();
599                let mut formula = String::new();
600                for a in e.attributes() {
601                    let a = a.map_err(OdsError::XmlAttr)?;
602                    match a.key {
603                        QName(b"table:name") => {
604                            name = a
605                                .decode_and_unescape_value(reader)
606                                .map_err(OdsError::Xml)?
607                                .to_string();
608                        }
609                        QName(b"table:cell-range-address" | b"table:expression") => {
610                            formula = a
611                                .decode_and_unescape_value(reader)
612                                .map_err(OdsError::Xml)?
613                                .to_string();
614                        }
615                        _ => (),
616                    }
617                }
618                defined_names.push((name, formula));
619            }
620            Ok(Event::End(ref e))
621                if e.name() == QName(b"table:named-range")
622                    || e.name() == QName(b"table:named-expression") => {}
623            Ok(Event::End(ref e)) if e.name() == QName(b"table:named-expressions") => break,
624            Err(e) => return Err(OdsError::Xml(e)),
625            Ok(e) => {
626                return Err(OdsError::Mismatch {
627                    expected: "table:named-expressions",
628                    found: format!("{:?}", e),
629                });
630            }
631        }
632    }
633    Ok(defined_names)
634}
635
636/// Read pictures
637#[cfg(feature = "picture")]
638fn read_pictures<RS: Read + Seek>(
639    zip: &mut ZipArchive<RS>,
640) -> Result<Option<Vec<(String, Vec<u8>)>>, OdsError> {
641    let mut pics = Vec::new();
642    for i in 0..zip.len() {
643        let mut zfile = zip.by_index(i)?;
644        let zname = zfile.name().to_owned();
645        // no Thumbnails
646        if zname.starts_with("Pictures") {
647            let name_ext: Vec<&str> = zname.split(".").collect();
648            if let Some(ext) = name_ext.last() {
649                if [
650                    "emf", "wmf", "pict", "jpeg", "jpg", "png", "dib", "gif", "tiff", "eps", "bmp",
651                    "wpg",
652                ]
653                .contains(ext)
654                {
655                    let mut buf: Vec<u8> = Vec::new();
656                    zfile.read_to_end(&mut buf)?;
657                    pics.push((ext.to_string(), buf));
658                }
659            }
660        }
661    }
662    if pics.is_empty() {
663        Ok(None)
664    } else {
665        Ok(Some(pics))
666    }
667}