# Copyright 2015-2021 Akretion France # @author: Alexis de Lattre # Copyright 2022 Camptocamp SA # @author: Simone Orsi # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). import logging from io import BytesIO from struct import error as StructError from lxml import etree try: from PyPDF2.errors import PdfReadError except ImportError: from PyPDF2.utils import PdfReadError from odoo.tools.pdf import OdooPdfFileReader _logger = logging.getLogger(__name__) class PDFParser: def __init__(self, pdf_file): self.pdf_file = pdf_file def get_xml_files(self): """Parse PDF files to extract XML content. :param pdf_file: binary PDF file content :returns: a dict like {$filename: $parsed_xml_file_obj}. """ res = {} with BytesIO(self.pdf_file) as buffer: pdf_reader = OdooPdfFileReader(buffer, strict=False) # Process embedded files. for xml_name, content in pdf_reader.getAttachments(): try: res[xml_name] = etree.fromstring(content) except Exception: _logger.debug("Non XML file found in PDF") if res: _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) return res def get_xml_files_swallable_exceptions(self): return (NotImplementedError, StructError, PdfReadError)