mirror of
https://github.com/bringout/oca-edi.git
synced 2026-04-18 07:32:02 +02:00
48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
# Copyright 2015-2021 Akretion France
|
|
# @author: Alexis de Lattre <alexis.delattre@akretion.com>
|
|
# Copyright 2022 Camptocamp SA
|
|
# @author: Simone Orsi <simahawk@gmail.com>
|
|
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
|
|
|
|
import logging
|
|
from io import BytesIO
|
|
from struct import error as StructError
|
|
|
|
from lxml import etree
|
|
|
|
try:
|
|
from PyPDF2.errors import PdfReadError
|
|
except ImportError:
|
|
from PyPDF2.utils import PdfReadError
|
|
|
|
from odoo.tools.pdf import OdooPdfFileReader
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFParser:
|
|
def __init__(self, pdf_file):
|
|
self.pdf_file = pdf_file
|
|
|
|
def get_xml_files(self):
|
|
"""Parse PDF files to extract XML content.
|
|
|
|
:param pdf_file: binary PDF file content
|
|
:returns: a dict like {$filename: $parsed_xml_file_obj}.
|
|
"""
|
|
res = {}
|
|
with BytesIO(self.pdf_file) as buffer:
|
|
pdf_reader = OdooPdfFileReader(buffer, strict=False)
|
|
|
|
# Process embedded files.
|
|
for xml_name, content in pdf_reader.getAttachments():
|
|
try:
|
|
res[xml_name] = etree.fromstring(content)
|
|
except Exception:
|
|
_logger.debug("Non XML file found in PDF")
|
|
if res:
|
|
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
|
|
return res
|
|
|
|
def get_xml_files_swallable_exceptions(self):
|
|
return (NotImplementedError, StructError, PdfReadError)
|