19.0 vanilla

2026-04-20 19:12:06 +02:00 · 2025-10-03 18:07:25 +02:00 · 2025-10-03 18:07:25 +02:00 · 991d2234ca
commit 991d2234ca
parent 0a7ae8db93
416 changed files with 646602 additions and 300844 deletions
--- a/odoo-bringout-oca-ocb-base/odoo/tools/mail.py
+++ b/odoo-bringout-oca-ocb-base/odoo/tools/mail.py
@ -12,12 +12,22 @@ import time
 import email.utils
 from email.utils import getaddresses as orig_getaddresses
 from urllib.parse import urlparse
+from typing import Literal
 import html as htmllib

 import idna
 import markupsafe
 from lxml import etree, html
-from lxml.html import clean, defs
+from lxml.html import (
+    XHTML_NAMESPACE,
+    _contains_block_level_tag,
+    _looks_like_full_html_bytes,
+    _looks_like_full_html_unicode,
+    clean,
+    defs,
+    document_fromstring,
+    html_parser,
+)
 from werkzeug import urls

 from odoo.tools import misc
@ -66,7 +76,11 @@ safe_attrs = defs.safe_attrs | frozenset(
     'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
     'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
     'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
+     'data-attachment-id', 'data-format-mimetype',
+     'data-ai-field', 'data-ai-record-id',
+     'data-heading-link-id',
     'data-mimetype-before-conversion',
+     'data-language-id', 'data-syntax-highlighting-value'
     ])
 SANITIZE_TAGS = {
    # allow new semantic HTML5 tags
@ -262,6 +276,76 @@ def tag_quote(el):
        el.set('data-o-mail-quote', '1')


+def fromstring(html_, base_url=None, parser=None, **kw):
+    """
+    This function mimics lxml.html.fromstring. It not only returns the parsed
+    element/document but also a flag indicating whether the input is for a
+    a single body element or not.
+
+    This tries to minimally parse the chunk of text, without knowing if it
+    is a fragment or a document.
+
+    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+    """
+    if parser is None:
+        parser = html_parser
+    if isinstance(html_, bytes):
+        is_full_html = _looks_like_full_html_bytes(html_)
+    else:
+        is_full_html = _looks_like_full_html_unicode(html_)
+    doc = document_fromstring(html_, parser=parser, base_url=base_url, **kw)
+    if is_full_html:
+        return doc, False
+    # otherwise, lets parse it out...
+    bodies = doc.findall('body')
+    if not bodies:
+        bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
+    if bodies:
+        body = bodies[0]
+        if len(bodies) > 1:
+            # Somehow there are multiple bodies, which is bad, but just
+            # smash them into one body
+            for other_body in bodies[1:]:
+                if other_body.text:
+                    if len(body):
+                        body[-1].tail = (body[-1].tail or '') + other_body.text
+                    else:
+                        body.text = (body.text or '') + other_body.text
+                body.extend(other_body)
+                # We'll ignore tail
+                # I guess we are ignoring attributes too
+                other_body.drop_tree()
+    else:
+        body = None
+    heads = doc.findall('head')
+    if not heads:
+        heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
+    if heads:
+        # Well, we have some sort of structure, so lets keep it all
+        head = heads[0]
+        if len(heads) > 1:
+            for other_head in heads[1:]:
+                head.extend(other_head)
+                # We don't care about text or tail in a head
+                other_head.drop_tree()
+        return doc, False
+    if body is None:
+        return doc, False
+    if (len(body) == 1 and (not body.text or not body.text.strip())
+        and (not body[-1].tail or not body[-1].tail.strip())):
+        # The body has just one element, so it was probably a single
+        # element passed in
+        return body[0], True
+    # Now we have a body which represents a bunch of tags which have the
+    # content that was passed in.  We will create a fake container, which
+    # is the body tag, except <body> implies too much structure.
+    if _contains_block_level_tag(body):
+        body.tag = 'div'
+    else:
+        body.tag = 'span'
+    return body, False
+
+
 def html_normalize(src, filter_callback=None, output_method="html"):
    """ Normalize `src` for storage as an html field value.

@ -282,7 +366,7 @@ def html_normalize(src, filter_callback=None, output_method="html"):
        return src

    # html: remove encoding attribute inside tags
-    src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src, flags=re.IGNORECASE | re.DOTALL)
+    src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src)

    src = src.replace('--!>', '-->')
    src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
@ -291,7 +375,7 @@ def html_normalize(src, filter_callback=None, output_method="html"):
    src = re.sub(r'</?o:.*?>', '', src)

    try:
-        doc = html.fromstring(src)
+        doc, single_body_element = fromstring(src)
    except etree.ParserError as e:
        # HTML comment only string, whitespace only..
        if 'empty' in str(e):
@ -299,18 +383,23 @@ def html_normalize(src, filter_callback=None, output_method="html"):
        raise

    # perform quote detection before cleaning and class removal
-    if doc is not None:
-        for el in doc.iter(tag=etree.Element):
-            tag_quote(el)
+    for el in doc.iter(tag=etree.Element):
+        tag_quote(el)

    if filter_callback:
        doc = filter_callback(doc)

    src = html.tostring(doc, encoding='unicode', method=output_method)

-    # this is ugly, but lxml/etree tostring want to put everything in a
-    # 'div' that breaks the editor -> remove that
-    if src.startswith('<div>') and src.endswith('</div>'):
+    if not single_body_element and src.startswith('<div>') and src.endswith('</div>'):
+        # the <div></div> may come from 2 places
+        # 1. the src is parsed as multiple body elements
+        #    <div></div> wraps all elements.
+        # 2. the src is parsed as not only body elements
+        #    <html></html> wraps all elements.
+        #    then the Cleaner as the filter_callback which has 'html' in its
+        #    'remove_tags' will write <html></html> to <div></div> since it
+        #    cannot directly drop the parent-most tag
        src = src[5:-6]

    # html considerations so real html content match database value
@ -393,19 +482,20 @@ def validate_url(url):
    return url


-def is_html_empty(html_content):
+def is_html_empty(html_content: str | markupsafe.Markup | Literal[False] | None) -> bool:
    """Check if a html content is empty. If there are only formatting tags with style
    attributes or a void content  return True. Famous use case if a
    '<p style="..."><br></p>' added by some web editor.

-    :param str html_content: html content, coming from example from an HTML field
-    :returns: bool, True if no content found or if containing only void formatting tags
+    :param html_content: html content, coming from example from an HTML field
+    :returns: True if no content found or if containing only void formatting tags
    """
    if not html_content:
        return True
    icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'
    tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'
-    return not bool(re.sub(tag_re, '', html_content).strip()) and not re.search(icon_re, html_content)
+    text_content = htmllib.unescape(re.sub(tag_re, '', html_content))
+    return not bool(text_content.strip()) and not re.search(icon_re, html_content)


 def html_keep_url(text):
@ -432,16 +522,21 @@ def html_to_inner_content(html):
    processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)
    processed = re.sub(HTML_TAGS_REGEX, '', processed)
    processed = re.sub(r' {2,}|\t', ' ', processed)
+    processed = processed.replace("\xa0", " ")
    processed = htmllib.unescape(processed)
-    processed = processed.strip()
-    return processed
+    return processed.strip()


 def create_link(url, label):
    return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'


-def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True):
+def html2plaintext(
+    html: str | markupsafe.Markup | Literal[False] | None,
+    body_id: str | None = None,
+    encoding: str = 'utf-8',
+    include_references: bool = True
+) -> str:
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
@ -518,19 +613,19 @@ def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True

    return html.strip()

-def plaintext2html(text, container_tag=None):
+
+def plaintext2html(text: str, container_tag: str | None = None, with_paragraph: bool = True) -> markupsafe.Markup:
    r"""Convert plaintext into html. Content of the text is escaped to manage
    html entities, using :func:`~odoo.tools.misc.html_escape`.

    - all ``\n``, ``\r`` are replaced by ``<br/>``
-    - enclose content into ``<p>``
    - convert url into clickable link
-    - 2 or more consecutive ``<br/>`` are considered as paragraph breaks

-    :param str text: plaintext to convert
-    :param str container_tag: container of the html; by default the content is
+    :param text: plaintext to convert
+    :param container_tag: container of the html; by default the content is
        embedded into a ``<div>``
-    :rtype: markupsafe.Markup
+    :param with_paragraph: whether or not considering 2 or more consecutive ``<br/>``
+        as paragraph breaks and enclosing content in ``<p>``
    """
    assert isinstance(text, str)
    text = misc.html_escape(text)
@ -542,13 +637,15 @@ def plaintext2html(text, container_tag=None):
    text = html_keep_url(text)

    # 3-4: form paragraphs
-    idx = 0
-    final = '<p>'
-    br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
-    for item in re.finditer(br_tags, text):
-        final += text[idx:item.start()] + '</p><p>'
-        idx = item.end()
-    final += text[idx:] + '</p>'
+    final = text
+    if with_paragraph:
+        idx = 0
+        final = '<p>'
+        br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
+        for item in re.finditer(br_tags, text):
+            final += text[idx:item.start()] + '</p><p>'
+            idx = item.end()
+        final += text[idx:] + '</p>'

    # 5. container
    if container_tag: # FIXME: validate that container_tag is just a simple tag?
@ -682,19 +779,23 @@ def email_split_tuples(text):

    return list(map(_parse_based_on_spaces, valid_pairs))

+
 def email_split(text):
    """ Return a list of the email addresses found in ``text`` """
-    if not text:
-        return []
    return [email for (name, email) in email_split_tuples(text)]

+
 def email_split_and_format(text):
    """ Return a list of email addresses found in ``text``, formatted using
    formataddr. """
-    if not text:
-        return []
    return [formataddr((name, email)) for (name, email) in email_split_tuples(text)]

+
+def email_split_and_normalize(text):
+    """ Same as 'email_split' but normalized email """
+    return [(name, _normalize_email(email)) for (name, email) in email_split_tuples(text)]
+
+
 def email_split_and_format_normalize(text):
    """ Same as 'email_split_and_format' but normalizing email. """
    return [
@ -737,7 +838,6 @@ def email_normalize(text, strict=True):
    emails = email_split(text)
    if not emails or (strict and len(emails) != 1):
        return False
-
    return _normalize_email(emails[0])

 def email_normalize_all(text):
@ -749,8 +849,6 @@ def email_normalize_all(text):

    :return list: list of normalized emails found in text
    """
-    if not text:
-        return []
    emails = email_split(text)
    return list(filter(None, [_normalize_email(email) for email in emails]))

@ -782,6 +880,7 @@ def _normalize_email(email):
        pass
    else:
        local_part = local_part.lower()
+
    return local_part + at + domain.lower()

 def email_anonymize(normalized_email, *, redact_domain=False):