19.0 vanilla

This commit is contained in:
Ernad Husremovic 2025-10-03 18:07:25 +02:00
parent 0a7ae8db93
commit 991d2234ca
416 changed files with 646602 additions and 300844 deletions

View file

@ -12,12 +12,22 @@ import time
import email.utils
from email.utils import getaddresses as orig_getaddresses
from urllib.parse import urlparse
from typing import Literal
import html as htmllib
import idna
import markupsafe
from lxml import etree, html
from lxml.html import clean, defs
from lxml.html import (
XHTML_NAMESPACE,
_contains_block_level_tag,
_looks_like_full_html_bytes,
_looks_like_full_html_unicode,
clean,
defs,
document_fromstring,
html_parser,
)
from werkzeug import urls
from odoo.tools import misc
@ -66,7 +76,11 @@ safe_attrs = defs.safe_attrs | frozenset(
'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
'data-attachment-id', 'data-format-mimetype',
'data-ai-field', 'data-ai-record-id',
'data-heading-link-id',
'data-mimetype-before-conversion',
'data-language-id', 'data-syntax-highlighting-value'
])
SANITIZE_TAGS = {
# allow new semantic HTML5 tags
@ -262,6 +276,76 @@ def tag_quote(el):
el.set('data-o-mail-quote', '1')
def fromstring(html_, base_url=None, parser=None, **kw):
"""
This function mimics lxml.html.fromstring. It not only returns the parsed
element/document but also a flag indicating whether the input is for a
a single body element or not.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
if parser is None:
parser = html_parser
if isinstance(html_, bytes):
is_full_html = _looks_like_full_html_bytes(html_)
else:
is_full_html = _looks_like_full_html_unicode(html_)
doc = document_fromstring(html_, parser=parser, base_url=base_url, **kw)
if is_full_html:
return doc, False
# otherwise, lets parse it out...
bodies = doc.findall('body')
if not bodies:
bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
if bodies:
body = bodies[0]
if len(bodies) > 1:
# Somehow there are multiple bodies, which is bad, but just
# smash them into one body
for other_body in bodies[1:]:
if other_body.text:
if len(body):
body[-1].tail = (body[-1].tail or '') + other_body.text
else:
body.text = (body.text or '') + other_body.text
body.extend(other_body)
# We'll ignore tail
# I guess we are ignoring attributes too
other_body.drop_tree()
else:
body = None
heads = doc.findall('head')
if not heads:
heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
if heads:
# Well, we have some sort of structure, so lets keep it all
head = heads[0]
if len(heads) > 1:
for other_head in heads[1:]:
head.extend(other_head)
# We don't care about text or tail in a head
other_head.drop_tree()
return doc, False
if body is None:
return doc, False
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
# The body has just one element, so it was probably a single
# element passed in
return body[0], True
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except <body> implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body, False
def html_normalize(src, filter_callback=None, output_method="html"):
""" Normalize `src` for storage as an html field value.
@ -282,7 +366,7 @@ def html_normalize(src, filter_callback=None, output_method="html"):
return src
# html: remove encoding attribute inside tags
src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src, flags=re.IGNORECASE | re.DOTALL)
src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src)
src = src.replace('--!>', '-->')
src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
@ -291,7 +375,7 @@ def html_normalize(src, filter_callback=None, output_method="html"):
src = re.sub(r'</?o:.*?>', '', src)
try:
doc = html.fromstring(src)
doc, single_body_element = fromstring(src)
except etree.ParserError as e:
# HTML comment only string, whitespace only..
if 'empty' in str(e):
@ -299,18 +383,23 @@ def html_normalize(src, filter_callback=None, output_method="html"):
raise
# perform quote detection before cleaning and class removal
if doc is not None:
for el in doc.iter(tag=etree.Element):
tag_quote(el)
for el in doc.iter(tag=etree.Element):
tag_quote(el)
if filter_callback:
doc = filter_callback(doc)
src = html.tostring(doc, encoding='unicode', method=output_method)
# this is ugly, but lxml/etree tostring want to put everything in a
# 'div' that breaks the editor -> remove that
if src.startswith('<div>') and src.endswith('</div>'):
if not single_body_element and src.startswith('<div>') and src.endswith('</div>'):
# the <div></div> may come from 2 places
# 1. the src is parsed as multiple body elements
# <div></div> wraps all elements.
# 2. the src is parsed as not only body elements
# <html></html> wraps all elements.
# then the Cleaner as the filter_callback which has 'html' in its
# 'remove_tags' will write <html></html> to <div></div> since it
# cannot directly drop the parent-most tag
src = src[5:-6]
# html considerations so real html content match database value
@ -393,19 +482,20 @@ def validate_url(url):
return url
def is_html_empty(html_content):
def is_html_empty(html_content: str | markupsafe.Markup | Literal[False] | None) -> bool:
"""Check if a html content is empty. If there are only formatting tags with style
attributes or a void content return True. Famous use case if a
'<p style="..."><br></p>' added by some web editor.
:param str html_content: html content, coming from example from an HTML field
:returns: bool, True if no content found or if containing only void formatting tags
:param html_content: html content, coming from example from an HTML field
:returns: True if no content found or if containing only void formatting tags
"""
if not html_content:
return True
icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'
tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'
return not bool(re.sub(tag_re, '', html_content).strip()) and not re.search(icon_re, html_content)
text_content = htmllib.unescape(re.sub(tag_re, '', html_content))
return not bool(text_content.strip()) and not re.search(icon_re, html_content)
def html_keep_url(text):
@ -432,16 +522,21 @@ def html_to_inner_content(html):
processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)
processed = re.sub(HTML_TAGS_REGEX, '', processed)
processed = re.sub(r' {2,}|\t', ' ', processed)
processed = processed.replace("\xa0", " ")
processed = htmllib.unescape(processed)
processed = processed.strip()
return processed
return processed.strip()
def create_link(url, label):
return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'
def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True):
def html2plaintext(
html: str | markupsafe.Markup | Literal[False] | None,
body_id: str | None = None,
encoding: str = 'utf-8',
include_references: bool = True
) -> str:
""" From an HTML text, convert the HTML to plain text.
If @param body_id is provided then this is the tag where the
body (not necessarily <body>) starts.
@ -518,19 +613,19 @@ def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True
return html.strip()
def plaintext2html(text, container_tag=None):
def plaintext2html(text: str, container_tag: str | None = None, with_paragraph: bool = True) -> markupsafe.Markup:
r"""Convert plaintext into html. Content of the text is escaped to manage
html entities, using :func:`~odoo.tools.misc.html_escape`.
- all ``\n``, ``\r`` are replaced by ``<br/>``
- enclose content into ``<p>``
- convert url into clickable link
- 2 or more consecutive ``<br/>`` are considered as paragraph breaks
:param str text: plaintext to convert
:param str container_tag: container of the html; by default the content is
:param text: plaintext to convert
:param container_tag: container of the html; by default the content is
embedded into a ``<div>``
:rtype: markupsafe.Markup
:param with_paragraph: whether or not considering 2 or more consecutive ``<br/>``
as paragraph breaks and enclosing content in ``<p>``
"""
assert isinstance(text, str)
text = misc.html_escape(text)
@ -542,13 +637,15 @@ def plaintext2html(text, container_tag=None):
text = html_keep_url(text)
# 3-4: form paragraphs
idx = 0
final = '<p>'
br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
for item in re.finditer(br_tags, text):
final += text[idx:item.start()] + '</p><p>'
idx = item.end()
final += text[idx:] + '</p>'
final = text
if with_paragraph:
idx = 0
final = '<p>'
br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
for item in re.finditer(br_tags, text):
final += text[idx:item.start()] + '</p><p>'
idx = item.end()
final += text[idx:] + '</p>'
# 5. container
if container_tag: # FIXME: validate that container_tag is just a simple tag?
@ -682,19 +779,23 @@ def email_split_tuples(text):
return list(map(_parse_based_on_spaces, valid_pairs))
def email_split(text):
""" Return a list of the email addresses found in ``text`` """
if not text:
return []
return [email for (name, email) in email_split_tuples(text)]
def email_split_and_format(text):
""" Return a list of email addresses found in ``text``, formatted using
formataddr. """
if not text:
return []
return [formataddr((name, email)) for (name, email) in email_split_tuples(text)]
def email_split_and_normalize(text):
""" Same as 'email_split' but normalized email """
return [(name, _normalize_email(email)) for (name, email) in email_split_tuples(text)]
def email_split_and_format_normalize(text):
""" Same as 'email_split_and_format' but normalizing email. """
return [
@ -737,7 +838,6 @@ def email_normalize(text, strict=True):
emails = email_split(text)
if not emails or (strict and len(emails) != 1):
return False
return _normalize_email(emails[0])
def email_normalize_all(text):
@ -749,8 +849,6 @@ def email_normalize_all(text):
:return list: list of normalized emails found in text
"""
if not text:
return []
emails = email_split(text)
return list(filter(None, [_normalize_email(email) for email in emails]))
@ -782,6 +880,7 @@ def _normalize_email(email):
pass
else:
local_part = local_part.lower()
return local_part + at + domain.lower()
def email_anonymize(normalized_email, *, redact_domain=False):