mirror of
https://github.com/bringout/oca-ocb-core.git
synced 2026-04-20 11:52:04 +02:00
18.0 vanilla
This commit is contained in:
parent
d72e748793
commit
0a7ae8db93
337 changed files with 399651 additions and 232598 deletions
|
|
@ -3,11 +3,11 @@
|
|||
|
||||
import base64
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
import email.utils
|
||||
from email.utils import getaddresses as orig_getaddresses
|
||||
|
|
@ -20,10 +20,25 @@ from lxml import etree, html
|
|||
from lxml.html import clean, defs
|
||||
from werkzeug import urls
|
||||
|
||||
import odoo
|
||||
from odoo.loglevels import ustr
|
||||
from odoo.tools import misc
|
||||
|
||||
__all__ = [
|
||||
"email_domain_extract",
|
||||
"email_domain_normalize",
|
||||
"email_normalize",
|
||||
"email_normalize_all",
|
||||
"email_split",
|
||||
"encapsulate_email",
|
||||
"formataddr",
|
||||
"html2plaintext",
|
||||
"html_normalize",
|
||||
"html_sanitize",
|
||||
"is_html_empty",
|
||||
"parse_contact_from_email",
|
||||
"plaintext2html",
|
||||
"single_email_re",
|
||||
]
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -44,12 +59,13 @@ else:
|
|||
safe_attrs = defs.safe_attrs | frozenset(
|
||||
['style',
|
||||
'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection
|
||||
'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-initial-sha', 'data-oe-nodeid',
|
||||
'data-last-history-steps', 'data-oe-protected', 'data-oe-transient-content', 'data-width', 'data-height', 'data-scale-x', 'data-scale-y', 'data-x', 'data-y',
|
||||
'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid',
|
||||
'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version',
|
||||
'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', 'data-width', 'data-height', 'data-scale-x', 'data-scale-y', 'data-x', 'data-y', # legacy editor
|
||||
'data-oe-role', 'data-oe-aria-label',
|
||||
'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
|
||||
'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
|
||||
'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
|
||||
'data-behavior-props', 'data-prop-name', # knowledge commands
|
||||
'data-mimetype-before-conversion',
|
||||
])
|
||||
SANITIZE_TAGS = {
|
||||
|
|
@ -68,10 +84,12 @@ class _Cleaner(clean.Cleaner):
|
|||
_style_whitelist = [
|
||||
'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',
|
||||
'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',
|
||||
'float', 'vertical-align', 'display',
|
||||
'float', 'vertical-align', 'display', 'object-fit',
|
||||
'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',
|
||||
'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',
|
||||
'white-space',
|
||||
# appearance
|
||||
'background-image', 'background-position', 'background-size', 'background-repeat', 'background-origin',
|
||||
# box model
|
||||
'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',
|
||||
'height', 'width', 'max-width', 'min-width', 'min-height',
|
||||
|
|
@ -86,6 +104,7 @@ class _Cleaner(clean.Cleaner):
|
|||
|
||||
strip_classes = False
|
||||
sanitize_style = False
|
||||
conditional_comments = True
|
||||
|
||||
def __call__(self, doc):
|
||||
super(_Cleaner, self).__call__(doc)
|
||||
|
|
@ -118,6 +137,24 @@ class _Cleaner(clean.Cleaner):
|
|||
else:
|
||||
del el.attrib['style']
|
||||
|
||||
def kill_conditional_comments(self, doc):
|
||||
"""Override the default behavior of lxml.
|
||||
|
||||
https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510
|
||||
|
||||
In some use cases, e.g. templates used for mass mailing,
|
||||
we send emails containing conditional comments targeting Microsoft Outlook,
|
||||
to give special styling instructions.
|
||||
https://github.com/odoo/odoo/pull/119325/files#r1301064789
|
||||
|
||||
Within these conditional comments, unsanitized HTML can lie.
|
||||
However, in modern browser, these comments are considered as simple comments,
|
||||
their content is not executed.
|
||||
https://caniuse.com/sr_ie-features
|
||||
"""
|
||||
if self.conditional_comments:
|
||||
super().kill_conditional_comments(doc)
|
||||
|
||||
|
||||
def tag_quote(el):
|
||||
def _create_new_node(tag, text, tail=None, attrs=None):
|
||||
|
|
@ -209,13 +246,23 @@ def tag_quote(el):
|
|||
# remove single node
|
||||
el.set('data-o-mail-quote-node', '1')
|
||||
el.set('data-o-mail-quote', '1')
|
||||
if el.getparent() is not None and (el.getparent().get('data-o-mail-quote') or el.getparent().get('data-o-mail-quote-container')) and not el.getparent().get('data-o-mail-quote-node'):
|
||||
el.set('data-o-mail-quote', '1')
|
||||
if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'):
|
||||
if el.getparent().get('data-o-mail-quote'):
|
||||
el.set('data-o-mail-quote', '1')
|
||||
# only quoting the elements following the first quote in the container
|
||||
# avoids issues with repeated calls to html_normalize
|
||||
elif el.getparent().get('data-o-mail-quote-container'):
|
||||
if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None:
|
||||
siblings = el.getparent().getchildren()
|
||||
quote_index = siblings.index(first_sibling_quote)
|
||||
element_index = siblings.index(el)
|
||||
if quote_index < element_index:
|
||||
el.set('data-o-mail-quote', '1')
|
||||
if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip():
|
||||
el.set('data-o-mail-quote', '1')
|
||||
|
||||
|
||||
def html_normalize(src, filter_callback=None):
|
||||
def html_normalize(src, filter_callback=None, output_method="html"):
|
||||
""" Normalize `src` for storage as an html field value.
|
||||
|
||||
The string is parsed as an html tag soup, made valid, then decorated for
|
||||
|
|
@ -228,27 +275,27 @@ def html_normalize(src, filter_callback=None):
|
|||
:param filter_callback: optional callable taking a single `etree._Element`
|
||||
document parameter, to be called during normalization in order to
|
||||
filter the output document
|
||||
:param output_method: defines the output method to pass to `html.tostring`.
|
||||
It defaults to 'html', but can also be 'xml' for xhtml output.
|
||||
"""
|
||||
|
||||
if not src:
|
||||
return src
|
||||
|
||||
src = ustr(src, errors='replace')
|
||||
# html: remove encoding attribute inside tags
|
||||
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
|
||||
src = doctype.sub(u"", src)
|
||||
src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src, flags=re.IGNORECASE | re.DOTALL)
|
||||
|
||||
src = src.replace('--!>', '-->')
|
||||
src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
|
||||
# On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed
|
||||
# in '<p></p>' which may alter the appearance (eg. spacing) of the mail body
|
||||
src = re.sub(r'</?o:.*?>', '', src)
|
||||
|
||||
try:
|
||||
src = src.replace('--!>', '-->')
|
||||
src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
|
||||
# On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed
|
||||
# in '<p></p>' which may alter the appearance (eg. spacing) of the mail body
|
||||
src = re.sub(r'</?o:.*?>', '', src)
|
||||
doc = html.fromstring(src)
|
||||
except etree.ParserError as e:
|
||||
# HTML comment only string, whitespace only..
|
||||
if 'empty' in str(e):
|
||||
return u""
|
||||
return ""
|
||||
raise
|
||||
|
||||
# perform quote detection before cleaning and class removal
|
||||
|
|
@ -259,7 +306,7 @@ def html_normalize(src, filter_callback=None):
|
|||
if filter_callback:
|
||||
doc = filter_callback(doc)
|
||||
|
||||
src = html.tostring(doc, encoding='unicode')
|
||||
src = html.tostring(doc, encoding='unicode', method=output_method)
|
||||
|
||||
# this is ugly, but lxml/etree tostring want to put everything in a
|
||||
# 'div' that breaks the editor -> remove that
|
||||
|
|
@ -272,7 +319,7 @@ def html_normalize(src, filter_callback=None):
|
|||
return src
|
||||
|
||||
|
||||
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, strip_style=False, strip_classes=False):
|
||||
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"):
|
||||
if not src:
|
||||
return src
|
||||
|
||||
|
|
@ -286,6 +333,7 @@ def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=Fals
|
|||
'forms': sanitize_form, # True = remove form tags
|
||||
'remove_unknown_tags': False,
|
||||
'comments': False,
|
||||
'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments
|
||||
'processing_instructions': False
|
||||
}
|
||||
if sanitize_tags:
|
||||
|
|
@ -311,7 +359,7 @@ def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=Fals
|
|||
return doc
|
||||
|
||||
try:
|
||||
sanitized = html_normalize(src, filter_callback=sanitize_handler)
|
||||
sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method)
|
||||
except etree.ParserError:
|
||||
if not silent:
|
||||
raise
|
||||
|
|
@ -329,7 +377,8 @@ def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=Fals
|
|||
# HTML/Text management
|
||||
# ----------------------------------------------------------
|
||||
|
||||
URL_REGEX = r'(\bhref=[\'"](?!mailto:|tel:|sms:)([^\'"]+)[\'"])'
|
||||
URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:'
|
||||
URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])'''
|
||||
TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?'
|
||||
# retrieve inner content of the link
|
||||
HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?'
|
||||
|
|
@ -392,20 +441,24 @@ def create_link(url, label):
|
|||
return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'
|
||||
|
||||
|
||||
def html2plaintext(html, body_id=None, encoding='utf-8'):
|
||||
def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True):
|
||||
""" From an HTML text, convert the HTML to plain text.
|
||||
If @param body_id is provided then this is the tag where the
|
||||
body (not necessarily <body>) starts.
|
||||
:param include_references: If False, numbered references and
|
||||
URLs for links and images will not be included.
|
||||
"""
|
||||
## (c) Fry-IT, www.fry-it.com, 2007
|
||||
## <peter@fry-it.com>
|
||||
## download here: http://www.peterbe.com/plog/html2plaintext
|
||||
|
||||
html = ustr(html)
|
||||
|
||||
if not html.strip():
|
||||
if not (html and html.strip()):
|
||||
return ''
|
||||
|
||||
if isinstance(html, bytes):
|
||||
html = html.decode(encoding)
|
||||
else:
|
||||
assert isinstance(html, str), f"expected str got {html.__class__.__name__}"
|
||||
|
||||
tree = etree.fromstring(html, parser=etree.HTMLParser())
|
||||
|
||||
if body_id is not None:
|
||||
|
|
@ -416,28 +469,25 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
|
|||
tree = source[0]
|
||||
|
||||
url_index = []
|
||||
i = 0
|
||||
for link in tree.findall('.//a'):
|
||||
url = link.get('href')
|
||||
if url:
|
||||
i += 1
|
||||
link.tag = 'span'
|
||||
link.text = '%s [%s]' % (link.text, i)
|
||||
url_index.append(url)
|
||||
linkrefs = itertools.count(1)
|
||||
if include_references:
|
||||
for link in tree.findall('.//a'):
|
||||
if url := link.get('href'):
|
||||
link.tag = 'span'
|
||||
link.text = f'{link.text} [{next(linkrefs)}]'
|
||||
url_index.append(url)
|
||||
|
||||
for img in tree.findall('.//img'):
|
||||
src = img.get('src')
|
||||
if src:
|
||||
i += 1
|
||||
img.tag = 'span'
|
||||
if src.startswith('data:'):
|
||||
img_name = None # base64 image
|
||||
else:
|
||||
img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)
|
||||
img.text = '%s [%s]' % (img_name.group(0) if img_name else 'Image', i)
|
||||
url_index.append(src)
|
||||
for img in tree.findall('.//img'):
|
||||
if src := img.get('src'):
|
||||
img.tag = 'span'
|
||||
if src.startswith('data:'):
|
||||
img_name = None # base64 image
|
||||
else:
|
||||
img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)
|
||||
img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs))
|
||||
url_index.append(src)
|
||||
|
||||
html = ustr(etree.tostring(tree, encoding=encoding))
|
||||
html = etree.tostring(tree, encoding="unicode")
|
||||
# \r char is converted into , must remove it
|
||||
html = html.replace(' ', '')
|
||||
|
||||
|
|
@ -461,10 +511,10 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
|
|||
html = '\n'.join([x.strip() for x in html.splitlines()])
|
||||
html = html.replace('\n' * 2, '\n')
|
||||
|
||||
for i, url in enumerate(url_index):
|
||||
if i == 0:
|
||||
html += '\n\n'
|
||||
html += ustr('[%s] %s\n') % (i + 1, url)
|
||||
if url_index:
|
||||
html += '\n\n'
|
||||
for i, url in enumerate(url_index, start=1):
|
||||
html += f'[{i}] {url}\n'
|
||||
|
||||
return html.strip()
|
||||
|
||||
|
|
@ -482,7 +532,8 @@ def plaintext2html(text, container_tag=None):
|
|||
embedded into a ``<div>``
|
||||
:rtype: markupsafe.Markup
|
||||
"""
|
||||
text = misc.html_escape(ustr(text))
|
||||
assert isinstance(text, str)
|
||||
text = misc.html_escape(text)
|
||||
|
||||
# 1. replace \n and \r
|
||||
text = re.sub(r'(\r\n|\r|\n)', '<br/>', text)
|
||||
|
|
@ -508,7 +559,7 @@ def append_content_to_html(html, content, plaintext=True, preserve=False, contai
|
|||
""" Append extra content at the end of an HTML snippet, trying
|
||||
to locate the end of the HTML document (</body>, </html>, or
|
||||
EOF), and converting the provided content in html unless ``plaintext``
|
||||
is False.
|
||||
is ``False``.
|
||||
|
||||
Content conversion can be done in two ways:
|
||||
|
||||
|
|
@ -529,17 +580,16 @@ def append_content_to_html(html, content, plaintext=True, preserve=False, contai
|
|||
:param str container_tag: tag to wrap the content into, defaults to `div`.
|
||||
:rtype: markupsafe.Markup
|
||||
"""
|
||||
html = ustr(html)
|
||||
if plaintext and preserve:
|
||||
content = u'\n<pre>%s</pre>\n' % misc.html_escape(ustr(content))
|
||||
content = '\n<pre>%s</pre>\n' % misc.html_escape(content)
|
||||
elif plaintext:
|
||||
content = '\n%s\n' % plaintext2html(content, container_tag)
|
||||
else:
|
||||
content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
|
||||
content = u'\n%s\n' % ustr(content)
|
||||
content = '\n%s\n' % content
|
||||
# Force all tags to lowercase
|
||||
html = re.sub(r'(</?)(\w+)([ >])',
|
||||
lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
|
||||
lambda m: '%s%s%s' % (m[1], m[2].lower(), m[3]), html)
|
||||
insert_location = html.find('</body>')
|
||||
if insert_location == -1:
|
||||
insert_location = html.find('</html>')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue