mirror of
https://github.com/bringout/oca-ocb-core.git
synced 2026-04-20 09:12:04 +02:00
19.0 vanilla
This commit is contained in:
parent
d1963a3c3a
commit
2d3ee4855a
7430 changed files with 2687981 additions and 2965473 deletions
108
odoo-bringout-oca-ocb-mail/mail/tools/link_preview.py
Normal file
108
odoo-bringout-oca-ocb-mail/mail/tools/link_preview.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Part of Odoo. See LICENSE file for full copyright and licensing details.
|
||||
import re
|
||||
from lxml import html
|
||||
import chardet
|
||||
import requests
|
||||
from urllib3.exceptions import LocationParseError
|
||||
|
||||
|
||||
def get_link_preview_from_url(url, request_session=None):
|
||||
"""
|
||||
Get the Open Graph properties of an url. (https://ogp.me/)
|
||||
If the url leads directly to an image mimetype, return
|
||||
the url as preview image else retrieve the properties from
|
||||
the html page.
|
||||
|
||||
Using a stream request to prevent loading the whole page
|
||||
as those properties are declared in the <head> tag.
|
||||
|
||||
The request session is optional as in some cases using
|
||||
a session could be beneficial performance wise
|
||||
(e.g. a lot of url could have the same domain).
|
||||
"""
|
||||
# Some websites are blocking non browser user agent.
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0',
|
||||
'Odoo-Link-Preview': 'True', # Used to identify coming from the link previewer
|
||||
}
|
||||
try:
|
||||
if request_session:
|
||||
response = request_session.get(url, timeout=3, headers=headers, allow_redirects=True, stream=True)
|
||||
else:
|
||||
response = requests.get(url, timeout=3, headers=headers, allow_redirects=True, stream=True)
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
except LocationParseError:
|
||||
return False
|
||||
if not response.ok or not response.headers.get('Content-Type'):
|
||||
return False
|
||||
# Content-Type header can return a charset, but we just need the
|
||||
# mimetype (eg: image/jpeg;charset=ISO-8859-1)
|
||||
content_type = response.headers['Content-Type'].split(';')
|
||||
if response.headers['Content-Type'].startswith('image/'):
|
||||
return {
|
||||
'image_mimetype': content_type[0],
|
||||
'og_image': url, # If the url mimetype is already an image type, set url as preview image
|
||||
'source_url': url,
|
||||
}
|
||||
elif response.headers['Content-Type'].startswith('text/html'):
|
||||
return get_link_preview_from_html(url, response)
|
||||
return False
|
||||
|
||||
def get_link_preview_from_html(url, response):
|
||||
"""
|
||||
Retrieve the Open Graph properties from the html page. (https://ogp.me/)
|
||||
Load the page with chunks of 8kb to prevent loading the whole
|
||||
html when we only need the <head> tag content.
|
||||
Fallback on the <title> tag if the html doesn't have
|
||||
any Open Graph title property.
|
||||
"""
|
||||
content = b""
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
content += chunk
|
||||
pos = content.find(b'</head>', -8196 * 2)
|
||||
# Stop reading once all the <head> data is found
|
||||
if pos != -1:
|
||||
content = content[:pos + 7]
|
||||
break
|
||||
|
||||
if not content:
|
||||
return False
|
||||
|
||||
encoding = response.encoding or chardet.detect(content).get("encoding", "utf-8")
|
||||
try:
|
||||
decoded_content = content.decode(encoding)
|
||||
except (UnicodeDecodeError, TypeError) as e:
|
||||
decoded_content = content.decode("utf-8", errors="ignore")
|
||||
|
||||
try:
|
||||
tree = html.fromstring(decoded_content)
|
||||
except ValueError:
|
||||
decoded_content = re.sub(
|
||||
r"^<\?xml[^>]+\?>\s*", "", decoded_content, flags=re.IGNORECASE
|
||||
)
|
||||
tree = html.fromstring(decoded_content)
|
||||
|
||||
og_title = tree.xpath('//meta[@property="og:title"]/@content')
|
||||
if og_title:
|
||||
og_title = og_title[0]
|
||||
elif tree.find('.//title') is not None:
|
||||
# Fallback on the <title> tag if it exists
|
||||
og_title = tree.find('.//title').text
|
||||
else:
|
||||
return False
|
||||
og_description = tree.xpath('//meta[@property="og:description"]/@content')
|
||||
og_type = tree.xpath('//meta[@property="og:type"]/@content')
|
||||
og_site_name = tree.xpath('//meta[@property="og:site_name"]/@content')
|
||||
og_image = tree.xpath('//meta[@property="og:image"]/@content')
|
||||
og_mimetype = tree.xpath('//meta[@property="og:image:type"]/@content')
|
||||
return {
|
||||
'og_description': og_description[0] if og_description else None,
|
||||
'og_image': og_image[0] if og_image else None,
|
||||
'og_mimetype': og_mimetype[0] if og_mimetype else None,
|
||||
'og_title': og_title,
|
||||
'og_type': og_type[0] if og_type else None,
|
||||
'og_site_name': og_site_name[0] if og_site_name else None,
|
||||
'source_url': url,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue