image-width-limit/image-width-limit.py

90 lines
2.5 KiB
Python
Raw Normal View History

2021-09-15 17:27:01 +02:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from pathlib import Path
from sys import argv
from typing import Dict
from bs4 import BeautifulSoup, ResultSet
def main():
"""
Python3 dependencies:
-> beautifulsoup4 (pacman -S python-beautifulsoup4)
-> html5lib (pacman -S python-html5lib)
"""
if len(argv) < 2 or len(argv) > 3:
raise "usage: <input_file> [<output_file>]"
input_file = Path(argv[1])
if len(argv) == 3:
output_file = Path(argv[2])
else: # len(argv) == 2
output_file = input_file
# Read input file
with open(input_file) as f:
html_str = f.read()
modified_html_str = limit_image_width(html_str)
# Write to new file -> overwrite if already existent!
with open(output_file, mode='w') as f:
f.write(modified_html_str)
def limit_image_width(html_str) -> str:
"""
When converting HTML5 to other formats, e.g. PDF, it
may happen that too wide images get cropped of.
If there are HTML5 image tags which do only contain
the 'src' and 'alt' attribute, then this method adds
the following style attribute to limit their width:
2021-09-15 17:27:01 +02:00
max-width:100%;height:auto;
2022-04-17 00:47:46 +02:00
Update: As images may also be to tall (and get split
up over multiple pages), here is an improved style
attribute:
max-width:100%;height:25em;
:param html_str: source HTML5 string
:returns: modified HTML5 with max-width attribute added to all img tags without size attributes
2021-09-15 17:27:01 +02:00
"""
html_parser = 'html5lib'
soup = BeautifulSoup(markup=html_str, features=html_parser)
tag_name = 'img'
img_tags: ResultSet = soup.find_all(name=tag_name, recursive=True)
for img_tag in img_tags:
attrs: Dict = img_tag.attrs
if 'src' not in attrs.keys():
raise 'src attr missing!'
if 'alt' not in attrs.keys():
attrs['alt'] = attrs['src']
# Any other attrs apart from 'src' and 'alt' may specify the image size and position.
# If such attrs do already exist, we continue with the next image.
# Otherwise: We add an attribute to fit the image to the screen/page.
if len(attrs) > 2:
continue
# Prevent too wide images.
# Sources:
# -> https://www.smashingmagazine.com/2020/03/setting-height-width-images-important-again/
# -> https://www.w3schools.com/tags/att_style.asp
2022-04-17 00:47:46 +02:00
attrs['style'] = 'max-width:100%;height:25em;'
2021-09-15 17:27:01 +02:00
# return soup.prettify()
return str(soup)
if __name__ == '__main__':
main()