#!/usr/bin/python3 # -*- coding: utf-8 -*- from pathlib import Path from sys import argv from typing import Dict from bs4 import BeautifulSoup, ResultSet def main(): """ Python3 dependencies: -> beautifulsoup4 (pacman -S python-beautifulsoup4) -> html5lib (pacman -S python-html5lib) """ if len(argv) < 2 or len(argv) > 3: raise "usage: []" input_file = Path(argv[1]) if len(argv) == 3: output_file = Path(argv[2]) else: # len(argv) == 2 output_file = input_file # Read input file with open(input_file) as f: html_str = f.read() modified_html_str = limit_image_width(html_str) # Write to new file -> overwrite if already existent! with open(output_file, mode='w') as f: f.write(modified_html_str) def limit_image_width(html_str) -> str: """ When converting HTML5 to other formats, e.g. PDF, it may happen that too wide images get cropped of. If there are HTML5 image tags which do only contain the 'src' and 'alt' attribute, then this method adds the following style attribute to limit their width: max-width:100%;height:auto; Update: As images may also be to tall (and get split up over multiple pages), here is an improved style attribute: max-width:100%;height:25em; :param html_str: source HTML5 string :returns: modified HTML5 with max-width attribute added to all img tags without size attributes """ html_parser = 'html5lib' soup = BeautifulSoup(markup=html_str, features=html_parser) tag_name = 'img' img_tags: ResultSet = soup.find_all(name=tag_name, recursive=True) for img_tag in img_tags: attrs: Dict = img_tag.attrs if 'src' not in attrs.keys(): raise 'src attr missing!' if 'alt' not in attrs.keys(): attrs['alt'] = attrs['src'] # Any other attrs apart from 'src' and 'alt' may specify the image size and position. # If such attrs do already exist, we continue with the next image. # Otherwise: We add an attribute to fit the image to the screen/page. if len(attrs) > 2: continue # Prevent too wide images. # Sources: # -> https://www.smashingmagazine.com/2020/03/setting-height-width-images-important-again/ # -> https://www.w3schools.com/tags/att_style.asp attrs['style'] = 'max-width:100%;height:25em;' # return soup.prettify() return str(soup) if __name__ == '__main__': main()