commit 33d8557b07ff508c97c5669858e7f0ff7b424e57 Author: Daniel Langbein Date: Tue Jan 23 14:15:27 2024 +0100 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..85e7c1d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..878e577 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# PDF Replace + +About replacing text and links on PDF files. + +## PDF - Print all links + +See [pdf_replace/print_links.py](pdf_replace/print_links.py) + +## PDF - Print text from all pages + +See [pdf_replace/print_text.py](pdf_replace/print_text.py) + +## PDF - Replace text or links + +Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d + +1) Uncompress the PDF file + +```shell +nix-shell -p qpdf +qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf +``` + +or + +```shell +nix-shell -p pdftk +pdftk input.pdf output uncompressed.pdf uncompress +``` + +2) Edit the PDF as "plain text" file + +Works: + +- With `LC_ALL=C sed` + - `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf` + +Does **not** work: + +- With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned. + +Untested: + +- With a text editor using "search and replace". Warning, on large files this might be laggy. + +3) Compress the PDF + +```shell +qpdf uncompressed-output.pdf output.pdf +``` + +or + +```shell +pdftk uncompressed-output.pdf output output.pdf compress +``` diff --git a/pdf-replace.iml b/pdf-replace.iml new file mode 100644 index 0000000..e3e2d93 --- /dev/null +++ b/pdf-replace.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/pdf_replace/__init__.py b/pdf_replace/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pdf_replace/print_links.py b/pdf_replace/print_links.py new file mode 100644 index 0000000..9543ba9 --- /dev/null +++ b/pdf_replace/print_links.py @@ -0,0 +1,32 @@ +from pathlib import Path + +import pdfrw +from pdfrw import PdfDict, PdfArray, PdfString + + +def main(): + pdf_file = Path("pdf.pdf") + print_links(pdf_file) + + +def print_links(pdf_file: Path): + """ + Print all links found in the given PDF file. + """ + pdf_reader = pdfrw.PdfReader(pdf_file) + pages: list = pdf_reader.pages + page: PdfDict + for page_num, page in enumerate(pages): + print(f"Page {page_num}") + # Links are in /Annots + annots: PdfArray = page['/Annots'] + for annot in annots: + # The links are inside brackets, e.g. (https://example.com) + uri: PdfString = annot['/A']['/URI'] + # But after decoding them, the brackets are gone + uri_str: str = uri.decode() + print(f'\t{uri_str}') + + +if __name__ == '__main__': + main() diff --git a/pdf_replace/print_text.py b/pdf_replace/print_text.py new file mode 100644 index 0000000..b91c36b --- /dev/null +++ b/pdf_replace/print_text.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from pypdf import PdfReader + + +def main(): + """ + Print text from all pages of PDF file. + """ + file = Path("pdf.pdf") + + # Print text of all PDF pages. + reader = PdfReader(file) + for page in reader.pages: + print(page.extract_text()) + + +if __name__ == '__main__': + main() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..1547e5c --- /dev/null +++ b/poetry.lock @@ -0,0 +1,35 @@ +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. + +[[package]] +name = "pdfrw" +version = "0.4" +description = "PDF file reader/writer library" +optional = false +python-versions = "*" +files = [ + {file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"}, + {file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"}, +] + +[[package]] +name = "pypdf" +version = "3.17.4" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"}, + {file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"}, +] + +[package.extras] +crypto = ["PyCryptodome", "cryptography"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"] +image = ["Pillow (>=8.0.0)"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..04ca2f3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "pdf-replace" +version = "0.1.0" +description = "Print or replace links and text in PDF files" +authors = ["Daniel Langbein "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.11" +pdfrw = "^0.4" +pypdf = "^3.17.4" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..679ebf4 --- /dev/null +++ b/shell.nix @@ -0,0 +1,8 @@ +{ pkgs ? import {} }: + +pkgs.mkShell { + buildInputs = [ + pkgs.python3 + pkgs.poetry + ]; +} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29