init

2025-03-30 16:36:39 +02:00 · 2024-01-23 14:15:27 +01:00 · 2024-01-23 14:15:27 +01:00 · 33d8557b07
commit 33d8557b07
10 changed files with 181 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/.idea/
--- a/README.md
+++ b/README.md
@ -0,0 +1,56 @@
+# PDF Replace
+
+About replacing text and links on PDF files.
+
+## PDF - Print all links
+
+See [pdf_replace/print_links.py](pdf_replace/print_links.py)
+
+## PDF - Print text from all pages
+
+See [pdf_replace/print_text.py](pdf_replace/print_text.py)
+
+## PDF - Replace text or links
+
+Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d
+
+1) Uncompress the PDF file
+
+```shell
+nix-shell -p qpdf
+qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf
+```
+
+or
+
+```shell
+nix-shell -p pdftk
+pdftk input.pdf output uncompressed.pdf uncompress
+```
+
+2) Edit the PDF as "plain text" file
+
+Works:
+
+- With `LC_ALL=C sed`
+  - `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf`
+
+Does **not** work:
+
+- With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned.
+
+Untested:
+
+- With a text editor using "search and replace". Warning, on large files this might be laggy.
+
+3) Compress the PDF
+
+```shell
+qpdf uncompressed-output.pdf output.pdf
+```
+
+or
+
+```shell
+pdftk uncompressed-output.pdf output output.pdf compress
+```
--- a/pdf-replace.iml
+++ b/pdf-replace.iml
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="GENERAL_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/pdf_replace" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/pdf_replace/init.py
+++ b/pdf_replace/init.py
--- a/pdf_replace/print_links.py
+++ b/pdf_replace/print_links.py
@ -0,0 +1,32 @@
+from pathlib import Path
+
+import pdfrw
+from pdfrw import PdfDict, PdfArray, PdfString
+
+
+def main():
+    pdf_file = Path("pdf.pdf")
+    print_links(pdf_file)
+
+
+def print_links(pdf_file: Path):
+    """
+    Print all links found in the given PDF file.
+    """
+    pdf_reader = pdfrw.PdfReader(pdf_file)
+    pages: list = pdf_reader.pages
+    page: PdfDict
+    for page_num, page in enumerate(pages):
+        print(f"Page {page_num}")
+        # Links are in /Annots
+        annots: PdfArray = page['/Annots']
+        for annot in annots:
+            # The links are inside brackets, e.g. (https://example.com)
+            uri: PdfString = annot['/A']['/URI']
+            # But after decoding them, the brackets are gone
+            uri_str: str = uri.decode()
+            print(f'\t{uri_str}')
+
+
+if __name__ == '__main__':
+    main()
--- a/pdf_replace/print_text.py
+++ b/pdf_replace/print_text.py
@ -0,0 +1,19 @@
+from pathlib import Path
+
+from pypdf import PdfReader
+
+
+def main():
+    """
+    Print text from all pages of PDF file.
+    """
+    file = Path("pdf.pdf")
+
+    # Print text of all PDF pages.
+    reader = PdfReader(file)
+    for page in reader.pages:
+        print(page.extract_text())
+
+
+if __name__ == '__main__':
+    main()
--- a/poetry.lock
+++ b/poetry.lock
@ -0,0 +1,35 @@
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+
+[[package]]
+name = "pdfrw"
+version = "0.4"
+description = "PDF file reader/writer library"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"},
+    {file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"},
+]
+
+[[package]]
+name = "pypdf"
+version = "3.17.4"
+description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"},
+    {file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"},
+]
+
+[package.extras]
+crypto = ["PyCryptodome", "cryptography"]
+dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"]
+docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
+full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"]
+image = ["Pillow (>=8.0.0)"]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.11"
+content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,16 @@
+[tool.poetry]
+name = "pdf-replace"
+version = "0.1.0"
+description = "Print or replace links and text in PDF files"
+authors = ["Daniel Langbein <daniel@systemli.org>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+pdfrw = "^0.4"
+pypdf = "^3.17.4"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,8 @@
+{ pkgs ? import <nixpkgs> {} }:
+
+pkgs.mkShell {
+  buildInputs = [
+    pkgs.python3
+    pkgs.poetry
+  ];
+}
--- a/tests/init.py
+++ b/tests/init.py