init

2025-04-02 17:05:59 +02:00 · 2024-01-23 14:15:27 +01:00 · 2024-01-23 14:15:27 +01:00 · 33d8557b07
commit 33d8557b07
10 changed files with 181 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /.idea/
--- a/README.md
+++ b/README.md
@ -0,0 +1,56 @@
 # PDF Replace
 About replacing text and links on PDF files.
 ## PDF - Print all links
 See [pdf_replace/print_links.py](pdf_replace/print_links.py)
 ## PDF - Print text from all pages
 See [pdf_replace/print_text.py](pdf_replace/print_text.py)
 ## PDF - Replace text or links
 Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d
 1) Uncompress the PDF file
 ```shell
 nix-shell -p qpdf
 qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf
 ```
 or
 ```shell
 nix-shell -p pdftk
 pdftk input.pdf output uncompressed.pdf uncompress
 ```
 2) Edit the PDF as "plain text" file
 Works:
 - With `LC_ALL=C sed`
  - `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf`
 Does **not** work:
 - With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned.
 Untested:
 - With a text editor using "search and replace". Warning, on large files this might be laggy.
 3) Compress the PDF
 ```shell
 qpdf uncompressed-output.pdf output.pdf
 ```
 or
 ```shell
 pdftk uncompressed-output.pdf output output.pdf compress
 ```
--- a/pdf-replace.iml
+++ b/pdf-replace.iml
@ -0,0 +1,14 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="GENERAL_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$">
      <sourceFolder url="file://$MODULE_DIR$/pdf_replace" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/pdf_replace/init.py
+++ b/pdf_replace/init.py
--- a/pdf_replace/print_links.py
+++ b/pdf_replace/print_links.py
@ -0,0 +1,32 @@
 from pathlib import Path
 import pdfrw
 from pdfrw import PdfDict, PdfArray, PdfString
 def main():
    pdf_file = Path("pdf.pdf")
    print_links(pdf_file)
 def print_links(pdf_file: Path):
    """
    Print all links found in the given PDF file.
    """
    pdf_reader = pdfrw.PdfReader(pdf_file)
    pages: list = pdf_reader.pages
    page: PdfDict
    for page_num, page in enumerate(pages):
        print(f"Page {page_num}")
        # Links are in /Annots
        annots: PdfArray = page['/Annots']
        for annot in annots:
            # The links are inside brackets, e.g. (https://example.com)
            uri: PdfString = annot['/A']['/URI']
            # But after decoding them, the brackets are gone
            uri_str: str = uri.decode()
            print(f'\t{uri_str}')
 if __name__ == '__main__':
    main()
--- a/pdf_replace/print_text.py
+++ b/pdf_replace/print_text.py
@ -0,0 +1,19 @@
 from pathlib import Path
 from pypdf import PdfReader
 def main():
    """
    Print text from all pages of PDF file.
    """
    file = Path("pdf.pdf")
    # Print text of all PDF pages.
    reader = PdfReader(file)
    for page in reader.pages:
        print(page.extract_text())
 if __name__ == '__main__':
    main()
--- a/poetry.lock
+++ b/poetry.lock
@ -0,0 +1,35 @@
 # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 [[package]]
 name = "pdfrw"
 version = "0.4"
 description = "PDF file reader/writer library"
 optional = false
 python-versions = "*"
 files = [
    {file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"},
    {file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"},
 ]
 [[package]]
 name = "pypdf"
 version = "3.17.4"
 description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
 optional = false
 python-versions = ">=3.6"
 files = [
    {file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"},
    {file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"},
 ]
 [package.extras]
 crypto = ["PyCryptodome", "cryptography"]
 dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"]
 docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
 full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"]
 image = ["Pillow (>=8.0.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
 content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,16 @@
 [tool.poetry]
 name = "pdf-replace"
 version = "0.1.0"
 description = "Print or replace links and text in PDF files"
 authors = ["Daniel Langbein <daniel@systemli.org>"]
 readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.11"
 pdfrw = "^0.4"
 pypdf = "^3.17.4"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,8 @@
 { pkgs ? import <nixpkgs> {} }:
 pkgs.mkShell {
  buildInputs = [
    pkgs.python3
    pkgs.poetry
  ];
 }
--- a/tests/init.py
+++ b/tests/init.py