This commit is contained in:
Daniel Langbein 2024-01-23 14:15:27 +01:00
commit 33d8557b07
Signed by: langfingaz
GPG Key ID: 6C47C753F0823002
10 changed files with 181 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/.idea/

56
README.md Normal file
View File

@ -0,0 +1,56 @@
# PDF Replace
About replacing text and links on PDF files.
## PDF - Print all links
See [pdf_replace/print_links.py](pdf_replace/print_links.py)
## PDF - Print text from all pages
See [pdf_replace/print_text.py](pdf_replace/print_text.py)
## PDF - Replace text or links
Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d
1) Uncompress the PDF file
```shell
nix-shell -p qpdf
qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf
```
or
```shell
nix-shell -p pdftk
pdftk input.pdf output uncompressed.pdf uncompress
```
2) Edit the PDF as "plain text" file
Works:
- With `LC_ALL=C sed`
- `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf`
Does **not** work:
- With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned.
Untested:
- With a text editor using "search and replace". Warning, on large files this might be laggy.
3) Compress the PDF
```shell
qpdf uncompressed-output.pdf output.pdf
```
or
```shell
pdftk uncompressed-output.pdf output output.pdf compress
```

14
pdf-replace.iml Normal file
View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="GENERAL_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/pdf_replace" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

0
pdf_replace/__init__.py Normal file
View File

View File

@ -0,0 +1,32 @@
from pathlib import Path
import pdfrw
from pdfrw import PdfDict, PdfArray, PdfString
def main():
pdf_file = Path("pdf.pdf")
print_links(pdf_file)
def print_links(pdf_file: Path):
"""
Print all links found in the given PDF file.
"""
pdf_reader = pdfrw.PdfReader(pdf_file)
pages: list = pdf_reader.pages
page: PdfDict
for page_num, page in enumerate(pages):
print(f"Page {page_num}")
# Links are in /Annots
annots: PdfArray = page['/Annots']
for annot in annots:
# The links are inside brackets, e.g. (https://example.com)
uri: PdfString = annot['/A']['/URI']
# But after decoding them, the brackets are gone
uri_str: str = uri.decode()
print(f'\t{uri_str}')
if __name__ == '__main__':
main()

19
pdf_replace/print_text.py Normal file
View File

@ -0,0 +1,19 @@
from pathlib import Path
from pypdf import PdfReader
def main():
"""
Print text from all pages of PDF file.
"""
file = Path("pdf.pdf")
# Print text of all PDF pages.
reader = PdfReader(file)
for page in reader.pages:
print(page.extract_text())
if __name__ == '__main__':
main()

35
poetry.lock generated Normal file
View File

@ -0,0 +1,35 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "pdfrw"
version = "0.4"
description = "PDF file reader/writer library"
optional = false
python-versions = "*"
files = [
{file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"},
{file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"},
]
[[package]]
name = "pypdf"
version = "3.17.4"
description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
optional = false
python-versions = ">=3.6"
files = [
{file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"},
{file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"},
]
[package.extras]
crypto = ["PyCryptodome", "cryptography"]
dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"]
docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"]
image = ["Pillow (>=8.0.0)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980"

16
pyproject.toml Normal file
View File

@ -0,0 +1,16 @@
[tool.poetry]
name = "pdf-replace"
version = "0.1.0"
description = "Print or replace links and text in PDF files"
authors = ["Daniel Langbein <daniel@systemli.org>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
pdfrw = "^0.4"
pypdf = "^3.17.4"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

8
shell.nix Normal file
View File

@ -0,0 +1,8 @@
{ pkgs ? import <nixpkgs> {} }:
pkgs.mkShell {
buildInputs = [
pkgs.python3
pkgs.poetry
];
}

0
tests/__init__.py Normal file
View File