mirror of
https://codeberg.org/privacy1st/pdf-replace
synced 2024-12-03 23:55:05 +01:00
init
This commit is contained in:
commit
33d8557b07
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/.idea/
|
56
README.md
Normal file
56
README.md
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# PDF Replace
|
||||||
|
|
||||||
|
About replacing text and links on PDF files.
|
||||||
|
|
||||||
|
## PDF - Print all links
|
||||||
|
|
||||||
|
See [pdf_replace/print_links.py](pdf_replace/print_links.py)
|
||||||
|
|
||||||
|
## PDF - Print text from all pages
|
||||||
|
|
||||||
|
See [pdf_replace/print_text.py](pdf_replace/print_text.py)
|
||||||
|
|
||||||
|
## PDF - Replace text or links
|
||||||
|
|
||||||
|
Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d
|
||||||
|
|
||||||
|
1) Uncompress the PDF file
|
||||||
|
|
||||||
|
```shell
|
||||||
|
nix-shell -p qpdf
|
||||||
|
qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```shell
|
||||||
|
nix-shell -p pdftk
|
||||||
|
pdftk input.pdf output uncompressed.pdf uncompress
|
||||||
|
```
|
||||||
|
|
||||||
|
2) Edit the PDF as "plain text" file
|
||||||
|
|
||||||
|
Works:
|
||||||
|
|
||||||
|
- With `LC_ALL=C sed`
|
||||||
|
- `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf`
|
||||||
|
|
||||||
|
Does **not** work:
|
||||||
|
|
||||||
|
- With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned.
|
||||||
|
|
||||||
|
Untested:
|
||||||
|
|
||||||
|
- With a text editor using "search and replace". Warning, on large files this might be laggy.
|
||||||
|
|
||||||
|
3) Compress the PDF
|
||||||
|
|
||||||
|
```shell
|
||||||
|
qpdf uncompressed-output.pdf output.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pdftk uncompressed-output.pdf output output.pdf compress
|
||||||
|
```
|
14
pdf-replace.iml
Normal file
14
pdf-replace.iml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="GENERAL_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/pdf_replace" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
0
pdf_replace/__init__.py
Normal file
0
pdf_replace/__init__.py
Normal file
32
pdf_replace/print_links.py
Normal file
32
pdf_replace/print_links.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pdfrw
|
||||||
|
from pdfrw import PdfDict, PdfArray, PdfString
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf_file = Path("pdf.pdf")
|
||||||
|
print_links(pdf_file)
|
||||||
|
|
||||||
|
|
||||||
|
def print_links(pdf_file: Path):
|
||||||
|
"""
|
||||||
|
Print all links found in the given PDF file.
|
||||||
|
"""
|
||||||
|
pdf_reader = pdfrw.PdfReader(pdf_file)
|
||||||
|
pages: list = pdf_reader.pages
|
||||||
|
page: PdfDict
|
||||||
|
for page_num, page in enumerate(pages):
|
||||||
|
print(f"Page {page_num}")
|
||||||
|
# Links are in /Annots
|
||||||
|
annots: PdfArray = page['/Annots']
|
||||||
|
for annot in annots:
|
||||||
|
# The links are inside brackets, e.g. (https://example.com)
|
||||||
|
uri: PdfString = annot['/A']['/URI']
|
||||||
|
# But after decoding them, the brackets are gone
|
||||||
|
uri_str: str = uri.decode()
|
||||||
|
print(f'\t{uri_str}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
19
pdf_replace/print_text.py
Normal file
19
pdf_replace/print_text.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Print text from all pages of PDF file.
|
||||||
|
"""
|
||||||
|
file = Path("pdf.pdf")
|
||||||
|
|
||||||
|
# Print text of all PDF pages.
|
||||||
|
reader = PdfReader(file)
|
||||||
|
for page in reader.pages:
|
||||||
|
print(page.extract_text())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
35
poetry.lock
generated
Normal file
35
poetry.lock
generated
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pdfrw"
|
||||||
|
version = "0.4"
|
||||||
|
description = "PDF file reader/writer library"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"},
|
||||||
|
{file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pypdf"
|
||||||
|
version = "3.17.4"
|
||||||
|
description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"},
|
||||||
|
{file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
crypto = ["PyCryptodome", "cryptography"]
|
||||||
|
dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"]
|
||||||
|
docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
|
||||||
|
full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"]
|
||||||
|
image = ["Pillow (>=8.0.0)"]
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
lock-version = "2.0"
|
||||||
|
python-versions = "^3.11"
|
||||||
|
content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980"
|
16
pyproject.toml
Normal file
16
pyproject.toml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "pdf-replace"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Print or replace links and text in PDF files"
|
||||||
|
authors = ["Daniel Langbein <daniel@systemli.org>"]
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
pdfrw = "^0.4"
|
||||||
|
pypdf = "^3.17.4"
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
8
shell.nix
Normal file
8
shell.nix
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{ pkgs ? import <nixpkgs> {} }:
|
||||||
|
|
||||||
|
pkgs.mkShell {
|
||||||
|
buildInputs = [
|
||||||
|
pkgs.python3
|
||||||
|
pkgs.poetry
|
||||||
|
];
|
||||||
|
}
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user