mirror of
https://codeberg.org/privacy1st/pdf-replace
synced 2024-12-02 23:45:03 +01:00
init
This commit is contained in:
commit
33d8557b07
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/.idea/
|
56
README.md
Normal file
56
README.md
Normal file
@ -0,0 +1,56 @@
|
||||
# PDF Replace
|
||||
|
||||
About replacing text and links on PDF files.
|
||||
|
||||
## PDF - Print all links
|
||||
|
||||
See [pdf_replace/print_links.py](pdf_replace/print_links.py)
|
||||
|
||||
## PDF - Print text from all pages
|
||||
|
||||
See [pdf_replace/print_text.py](pdf_replace/print_text.py)
|
||||
|
||||
## PDF - Replace text or links
|
||||
|
||||
Source: https://gist.github.com/Nezteb/e761bb85ced6ce965e37d54ceb04635d
|
||||
|
||||
1) Uncompress the PDF file
|
||||
|
||||
```shell
|
||||
nix-shell -p qpdf
|
||||
qpdf --qdf --object-streams=disable input.pdf uncompressed.pdf
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```shell
|
||||
nix-shell -p pdftk
|
||||
pdftk input.pdf output uncompressed.pdf uncompress
|
||||
```
|
||||
|
||||
2) Edit the PDF as "plain text" file
|
||||
|
||||
Works:
|
||||
|
||||
- With `LC_ALL=C sed`
|
||||
- `LC_ALL=C sed -e 's|some text||g' uncompressed.pdf > uncompressed-output.pdf`
|
||||
|
||||
Does **not** work:
|
||||
|
||||
- With `nano` by pressing `ALT` + `r` (search and replace). The resulting PDF is distorted - some text is missing or misaligned.
|
||||
|
||||
Untested:
|
||||
|
||||
- With a text editor using "search and replace". Warning, on large files this might be laggy.
|
||||
|
||||
3) Compress the PDF
|
||||
|
||||
```shell
|
||||
qpdf uncompressed-output.pdf output.pdf
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```shell
|
||||
pdftk uncompressed-output.pdf output output.pdf compress
|
||||
```
|
14
pdf-replace.iml
Normal file
14
pdf-replace.iml
Normal file
@ -0,0 +1,14 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="GENERAL_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/pdf_replace" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
0
pdf_replace/__init__.py
Normal file
0
pdf_replace/__init__.py
Normal file
32
pdf_replace/print_links.py
Normal file
32
pdf_replace/print_links.py
Normal file
@ -0,0 +1,32 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pdfrw
|
||||
from pdfrw import PdfDict, PdfArray, PdfString
|
||||
|
||||
|
||||
def main():
|
||||
pdf_file = Path("pdf.pdf")
|
||||
print_links(pdf_file)
|
||||
|
||||
|
||||
def print_links(pdf_file: Path):
|
||||
"""
|
||||
Print all links found in the given PDF file.
|
||||
"""
|
||||
pdf_reader = pdfrw.PdfReader(pdf_file)
|
||||
pages: list = pdf_reader.pages
|
||||
page: PdfDict
|
||||
for page_num, page in enumerate(pages):
|
||||
print(f"Page {page_num}")
|
||||
# Links are in /Annots
|
||||
annots: PdfArray = page['/Annots']
|
||||
for annot in annots:
|
||||
# The links are inside brackets, e.g. (https://example.com)
|
||||
uri: PdfString = annot['/A']['/URI']
|
||||
# But after decoding them, the brackets are gone
|
||||
uri_str: str = uri.decode()
|
||||
print(f'\t{uri_str}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
19
pdf_replace/print_text.py
Normal file
19
pdf_replace/print_text.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pathlib import Path
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Print text from all pages of PDF file.
|
||||
"""
|
||||
file = Path("pdf.pdf")
|
||||
|
||||
# Print text of all PDF pages.
|
||||
reader = PdfReader(file)
|
||||
for page in reader.pages:
|
||||
print(page.extract_text())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
35
poetry.lock
generated
Normal file
35
poetry.lock
generated
Normal file
@ -0,0 +1,35 @@
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "pdfrw"
|
||||
version = "0.4"
|
||||
description = "PDF file reader/writer library"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pdfrw-0.4-py2.py3-none-any.whl", hash = "sha256:758289edaa3b672e9a1a67504be73c18ec668d4e5b9d5ac9cbc0dc753d8d196b"},
|
||||
{file = "pdfrw-0.4.tar.gz", hash = "sha256:0dc0494a0e6561b268542b28ede2280387c2728114f117d3bb5d8e4787b93ef4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pypdf"
|
||||
version = "3.17.4"
|
||||
description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "pypdf-3.17.4-py3-none-any.whl", hash = "sha256:6aa0f61b33779b64486de3f42835d3668badd48dac4a536aeb87da187a5eacd2"},
|
||||
{file = "pypdf-3.17.4.tar.gz", hash = "sha256:ec96e2e4fc9648ac609d19c00d41e9d606e0ae2ce5a0bbe7691426f5f157166a"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
crypto = ["PyCryptodome", "cryptography"]
|
||||
dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"]
|
||||
docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
|
||||
full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"]
|
||||
image = ["Pillow (>=8.0.0)"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "0d691806e3e0b2213110872fa0616b5012f0a30f6fc0a29001cb48a739f77980"
|
16
pyproject.toml
Normal file
16
pyproject.toml
Normal file
@ -0,0 +1,16 @@
|
||||
[tool.poetry]
|
||||
name = "pdf-replace"
|
||||
version = "0.1.0"
|
||||
description = "Print or replace links and text in PDF files"
|
||||
authors = ["Daniel Langbein <daniel@systemli.org>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
pdfrw = "^0.4"
|
||||
pypdf = "^3.17.4"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
8
shell.nix
Normal file
8
shell.nix
Normal file
@ -0,0 +1,8 @@
|
||||
{ pkgs ? import <nixpkgs> {} }:
|
||||
|
||||
pkgs.mkShell {
|
||||
buildInputs = [
|
||||
pkgs.python3
|
||||
pkgs.poetry
|
||||
];
|
||||
}
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user