2 Star 2 Fork 2

LTCTM/PDF合并与拆分

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
pdf_ltctm.py 5.35 KB
一键复制 编辑 原始数据 按行查看 历史
bluequilt 提交于 3年前 . 修复bug
from pathlib import Path
import subprocess
import re
import json
from functools import reduce
from os import system, environ as ENVIRON
import tempfile
import fire
ENVIRON["PATH"] = ENVIRON["PATH"] + ";%s" % Path.cwd().absolute()
NUM_MATCH = re.compile(r"\D*(\d+)\D*")
TOCS_TAB_MATCH = re.compile(r"(\t*)(.+)\t(\d+)")
TOCS_PDFTK_MATCH = re.compile(
r"BookmarkBegin\nBookmarkTitle:\s*(.+)\nBookmarkLevel:\s*(\d+)\nBookmarkPageNumber:\s*(\d+)(?:$|\n)"
)
def _asn(f: Path):
m = NUM_MATCH.match(f.stem)
return int(m.group(1)) if m else f.stem
def _sort_and_rename(path):
items = []
for f in path.glob("*"):
m = NUM_MATCH.match(f.stem)
num = int(m.group(1)) if m else None
items.append((f, num))
new_names = sorted(
((f, f.with_stem("{:0>5d}".format(num))) for f, num in items if num),
key=lambda j: j[1],
)
new_exists = [n.exists() for (o, n) in new_names]
if all(new_exists):
pass
elif any(new_exists):
# 这里没想好该怎么做
pass
else:
for old_name, new_name in new_names:
old_name.rename(new_name)
# ========目录========
def _write_tocs_pdftk(tocs):
combine_line = lambda l, t, p: "\n".join(
(
"BookmarkBegin",
"BookmarkTitle: %s" % t,
"BookmarkLevel: %s" % l,
"BookmarkPageNumber: %s" % p,
"",
)
)
return reduce(
lambda i, j: i + j,
(combine_line(*line) for line in tocs),
"",
)
def _read_tocs_pdftk(text):
return [
(int(match.group(2)), match.group(1), int(match.group(3)))
for match in TOCS_PDFTK_MATCH.finditer(text)
]
def _read_tocs_tab(text):
result = []
for line in text.split("\n"):
m = TOCS_TAB_MATCH.match(line)
if m:
result.append([len(m.group(1)) + 1, m.group(2), int(m.group(3))])
return result
# ========对外接口========
def combine_pics(indir, outpdf, auto_search_num=False):
indir, outpdf = Path(indir), Path(outpdf)
outpdf.parent.mkdir(parents=True, exist_ok=True)
# 所有合法文件对象
if auto_search_num:
_sort_and_rename(indir)
outpdf = '"%s"' % outpdf.absolute()
indir = '"%s"' % indir.joinpath("*").absolute()
cmd_str = "mutool convert -o %s -F PDF -O compress %s" % (outpdf, indir)
return subprocess.Popen(cmd_str, creationflags=subprocess.CREATE_NO_WINDOW)
def extract_pics(inpdf, outdir):
inpdf, outdir = Path(inpdf), Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
inpdf = '"%s"' % inpdf.absolute()
cmd_str = "mutool extract %s" % inpdf
return subprocess.Popen(
cmd_str, creationflags=subprocess.CREATE_NO_WINDOW, cwd=outdir.absolute()
)
def page2PNG(inpdf, outdir, dpi=300):
inpdf, outdir = Path(inpdf), Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
outpics = '"%s"' % outdir.joinpath("%05d.png").absolute()
inpdf = '"%s"' % inpdf.absolute()
cmd_str = "mutool convert -o %s -F PNG -O resolution=%s %s" % (outpics, dpi, inpdf)
return subprocess.Popen(cmd_str, creationflags=subprocess.CREATE_NO_WINDOW)
def combine_pdfs(indir, outpdf, auto_search_num=False):
indir, outpdf = Path(indir), Path(outpdf)
outpdf.parent.mkdir(parents=True, exist_ok=True)
if auto_search_num:
_sort_and_rename(indir)
outpdf = '"%s"' % outpdf.absolute()
indir = '"%s"' % indir.joinpath("*.pdf").absolute()
cmd_str = "pdftk %s cat output %s" % (indir, outpdf)
return subprocess.Popen(cmd_str, creationflags=subprocess.CREATE_NO_WINDOW)
def set_tocs(inpdf, tocs, outpdf):
inpdf, tocs, outpdf = Path(inpdf), Path(tocs), Path(outpdf)
outpdf.parent.mkdir(parents=True, exist_ok=True)
if tocs.suffix == ".json":
tocs_array = json.loads(tocs.read_text("utf-8"))
elif tocs.suffix == ".txt":
tocs_array = _read_tocs_tab(tocs.read_text("utf-8"))
# 这里的pdftk导出文件名不能含有中文
with tempfile.NamedTemporaryFile(
mode="w", encoding="utf-8", newline="\n", delete=False
) as tf:
temp_path = Path(tf.name)
tf.write(_write_tocs_pdftk(tocs_array))
# ====
inpdf = '"%s"' % inpdf.absolute()
outpdf = '"%s"' % outpdf.absolute()
cmd_str = "pdftk %s update_info_utf8 %s output %s" % (
inpdf,
'"%s"' % temp_path,
outpdf,
)
system(cmd_str)
temp_path.unlink()
def output_tocs(inpdf, outtocs):
inpdf, outtocs = Path(inpdf), Path(outtocs)
outtocs.parent.mkdir(parents=True, exist_ok=True)
inpdf = '"%s"' % inpdf.absolute()
# 这里的pdftk导出文件名不能含有中文
with tempfile.NamedTemporaryFile(
mode="w", encoding="utf-8", newline="\n", delete=False
) as tf:
temp_path = Path(tf.name)
cmd_str = "pdftk %s dump_data_utf8 output %s" % (inpdf, '"%s"' % tf.name)
system(cmd_str)
# ====
tocs = _read_tocs_pdftk(temp_path.read_text("utf-8"))
outtocs.write_text(json.dumps(tocs, ensure_ascii=False), "utf-8")
def combine_pics_in_subfolders(path, auto_search_num):
path = Path(path)
dirs = (d for d in path.glob("*") if d.is_dir())
return [
combine_pics(d, path.absolute().joinpath("%s.pdf" % d.name), auto_search_num)
for d in dirs
]
if __name__ == "__main__":
# 考虑到命令行程序需求,所有函数的参数都可以接受字符串
fire.Fire()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/ltctm/pdf_merge_and_split.git
git@gitee.com:ltctm/pdf_merge_and_split.git
ltctm
pdf_merge_and_split
PDF合并与拆分
master

搜索帮助