import fitz # PyMuPDF
"""
由于DeepL限制文件大小为20M,某些特别大的PDF无法被处理
1、将PDF转换为TXT
"""
class Totxt:
def __init__(self, pdf_source_path, txt_file_path):
# PDF文件路径
self.pdf_file = pdf_source_path
# 输出的TXT文件路径
self.txt_file = txt_file_path
def is_header_or_footer(self, block, page_height, margin=15, margin2=20):
# 判断文本块是否位于页眉或页脚位置
y = block['lines'][0]['spans'][0]['origin'][1]
return y < margin or y > page_height - margin2
def to_txt(self):
# 打开PDF文件
pdf_document = fitz.open(self.pdf_file)
# 创建或打开TXT文件
with open(self.txt_file, 'w', encoding='utf-8') as output_file:
# 遍历每一页
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
page_height = page.rect.height
# 获取页面中的文本块
blocks = page.get_text("dict")["blocks"]
# print(blocks)
the_current_page_lastline = ''
# 每个block是一行
for block in blocks:
# time.sleep(0.5)
# page_height = 10
if "lines" in block:
# 每行会被切成多个碎片放在lines中
if not self.is_header_or_footer(block, page_height):
spans = []
for line in block["lines"]:
for span in line["spans"]:
flags = span['flags']
# 确保是正文而不是乱码
if flags == 4:
text = span["text"].strip()
if text:
output_file.write(text)
spans.append(text)
the_current_page_lastline = text
else:
print("乱码",span["text"].strip())
# 美化text的文本输出格式
if spans:
if '。' == spans[-1][-1]:
output_file.write('\n')
else:
line_text = ''
for line in block['lines']:
spans = line['spans']
for span in spans:
text = span['text']
line_text += text
# 没换文本格式的条件判断
if the_current_page_lastline:
if '。' == the_current_page_lastline[-1]:
output_file.write('\n') # 每页之间添加一个换行符
print(f"PDF内容已成功保存到 {self.txt_file}")
This site is built on Heroku
Join the ranks of developers at Salesforce, Airbase, DEV, and more who deploy their mission critical applications on Heroku. Sign up today and launch your first app!
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)