DEV Community

drake
drake

Posted on

将PDF格式的电子书转换为TXT

import fitz  # PyMuPDF

"""
由于DeepL限制文件大小为20M,某些特别大的PDF无法被处理
1、将PDF转换为TXT

"""

class Totxt:

    def __init__(self, pdf_source_path, txt_file_path):

        # PDF文件路径
        self.pdf_file = pdf_source_path
        # 输出的TXT文件路径
        self.txt_file = txt_file_path

    def is_header_or_footer(self, block, page_height, margin=15, margin2=20):
        # 判断文本块是否位于页眉或页脚位置
        y = block['lines'][0]['spans'][0]['origin'][1]
        return y < margin or y > page_height - margin2


    def to_txt(self):

        # 打开PDF文件
        pdf_document = fitz.open(self.pdf_file)

        # 创建或打开TXT文件
        with open(self.txt_file, 'w', encoding='utf-8') as output_file:
            # 遍历每一页
            for page_num in range(len(pdf_document)):

                page = pdf_document[page_num]
                page_height = page.rect.height

                # 获取页面中的文本块
                blocks = page.get_text("dict")["blocks"]
                # print(blocks)
                the_current_page_lastline = ''
                # 每个block是一行
                for block in blocks:
                    # time.sleep(0.5)
                    # page_height = 10
                    if "lines" in block:
                        # 每行会被切成多个碎片放在lines中
                        if not self.is_header_or_footer(block, page_height):
                            spans = []
                            for line in block["lines"]:
                                for span in line["spans"]:
                                    flags = span['flags']
                                    # 确保是正文而不是乱码
                                    if flags == 4:
                                        text = span["text"].strip()
                                        if text:
                                            output_file.write(text)
                                            spans.append(text)
                                            the_current_page_lastline = text
                                    else:
                                        print("乱码",span["text"].strip())
                            # 美化text的文本输出格式
                            if spans:
                                if '' == spans[-1][-1]:
                                    output_file.write('\n')
                        else:
                            line_text = ''
                            for line in block['lines']:
                                spans = line['spans']
                                for span in spans:
                                    text = span['text']
                                    line_text += text
                # 没换文本格式的条件判断
                if the_current_page_lastline:
                    if '' == the_current_page_lastline[-1]:
                        output_file.write('\n')  # 每页之间添加一个换行符
            print(f"PDF内容已成功保存到 {self.txt_file}")

Enter fullscreen mode Exit fullscreen mode

Heroku

This site is built on Heroku

Join the ranks of developers at Salesforce, Airbase, DEV, and more who deploy their mission critical applications on Heroku. Sign up today and launch your first app!

Get Started

Top comments (0)

Billboard image

The Next Generation Developer Platform

Coherence is the first Platform-as-a-Service you can control. Unlike "black-box" platforms that are opinionated about the infra you can deploy, Coherence is powered by CNC, the open-source IaC framework, which offers limitless customization.

Learn more

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay