DEV Community

drake
drake

Posted on

playwright自动下载PDF文件

import asyncio
from pathlib import Path
from patchright.async_api import async_playwright

async def handle_pdf_route(route):
    """拦截 PDF 请求并强制下载"""
    if route.request.url.endswith('.pdf'):
        response = await route.fetch()
        headers = dict(response.headers)
        response = await route.fetch()
        binary_data = await response.body() # 直接获取 bytes
        print(f"获取到 PDF 文档,大小: {len(binary_data)} bytes")
        print(f"PDF 文档内容: {binary_data[:100]}...")  # 打印前100个字节
        # with open("direct_download.pdf", "wb") as f:
            # f.write(binary_data)
        headers['Content-Disposition'] = 'attachment; filename="document.pdf"'
        await route.fulfill(response=response, headers=headers)
        await asyncio.sleep(10)  # 等待页面加载完成
    else:
        await route.continue_()

async def main():
    url = "https://www.fatf-gafi.org/content/dam/fatf-gafi/guidance/Second-12-Month-Review-Revised-FATF-Standards-Virtual-Assets-VASPS.pdf"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # 可见模式便于调试
        context = await browser.new_context(accept_downloads=True)
        page = await context.new_page()
        await page.goto(url)
        # 设置路由拦截
        await page.route("**/*", handle_pdf_route)
        await browser.close()

if __name__ == "__main__":
    asyncio.run(main())

Enter fullscreen mode Exit fullscreen mode

Top comments (0)