DEV Community

Tahsin Abrar
Tahsin Abrar

Posted on

PDF Upload and OCR

Tech Stack

  • Frontend: HTML, CSS, Bootstrap
  • Backend: Node.js, Express.js, Multer, Tesseract.js, pdf-lib
  • OCR Library: Tesseract.js
  • PDF Library: pdf-lib

Setup Instructions

1. Install Dependencies

Backend

Navigate to the backend directory and install the required packages:

cd backend
npm install
Enter fullscreen mode Exit fullscreen mode

Frontend

You can use the same directory for serving static files or set up a separate frontend directory. No additional dependencies are needed for the provided HTML.

2. Configure the Backend

Ensure that you have the following packages installed:

npm install express cors multer pdf-lib tesseract.js pdf-poppler
Enter fullscreen mode Exit fullscreen mode

3. Start the Backend Server

Run the server:

node server.js
Enter fullscreen mode Exit fullscreen mode

4. Access the Application

Open your web browser and navigate to http://localhost:3000. You should see the frontend UI where you can upload a PDF file.

How It Works

  1. Frontend:

    • Users select a PDF file using the file input.
    • Upon form submission, the file is sent to the backend via a POST request.
    • A loading spinner is displayed while the PDF is being processed.
  2. Backend:

    • Receives the PDF file, converts it to images if necessary, and performs OCR using Tesseract.js.
    • Extracts text from the PDF images and returns the result to the frontend.

Frontend Code

Here is the index.html file used for the frontend:

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>PDF Upload and OCR</title>
    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
    <style>
        .container {
            max-width: 600px;
            margin: auto;
            padding: 20px;
        }
        .file-label {
            display: block;
            margin: 10px 0;
        }
        .submit-btn {
            margin-top: 10px;
        }
        #loading {
            display: none;
            text-align: center;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1 class="text-center">Upload a PDF for OCR</h1>
        <form id="uploadForm" enctype="multipart/form-data">
            <div class="form-group">
                <label for="file" class="file-label">Select PDF</label>
                <input type="file" id="file" name="file" class="form-control-file" accept=".pdf" required>
            </div>
            <button type="submit" class="btn btn-primary submit-btn">Upload</button>
        </form>

        <div id="loading">
            <div class="spinner-border text-primary" role="status">
                <span class="sr-only">Loading...</span>
            </div>
            <p>Processing...</p>
        </div>

        <div id="result-container" class="mt-4">
            <h2>OCR Result</h2>
            <pre id="result"></pre>
        </div>
    </div>

    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.5.4/dist/umd/popper.min.js"></script>
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
    <script>
        document.getElementById('uploadForm').addEventListener('submit', async function(event) {
            event.preventDefault(); // Prevent default form submission

            const formData = new FormData();
            const fileInput = document.getElementById('file');
            const file = fileInput.files[0];

            formData.append("file", file);

            const requestOptions = {
                method: "POST",
                body: formData,
                redirect: "follow"
            };

            // Show loading animation
            document.getElementById('loading').style.display = 'block';

            try {
                const response = await fetch("http://localhost:3000/ocr", requestOptions);
                if (response.ok) {
                    const result = await response.json();
                    document.getElementById('result').textContent = result.text;
                } else {
                    document.getElementById('result').textContent = 'Error: ' + response.statusText;
                }
            } catch (error) {
                document.getElementById('result').textContent = 'Error: ' + error.message;
            } finally {
                // Hide loading animation
                document.getElementById('loading').style.display = 'none';
            }
        });
    </script>
</body>
</html>
Enter fullscreen mode Exit fullscreen mode

Backend Code

Here is the server.js file used for the backend:

const express = require("express");
const cors = require("cors");
const multer = require("multer");
const { PDFDocument } = require("pdf-lib");
const Tesseract = require("tesseract.js");
const path = require("path");
const fs = require("fs");
const pdfPoppler = require("pdf-poppler");

const app = express();
const port = 3000;

// Enable CORS for all origins
app.use(cors());

const storage = multer.memoryStorage();
const upload = multer({ storage: storage });

async function convertPdfToImages(pdfPath) {
  const outputDir = path.join(__dirname, "output");
  if (!fs.existsSync(outputDir)) {
    fs.mkdirSync(outputDir);
  }

  const options = {
    format: "png",
    out_dir: outputDir,
    out_prefix: "page",
    page_range: "1-",
  };

  try {
    await pdfPoppler.convert(pdfPath, options);
    return fs
      .readdirSync(outputDir)
      .filter((file) => file.endsWith(".png"))
      .map((file) => path.join(outputDir, file));
  } catch (error) {
    console.error("Error converting PDF to images:", error);
    throw error;
  }
}

// Sample route
app.get("/", (req, res) => {
  res.send("Hello World!");
});

// POST route for OCR
app.post("/ocr", upload.single("file"), async (req, res) => {
  if (!req.file) {
    return res.status(400).send("No file uploaded.");
  }

  try {
    const tempPdfPath = path.join(__dirname, "temp.pdf");
    fs.writeFileSync(tempPdfPath, req.file.buffer);

    const imagePaths = await convertPdfToImages(tempPdfPath);

    const texts = await Promise.all(
      imagePaths.map(async (imagePath) => {
        const {
          data: { text },
        } = await Tesseract.recognize(imagePath, "eng", {
          logger: (info) => console.log(info),
        });
        return text;
      })
    );

    fs.unlinkSync(tempPdfPath);
    imagePaths.forEach((imagePath) => fs.unlinkSync(imagePath));

    res.json({ text: texts.join("\n") });
  } catch (error) {
    console.error(error);
    res.status(500).send("Error processing file.");
  }
});

// 404 error handler
app.use((req, res) => {
  res.status(404).send("Not Found");
});

app.listen(port, () => {
  console.log(`Server running at http://localhost:${port}`);
});
Enter fullscreen mode Exit fullscreen mode

Top comments (0)