Introduction
This article will tell you how to compare excel file contents with pdf contents or name, so that you can decide if the excel content name is the same as pdf contents or pdf name automatically using nodejs.
prequisites
nodejs
most of the command below i will use linux command so if you use windows you can use wsl to recreate the same command
example excel file that we use to compare to pdf excel file
example pdf folder to compare example pdf
Step 1 - Let's download all the dependencies and see whats inside the pdf folder and whats inside excel file
npm install -g yarn
yarn add xlsx
this is the inside of excel file that we will be compared to pdf file
- this is inside of pdf folder
- this is inside of pdf file cat.pdf
so the goal here is to compare the excel file data with the name and the contents of the pdf so we know which pdf correspond to the excel data
Step 2 - Let's code
first of all create an index.mjs file because we gonna use import from es6 syntax and copy paste the code below
import XLSX from 'xlsx';
import fs from 'fs';
import path from 'path';
import pdf from './pdf-parse/index.js';
const workbook = XLSX.readFile('./example_excel.xlsx');
const __dirname = path.resolve();
let to_json =async function to_json(workbook) {
var result = {};
const sementara=[];
// console.log(workbook.SheetNames)
// change sheet to json
let jsonParse = XLSX.utils.sheet_to_json(workbook.Sheets["Sheet1"], {header:1});
// change remove header
let removeHeader=jsonParse.splice(1,jsonParse.length)
for(let b of removeHeader){
sementara.push({name:b[0],pdf_file:null})
}
// read example_pdf directory
const files=fs.readdirSync('./example_pdf')
// read pdf file one by one and push it to semuaFile array
let semuaFile=[];
for await(let c of files){
var absolute_path_to_pdf = path.join(__dirname, 'example_pdf',c)
let dataBuffer = fs.readFileSync(absolute_path_to_pdf);
const z=await pdf(dataBuffer)
const j=z.text.trim()
semuaFile.push({name:c.toLowerCase(),text:j.split('\n').join(' ').toLowerCase()})
}
//compare excel data to pdf file contents and name
for(let c of sementara){
for(let d of semuaFile){
if(d.name.toLowerCase().indexOf(c.name.toLowerCase())!==-1||d.text.toLowerCase().indexOf(c.name.toLowerCase())!==-1){
if(c.pdf_file){
c.pdf_file.push(d.name)
}else{
c.pdf_file=[]
c.pdf_file.push(d.name)
}
}
}
}
let stringifySementara=sementara.map(a=>{
return {name:a.name,pdf_file:JSON.stringify(a.pdf_file)}
})
//create a new workbook
let wb = XLSX.utils.book_new();
//change json to sheet
let ws=XLSX.utils.json_to_sheet(stringifySementara);
//create a new workbook named compared_pdf
XLSX.utils.book_append_sheet(wb, ws, "compared_pdf");
/* generate an XLSX file */
XLSX.writeFile(wb, "compared_pdf.xlsx");
};
to_json(workbook)
run it using node index.mjs
you will get a new xlsx file named compared_pdf.xlsx and you will also get a list of pdf file correspond to the data from excel file like this
this is an image before we compare to our pdf
this is an image after we compare it to our pdf folder
you can see in the pdf_file column is an array of a pdf file that is for if we had a bunch of pdf with the same name or content as the excel file we push it to that array later so we can analyze what's inside the pdf also
Conclusion
With this we learn how to compare pdf file to excel file using nodejs for this one i just use simple pdf file with simple text on it you can add regex or some text processing to get a better result you can clone the github below
gakpenting / compare-excel-to-pdf
this repo will compare your pdf to excel automatically
I create article here to explain the code https://dev.to/spiritbro1/how-to-check-for-a-text-in-an-excel-file-with-nodejs-127j
Top comments (0)