Puppeteer is awesome for web scrapping, automation, testing. I wanted to do some practice with it. I made an app that fetchs public job ads from kamuilan.sbb.gov.tr and lists.
Let me explain some puppeteer methods:
launch: Open the browser
newPage: Creates a tab in the browser.
goto(url, { waitUntil: 'networkidle0'}): Navigate an url. (waitUntil: 'networkidle0': consider navigation to be finished when there are no more than 0 network connections for at least 500 ms).
waitforSelector: Waits for the selector to appear in page.
$(): It is like document.querySelector() it selects an element that matched selector string.
$$(): It is like document.querySelectorAll() it select all element that matched selector string.
$eval(): It selects first matches element and passes it to its function parameter then we can handle that element in this function.
data:image/s3,"s3://crabby-images/51d6d/51d6d34ad861a8377ab041cd906dfeb90da7ebc6" alt="Image description"
Firstly, let's review the website code that we want to crawl. There is a slider and it is a ul element. Its class name is slides. And there are ads (li elements) its inside. We can grab them.
const slides = await newPage.$$('ul.slides li');
Backend
index.js
require('dotenv').config();
const PORT = process.env.PORT || 3000;
const express = require('express');
const cors = require('cors');
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.static('public'));
app.use('/ads', require('./routes/ads'),
(req, res) => res.status(500).send('Internal server error'));
app.listen(PORT, () => console.log(`App started on http://localhost: ${PORT}`));
routes/ads.js
We want to grab these elements.
const puppeteer = require('puppeteer');
const router = require('express').Router();
router.get('/', async (req, res, next) => {
try {
const browser = await puppeteer.launch();
const newPage = await browser.newPage();
const adsUrl = process.env.PUBLIC_ADS_URL;
await newPage.goto(adsUrl, { waitUntil: 'networkidle0' });
await newPage.waitForSelector('ul.slides');
const slides = await newPage.$$('ul.slides li');
const ads = [];
for (s of slides) {
const avatar = await s.$eval('.avatar img', el => el.src);
const organasation = await s.$eval('h5 span.black', el => el.innerText);
const title = await s.$eval('h5 span.patrol', el => el.innerText);
const dateRange = await s.$eval('h5 span.h5date', el => el.innerText);
const link = await s.$eval('a', el => el.href);
ads.push({
avatar,
organasation,
title,
dateRange,
link
});
}
await browser.close();
return res.status(200).json(ads);
} catch {
next();
}
});
module.exports = router;
Results
I use thunder client vs code extension.
Frontend
index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Public Ads</title>
<style>
a {
text-decoration: none;
color: #000;
}
a:hover {
color: #ff0000;
}
#publicAdsTable {
border-collapse: true;
width: 90%;
}
#publicAdsTable th, td {
padding: 1rem;
border: 1px solid #ccc;
}
</style>
</head>
<body>
<h1 id="loading">Loading...</h1>
<table id="publicAdsTable">
<thead>
<th></th>
<th></th>
<th>Organasation</th>
<th>Job Ad Title</th>
<th>Date Range</th>
<th>Go</th>
</thead>
<tbody></tbody>
</table>
<script>
const publicAdsTable = document.getElementById('publicAdsTable');
const loadingEl = document.getElementById('loading');
const ads = fetch('http://localhost:5000/ads')
.then(response => {
if (response.ok) {
loadingEl.style.display = 'none';
const tbody = publicAdsTable.querySelector('tbody');
response.json().then(json => json.forEach((ad, index) => tbody.innerHTML +=
`<tr>
<td>${index + 1}</td>
<td><img src="${ad.avatar}" alt="${ad.title}"></td>
<td>${ad.organasation}</td>
<td>${ad.title}</td>
<td>${ad.dateRange}</td>
<td><a href="${ad.link}" target="blank">Open Ad</a></td>
</tr>`
));
} else {
alertError();
}
})
.catch(() => alertError());
function alertError() {
alert('Something went wrong! Please Try Again Later.');
}
</script>
</body>
</html>
Results
.env file
PORT=5000
PUBLIC_ADS_URL = 'https://kamuilan.sbb.gov.tr/'
package.json
{
"name": "publicads",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"dev": "nodemon index"
},
"author": "",
"license": "ISC",
"dependencies": {
"cors": "^2.8.5",
"dotenv": "^10.0.0",
"express": "^4.17.1",
"puppeteer": "^10.4.0"
}
}
Top comments (0)