Puppeteer is awesome for web scrapping, automation, testing. I wanted to do some practice with it. I made an app that fetchs public job ads from kamuilan.sbb.gov.tr and lists.
Let me explain some puppeteer methods:
launch: Open the browser
newPage: Creates a tab in the browser.
goto(url, { waitUntil: 'networkidle0'}): Navigate an url. (waitUntil: 'networkidle0': consider navigation to be finished when there are no more than 0 network connections for at least 500 ms).
waitforSelector: Waits for the selector to appear in page.
$(): It is like document.querySelector() it selects an element that matched selector string.
$$(): It is like document.querySelectorAll() it select all element that matched selector string.
$eval(): It selects first matches element and passes it to its function parameter then we can handle that element in this function.
Firstly, let's review the website code that we want to crawl. There is a slider and it is a ul element. Its class name is slides. And there are ads (li elements) its inside. We can grab them.
const slides = await newPage.$$('ul.slides li');
Backend
index.js
require('dotenv').config();
const PORT = process.env.PORT || 3000;
const express = require('express');
const cors = require('cors');
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.static('public'));
app.use('/ads', require('./routes/ads'),
(req, res) => res.status(500).send('Internal server error'));
app.listen(PORT, () => console.log(`App started on http://localhost: ${PORT}`));
routes/ads.js
We want to grab these elements.
const puppeteer = require('puppeteer');
const router = require('express').Router();
router.get('/', async (req, res, next) => {
try {
const browser = await puppeteer.launch();
const newPage = await browser.newPage();
const adsUrl = process.env.PUBLIC_ADS_URL;
await newPage.goto(adsUrl, { waitUntil: 'networkidle0' });
await newPage.waitForSelector('ul.slides');
const slides = await newPage.$$('ul.slides li');
const ads = [];
for (s of slides) {
const avatar = await s.$eval('.avatar img', el => el.src);
const organasation = await s.$eval('h5 span.black', el => el.innerText);
const title = await s.$eval('h5 span.patrol', el => el.innerText);
const dateRange = await s.$eval('h5 span.h5date', el => el.innerText);
const link = await s.$eval('a', el => el.href);
ads.push({
avatar,
organasation,
title,
dateRange,
link
});
}
await browser.close();
return res.status(200).json(ads);
} catch {
next();
}
});
module.exports = router;
Results
I use thunder client vs code extension.
Frontend
index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Public Ads</title>
<style>
a {
text-decoration: none;
color: #000;
}
a:hover {
color: #ff0000;
}
#publicAdsTable {
border-collapse: true;
width: 90%;
}
#publicAdsTable th, td {
padding: 1rem;
border: 1px solid #ccc;
}
</style>
</head>
<body>
<h1 id="loading">Loading...</h1>
<table id="publicAdsTable">
<thead>
<th></th>
<th></th>
<th>Organasation</th>
<th>Job Ad Title</th>
<th>Date Range</th>
<th>Go</th>
</thead>
<tbody></tbody>
</table>
<script>
const publicAdsTable = document.getElementById('publicAdsTable');
const loadingEl = document.getElementById('loading');
const ads = fetch('http://localhost:5000/ads')
.then(response => {
if (response.ok) {
loadingEl.style.display = 'none';
const tbody = publicAdsTable.querySelector('tbody');
response.json().then(json => json.forEach((ad, index) => tbody.innerHTML +=
`<tr>
<td>${index + 1}</td>
<td><img src="${ad.avatar}" alt="${ad.title}"></td>
<td>${ad.organasation}</td>
<td>${ad.title}</td>
<td>${ad.dateRange}</td>
<td><a href="${ad.link}" target="blank">Open Ad</a></td>
</tr>`
));
} else {
alertError();
}
})
.catch(() => alertError());
function alertError() {
alert('Something went wrong! Please Try Again Later.');
}
</script>
</body>
</html>
Results
.env file
PORT=5000
PUBLIC_ADS_URL = 'https://kamuilan.sbb.gov.tr/'
package.json
{
"name": "publicads",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"dev": "nodemon index"
},
"author": "",
"license": "ISC",
"dependencies": {
"cors": "^2.8.5",
"dotenv": "^10.0.0",
"express": "^4.17.1",
"puppeteer": "^10.4.0"
}
}
Top comments (0)