Puppeteer is a NodeJs library which provide a High-level API to Chrome over DevToolsProtocol
It's do a lot
- Crawl data from webapps
- Generate a screenshots and PDFs
- Create an up-to-date, automated testing environment. Run your tests directly in the latest version of Chrome using the latest JavaScript and browser features.
- Automate form submission, UI testing, keyboard input, etc.
Let's do a tutorial about Automation and Crawling
Puppeter setup
Is common to know how the structure look to run any lib
so Puppeter rely on promises
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
await page.screenshot({path: 'example.png'});
await browser.close();
})();
we call puppeteer.launch() method to open a browser for us
then asking it to open a page which we will do all stuff there.
and goto() to navigate to a website
all methods that we'll use is provide to the page object
Crawling data
if we want to crawl data, we need to know
what we deal with, So we want to know that we select using the id's or tags or even attributes we deal with DOM
so let's make the environment
mkdir book-scraper
cd book-scraper
npm init -y
npm install --save puppeteer
setup the browser instance
When you open browser you can interact with it like doing clicks and type etc. Headless chrome will do the same thing, but programmatically.
we will need
four .js file
browser.js
main.js
pageScraper.js
PageController.js
let's start with browser.js
const pupt= require('puppeteer');
async function startBrowser(){
let browser
try{
browser = await pupt.launch({
headless:false, // defualt
'ignoreHTTPSErrors':true
})
}
catch(err){
console.error(err)
}
return browser;
}
module.exports = {
startBrowser
};
Making the headless:false that's mean you will see the interface while your script is running to see the process
let's do the main.js
file to run our browser instance
const browser= require('./bowser')
const scraperController= require('./pageController'); // we will make it later
let browserInstance = browser.startBrowser();
// we want to to controle this browser, so we will do a function to do that
// so we have to pass it there
scraperController(browserInstance);
let's do the 'pageController.js' file to see how to interact with our browser
const pageScraper= require('./pageScraper');
async function scrapeAll(browserInstance){
let browser;
try{
browser=await browserInstance;
await pageScraper.scraper(browser);
}
catch(err){
console.error("There's a problem with browser Instance => ", err)
}
}
module.exports=(browserInstance) => scrapeAll(browserInstance)
we export the method to take the browserInstance as argument
let's make the pageScraper.js
file to scrape the data
const scrapeObject={
url:'http://books.toscrape.com',
async scraper(browser){
let page= await browser.newPage();
console.log(`navigating to ${this.url}..`)
await page.goto(this.url);
}
}
module.exports=scrapeObject
This the basic structure to build your web crawler
you can clone it through github pupt starter
Scrape the data from a page
First, if you inspect the source code for the homepage using the Dev Tools inside your browser, you will notice that the page lists each book’s data under a section tag. Inside the section tag every book is under a list (li) tag, and it is here that you find the link to the book’s dedicated page, the price, and the in-stock availability.
let's edit our files, reopen pageScraper.js
file
const scrapeObject={
url:'http://books.toscrape.com',
async scraper(browser){
let page= await browser.newPage();
console.log(`navigating to ${this.url}..`)
await page.goto(this.url);
// wait fot the DOM to render
await page.waitForSelector('.page_inner')
//Get the link to all selected books
let urls= await page.$$eval('section ol > li',links =>{
// Make sure the book is in stock
links=links.filter(link => link.querySelector('.instock.availability > i').textContent !== "In stock")
// Extract the links from the data
links=links.map(el => el.querySelector('h3 > a').href)
return links;
})
console.log(urls);
}
}
module.exports=scrapeObject;
// run this command
node main.js
it should work clearly
let's scrap all data about the books
modifie the pageScraper.js
file
const scraperObject = {
url: 'http://books.toscrape.com',
async scraper(browser){
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
// Navigate to the selected page
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector('.page_inner');
// Get the link to all the required books
let urls = await page.$$eval('section ol > li', links => {
// Make sure the book to be scraped is in stock
links = links.filter(link => link.querySelector('.instock.availability > i').textContent !== "In stock")
// Extract the links from the data
links = links.map(el => el.querySelector('h3 > a').href)
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) => new Promise(async(resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await newPage.goto(link);
dataObj['bookTitle'] = await newPage.$eval('.product_main > h1', text => text.textContent);
dataObj['bookPrice'] = await newPage.$eval('.price_color', text => text.textContent);
dataObj['noAvailable'] = await newPage.$eval('.instock.availability', text => {
// Strip new line and tab spaces
text = text.textContent.replace(/(\r\n\t|\n|\r|\t)/gm, "");
// Get the number of stock available
let regexp = /^.*\((.*)\).*$/i;
let stockAvailable = regexp.exec(text)[1].split(' ')[0];
return stockAvailable;
});
dataObj['imageUrl'] = await newPage.$eval('#product_gallery img', img => img.src);
dataObj['bookDescription'] = await newPage.$eval('#product_description', div => div.nextSibling.nextSibling.textContent);
dataObj['upc'] = await newPage.$eval('.table.table-striped > tbody > tr > td', table => table.textContent);
resolve(dataObj);
await newPage.close();
});
for(link in urls){
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
await browser.close();
}
}
module.exports = scraperObject;
This is all about crawling the data from a webpage
You can find this on github Full code
Top comments (0)