What will be scraped
Full code
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "star wars"; // what we want to search
const requestParams = {
baseURL: `https://www.youtube.com`,
encodedQuery: encodeURI(searchString), // what we want to search for in URI encoding
};
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength) break;
}
}
async function fillDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-video-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
},
publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
async function getYoutubeOrganicResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const scrollElements = "#contents > ytd-video-renderer";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const organicResults = await fillDataFromPage(page);
await browser.close();
return organicResults;
}
getYoutubeOrganicResults().then(console.log);
Preparation
First, we need to create a Node.js* project and add npm
packages puppeteer
, puppeteer-extra
and puppeteer-extra-plugin-stealth
to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.
To do this, in the directory with our project, open the command line and enter npm init -y
, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
.
*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.
πNote: also, you can use puppeteer
without any extensions, but I strongly recommended use it with puppeteer-extra
with puppeteer-extra-plugin-stealth
to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.
Process
SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.
The Gif below illustrates the approach of selecting different parts of the results.
Code explanation
Declare constants from required libraries:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Code | Explanation |
---|---|
puppeteer |
Chromium control library |
StealthPlugin |
library for prevent website detection that you are using web driver |
Next, we "say" to puppeteer
use StealthPlugin
and write what we want to search:
puppeteer.use(StealthPlugin());
const searchString = "star wars";
Next, we write down a function for scrolling page:
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength) break;
}
}
Code | Explanation |
---|---|
elementsLength |
amount of elements on the page before scrolling |
page.evaluate() |
runs code from the brackets in the browser console and returns the result |
document.querySelectorAll(scrollElements) |
returns a static NodeList representing a list of the document's elements that match the css selectors with class name gets from scrollElements argument |
page.waitForTimeout(200) |
waiting 200 ms before continue |
newElementsLength |
amount of elements on the page after scrolling |
Next, we write down a function for getting organic results data from the search page:
async function fillDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-video-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
},
publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
Code | Explanation |
---|---|
Array.from() |
this method creates a new, shallow-copied Array instance from an iterable or array-like object |
el.querySelector("a#video-title") |
returns the first html element with selector a#video-title which is any child of the el html element |
.textContent |
gets the raw text of html element |
.trim() |
removes whitespace from both ends of a string |
.getAttribute("href") |
gets the href attribute value of the html element |
And finally, a function to control the browser, and get information:
async function getYoutubeOrganicResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const scrollElements = "#contents > ytd-video-renderer";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const organicResults = await fillDataFromPage(page);
await browser.close();
return organicResults;
}
getYoutubeOrganicResults().then(console.log);
Code | Explanation |
---|---|
puppeteer.launch({options}) |
this method launches a new instance of the Chromium browser with current options
|
headless |
defines which mode to use: headless (by default) or non-headless |
args |
an array with arguments which is used with Chromium |
["--no-sandbox", "--disable-setuid-sandbox"] |
these arguments we use to allow the launch of the browser process in the online IDE |
browser.newPage() |
this method launches a new page |
page.setDefaultNavigationTimeout(60000) |
changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection |
page.goto(URL) |
navigation to URL which is defined above |
browser.close() |
after all we close the browser instance |
Now we can launch our parser. To do this enter node YOUR_FILE_NAME
in your command line. Where YOUR_FILE_NAME
is the name of your .js
file.
Output
[
{
"title": "Star Wars Battlefront 2 - Funny Moments Order #66",
"link": "https://www.youtube.com/watch?v=LquShRk_3sw",
"channel": {
"name": "Jongo Phett",
"link": "https://www.youtube.com/c/JongoPhett",
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLR-k_Ubr0aJgzNu91jAQCc-vnCOpyIkASWxIbm7rQ=s68-c-k-c0x00ffffff-no-rj"
},
"publishedDate": "16 hours ago",
"views": "12K views",
"length": "10:39",
"description": "episode 66 of Star Wars Battlefront 2 Funny Moments, a montage of the funniest star wars clips in battlefront II. edited together by ...",
"extensions": ["New"],
"thumbnail": "https://i.ytimg.com/vi/LquShRk_3sw/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAFNjI-rEeq5mmDL6I4nXgxZyId3Q"
},
{
"title": "The Most Powerful Character In All of Star Wars",
"link": "https://www.youtube.com/watch?v=JTTv8mmxoTE",
"channel": {
"name": "The Stupendous Wave",
"link": "https://www.youtube.com/c/TheStupendousWave",
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLQ0T0u6VqryQ-Z5efb1qVTcUHthiH8EamJMKDAE=s68-c-k-c0x00ffffff-no-rj"
},
"publishedDate": "10 hours ago",
"views": "29K views",
"length": "12:22",
"description": "For all sponsorship and business inquiries please contact: thestupendousscrub@gmail.com Business: ...",
"extensions": ["New"],
"thumbnail": "https://i.ytimg.com/vi/JTTv8mmxoTE/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBdHe_wWjGruLfxz1acr-3jP0YltA"
}
...and other results
]
YouTube Video Results API
Alternatively, you can use the YouTube Video Results API from SerpApi.
The difference is that you can still get the same results without using browser automation, which saves time. Also, you don't need to write a parser from scratch, choose the right CSS selectors, which can change. And in the end, there is a possibility that at some point the request may be blocked as suspicious. Instead, you just need to iterate over the structured JSON and get the data you want.
First, we need to install google-search-results-nodejs
. To do this you need to enter in your console: npm i google-search-results-nodejs
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const searchString = "star wars"; // what we want to search
const params = {
engine: "youtube", // search engine
search_query: searchString, // search query
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const allVideos = [];
while (true) {
const json = await getJson();
if (json.video_results) {
allVideos.push(...json.video_results);
} else break;
if (json.serpapi_pagination?.next_page_token) {
params.sp = json.serpapi_pagination?.next_page_token;
} else break;
}
return allVideos;
};
getResults().then(console.log);
Code explanation
Declare constants from required libraries:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Code | Explanation |
---|---|
SerpApi |
SerpApi Node.js library |
search |
new instance of GoogleSearch class |
API_KEY |
your API key from SerpApi |
Next, we write down what we want to search and the necessary parameters for making a request:
const searchString = "star wars";
const params = {
engine: "youtube",
search_query: searchString,
};
Code | Explanation |
---|---|
searchString |
what we want to search |
engine |
search engine |
search_query |
search query |
Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
})
}
And finally, we declare and run the function getResult
that gets videos info from all pages and return it:
const getResults = async () => {
const allVideos = [];
while (true) {
const json = await getJson();
if (json.video_results) {
allVideos.push(...json.video_results);
} else break;
if (json.serpapi_pagination?.next_page_token) {
params.sp = json.serpapi_pagination?.next_page_token;
} else break;
}
return allVideos;
};
getResults().then(console.log)
Code | Explanation |
---|---|
allVideos |
an array with videos info from all pages |
allVideos.push(...json.video_results) |
in this code, we use spread syntax to split the video_results array from result that was returned from getJson function into elements and add them in the end of allVideos array |
Output
[
{
"position_on_page": 1,
"title": "LEGO Star Wars Sets I would DIE FOR! (Part 6)",
"link": "https://www.youtube.com/watch?v=CGAYy_lqyHk",
"channel": {
"name": "LEGO Empire",
"link": "https://www.youtube.com/c/LEGOEmpireOfficial",
"thumbnail": "https://yt3.ggpht.com/wJNsooqsZmzGudM2Z0jJ4umj6I9aeHdUfBI8J3d4uC98qKbU5UT6isxzQ-yw5bQ9rBBSo_SY0Eg=s68-c-k-c0x00ffffff-no-rj"
},
"published_date": "2 days ago",
"views": 10024,
"length": "5:40",
"description": "LEGO Star Wars Sets I would DIE FOR, Part 6, is HERE! These sets are 18+ LEGO star wars fans dreams come true! Credits to ...",
"extensions": ["New"],
"thumbnail": {
"static": "https://i.ytimg.com/vi/CGAYy_lqyHk/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAudls7QsD4J6_dHDclXgm9Z0lyVg"
}
},
{
"position_on_page": 2,
"title": "LEGO Star Wars 75335 BD-1 Review! (2022)",
"link": "https://www.youtube.com/watch?v=Gega7LfS658",
"channel": {
"name": "MandRproductions",
"link": "https://www.youtube.com/c/MandRproductions",
"verified": true,
"thumbnail": "https://yt3.ggpht.com/Jrnvzkoi6Hga08KaZ5Z1N99lmBMFWDMtEGgCHGV6O-m3IYdlJN2gKZV68OVBYVQM2YT8Fq3L=s68-c-k-c0x00ffffff-no-rj"
},
"published_date": "3 days ago",
"views": 45876,
"length": "7:38",
"description": "LEGO Star Wars Jedi: Fallen Order sets were thought to be impossible. Enter the 75335 BD-1 Buildable Character Summer 2022 ...",
"extensions": ["New"],
"thumbnail": {
"static": "https://i.ytimg.com/vi/Gega7LfS658/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCX1tTbkRxhZvSw-O5i5Prsp4x8Hw",
"rich": "https://i.ytimg.com/an_webp/Gega7LfS658/mqdefault_6s.webp?du=3000&sqp=CM6FhJcG&rs=AOn4CLCGISNuZXlgmZVlZmnlEIAZmGE8jA"
}
},
...and other results
]
Links
If you want to see some projects made with SerpApi, please write me a message.
Add a Feature Requestπ« or a Bugπ
Top comments (0)