Mikhail Zub for SerpApi

Posted on Dec 8, 2022

Web scraping Yelp Organic And Ads Results with Nodejs

#career #watercooler

What will be scraped

DIY code

If you don't need an explanation, have a look at the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

async function getResultsFromPage(page) {
  return await page.evaluate(() => {
    let isAds = false;
    const adsResults = [];
    const organicResults = [];
    Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
      const title = el.querySelector(":scope > h2")?.textContent;
      const result = el.querySelector("[data-testid='serp-ia-card']");
      if (title && title.includes("Sponsored")) isAds = true;
      if (title && !title.includes("Sponsored")) isAds = false;
      if (!title && result) {
        const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
        const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
        const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
        const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
        const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
          if (el.textContent === "more") return null;
          return {
            title: el.textContent,
            link: `https://www.yelp.com${el.getAttribute("href")}`,
          };
        });
        const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];
        if (isAds) {
          adsResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
            categories: categories.filter((el) => el),
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        } else {
          organicResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${link}#reviews`,
            categories: categories.filter((el) => el),
            price: result.querySelector("p > span:nth-child(2)").textContent,
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        }
      }
    });
    return { adsResults, organicResults };
  });
}

async function getOrganicResults() {
  const browser = await puppeteer.launch({
    headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  const adsResults = [];
  const organicResults = [];

  while (true) {
    await page.waitForSelector("[data-testid='serp-ia-card']");
    const resultsFromPage = await getResultsFromPage(page);
    adsResults.push(...resultsFromPage.adsResults);
    organicResults.push(...resultsFromPage.organicResults);
    const isNextPage = await page.$("a[aria-label='Next']");
    if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
    await page.click("a[aria-label='Next']");
    await page.waitForTimeout(3000);
  }

  await browser.close();

  return { adsResults, organicResults };
}

getOrganicResults().then((result) => console.dir(result, { depth: null }));

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter:

$ npm init -y

And then:

$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

Process

We need to extract data from HTML elements. The process of getting the right CSS selectors is fairly easy via SelectorGadget Chrome extension which able us to grab CSS selectors by clicking on the desired element in the browser. However, it is not always working perfectly, especially when the website is heavily used by JavaScript.

We have a dedicated Web Scraping with CSS Selectors blog post at SerpApi if you want to know a little bit more about them.

The Gif below illustrates the approach of selecting different parts of the results using SelectorGadget.

Code explanation

Declare puppeteer to control Chromium browser from puppeteer-extra library and StealthPlugin to prevent website detection that you are using web driver from puppeteer-extra-plugin-stealth library:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

Next, we "say" to puppeteer use StealthPlugin, write what we want to search (serchQuery constant), search location, set how many results we want to receive (reviewsLimit constant), search URL and make search parameters with encodeURI method:

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

Next, we write a function to get organic results from the page:

async function getResultsFromPage(page) {
  ...
}

Then, we get information from the page context (using evaluate() method) and save it in the returned object:

return await page.evaluate(() => ({
    ...
}));

Next, we need to declare isAds variable to differ organic results and ads results, then we declare adsResults and organicResults empty arrays, then we make and iterate (forEach() method) over the new array (Array.from() method) from all "ul > li > div" selectors (querySelectorAll()):

let isAds = false;
const adsResults = [];
const organicResults = [];
Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
    ...
});

Because the results elements titles and results are placed in the same HTML elements on the page, we need to get the title, the result (using querySelector() method) and write a few conditions to set isAds variable true or false (using includes() method) and if the current element, not title and have result we move on.

const title = el.querySelector(":scope > h2")?.textContent;
const result = el.querySelector("[data-testid='serp-ia-card']");
if (title && title.includes("Sponsored")) isAds = true;
if (title && !title.includes("Sponsored")) isAds = false;
if (!title && result) {
    ...
}

To make returned result object we need to define link, fullAddress, categories, rating and get thumbnails in all resolutions. Then we get the last resolution link - it's the best one:

const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
  if (el.textContent === "more") return null;
  return {
    title: el.textContent,
    link: `https://www.yelp.com${el.getAttribute("href")}`,
  };
});
const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];

Next, we need to check if current element isAds we get and add (push() method) the different parts of the page to the adsResults array, otherwise we add results to organicResults. We can get the results using next methods:

if (isAds) {
  adsResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
    categories: categories.filter((el) => el),
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
} else {
  organicResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${link}#reviews`,
    categories: categories.filter((el) => el),
    price: result.querySelector("p > span:nth-child(2)").textContent,
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
}

Next, write a function to control the browser, and get information:

async function getOrganicResults() {
  ...
}

In this function first we need to define browser using puppeteer.launch({options}) method with current options, such as headless: true and args: ["--no-sandbox", "--disable-setuid-sandbox"].

These options mean that we use headless mode and array with arguments which we use to allow the launch of the browser process in the online IDE. And then we open a new page:

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();

Next, we change default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection with .setDefaultNavigationTimeout() method, go to URL with .goto() method and define the results array:

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);

const results = [];

Next, we use while loop (while) in which we use .waitForSelector() method to wait until the selector is load, add results from the page to adsResults and organicResults array (using spread syntax and filter()), check if the next page button is present on the page ($ method) and the number of results is less then resultsLimit we click (click() method) on the next page button element, wait 3 seconds (using waitForTimeout method), otherwise we stop the loop (using break).

while (true) {
  await page.waitForSelector("[data-testid='serp-ia-card']");
  const resultsFromPage = await getResultsFromPage(page);
  adsResults.push(...resultsFromPage.adsResults);
  organicResults.push(...resultsFromPage.organicResults);
  const isNextPage = await page.$("a[aria-label='Next']");
  if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
  await page.click("a[aria-label='Next']");
  await page.waitForTimeout(3000);
}

And finally, we close the browser, and return the received data:

await browser.close();

return { adsResults, organicResults };

Now we can launch our parser:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

Output

{
  "adsResults": [
    {
        "title":"Pizza Hut",
        "link":"https://www.yelp.com/adredir?ad_business_id=LuJTYRXHOuNBvmH2q_Pnhw&campaign_id=6a6Jneapwf8y0J5-2OI0UQ&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fpizza-hut-seattle-5&request_id=ed15d90f0b858297&signature=9a5b8ff57fd33b93fa11a7ccd12cfbf4268e7325970c0ac43238160bf2fe4f50&slot=0",
        "reviewsLink":"https://www.yelp.com/biz/pizza-hut-seattle-5#reviews",
        "categories":[
            {
              "title":"Chicken Wings",
              "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Pizza",
              "link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Fast Food",
              "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            }
        ],
        "address":"",
        "snippet":"“We ordered a pizza and wings for carry out tonight with special instructions for how to cook the wings. When my husband picked up the order, the wings weren't right. I want to give…",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/c-BNJn-PnEQedrtt4NPvYw/1000s.jpg"
    },
    ...and other results
  ],
  "organicResults": [
    {
        "title":"KFC",
        "link":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
        "reviewsLink":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc#reviews",
        "categories":[
            {
                "title":"Fast Food",
                "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Wings",
                "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Shop",
                "link":"https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
            },
            {
                "title":"more",
                "link":"https://www.yelp.com/biz/kfc-seattle-18?hrid=wRKhf8md_ru2OgAz1mrpRg&osq=kfc"
            }
        ],
        "price":"$",
        "rating":"2",
        "reviews":"54",
        "address":"KFC - Taco Bell",
        "neighborhoods":"Lower Queen Anne",
        "snippet":"“I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true,
            "Curbside Pickup":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/1000s.jpg"
    },
    ...and other results
  ]
}

Using Yelp Organic and Ads Results API from SerpApi

This section is to show the comparison between the DIY solution and our solution.

The biggest difference is that you don't need to create the parser from scratch and maintain it.

There's also a chance that the request might be blocked at some point from Google, we handle it on our backend so there's no need to figure out how to do it yourself or figure out which CAPTCHA, proxy provider to use.

First, we need to install google-search-results-nodejs:

npm i google-search-results-nodejs

Here's the full code example, if you don't need an explanation:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const adsResults = [];
  const organicResults = [];
  while (true) {
    const json = await getJson();
    if (json.organic_results) {
      if (json.ads_results) {
        adsResults.push(...json.ads_results);
      }
      organicResults.push(...json.organic_results);
      params.start ? (params.start += 10) : (params.start = 10);
    } else break;
    if (adsResults.length + organicResults.length >= resultsLimit) break;
  }
  return { adsResults, organicResults };
};

getResults().then((result) => console.dir(result, { depth: null }));

Code explanation

First, we need to declare SerpApi from google-search-results-nodejs library and define new search instance with your API key from SerpApi:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

Next, we write the necessary parameters for making a request and set how many results we want to receive (resultsLimit constant):

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

And finally, we declare the function getResult that gets data from the page and return it:

const getResults = async () => {
  ...
};

In this function we need to declare empty adsResults and organicResults arrays and using while loop get json, add ads_results and organic_results results from each page and set next page start index (to params.start value). If there is no more results on the page or if the number of received results more thanresultsLimit we stop the loop (using break) and return object with results:

const adsResults = [];
const organicResults = [];
while (true) {
  const json = await getJson();
  if (json.organic_results) {
    if (json.ads_results) {
      adsResults.push(...json.ads_results);
    }
    organicResults.push(...json.organic_results);
    params.start ? (params.start += 10) : (params.start = 10);
  } else break;
  if (adsResults.length + organicResults.length >= resultsLimit) break;
}
return { adsResults, organicResults };

After, we run the getResults function and print all the received information in the console with the console.dir method, which allows you to use an object with the necessary parameters to change default output options:

getResults().then((result) => console.dir(result, { depth: null }));

Output

{
  "adsResults": [
    {
      "block_position":"top",
      "place_ids":[
          "ThGZdWIyNOXUeTqMWRmVlw",
          "dudez-woodfired-pizza-seattle"
      ],
      "title":"DUDE’Z woodfired pizza",
      "link":"https://www.yelp.com/adredir?ad_business_id=ThGZdWIyNOXUeTqMWRmVlw&amp;campaign_id=KjSNa2u5Q-4tz8JKZAiYvg&amp;click_origin=search_results&amp;placement=above_search&amp;placement_slot=0&amp;redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fdudez-woodfired-pizza-seattle&amp;request_id=e3869f3c027b5193&amp;signature=bbf93e3aaaae7762d2435d05e5fefee95c31e33c01f9b2b8aad85c660d7d5cfc&amp;slot=0",
      "reviews_link":"https://serpapi.com/search.json?engine=yelp_reviews&place_id=ThGZdWIyNOXUeTqMWRmVlw",
      "categories":[
          {
            "title":"Pizza",
            "link":"https://www.yelp.com/search?cflt=pizza&amp;find_loc=Seattle%2C+WA"
          }
      ],
      "rating":5,
      "reviews":1,
      "neighborhoods":"Cottage Grove",
      "phone":"(360) 803-1616",
      "snippet":"These pizzas are so delicious! The guys really take care of their customers. We ordered the Carne Asada Pizza and the awesome G Pop pizza (think jalapeño poppers in the shape of a…",
      "service_options":{
          "outdoor_seating":false,
          "delivery":true,
          "takeout":true
      },
      "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/cJynIXUZp0OWhSdW3AUoaw/348s.jpg"
    },
    ...and other results
  ],
  "organicResults":
  [
    {
      "position": 1,
      "place_ids": ["UON0MxZGG0cgsU5LYPjJbg", "kfc-seattle-18"],
      "title": "KFC",
      "link": "https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
      "reviews_link": "https://serpapi.com/search.json?engine=yelp_reviews&place_id=UON0MxZGG0cgsU5LYPjJbg",
      "categories": [
        {
          "title": "Fast Food",
          "link": "https://www.yelp.com/search?cflt=hotdogs&amp;find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Wings",
          "link": "https://www.yelp.com/search?cflt=chicken_wings&amp;find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Shop",
          "link": "https://www.yelp.com/search?cflt=chickenshop&amp;find_loc=Seattle%2C+WA"
        }
      ],
      "price": "$",
      "rating": 2,
      "reviews": 54,
      "address": "210 W Mercer St",
      "neighborhoods": "Lower Queen Anne",
      "phone": "(206) 283-7575",
      "snippet": "I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
      "service_options": {
        "delivery": true,
        "takeout": true,
        "curbside_pickup": true
      },
      "thumbnail": "https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/348s.jpg"
    },
    ...and other results
  ]
}