DEV Community

Cover image for How to scrape product characteristics on Walmart?
Mikhail Zub
Mikhail Zub

Posted on

How to scrape product characteristics on Walmart?

Intro

In this post, I want to explain how to scrape product characteristics on Walmart with Node.js. I will show you several ways to do this.

Preparation

First, we need to create a Node.js project and add npm packages "Puppeeteer" and "Puppeteer stealth plugin". To do this, in the directory with our project, open the command line and enter:
npm init -y
then:
npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

What will be scraped

Walmart organic results
Organic results
then the characteristics of each product
Characteristics

Process

SelectorGadget Chrome extension was used to grab CSS selectors.
The Gif below illustrates the approach of selecting different parts of the organic results.
Grab CSS

Code

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

const URL = "https://www.walmart.com/";
const searchString = "laptop";
const encodedString = encodeURI(searchString);

puppeteer.use(StealthPlugin());

async function solveCaptcha(page, url) {
  const currentPage = await page.evaluate(() => location.href);
  if (currentPage != url) {
    await page.waitForSelector("#px-captcha");
    await page.waitForTimeout(3000);
    await page.keyboard.press("Tab");
    await page.keyboard.press("Tab");
    await page.keyboard.press("Enter", { delay: 10000 });
    await page.waitForTimeout(5000);
  }
  return;
}

async function solveSecondCaptcha(page, url, searchQuery) {
  const currentPage = await page.evaluate(() => location.href);
  if (currentPage.indexOf("?query=") != "-1") {
    if (currentPage != `${url}search/?query=${searchQuery}`) {
      await page.waitForSelector("#px-captcha");
      await page.waitForTimeout(3000);
      await page.keyboard.press("Tab");
      await page.keyboard.press("Tab");
      await page.keyboard.press("Enter", { delay: 10000 });
      await page.keyboard.up("Enter");
      await page.waitForTimeout(5000);
    }
  }
  if (currentPage.indexOf("?q=") != "-1") {
    if (currentPage != `${url}search?q=${searchQuery}`) {
      await page.waitForSelector("#px-captcha");
      await page.waitForTimeout(3000);
      await page.keyboard.press("Tab");
      await page.keyboard.press("Tab");
      await page.keyboard.press("Enter", { delay: 10000 });
      await page.keyboard.up("Enter");
      await page.waitForTimeout(5000);
    }
  }
  return;
}

async function getOrganicResults(url, searchQuery) {
  browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(url);
  await solveCaptcha(page, url);
  await page.waitForSelector("input[name='q']");
  await page.type("input[name='q']", searchQuery, { delay: 10 });
  await page.keyboard.press("Enter");
  await page.waitForTimeout(10000);
  await solveSecondCaptcha(page, url, searchQuery);
  await page.waitForSelector("div[data-item-id]");
  await page.waitForTimeout(5000);

  const products = await page.evaluate(function () {
    return Array.from(document.querySelectorAll("div[data-stack-index] div[data-item-id]")).map((el) => ({
      link: "https://www.walmart.com" + el.querySelector("a[link-identifier]").getAttribute("href"),
      title: el.querySelectorAll(".lh-title")[1].innerText,
      price: el.querySelector(".f4-l").innerText,
    }));
  });
  return products;
}

async function getProductsSpecs(product) {
  if (!product.link.toLowerCase().includes("https://wrd.walmart.com/")) {
    const url = product.link;
    const page = await browser.newPage();
    await page.setDefaultNavigationTimeout(60000);
    await page.goto(url);
    await solveCaptcha(page, url);
    await page.waitForTimeout(10000);
    const selectors1 = await page.$$(".product-specification-table td");
    const productInfo = {};
    let keys = [];
    let values = [];

    if (selectors1.length > 1) {
      keys = await page.evaluate(function () {
        return Array.from(document.querySelectorAll(".product-specification-table td:first-child")).map((el) => el.innerText);
      });
      values = await page.evaluate(function () {
        return Array.from(document.querySelectorAll(".product-specification-table td:last-child")).map((el) => el.innerText);
      });
    } else {
      keys = await page.evaluate(function () {
        return Array.from(document.querySelectorAll(".nt1 .pb2 h3")).map((el) => el.innerText);
      });
      values = await page.evaluate(function () {
        return Array.from(document.querySelectorAll(".nt1 .pb2 p span")).map((el) => el.innerText);
      });
    }

    await page.close();
    productInfo.link = url;
    productInfo.title = product.title;
    productInfo.price = product.price;
    productInfo.specifications = {};
    for (let i = 0; i < keys.length; i++) {
      productInfo.specifications[`${keys[i]}`] = values[i] ? values[i].trim() : "no spec";
    }

    console.log(productInfo);
    return productInfo;
  }
  return;
}

async function getAllSpecs(url, searchQuery) {
  const mainProductInfo = await getOrganicResults(url, searchQuery);
  const products = [];
  for (let i = 0; i < mainProductInfo.length; i++) {
    products.push(await getProductsSpecs(mainProductInfo[i]));
  }
  await browser.close();
}

getAllSpecs(URL, encodedString);
Enter fullscreen mode Exit fullscreen mode

Output

{
  link: 'https://www.walmart.com/ip/HP-15-Pentium-4GB-128GB-Laptop-Scarlet-Red/307924252',
  title: 'HP 15 Pentium 4GB/128GB Laptop-Scarlet Red',
  price: '$299.00',
  specifications: {
    'Processor Brand': 'Intel',
    'Processor Type': 'Intel',
    'Hard Drive Capacity': '128 GB',
    'Processor Core Type': 'Quad-Core',
    Manufacturer: 'HP',
    'Product Line': 'HP',
    'Processor Speed': '1.1 GHz',
    'Manufacturer Part Number': '1A493UA#ABA',
    'Wireless Technology': '802.11b',
    'RAM Memory': '4 GB',
    'Maximum RAM Supported': '4 GB',
    'Operating System': 'Windows 10',
    'Battery Life': '10.45 h',
    Model: '15-dw0083wm',
    'Screen Size': '15.6 in',
    Brand: 'HP',
    Features: 'No features description available',
    'Assembled Product Dimensions (L x W x H)': '22.00 x 4.00 x 14.00 Inches'
  }
}
Enter fullscreen mode Exit fullscreen mode

Using Walmart Search Engine Results API

SerpApi is a free API with 100 search per month. If you need more searches, there are paid plans.

The difference is that all that needs to be done is just to iterate over a ready made, structured JSON instead of coding everything from scratch, and selecting correct selectors which could be time consuming at times.

First we need to install "google-search-results-nodejs". To do this you need to enter:
npm i google-search-results-nodejs

Code

const util = require("util");

const { GoogleSearch } = require("google-search-results-nodejs");
const search = new GoogleSearch(process.env.API_KEY); //Your API key

const getJson = search.json.bind(search);

getJson[util.promisify.custom] = (params) => {
  return new Promise((resolve, reject) => {
    getJson(params, resolve, reject);
  });
};

const promisifiedGetJson = util.promisify(getJson);

async function getMainInfo() {
  const paramsSearch = {
    engine: "walmart",
    query: "laptop",
  };

  const data = await promisifiedGetJson(paramsSearch);
  const organicResults = data.organic_results;

  const productResultsPromises = organicResults.map((organicResult) => {
    if (!(organicResult.us_item_id || organicResult.upc || organicResult.product_id)) {
      return null;
    }

    const paramsProduct = {
      engine: "walmart_product",
      product_id: organicResult.us_item_id || organicResult.upc || organicResult.product_id,
    };

    return promisifiedGetJson(paramsProduct);
  });

  await Promise.all(productResultsPromises).then((productResults) => {
    const fullProductsSpecs = [];
    productResults.forEach((el, i) => {
      fullProductsSpecs[i] = {
        link: el.search_metadata.walmart_product_url,
        title: el.product_result.title,
        price: el.product_result.price_map.price,
        specifications: {},
      };
      const specificationHighlights = el.product_result?.specification_highlights;
      if (specificationHighlights) {
        for (let j = 0; j < specificationHighlights.length; j++) {
          fullProductsSpecs[i].specifications[`${specificationHighlights[j].display_name}`] = specificationHighlights[j].value;
        }
      } else productInfo.specifications = "no detailed specifications";
    });
    console.log(fullProductsSpecs);
  });
}
getMainInfo();
Enter fullscreen mode Exit fullscreen mode

Output

{
  link: 'https://www.walmart.com/ip/Refurbished-Apple-MacBook-Pro-13-3-Laptop-LED-Intel-i5-3210M-2-5GHz-4GB-500GB-MD101LLA/708321782',
  title: 'Refurbished Apple MacBook Pro 13.3 Laptop LED Intel i5 3210M 2.5GHz 4GB 500GB - MD101LLA',
  price: 279.97,
  specifications: {
    'Laptop Computer Type': 'MacBooks',
    'Screen Size': '13.3 in',
    'RAM Memory': '4 KB',
    'Operating System': 'Windows',
    'Hard Drive Capacity': '500 KB',
    'Battery Life': '10 h'
  }
}
Enter fullscreen mode Exit fullscreen mode

Links

Code in the online IDESerpApi Playground

Outro

If you want to see how to scrape something using Node.js that I didn't write about yet or you want to see some project made with SerpApi, please write me a message.

Oldest comments (0)