Extracting data from e-commerce websites

#data #dataanalysis #webscraping #python

Basic Web Scraping is one of the essentials for a Data Analyst. The ability to get your own data for Project Purpose is an undervalued task.

I recently scraped some data from 4 big art shops (websites) in Nigeria and I would like to share the codes (ChatGPT included codes) for learning purposes(Other Data analyst who might find it useful).

The first website is Crafts Village I scarped the Art-tools category.

code for scraping the website

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Initialize lists to store the data
product_names = []
prices = []

# Scrape all 6 pages
for page in range(1, 7):
    url = f"https://craftsvillage.com.ng/product-category/art-tools/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the relevant HTML elements for product information
    products = soup.find_all("li", class_="product")

    # Extract data from each product element
    for product in products:
        # Product name
        name_element = product.find("a", class_="woocommerce-LoopProduct-link")
        name = name_element.text.replace("\n", "").strip()
        name = re.sub(r"[₦\,|–]", "", name)  # Remove unwanted characters
        product_names.append(name)


        # Price
        price_element = product.find("bdi")
        price = price_element.text if price_element else None
        prices.append(price)

# Create a Pandas DataFrame from the scraped data
data = {
    "Product Name": product_names,
    "Price": prices
}
df = pd.DataFrame(data)

# Remove "\n\n\n\n\n" from "Product Name" column
df["Product Name"] = df["Product Name"].str.replace("\n", "")

# Display the Data Frame
print(df)

To get the name element class, I inspected the name class from my browser by putting the cursor on the product name right click my mouse pad and clicking on inspect.

I also did same for the price too

The code above extracted the product name and prices from all the 6 pages in the Art tool category.

Here is how I scraped information from Crafties Hobbies

import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = 'https://craftieshobbycraft.com/product-category/painting-drawing/page/{}/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Create lists to store data
categories = []
product_names = []
product_prices = []

# Iterate over each page
for page in range(1, 8):
    url = base_url.format(page)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    category_elements = soup.find_all('p', class_='category uppercase is-smaller no-text-overflow product-cat op-7')
    product_names_elements = soup.find_all('a', class_='woocommerce-LoopProduct-link woocommerce-loop-product__link')
    product_prices_elements = soup.find_all('bdi')

    for category_element, product_name_element, product_price_element in zip(category_elements, product_names_elements, product_prices_elements):
        category = category_element.get_text(strip=True)
        product_name = product_name_element.get_text(strip=True)
        product_price = product_price_element.get_text(strip=True)

        categories.append(category)
        product_names.append(product_name)
        product_prices.append(product_price)

# Create a pandas DataFrame
data = {
    'Category': categories,
    'Product Name': product_names,
    'Product Price': product_prices
}
df = pd.DataFrame(data)

# Print the DataFrame
print(df)

Here is how I scraped data from Kaenves store

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create empty lists to store the data
product_names = []
prices = []

# Iterate through each page
for page in range(1, 4):
    # Send a GET request to the page
    url = f"https://www.kaenves.store/collections/floating-wood-frame?page={page}"
    response = requests.get(url)

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all span elements with the specified class
    price_elements = soup.find_all('span', class_='price-item price-item--regular')
    name_elements = soup.find_all('h3', class_='card__heading h5')

    # Extract the prices and product names
    for price_element, name_element in zip(price_elements, name_elements):
        price = price_element.get_text(strip=True)
        name = name_element.get_text(strip=True)
        product_names.append(name)
        prices.append(price)

# Create a pandas DataFrame
data = {'Product Name': product_names, 'Price': prices}
df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv('paperandboard.csv', index=False)

Here is how I scraped data from Art Easy

import requests
from bs4 import BeautifulSoup
import pandas as pd

prices = []
product_names = []

# Iterate over all 2 pages
for page_num in range(1, 3):
    url = f"https://arteasy.com.ng/product-category/canvas-surfaces/page/{page_num}/"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all the span elements with class "price"
    product_prices = [span.get_text(strip=True) for span in soup.find_all("span", class_="price")]

    # Find all the h3 elements with class "product-title"
    product_names += [product_name.get_text(strip=True) for product_name in soup.find_all("h3", class_="product-title")]

    # Add the prices to the list
    prices += product_prices

# Check if the lengths of product_names and prices are equal
if len(product_names) == len(prices):
    # Create a pandas DataFrame
    data = {"Product Name": product_names, "Price": prices}
    df = pd.DataFrame(data)

    # Print the DataFrame
    print(df)
else:
    print("Error: The lengths of product_names and prices are not equal.")

If you want to reuse this code ensure to change the URL to your preferred e-commerce website and also change the class to your URL product name and product price class

These informations scraped can be used for the following;

Price comparison: You can use the scraped data to compare prices of products across different websites. This can help you find the best deal on the product you are looking for.
Product research: You can use the scraped data to research products. This can help you learn more about a product's features, specifications, and reviews.
Market analysis: You can use the scraped data to analyze the market for a particular product. This can help you identify trends and opportunities.
Product recommendations: You can use the scraped data to recommend products to users. This can help you increase sales and improve customer satisfaction.

Forem

Extracting data from e-commerce websites

Top comments (0)

Read next

Getting Started with Python: Creating a Hello World Project Using Poetry

AI Agents Tutorial For Beginners

Construyendo una Pokédex con Python y AWS CDK: Una guía paso a paso para crear tu propia aplicación web en menos de 25 minutos

6 Powerful Python Techniques for Efficient Graph Processing and Analysis