<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Priyanshu Choudhary</title>
    <description>The latest articles on DEV Community by Priyanshu Choudhary (@priyanshu_choudhary_b6ac1).</description>
    <link>https://dev.to/priyanshu_choudhary_b6ac1</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F1697850%2F36dfefc0-9fdb-44c7-9351-43c08221f93a.png</url>
      <title>DEV Community: Priyanshu Choudhary</title>
      <link>https://dev.to/priyanshu_choudhary_b6ac1</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/priyanshu_choudhary_b6ac1"/>
    <language>en</language>
    <item>
      <title>Amazon/flipkart Web Scrapping Working in Local but not Working on Deployment ? WHY ?</title>
      <dc:creator>Priyanshu Choudhary</dc:creator>
      <pubDate>Fri, 28 Jun 2024 20:01:42 +0000</pubDate>
      <link>https://dev.to/priyanshu_choudhary_b6ac1/amazonflipkart-web-scrapping-working-in-local-but-not-working-on-deployment-why--2d31</link>
      <guid>https://dev.to/priyanshu_choudhary_b6ac1/amazonflipkart-web-scrapping-working-in-local-but-not-working-on-deployment-why--2d31</guid>
      <description>&lt;p&gt;Hey i am not able to receive the scraped data through amazon/flipkart on my deployed fastapi link. the fastapi app works fine in my local. I have used BeautifulSoup for scrapping and use vercel to deploy the app. Can anyone have help me with it ? &lt;/p&gt;

&lt;p&gt;localHost-&lt;br&gt;
&lt;a href="https://media.dev.to/cdn-cgi/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fsf6ar8emb7e92sr734jh.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media.dev.to/cdn-cgi/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fsf6ar8emb7e92sr734jh.png" alt="Image description" width="800" height="335"&gt;&lt;/a&gt;&lt;br&gt;
Deployed vercel link-&lt;br&gt;
&lt;a href="https://media.dev.to/cdn-cgi/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F8enoy873sy7i17hkkolo.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media.dev.to/cdn-cgi/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F8enoy873sy7i17hkkolo.png" alt="Image description" width="800" height="275"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Below is the fastapi app code-&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, HttpUrl
import requests
from bs4 import BeautifulSoup
from fastapi.middleware.cors import CORSMiddleware
import time
import random
import logging

app = FastAPI()

origins = [
    "http://localhost",
    "http://localhost:3000",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["GET", "POST"],
    allow_headers=["*"],
)

class ProductURLs(BaseModel):
    url1: HttpUrl
    url2: HttpUrl

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
]

logging.basicConfig(level=logging.INFO)

def scrape_amazon_product_highlights(url):
    max_retries = 5
    for attempt in range(max_retries):
        try:
            headers = {
                "User-Agent": random.choice(user_agents)
            }
            page = requests.get(url, headers=headers)
            page.raise_for_status()  # Raises an HTTPError for bad responses

            soup = BeautifulSoup(page.text, "html.parser")
            titles = soup.find_all("span", class_="a-size-large product-title-word-break")
            specs = soup.find_all("ul", class_="a-unordered-list a-vertical a-spacing-mini")    

            highlights = [title.text for title in titles] + [spec.text for spec in specs] 

            if highlights:
                return highlights

            logging.info(f"No highlights found on attempt {attempt + 1} for URL {url}")
            if attempt &amp;lt; max_retries - 1:
                time.sleep(2 ** attempt)  

        except requests.RequestException as e:
            logging.error(f"RequestException occurred on attempt {attempt + 1} for URL {url}: {str(e)}")
            if attempt &amp;lt; max_retries - 1:
                time.sleep(2 ** attempt)  
                continue
            else:
                raise HTTPException(status_code=400, detail=str(e))
        except Exception as e:
            logging.error(f"Exception occurred on attempt {attempt + 1} for URL {url}: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    raise HTTPException(status_code=500, detail="Unable to retrieve product highlights after multiple attempts")

def scrape_flipkart_product_highlights(url):
    try:
        page = requests.get(url)
        page.raise_for_status()  

        soup = BeautifulSoup(page.text, "html.parser")
        titles = soup.find_all("span", class_="VU-ZEz")
        specs = soup.find_all("li", class_="_7eSDEz")


        highlights = [title.text for title in titles] + [spec.text for spec in specs]  # Extract the text for each spec

        return highlights

    except requests.RequestException as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
async def read_root():
    return {"message": "Welcome to the Product Comparison API"}

@app.post("/")
async def compare_products(urls: ProductURLs):
    try:
        if urls.url1.host and  "amazon" in urls.url1.host:
            product1_highlights = scrape_amazon_product_highlights(urls.url1)
        elif urls.url1.host and "flipkart" in urls.url1.host:
            product1_highlights = scrape_flipkart_product_highlights(urls.url1)
        else:
            raise HTTPException(status_code=400, detail="Unsupported URL1")

        if urls.url2.host and "amazon" in urls.url2.host:
            product2_highlights = scrape_amazon_product_highlights(urls.url2)
        elif urls.url2.host and "flipkart" in urls.url2.host:
            product2_highlights = scrape_flipkart_product_highlights(urls.url2)
        else:
            raise HTTPException(status_code=400, detail="Unsupported URL2")
    except HTTPException as e:
        raise e
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

    return {
        "product1": {
            "url": urls.url1,
            "highlights": product1_highlights
        },
        "product2": {
            "url": urls.url2,
            "highlights": product2_highlights
        }
    }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
      <category>webscraping</category>
      <category>fastapi</category>
      <category>bs4</category>
      <category>webdev</category>
    </item>
  </channel>
</rss>
