Scraping Online Learning Outcomes: Course Completion and Reviews

#python #tutorial #webdev #programming

Scraping Online Learning Outcomes: Course Completion and Reviews

The online education market is massive, but choosing the right course is a gamble. What if you could scrape actual completion rates, student reviews, and outcome data to make data-driven learning decisions?

What Data Matters

Student ratings and review text
Enrollment numbers and completion signals
Instructor track records
Skill outcomes mentioned in reviews

Setup

import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime
from collections import Counter

PROXY_URL = "https://api.scraperapi.com"
API_KEY = "YOUR_SCRAPERAPI_KEY"

Education platforms use aggressive anti-scraping. ScraperAPI handles the JavaScript rendering and bot detection.

Scraping Udemy Course Data

def scrape_udemy_course(course_slug):
    params = {
        "api_key": API_KEY,
        "url": f"https://www.udemy.com/course/{course_slug}/",
        "render": "true"
    }
    response = requests.get(PROXY_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    title = soup.select_one("h1")
    rating = soup.select_one("[data-purpose='rating-number']")
    students = soup.select_one("[data-purpose='enrollment']")

    return {
        "platform": "udemy",
        "title": title.text.strip() if title else "",
        "rating": float(rating.text.strip()) if rating else None,
        "students": parse_number(students.text if students else "0"),
        "url": f"https://www.udemy.com/course/{course_slug}/"
    }

def parse_number(text):
    nums = re.findall(r'[\d,]+', text)
    return int(nums[0].replace(",", "")) if nums else 0

Analyzing Learning Outcomes from Reviews

OUTCOME_KEYWORDS = {
    "got_job": ["got a job", "hired", "landed a role", "career change"],
    "built_project": ["built", "created", "deployed", "launched"],
    "skill_gained": ["learned", "understand", "confident"],
    "disappointed": ["waste", "outdated", "poor quality", "refund"]
}

def analyze_outcomes(reviews):
    outcome_counts = Counter()
    for review in reviews:
        text = review["text"].lower()
        for outcome, keywords in OUTCOME_KEYWORDS.items():
            if any(kw in text for kw in keywords):
                outcome_counts[outcome] += 1

    total = len(reviews)
    return {
        "total_reviews": total,
        "outcomes": {
            k: {"count": v, "pct": round(v/total*100, 1)}
            for k, v in outcome_counts.items()
        }
    }

Course Comparison

from statistics import mean

def compare_courses(course_slugs):
    comparisons = []
    for slug in course_slugs:
        course = scrape_udemy_course(slug)
        reviews = scrape_course_reviews(slug, pages=3)
        outcomes = analyze_outcomes(reviews)

        course["outcomes"] = outcomes
        course["value_score"] = calculate_value(course, outcomes)
        comparisons.append(course)
        import time; time.sleep(5)

    return sorted(comparisons, key=lambda x: x["value_score"], reverse=True)

def calculate_value(course, outcomes):
    score = (course["rating"] or 0) * 2
    positive = outcomes["outcomes"].get("got_job", {}).get("pct", 0)
    score += positive * 0.5
    disappointed = outcomes["outcomes"].get("disappointed", {}).get("pct", 0)
    score -= disappointed * 0.3
    return round(score, 2)

Infrastructure

ScraperAPI — JavaScript rendering for dynamic course pages
ThorData — residential proxies to browse like a real student
ScrapeOps — track scraping performance across education platforms

Conclusion

Scraping learning outcomes transforms course selection from guesswork into data science. Compare on actual student results, not just marketing. This pipeline helps learners and EdTech companies make data-driven decisions.