Students miss out on billions in unclaimed scholarships every year simply because they don't know they exist. Let's build a scholarship matcher that scrapes award databases and matches students to opportunities they qualify for.
The Scholarship Discovery Problem
There are over 1.7 million private scholarships worth $7.4 billion annually. Most students only apply to a handful because finding relevant ones is tedious. A scholarship matcher solves this by aggregating data from hundreds of sources and using NLP to match student profiles.
Data Sources
Key scholarship databases to scrape:
- Fastweb, Scholarships.com, Cappex
- College Board Scholarship Search
- State-specific scholarship portals
- Professional association awards
- Community foundation directories
Setting Up
pip install requests beautifulsoup4 pandas scikit-learn schedule
Using ScraperAPI for JavaScript-heavy scholarship sites:
import requests
from bs4 import BeautifulSoup
import json
SCRAPER_KEY = "YOUR_SCRAPERAPI_KEY"
def fetch(url):
resp = requests.get(
"http://api.scraperapi.com",
params={"api_key": SCRAPER_KEY, "url": url, "render": "true"},
timeout=60
)
return BeautifulSoup(resp.text, "html.parser")
Scraping Scholarship Listings
def scrape_scholarships_com(category, page=1):
"""Scrape scholarships from Scholarships.com."""
url = f"https://www.scholarships.com/financial-aid/college-scholarships/scholarships-by-type/{category}/page/{page}"
soup = fetch(url)
scholarships = []
for item in soup.select(".scholarship-item, .award-listing"):
name = item.select_one(".scholarship-name, h3")
amount = item.select_one(".award-amount, .amount")
deadline = item.select_one(".deadline")
desc = item.select_one(".description, .snippet")
link = item.select_one("a")
if name:
scholarships.append({
"name": name.text.strip(),
"amount": amount.text.strip() if amount else "Varies",
"deadline": deadline.text.strip() if deadline else "Varies",
"description": desc.text.strip() if desc else "",
"url": link["href"] if link else "",
"source": "scholarships.com"
})
return scholarships
# Scrape multiple categories
categories = [
"merit-scholarships",
"need-based-scholarships",
"minority-scholarships",
"stem-scholarships",
"creative-arts-scholarships"
]
all_scholarships = []
for cat in categories:
results = scrape_scholarships_com(cat)
all_scholarships.extend(results)
print(f"{cat}: {len(results)} scholarships")
Extracting Eligibility Criteria
import re
def extract_eligibility(description):
"""Parse eligibility criteria from description text."""
criteria = {
"min_gpa": None,
"max_age": None,
"citizenship": [],
"majors": [],
"states": [],
"demographics": []
}
# GPA requirements
gpa_match = re.search(r"(\d\.\d+)\s*GPA", description)
if gpa_match:
criteria["min_gpa"] = float(gpa_match.group(1))
# Citizenship
if "US citizen" in description.lower():
criteria["citizenship"].append("US")
if "permanent resident" in description.lower():
criteria["citizenship"].append("PR")
# Major keywords
major_keywords = [
"engineering", "computer science", "biology",
"business", "nursing", "education", "mathematics"
]
for major in major_keywords:
if major in description.lower():
criteria["majors"].append(major)
return criteria
Building the Student Profile Matcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
class ScholarshipMatcher:
def __init__(self, scholarships_df):
self.df = scholarships_df
self.vectorizer = TfidfVectorizer(
stop_words="english",
max_features=5000
)
self.tfidf_matrix = self.vectorizer.fit_transform(
self.df["description"]
)
def match(self, student_profile, top_n=20):
"""Find best matching scholarships for a student."""
profile_text = self._profile_to_text(student_profile)
profile_vec = self.vectorizer.transform([profile_text])
scores = cosine_similarity(
profile_vec, self.tfidf_matrix
).flatten()
self.df["match_score"] = scores
results = self.df.sort_values("match_score", ascending=False)
# Filter by hard eligibility criteria
results = self._apply_filters(results, student_profile)
return results.head(top_n)
def _profile_to_text(self, profile):
"""Convert student profile to searchable text."""
parts = [
profile.get("major", ""),
profile.get("interests", ""),
profile.get("background", ""),
profile.get("career_goals", ""),
" ".join(profile.get("activities", []))
]
return " ".join(parts)
def _apply_filters(self, df, profile):
"""Filter out scholarships student doesn't qualify for."""
student_gpa = profile.get("gpa", 4.0)
filtered = df[
df["criteria"].apply(
lambda c: c.get("min_gpa") is None or
c["min_gpa"] <= student_gpa
)
]
return filtered
Usage Example
# Student profile
student = {
"name": "Alex Chen",
"gpa": 3.8,
"major": "computer science",
"interests": "artificial intelligence machine learning",
"background": "first-generation college student",
"career_goals": "AI research in healthcare",
"activities": [
"robotics club president",
"hospital volunteer",
"coding bootcamp mentor"
],
"state": "CA",
"citizenship": "US"
}
# Load scholarships
df = pd.read_csv("scholarships.csv")
df["criteria"] = df["description"].apply(extract_eligibility)
matcher = ScholarshipMatcher(df)
matches = matcher.match(student, top_n=15)
print("Top Scholarship Matches:")
for _, row in matches.iterrows():
print(f" {row['name']} - {row['amount']} ({row['match_score']:.0%} match)")
Keeping Data Fresh
Scholarship deadlines change every year. Set up weekly scraping to keep your database current. Use ThorData for rotating proxies across scholarship sites, and ScrapeOps to monitor which scrapers need maintenance.
import schedule
def weekly_refresh():
for source in SCHOLARSHIP_SOURCES:
try:
new_data = scrape_source(source)
update_database(new_data)
print(f"Updated {source}: {len(new_data)} scholarships")
except Exception as e:
print(f"Failed {source}: {e}")
schedule.every().monday.at("03:00").do(weekly_refresh)
Conclusion
A scholarship matcher can help students discover thousands of dollars in funding they'd otherwise miss. The combination of web scraping for data collection and NLP for matching creates a powerful tool. Start with 2-3 major databases, build the matching engine, then expand your sources over time.
The $7.4 billion in annual scholarships is there for the taking. The hard part isn't qualifying. It's knowing they exist.
Top comments (0)