This is my first build2learn program i worked in the team to build the project with in short period of time
my team members are selvakumar duraipadiyan and susan mary and me
In my project concern is to scraping the website content using the python language and make the scraping content in to dataset
Our team head selvakumar duraipadiyan he has 10 years of experience in IT field he share project details in well understand able and we support for him to done the project
Venue: Entrans Technologies Private Limited
*website we used to scraping the content : *
http://www.sangathamizh.com/18keezh-kanakku/18keezh-kanakku-innanatpathu-%E0%AE%87%E0%AE%A9%E0%AF%8D%E0%AE%A9%E0%AE%BE%E0%AE%A8%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%AA%E0%AE%A4%E0%AF%81.html
Scraping Python program:
import requests
from bs4 import BeautifulSoup
import json
url = "http://www.sangathamizh.com/10paddu/10paddu-thirumurugatrupadai-%E0%AE%A4%E0%AE%BF%E0%AE%B0%E0%AF%81%E0%AE%AE%E0%AF%81%E0%AE%B0%E0%AF%81%E0%AE%95%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%B1%E0%AF%81%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AE%9F%E0%AF%88.html"
response = requests.get(url)
response.encoding = "utf-8" # ensure Tamil text is read properly
soup = BeautifulSoup(response.text, "html.parser")
data = []
# loop through all centerContent divs
for div in soup.select("div#centerSection div#centerContent"):
sub_header = div.select_one("div#sub-header")
if not sub_header:
continue
# title
sub_header = div.select_one("div#sub-header")
if sub_header:
# remove <a> if present
for a in sub_header.find_all("a"):
a.decompose()
title = sub_header.get_text(strip=True)
else:
title = ""
# content
content_div = div.select_one("div#p1")
if content_div:
# Replace <br> with newlines
content = content_div.get_text(" ", strip=True)
else:
content = ""
# explanation
explanation = ""
porulurai_h4 = div.find("h4", string="பொருளுரை:")
if porulurai_h4:
p_tag = porulurai_h4.find_next("p")
if p_tag:
explanation = p_tag.get_text(" ", strip=True)
if title or content or explanation:
data.append({
"title": title,
"content": content,
"explanation": explanation
})
# save to json
with open("output.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Scraping complete! Data saved to output.json")
program to make the dataset:
import json
# Input and output file names
input_file = "output.json" # Replace with your JSON file
output_file = "thirumurukaatrupadai_modified.jsonl"
# Load JSON data
with open(input_file, "r", encoding="utf-8") as f:
data = json.load(f)
# Write to JSONL format
with open(output_file, "w", encoding="utf-8") as f_out:
# If the JSON is a list of objects
if isinstance(data, list):
for entry in data:
f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
# If the JSON is a dictionary with lists inside
elif isinstance(data, dict):
for key, value in data.items():
f_out.write(json.dumps({key: value}, ensure_ascii=False) + "\n")
print(f"Converted {input_file} to {output_file}")
program to push dataset to dataset storing website
To storing the dataset we used the Huggingface
from datasets import Dataset
from huggingface_hub import login
# 🔑 Login (safer to run `huggingface-cli login` once in terminal, then skip token here)
login("") # or just login() if you already did CLI login
# Load your JSONL file into a Hugging Face Dataset
dataset = Dataset.from_json("thirumurukaatrupadai_modified.jsonl")
# Push to Hub
dataset.push_to_hub(
repo_id="selvakumarramesh/Thirumurukaatrupadai",
private=True # change to False if you want public
)
print("✅ Dataset pushed to Hugging Face Hub!")
Images for memories:
Top comments (0)