DEV Community

Greeshma Pogula
Greeshma Pogula

Posted on

Udemy Problem

Task-1

df = spark.read.format("csv").option("header","true").load(input_path)
df = df.select("course_id", "price", "num_subscribers", "num_reviews", "level",
"content_duration", "published_timestamp", "subject")
df.withColumn("price",df.price.cast("int"))
df.withColumn("num_subscribers",df.num_subscribers.cast("int"))
df.withColumn("num_reviews",df.num_reviews.cast("int"))
df.withColumn("level",df.level.cast("string"))
df.withColumn("content_duration",df.content_duration.cast("float"))
df.withColumn("published_timestamp",df.published_timestamp.cast("timestamp"))
df.withColumn("subject",df.subject.cast("string"))
return df

Task-2

price_df = spark.read.load(transformed_data_path)
price_df =
price_df.groupBy(col("level"),col("subject")).agg(sum("price").alias("Total_price")).select("level","
subject","Total_price")
price_df = price_df.sort(desc("Total_price"))
return price_df

Task-3

sub_df = spark.read.load(transformed_data_path)
sub_df = sub_df.filter(sub_df.level == "Expert Level")
sub_df = sub_df.withColumn("sub_level",when((sub_df.num_subscribers>1000),lit("High")).
when((sub_df.num_subscribers>500)&(sub_df.num_subscribers<1000),lit("Medium")).
when((sub_df.num_subscribers <500),lit("Low")))
sub_df = sub_df.select("course_id","num_subscribers","level","sub_level")
return sub_df

Task-4

years_df = spark.read.load(transformed_data_path)
years_df = years_df.select("course_id","price","level","published_timestamp")
years_df = years_df.filter(years_df.price > 100)
years_df.show(5)
years_df = years_df.filter((years_df.published_timestamp>=("2012-01-01T00:00:00Z")) &
(years_df.published_timestamp<("2015-01-01T00:00:00Z")))
years_df.show(5)
years_df = years_df.sort(desc("price"))
return years_df

Top comments (0)