import geojson
import geopandas
import pandas as pd
import numpy as np
import statistics as st
import matplotlib as plt
import folium
from folium.features import CustomIcon
import csv
from IPython.display import display, HTML
from pathlib import Path
import os
from datetime import date, time, datetime
from dateutil import tz #for dealing with timezone
from geopy import distance
from collections import defaultdict
from django.contrib.gis.geos import GEOSGeometry, Point
import math
from django.db import models
from django.contrib.gis.db import models
Step 1.) EXTRACT Download the raw csv file from Inside Airbnb and load them into a dataframe
def load_files(ia):
#load and create dataframes
raw_df_02_22 = pd.read_csv(ia)
print(raw_df_02_22.head())
#write to file so we can work on it in next function
raw_df_02_22.to_csv("../suwa_maps/airbnb_jupyter/raw_file.csv")
return(raw_df_02_22)
raw_df_02_22 = load_files("../suwa_maps/airbnb_jupyter/listings_feb_2022.csv")
*Step 2.) TRANSFORM Run the processing script on the raw file dataframe to clean and create new variables
*
`def clean_file(raw_df, raw_file):
#clean file
#drop columns
raw_df.drop(columns=["scrape_id", "bathrooms_text","host_verifications", "host_neighbourhood","property_type", "amenities", "host_verifications", "neighbourhood", "host_listings_count", "picture_url","host_url", "description", "neighborhood_overview", "host_about", "host_response_time", "host_response_rate", 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_has_profile_pic', 'host_identity_verified', 'calendar_updated', 'has_availability', 'calendar_last_scraped', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'neighbourhood_group_cleansed', 'beds', 'maximum_nights_avg_ntm', 'bathrooms', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'], inplace=True)
raw_df.to_csv("../suwa_maps/airbnb_jupyter/clean_file.csv")
#adjust price column
raw_df['price'] = raw_df['price'].str.replace("$", "").str.replace(',', '')
raw_df['price'] = pd.to_numeric(raw_df['price'])
cleaned_df = raw_df
# print(cleaned_df.head)
cleaned_df.to_csv("../suwa_maps/airbnb_jupyter/cleaned_file.csv")
return(cleaned_df)
cleaned_df_02_22 = clean_file(raw_df_02_22, "../suwa_maps/airbnb_jupyter/raw_file.csv")`
`def new_variables(cleaned_file_input, file_output, cleaned_dataframe):
#convert categorical to numeric
all = []
with open(cleaned_file_input, 'r') as file:
with open(file_output, 'w', encoding="utf-8") as newfile:
csvwriter = csv.writer(newfile,lineterminator='\n')
csvreader = csv.reader(file)
#extract headers
header = next(csvreader)
new_headers = ['days_rented_ltm', 'rounded_revenue_ltm', 'occupancy_rate_approx', 'is_hotel', 'is_entire', 'many_listings', 'only_1_listing', 'only_2_listings', 'host_florence', 'has_liscense', 'is_instant_bookable', 'dist_duomo', 'buffer_zone', 'is_centro', 'is_gavinana', 'is_isolotto', 'is_rifredi', 'is_campo', 'listing_revenue_exceed_LTR', 'effected_by_policy_1', 'effected_by_policy_2', 'effected_by_policy_3', 'commercial', 'very_likely_commercial', 'tourist_tax', 'unpaid_tourist_tax', 'geom']
for item in new_headers:
header.append(item)
all.append(header)
for row in csvreader:
min_night_avg = float(row[17]) #minimum_nights_avg_ntm
name_of_listing = row[4]
#filter out long term and inactive listings len(row[24]) > 0 is saying reviews for last 30days are not null, buthow about if reviews for last year are not 0?
if int(row[23]) > 0 and min_night_avg < 29:
listing_row_id = int(row[0])
listing_id = int(row[1])
#year and month of data scraped
date_scraped = row[3]
year = date_scraped.split('-')[0]
month = date_scraped.split('-')[1]
day = date_scraped.split('-')[2]
#dist_duomo lat, long
duomo_coords = (43.7731, 11.2560)
#listing_coords = lat, long
listing_coords = (row[11], row[12]) # ia_df_clean['latitude'], ia_df_clean['longitude']
##dist_duomo = duomo_coords - listing_coords
dist_duomo = (distance.distance(duomo_coords, listing_coords).m)
#buffer zone circle 5
# buffer_zone = 0
if dist_duomo <= 1500:
buffer_zone = 1
elif dist_duomo > 1500 and dist_duomo <= 3000:
buffer_zone = 2
elif dist_duomo > 3000 and dist_duomo <= 4500:
buffer_zone = 3
elif dist_duomo > 4500 and dist_duomo <= 6000:
buffer_zone = 4
elif dist_duomo > 6000 and dist_duomo <= 7500:
buffer_zone = 5
# geom = ((row[11], row[12]), srid=4326)
# pnt = GEOSGeometry('POINT(5 23)') # WKT
# gedjango is long, lat
coords_wkt = (round(float(row[12]),4) , round(float(row[11]),4) )
coords_wkt = Point(coords_wkt)
geom = coords_wkt
#policy considerations
#has_liscense
if len(row[25]) > 0:
has_liscense = 1
else:
has_liscense = 0
#is_instant_bookable
if row[26] == 't':
is_instant_bookable = 1
else:
is_instant_bookable = 0
#is in florence host
if ("Florence" or "Firenze") in row[8]: #row[8] ia_df_clean['host_location']
host_florence = 1
else:
host_florence = 0
#print(host_florence)
#to calculate rent burden from statistica averages, refine
centro_neighborhood_rent = 1238.70 * 12
campo_neighborhood_rent = 1078.00 * 12
rif_neighborhood_rent = 954.80 * 12
iso_neighborhood_rent = 939.40 * 12
gav_neighborhood_rent = 1070.30 * 12
#which neighborhoods
#centro
if "Centro" in row[10]: #row[10] ia_df_clean['neighbourhood_cleansed']
is_centro = 1
else:
is_centro = 0
#campo
if "Campo" in row[10]:
is_campo = 1
else:
is_campo = 0
#rifredi
if "Rifredi" in row[10]:
is_rifredi = 1
else:
is_rifredi = 0
#isolotto
if "Isolotto" in row[10]:
is_isolotto = 1
else:
is_isolotto = 0
#gavinana
if "Gavinana" in row[10]:
is_gavinana = 1
else:
is_gavinana = 0
#occupancy rate from SF model
#did reviews a year divided by how likely someone gives a review (airbnb data) * avg stay in florence (2.6)
review_rate = .50
avg_length = 3.2
if len(row[24]) > 0: #if number of reviews in last month more than 0 then it's active or there was an error
# print(row[23])
num_reviews = float(row[23]) #number_of_reviews_ltm
#test here for long term
if min_night_avg > 29:
print("min_night_avg > 29: is {} on row {}".format(min_night_avg,listing_row_id))
elif min_night_avg > avg_length:
avg_days = min_night_avg
else:
avg_days = avg_length
num_bookings = review_rate * num_reviews
days_rented_ltm = num_bookings * avg_days
#test for days
#adjusting to make occupancy rate 70 to be conservative according to Murray's suggestions
if days_rented_ltm > 255:
#print("days_rented_ltm > 365: is {} on row {}".format(days_rented_ltm,listing_row_id))
days_rented_ltm = 255
tourist_tax = (days_rented_ltm * 3) *2 #3 euro which is lower end, 3 start hotel is 3.80 euro and 2 persons conservative
occupancy_rate_approx = (days_rented_ltm/365) * 100
if occupancy_rate_approx > 70.00:
print("occupancy_rate_approx > 70: is {} on row {}".format(occupancy_rate_approx,listing_row_id))
occupancy_rate_approx = 70.00
#revenue #have to strip the string for price then turn into float
price_stripped_dollar = row[16].strip('$') #price
price_replaced_comma = price_stripped_dollar.replace(',','')
new_price = float(price_replaced_comma)
if new_price < 1.0:
print("new_price is basically zero: is {} on row {}".format(new_price,listing_row_id))
#yearly revenue
revenue_ltm = days_rented_ltm * new_price
rounded_revenue_ltm = round(revenue_ltm, 2)
if rounded_revenue_ltm >0.0 and rounded_revenue_ltm < new_price:
print("rounded_revenue_ltm < {}: and is {} on row {}".format(new_price, rounded_revenue_ltm,listing_row_id))
# print(rounded_revenue_ltm)
else:
days_rented_ltm = 'n/a'
rounded_revenue_ltm = 'n/a'
# listing_revenue_exceed_LTR = 'n/a'
if is_centro == 1:
if rounded_revenue_ltm == 'n/a':
listing_revenue_exceed_LTR = 'n/a'
elif rounded_revenue_ltm > centro_neighborhood_rent:
listing_revenue_exceed_LTR = 1
else:
listing_revenue_exceed_LTR = 0
elif is_gavinana == 1:
if rounded_revenue_ltm == 'n/a':
listing_revenue_exceed_LTR = 'n/a'
elif rounded_revenue_ltm > gav_neighborhood_rent:
listing_revenue_exceed_LTR = 1
else:
listing_revenue_exceed_LTR = 0
elif is_isolotto == 1:
if rounded_revenue_ltm == 'n/a':
listing_revenue_exceed_LTR = 'n/a'
elif rounded_revenue_ltm > iso_neighborhood_rent:
listing_revenue_exceed_LTR = 1
else:
listing_revenue_exceed_LTR = 0
elif is_rifredi ==1:
if rounded_revenue_ltm == 'n/a':
listing_revenue_exceed_LTR = 'n/a'
elif rounded_revenue_ltm > rif_neighborhood_rent:
listing_revenue_exceed_LTR = 1
else:
listing_revenue_exceed_LTR = 0
elif is_campo == 1:
if rounded_revenue_ltm == 'n/a':
listing_revenue_exceed_LTR = 'n/a'
elif rounded_revenue_ltm > campo_neighborhood_rent:
listing_revenue_exceed_LTR = 1
else:
listing_revenue_exceed_LTR = 0
# listing_revenue_exceed_LTR = 'n/a'
#many_listings
# print(type(row[9]))
try:
if float(row[9]) > 2: # ia_df_clean['host_total_listings_count']
many_listings = 1
else:
many_listings = 0
#only_1 success
if float((row[9])) == 1:
only_1_listing = 1
else:
only_1_listing = 0
#only_2 success
if float((row[9])) == 2:
only_2_listings = 1
else:
only_2_listings = 0
except ValueError:
many_listings = 'na'
only_1_listing = 'na'
only_2_listings = 'na'
#is_entire success
if row[13] == 'Entire home/apt': # ia_df_clean['room_type']
is_entire = 1
else:
is_entire = 0
#is_hotel success
if row[13] == 'Hotel room':
is_hotel = 1
else:
is_hotel = 0
#commercial
if cleaned_dataframe.loc[listing_row_id, 'host_total_listings_count'] > 4:
commercial = 1
elif host_florence == 0:
commercial = 1
else:
commercial = 0
#very_likely_commercial
if cleaned_dataframe.loc[listing_row_id, 'host_total_listings_count'] > 2 and cleaned_dataframe.loc[listing_row_id, 'instant_bookable'] == 't' and cleaned_dataframe.loc[listing_row_id, 'room_type']== 'Entire home/apt':
very_likely_commercial = 1
else:
very_likely_commercial = 0
#likely unpaid touist tax for non liscenced, tourist tax was calculated by (3*days rented ltm)
if has_liscense == 0:
unpaid_tourist_tax = tourist_tax
#effected_by_policy_x (1-liscense,2- -is entire, 3-2 listing max, 4-day limit 90) #switched 2 and 4 from origingal
if has_liscense == 0:
effected_by_policy_1 = 1
else:
effected_by_policy_1 = 0
if is_entire == 1:
effected_by_policy_2 = 1
else:
effected_by_policy_2 = 0
#more than 2 listings means many
if many_listings == 1:
effected_by_policy_3 = 1
else:
effected_by_policy_3 = 0
new_variables = [days_rented_ltm, rounded_revenue_ltm, occupancy_rate_approx, is_hotel, is_entire, many_listings, only_1_listing, only_2_listings, host_florence, has_liscense, is_instant_bookable, dist_duomo, buffer_zone, is_centro, is_gavinana, is_isolotto, is_rifredi, is_campo, listing_revenue_exceed_LTR, effected_by_policy_1, effected_by_policy_2, effected_by_policy_3, commercial, very_likely_commercial, tourist_tax, unpaid_tourist_tax, geom]
for item in new_variables:
row.append(item)
#adding entire row to all
all.append(row)
else:
# print("either inactive or long term listing so removing {} from file".format(name_of_listing))
pass
#writing all rows to new out file
csvwriter.writerows(all)
return csvwriter
new_var_df = new_variables('../suwa_maps/airbnb_jupyter/cleaned_file.csv', '../suwa_maps/airbnb_jupyter/new_variables_file.csv', cleaned_df_02_22)`
*Step 3.) LOAD Migrate the csv data into postgis spatial database. It was very important that we made a GEOM point column from the lat long attributes in the last step. The next code block won't run since environment variables need to be set up but I provided it to show how the database schema is set up to store the airbnb listing data we just processed. Then run python manage.py migrate
and python manage.py check
and finally the script to load our data into the database table we just created with the django migration function. *
`#models.py file
This is an auto-generated Django model module created by ogrinspect.
class AirbnbListings(models.Model): #issues here ...
Regular Django fields corresponding to the attributes in the IA csv file
id = models.IntegerField(primary_key=True)
listing_url = models.CharField(max_length=1000)
last_scraped = models.DateField(max_length=1000)
name = models.CharField(max_length=1000)
host_id = models.IntegerField()
host_name = models.CharField(max_length=1000)
host_since = models.CharField(max_length=1000)
host_location = models.CharField(max_length=1000)
host_total_listings_count = models.IntegerField()
neighbourhood_cleansed = models.CharField(max_length=1000)
latitude = models.FloatField()
longitude = models.FloatField()
room_type = models.CharField(max_length=1000)
accommodates = models.IntegerField()
bedrooms = models.IntegerField()
price = models.FloatField()
minimum_nights_avg_ntm = models.FloatField()
availability_30 = models.IntegerField()
availability_60 = models.IntegerField()
availability_90 = models.IntegerField()
availability_365 = models.IntegerField()
number_of_reviews = models.IntegerField()
number_of_reviews_ltm = models.IntegerField()
number_of_reviews_l30d = models.IntegerField()
license = models.CharField(max_length=1000)
instant_bookable = models.CharField(max_length=1000)
calculated_host_listings_count = models.IntegerField()
reviews_per_month = models.FloatField()
days_rented_ltm = models.FloatField()
rounded_revenue_ltm = models.FloatField()
occupancy_rate_approx = models.FloatField()
is_hotel = models.IntegerField()
is_entire = models.IntegerField()
many_listings = models.IntegerField()
only_1_listing = models.IntegerField()
only_2_listings = models.IntegerField()
host_florence = models.IntegerField()
has_liscense = models.IntegerField()
is_instant_bookable = models.IntegerField()
dist_duomo = models.FloatField()
is_centro = models.IntegerField()
is_gavinana = models.IntegerField()
is_isolotto = models.IntegerField()
is_rifredi = models.IntegerField()
is_campo = models.IntegerField()
listing_revenue_exceed_LTR = models.IntegerField()
effected_by_policy_1 = models.IntegerField()
effected_by_policy_2 = models.IntegerField()
effected_by_policy_3 = models.IntegerField()
commercial = models.IntegerField()
# editing to see if this is the problem JULY not so putting back
geom = models.PointField(verbose_name='geo',srid = 4326) # # GeoDjango-specific: a geometry field (MultiPolygonField)
#could be points, line, polygon, multipolygon, raster etc
# Returns the string representation of the model. maybe add this because it seems to be in docs.
# def __str__(self):
# return self.name
Auto-generated LayerMapping
dictionary for AirbnbListings model
augmented_airbnblistings_mapping = {
'id': 'id',
'listing_url': 'listing_url',
'last_scraped': 'last_scraped',
'name': 'name',
'host_id': 'host_id',
'host_name': 'host_name',
'host_since': 'host_since',
'host_location': 'host_location',
'host_total_listings_count': 'host_total_listings_count',
'neighbourhood_cleansed': 'neighbourhood_cleansed',
'latitude': 'latitude',
'longitude': 'longitude',
'room_type': 'room_type',
'accommodates': 'accommodates',
'bedrooms': 'bedrooms',
'price': 'price',
'minimum_nights_avg_ntm': 'minimum_nights_avg_ntm',
'availability_30': 'availability_30',
'availability_60': 'availability_60',
'availability_90': 'availability_90',
'availability_365': 'availability_365',
'number_of_reviews':'number_of_reviews',
'number_of_reviews_ltm': 'number_of_reviews_ltm',
'number_of_reviews_l30d': 'number_of_reviews_l30d',
'license': 'license',
'instant_bookable': 'instant_bookable',
'calculated_host_listings_count': 'calculated_host_listings_count',
'reviews_per_month': 'reviews_per_month',
'days_rented_ltm':'days_rented_ltm',
'rounded_revenue_ltm':'rounded_revenue_ltm',
'occupancy_rate_approx':'occupancy_rate_approx',
'is_hotel':'is_hotel',
'is_entire':'is_entire',
'many_listings':'many_listings',
'only_1_listing':'only_1_listing',
'only_2_listings':'only_2_listings',
'host_florence':'host_florence',
'has_liscense':'has_liscense',
'is_instant_bookable':'is_instant_bookable',
'dist_duomo':'dist_duomo',
'is_centro':'is_centro',
'is_gavinana':'is_gavinana',
'is_isolotto':'is_isolotto',
'is_rifredi':'is_rifredi',
'is_campo':'is_campo',
'listing_revenue_exceed_LTR':'listing_revenue_exceed_LTR',
'effected_by_policy_1':'effected_by_policy_1',
'effected_by_policy_2':'effected_by_policy_2',
'effected_by_policy_3':'effected_by_policy_3',
'commercial':'commercial',
'geom': ['latitude', 'longitude'],
}`
`#script to load up data in database adapted from https://towardsdatascience.com/use-python-scripts-to-insert-csv-data-into-django-databases-72eee7c6a433
def load_csv_db():
with open('"../suwa_maps/airbnb_jupyter/cleaned_file.csv"') as file:
reader = csv.reader(file)
next(reader) # Advance past the header
AirbnbListings.objects.all().delete()
for row in reader:
print(row)
listing = AirbnbListings(
id=row[0],
listing_url=row[1],
last_scraped=row[2],
name=row[3],
host_id=row[4],
host_name=row[5],
host_since=row[6],
host_location=row[7],
host_total_listings_count=row[8],
neighbourhood_cleansed=row[9],
latitude=row[10],
longitude=row[11],
room_type=row[12],
accommodates=row[13],
bedrooms=row[14],
price=row[15],
minimum_nights_avg_ntm=row[16],
availability_30=row[17],
availability_60=row[18],
availability_90=row[19],
availability_365=row[20],
number_of_reviews=row[21],
number_of_reviews_ltm=row[22],
number_of_reviews_l30d=row[23],
license=row[24],
instant_bookable=row[25],
calculated_host_listings_count=row[26],
reviews_per_month=row[27],
days_rented_ltm=row[28],
rounded_revenue_ltm=row[29],
occupancy_rate_approx=row[30],
is_hotel=row[31],
is_entire=row[32],
many_listings =row[33],
only_1_listing=row[34],
only_2_listings=row[35],
host_florence=row[36],
has_liscense=row[37],
is_instant_bookable=row[38],
dist_duomo=row[39],
is_centro=row[40],
is_gavinana=row[41],
is_isolotto=row[42],
is_rifredi=row[43],
is_campo=row[44],
listing_revenue_exceed_LTR=row[45],
effected_by_policy_1=row[46],
effected_by_policy_2=row[47],
effected_by_policy_3=row[48],
commercial=row[49],
geom=row[50],
)
listing.save()`
NOTE: These next steps normally would call the data from the database
Step 4.) EXPLORE with summary statistics in pandas
`new_variables_df = pd.read_csv('../suwa_maps/airbnb_jupyter/new_variables_file.csv')
quant_labels = ['calculated_host_listings_count','accommodates', 'bedrooms', 'days_rented_ltm', 'rounded_revenue_ltm', 'occupancy_rate_approx', 'tourist_tax', 'unpaid_tourist_tax']
df_quant = df.loc[:, quant_labels]
qual_labels = ['id', 'listing_url', 'name', 'host_id', 'host_name', 'neighbourhood_cleansed', 'host_location', 'room_type', 'license', 'instant_bookable', 'is_hotel', 'is_entire', 'many_listings', 'only_1_listing', 'only_2_listings', 'host_florence', 'has_liscense', 'is_instant_bookable', 'effected_by_policy_1', 'effected_by_policy_2', 'effected_by_policy_3', 'commercial', 'very_likely_commercial',]
df_qual = df.loc[:, qual_labels].astype("category")
continuous_quant_labels = ['host_since', 'latitude', 'longitude', 'price', 'dist_duomo']
df_continuous = df.loc[:, continuous_quant_labels]`
df_quant.describe().transpose()
df_qual.describe().transpose()
pivot_discrete_neighbourhood = pd.pivot_table(data=new_variables_df, index=["neighbourhood_cleansed"],columns=["room_type"], values='host_total_listings_count', aggfunc='mean')
print(pivot_discrete_neighbourhood)
`def original_airbnb_map(mapdf, filetitle):
#create bubble map
bub_map = folium.Map(location=[mapdf.latitude.mean(),mapdf.longitude.mean()], zoom_start=16, tiles='CartoDB Positron', control_scale=True)
folium.raster_layers.TileLayer('Stamen Toner').add_to(bub_map)
folium.raster_layers.TileLayer('Stamen Terrain').add_to(bub_map)
folium.raster_layers.TileLayer('CartoDB Positron').add_to(bub_map)
folium.LayerControl().add_to(bub_map)
for index, location_info in mapdf.iterrows():
# popup = [str(location_info["host_name"]), "http://www.airbnb.com/users/show/" + str(location_info["host_id"]), str(location_info["license"]), str(location_info["host_total_listings_count"]), str(location_info["global_total_listings"]), str(location_info["listing_url"]), str(location_info["id"]), str(location_info["name"]), str(location_info["neighbourhood_cleansed"]), str(location_info["room_type"]), str(location_info["bedrooms"]), str(location_info["rounded_revenue"]), str((location_info["rounded_revenue"])/12), str(location_info["price"]), str(location_info["days_rented"]),str(location_info["reviews_per_month"])]
popup = 'tbd'
tooltip = 'tbd'
folium.Circle([location_info["latitude"],location_info["longitude"]], radius=2, color="blue", opacity=.2, fill=True, fill_opacity=0.1, fill_color ="blue", popup=popup, tooltip=tooltip).add_to(bub_map)
fill_opacity=0.5,
bub_map.save('Out_Map/' + filetitle + '.html')
pass
return bub_map
original_airbnb_map(mapdf, datadf, tileinfo)
bub_map = original_airbnb_map(cleaned_df_02_22, 'original_airbnb_map')
bub_map`
Top comments (0)