if(!require("dplyr")){
install.packages("dplyr",dependencies = T)
library(dplyr) #For Data Manipulation; Essential 'Pipe' Operator
}
if(!require("readr")){
install.packages("readr",dependencies = T)
library(readr) #Fast and friendly way to read Rectangular Data
}
if(!require("tidyr")){
install.packages("tidyr",dependencies = T)
library(tidyr) #Required for tidying and cleaning data to facilitate analysis
}
if(!require("ggplot2")){
install.packages("ggplot2",dependencies = T)
library(ggplot2) #Used for Visualizations and Graphics
}
if(!require("gbm")){
install.packages("gbm",dependencies = T)
library(gbm) #Used for fitting a Gradient Boost Model
}
if(!require("mice")){
install.packages("mice",dependencies = T)
library(mice) # Multivariate Imputation by Chained Equations. Deals with missing data.
}
if(!require("plotly")){
install.packages("plotly",dependencies = T)
library(plotly) #Create interactive web-based graphs
}
list.data <- read_csv("C:/Users/Swagatam/Desktop/CapitalOne/listings.csv")
zillow.data<-read_csv("C:/Users/Swagatam/Desktop/CapitalOne/Zip_Zhvi_2bedroom.csv")
I am consulting for a real estate company that has a niche in purchasing properties to rent out short-term as part of their business model specifically within New York City. The real estate company has already concluded that two bedroom properties are the most profitable; however, they do not know which zip codes are the best to invest in. My objectives for this analysis would be finding out the Zip Codes and Neighborhoods that would have the best Annual Return Rates based on investments made in the properties and the Revenue from Airbnb. Secondly, I will be analyzing various factors like Room Type, Cancellation Policies, etc. and the relationship with Annual Return Rate and Total Annual Returns.
#Removing Text data based columns from Airbnb Data
#Filtering bedrooms(less than 2). This contains more than 95% of the observations and will be required later for only 2 bedrooms
list.data<-list.data%>%
filter(bedrooms<=2)%>%
select(-c(listing_url,scrape_id,experiences_offered,thumbnail_url,medium_url,
picture_url,xl_picture_url,host_url,host_name,host_about,host_thumbnail_url,
host_picture_url))
#Filtering for New York City. Note that filter has been done through Metro to get more observations. While joining 'New York' city will be used automatically for the Merge
zillow.data<-zillow.data%>%
filter(Metro=='New York')
#Creating the Time Series Data by using 'dplyr' functions
#Average Growth Rate has been considered for every year based on Monthly Growth every year
time.series.data<-zillow.data%>%
select(-c(1,3:7))%>%
group_by(RegionName)%>%
gather(key=YearMonth,value=Price,-RegionName)%>%
arrange(RegionName)%>%
separate(YearMonth, c("Year","Month"),sep="-")%>%
select(-Month)%>%
group_by(RegionName,Year) %>%
dplyr::summarize(Year.Price = mean(Price, na.rm=TRUE))%>%
group_by(RegionName) %>%
mutate(Growth = (Year.Price - lag(Year.Price))/lag(Year.Price))%>%
select(-Year,-Year.Price)%>%
group_by(RegionName) %>%
dplyr::summarize(Avg.Growth.Rate = mean(Growth, na.rm=TRUE))
#Average Growth Rate column is being merged with Zillow Data to create a New Column of Growth Rates
merge.zillow.data<-merge(zillow.data,time.series.data,by="RegionName",all = TRUE)
#Further, New Column for predicted price in June 2019 has been created by squaring the Growth Rate for 2017 and taking its product with the 2017 price
merge.zillow.data1<-merge.zillow.data%>%
mutate(Avg.Growth.Rate=Avg.Growth.Rate+1)%>%
mutate(June.2019.predicted.price=(`2017-06`)*(Avg.Growth.Rate)^2)%>%
select(-c(8:262))
#New Zillow Data is being merged with Airbnb data
final.data<-merge(list.data, merge.zillow.data1, by.x="zipcode", by.y="RegionName",all.x=TRUE)
#Removing redundant columns
final.data<-final.data%>%
select(-c(State,Metro))
#Calculating Total Missing Values in New Price column
sum(!is.na(final.data$June.2019.predicted.price))
## [1] 8429
#Selecting Relevant columns that might influence the Price of a Property
#Important columns like Neighbourhood, Overall Scores/Location scores by previous customers, Zipcode of the location, SizeRank or Population Metric of that area
#Cleaning Done by removing missing values
final.data.model<-final.data%>%
mutate_if(is.character, as.factor)%>%
select(c(neighbourhood_cleansed,neighbourhood_group_cleansed,city,
zipcode,SizeRank,review_scores_location,review_scores_value,June.2019.predicted.price))%>%
filter(!is.na(June.2019.predicted.price))
final.data.model<-unique(final.data.model)
#This is help us find the importantance of predictors and rank them based on Node Purity
model.boost<- gbm(June.2019.predicted.price~., data = final.data.model, distribution = "gaussian", n.trees = 1000, shrinkage = 0.01, interaction.depth = 8)
summary(model.boost)
## var rel.inf
## zipcode zipcode 7.958414e+01
## neighbourhood_cleansed neighbourhood_cleansed 1.397552e+01
## SizeRank SizeRank 6.183416e+00
## city city 1.578422e-01
## neighbourhood_group_cleansed neighbourhood_group_cleansed 9.854045e-02
## review_scores_value review_scores_value 5.179075e-04
## review_scores_location review_scores_location 2.187905e-05
#Converting Character Fields/Columns to Factors using 'mutate_if' function
na.model<-final.data%>%
mutate_if(is.character, as.factor)%>%
select(c(neighbourhood_cleansed,neighbourhood_group_cleansed,city,
zipcode,SizeRank,review_scores_location,review_scores_value,June.2019.predicted.price))
set.seed(123)
miceMod <- mice(na.model, method='cart',m=1,maxit=1) # perform mice imputation, based on CART/Decision Tree Algorithm. Number of iterations here is one
##
## iter imp variable
## 1 1 city zipcode SizeRank review_scores_location review_scores_value June.2019.predicted.price
miceOutput <- complete(miceMod) #New Data is generated with imputed values for missing observations
#Checking for missing values
anyNA(miceOutput)
## [1] FALSE
#Final Append with original Merged Airbnb Data
final.imputed<-cbind(final.data,miceOutput)
#Removing Redundant columns
final.imputed.clean<-final.imputed[-c(89:96)]
#Review Analysis
#Mean Reivew Scores for different Neighbourhoods
final.imputed.clean%>%
select(neighbourhood_group_cleansed,review_scores_value)%>%
group_by(neighbourhood_group_cleansed)%>%
summarise(Mean_Score=mean(review_scores_value,na.rm=T))%>%
arrange(desc(Mean_Score))
## # A tibble: 5 x 2
## neighbourhood_group_cleansed Mean_Score
## <chr> <dbl>
## 1 Staten Island 9.51
## 2 Brooklyn 9.38
## 3 Bronx 9.38
## 4 Queens 9.37
## 5 Manhattan 9.30
#Price Analysis
#Mean Price for different Neighbourhoods
final.imputed.clean%>%
select(neighbourhood_group_cleansed,June.2019.predicted.price)%>%
group_by(neighbourhood_group_cleansed)%>%
summarise(Mean_Price=mean(June.2019.predicted.price,na.rm=T))%>%
arrange(desc(Mean_Price))
## # A tibble: 5 x 2
## neighbourhood_group_cleansed Mean_Price
## <chr> <dbl>
## 1 Manhattan 2281060.
## 2 Brooklyn 1454485.
## 3 Queens 419987.
## 4 Bronx 414516.
## 5 Staten Island 406374.
#Neighbourhood Analyis
#Count of Airbnb hotels/residences in different Neighbourhoods
final.imputed.clean%>%
select(neighbourhood_group_cleansed,zipcode)%>%
group_by(neighbourhood_group_cleansed)%>%
count(zipcode)%>%
group_by(neighbourhood_group_cleansed)%>%
summarise(Count_Airbnb=sum(n))%>%
arrange(desc(Count_Airbnb))
## # A tibble: 5 x 2
## neighbourhood_group_cleansed Count_Airbnb
## <chr> <int>
## 1 Manhattan 18403
## 2 Brooklyn 15831
## 3 Queens 3632
## 4 Bronx 625
## 5 Staten Island 224
#Checking Price Column
class(final.imputed.clean$monthly_price) # It is not of Numeric Type
## [1] "character"
#Converting Price columns to Numeric using apply function
#Removing dollar symbol too
strip_dollars = function(x) {as.numeric(gsub("\\$", "",x)) }
final.imputed.clean[, 49:53] <- sapply(final.imputed.clean[, 49:53], strip_dollars )
#Establishing Criteria for Rating based on Average Review Score Range
#Above 9 will be 'Excellent'
#Between 8 and 9 will be 'Good'
#Below 8 will be 'Needs Improvement'
final.imputed.clean<-final.imputed.clean%>%
mutate(Avg.Review.Score=rowMeans(.[,69:74],na.rm=T))%>%
mutate(Rating=ifelse(Avg.Review.Score>=9 & Avg.Review.Score<=10,"Excellent",
ifelse(Avg.Review.Score>=8 & Avg.Review.Score<9,"Good","Needs Improvement")))
final.imputed.clean$Rating[which(is.na(final.imputed.clean$Rating))]<-"Needs Improvement"
#Occupancy Rates based on Rating and Location conditions
#Generally good Neighbourhoods and well Rated accomodations have better Occupancy Rates
#Here there are 4 different Rates: 0.75, 0.70, 0.65, 0.55
final.imputed.clean<-final.imputed.clean%>%
mutate(OccupancyRate=ifelse(Rating=="Excellent"&
(neighbourhood_group_cleansed=="Manhattan"|
neighbourhood_group_cleansed=="Brooklyn"),0.75,
ifelse(Rating=="Excellent"&
neighbourhood_group_cleansed=="Queens",0.70,ifelse(Rating=="Good"&
(neighbourhood_group_cleansed=="Manhattan"|
neighbourhood_group_cleansed=="Brooklyn"),0.65,0.55))))
#75% Occupancy Rate
sum(final.imputed.clean$OccupancyRate==0.75)
## [1] 22775
#70% Occupancy Rate
sum(final.imputed.clean$OccupancyRate==0.70)
## [1] 2383
#65% Occupancy Rate
sum(final.imputed.clean$OccupancyRate==0.65)
## [1] 2444
#55% Occupancy Rate
sum(final.imputed.clean$OccupancyRate==0.55)
## [1] 11113
#Monthly Rate of Return based on certain factors
#If there is a Monthly Rate given, we consider that and calculate total for the year by multiplying with 12
#If there is no Monthly Rate, we take the daily Rate and multiply by 365 to get total for year
#Product of Total and Occupancy Rate is taken to Calculate Yearly Earnings
#Yearly Return Rate is the Yearly Returns divided by the total invested amount for the property
final.imputed.clean<-final.imputed.clean%>%
mutate(Yearly_Earnings=ifelse(is.na(monthly_price),365*price*OccupancyRate,
12*monthly_price*OccupancyRate))%>%
mutate(Yearly_Return_Rate=Yearly_Earnings/June.2019.predicted.price)
#Considering Important Numerical and Categorical Data for visualizing.
#Eg: Return Rate, Rating Score, Total Annual Returns, Zipcode, etc
viz.data<-final.imputed.clean%>%
select(c(1,18,20,30,31,37:41,42:44,60:63,78:81,87:94))
head(viz.data)
## zipcode host_response_rate host_is_superhost
## 1 07310 100% FALSE
## 2 10000 78% FALSE
## 3 10001 100% FALSE
## 4 10001 60% FALSE
## 5 10001 100% TRUE
## 6 10001 N/A FALSE
## neighbourhood_group_cleansed city latitude longitude
## 1 Manhattan Jersey City 40.72670 -74.01076
## 2 Manhattan New York 40.75980 -73.99374
## 3 Manhattan New York 40.74842 -74.00235
## 4 Manhattan New York 40.75133 -73.99965
## 5 Manhattan New York 40.74402 -73.99272
## 6 Manhattan New York 40.74799 -73.99993
## is_location_exact property_type room_type accommodates bathrooms
## 1 TRUE Apartment Private room 2 1
## 2 FALSE Apartment Private room 2 1
## 3 TRUE Apartment Entire home/apt 1 1
## 4 TRUE Apartment Entire home/apt 2 1
## 5 TRUE Apartment Entire home/apt 4 1
## 6 TRUE Apartment Entire home/apt 4 2
## bedrooms availability_30 availability_60 availability_90
## 1 2 0 0 2
## 2 1 20 47 77
## 3 1 0 0 0
## 4 1 29 59 89
## 5 0 4 10 29
## 6 2 0 0 0
## availability_365 instant_bookable cancellation_policy
## 1 2 FALSE flexible
## 2 346 TRUE moderate
## 3 0 FALSE flexible
## 4 179 FALSE moderate
## 5 101 FALSE strict
## 6 0 FALSE moderate
## require_guest_profile_picture require_guest_phone_verification SizeRank
## 1 FALSE FALSE NA
## 2 FALSE FALSE NA
## 3 FALSE FALSE NA
## 4 FALSE FALSE NA
## 5 FALSE FALSE NA
## 6 FALSE FALSE NA
## Avg.Growth.Rate June.2019.predicted.price Avg.Review.Score
## 1 NA 2278585 NaN
## 2 NA 2278585 NaN
## 3 NA 2278585 NaN
## 4 NA 2278585 10.000000
## 5 NA 2278585 9.833333
## 6 NA 2278585 9.666667
## Rating OccupancyRate Yearly_Earnings Yearly_Return_Rate
## 1 Needs Improvement 0.55 13450.25 0.005902897
## 2 Needs Improvement 0.55 40150.00 0.017620588
## 3 Needs Improvement 0.55 60225.00 0.026430882
## 4 Excellent 0.75 75281.25 0.033038602
## 5 Excellent 0.75 95538.75 0.041928989
## 6 Excellent 0.75 123187.50 0.054063167
viz.data.rating<-viz.data%>%
filter(bedrooms==2)%>%
select(Rating,Yearly_Return_Rate,neighbourhood_group_cleansed)%>%
group_by(neighbourhood_group_cleansed,Rating)%>%
summarise(Mean_Return=mean(Yearly_Return_Rate,na.rm=T))
ggplot(viz.data.rating,aes(x=Rating,y=Mean_Return*100,fill=as.factor(Rating)))+geom_bar(stat="identity")+
facet_wrap(~neighbourhood_group_cleansed)+coord_flip()+
labs(title="Effect of Ratings on Return",
x="Rating",
y="Annual Return Rate(%)",
fill="Rating")
viz.data%>%
filter(bedrooms==2)%>%
select(neighbourhood_group_cleansed,Yearly_Return_Rate)%>%
group_by(neighbourhood_group_cleansed)%>%
summarise(Mean=mean(Yearly_Return_Rate,na.rm=T))%>%
arrange(desc(Mean))
## # A tibble: 5 x 2
## neighbourhood_group_cleansed Mean
## <chr> <dbl>
## 1 Queens 0.0860
## 2 Bronx 0.0591
## 3 Staten Island 0.0568
## 4 Brooklyn 0.0319
## 5 Manhattan 0.0304
#BoxPlot
ggplot(viz.data, aes(x=neighbourhood_group_cleansed, y=Yearly_Earnings/1000, fill=neighbourhood_group_cleansed)) +
geom_boxplot()+scale_y_continuous(limits = c(0, 150))+
labs(title="Box Plot for Yearly Earnings",
x="Neighbourhood",
y="Annual Earnings",
fill="Neighbourhood")
strip_perc = function(x) {as.numeric(sub("%","",x))/100}
viz.data[,2] <- sapply(viz.data[, 2], strip_perc )
viz.data%>%
filter(bedrooms==2)%>%
select(zipcode,host_response_rate,neighbourhood_group_cleansed)%>%
group_by(neighbourhood_group_cleansed,zipcode)%>%
summarise(Mean_Response_Score=mean(host_response_rate,na.rm = T))%>%
arrange(desc(Mean_Response_Score))%>%
top_n(5)
## # A tibble: 41 x 3
## # Groups: neighbourhood_group_cleansed [5]
## neighbourhood_group_cleansed zipcode Mean_Response_Score
## <chr> <chr> <dbl>
## 1 Bronx 10451 1
## 2 Bronx 10462 1
## 3 Bronx 10465 1
## 4 Bronx 10466 1
## 5 Bronx 10469 1
## 6 Bronx 10475 1
## 7 Brooklyn 11204 1
## 8 Brooklyn 11214 1
## 9 Brooklyn 11223 1
## 10 Manhattan 07310 1
## # ... with 31 more rows
#Plot 2.1-Density Plot
theme_set(theme_classic())
g <- ggplot(data=viz.data, aes(Yearly_Return_Rate*100))
g.g<-g + geom_density(aes(fill=factor(neighbourhood_group_cleansed)), alpha=0.8) +
labs(title="Density Plot",
subtitle="Annual Return Rate Grouped by Neighbourhood",
caption="Source: Airbnb Data",
x="Annual Return Rate(%)",
y="Density",
fill="Neighbourhood Area")+scale_x_continuous(limits = c(0, 10))
p <- ggplotly(g.g)
p
#Plot 2.2-Boxplot
ggplot(viz.data, aes(x=neighbourhood_group_cleansed, y=Yearly_Return_Rate*100, fill=neighbourhood_group_cleansed)) +
geom_boxplot()+scale_y_continuous(limits = c(0, 20))+
labs(title="Box Plot",
subtitle="Annual Return Rate Grouped by Neighbourhood",
caption="Source: Airbnb Data",
x="Neighbourhood",
y="Annual Return Rate(%)",
fill="Neighbourhood Area")
#### Analysis/Conclusions:
viz.data%>%
filter(bedrooms==2)%>%
group_by(room_type)%>%
tally()
## # A tibble: 2 x 2
## room_type n
## <chr> <int>
## 1 Entire home/apt 4593
## 2 Private room 301
viz.data%>%
filter(bedrooms==2)%>%
plot_ly(x = ~Yearly_Earnings,color = ~room_type) %>%
add_histogram()%>%
layout(title="Room Type Analysis",
scene = list(
xaxis = list(title = "Yearly Earnings"),
yaxis = list(title = "Count")))
viz.data%>%
filter(bedrooms==2)%>%
ggplot(aes(x=Yearly_Earnings/1000))+geom_area(stat ="bin", color="darkblue",
fill="lightblue")+facet_wrap(~cancellation_policy)+
ggtitle("Cancellation Policy Analysis")+xlab("Yearly Earnings in $1000s") +ylab("Count")
viz.data.1<-viz.data%>%
filter(bedrooms==2)%>%
filter(!is.na(zipcode))%>%
select(zipcode,neighbourhood_group_cleansed,Yearly_Return_Rate)%>%
group_by(zipcode,neighbourhood_group_cleansed)%>%
summarise(Mean_Return_Rate=mean(Yearly_Return_Rate,na.rm=T))%>%
arrange(Mean_Return_Rate)
viz.data.1<-viz.data.1%>%
select(zipcode,neighbourhood_group_cleansed,Mean_Return_Rate)%>%
tail(25)
ggplot(data=viz.data.1,aes(x=reorder(zipcode,-Mean_Return_Rate),y=Mean_Return_Rate*100,fill=neighbourhood_group_cleansed))+
geom_bar(stat="identity")+
#scale_fill_manual(values=c("red", "yellow","dark green","blue","grey"))+
coord_flip()+ggtitle("Top 25 Zipcodes for Best Return")+xlab("Zipcode") +ylab("Average Return Rate(%)") + labs(fill = "Neighbourhood Area")