I chose this data set based on the dream of a future vacation to Hawaii. I was hoping to glean insight into peak periods to visit and popular islands.
The primary data file is comprised of AirBnB listings containing several features and reviews in aggregate. A separate file containing individual reviews and date provided was used to support trends over time.
Inspections and summarizations on the data were performed in order to support visualizations and analysis of this data and are contained in this sections source code.
This data was retrieved from: https://www.dropbox.com/sh/mtenj2oa98sepw2/AADsnuJfyv9-GQNgFKnwUDota/Airbnb%20Listings%20(pick%20any%20one%20city%20unless%20comparing%20two)/Hawaii?dl=0&subfolder_nav_tracking=1
The data was last modified on: Jan 6, 2022
# {r, fig.width=9, fig.height=9, warning=FALSE, echo=FALSE}
library(data.table)
library(lubridate)
library(ggplot2)
library(scales)
library(plyr)
library(dplyr)
library(leaflet)
library(ggthemes)
library(RColorBrewer)
library(plotly)
setwd("C:/Users/mishabella/OneDrive/Documents/")
### Filenames per location
listings_hi <- "HI_airbnb/listings.csv"
neighbourhoods_hi <- "HI_airbnb/neighbourhoods.csv"
neighbourhoods_json_hi <- "HI_airbnb/neighbourhoods.geojson"
reviews_hi <- "HI_airbnb/reviews.csv"
### Dataframes per file
df_listings <- fread(listings_hi, na.strings=c(NA, ""))
df_neighbourhoods <- fread(neighbourhoods_hi, na.strings=c(NA, ""))
df_reviews <- fread(reviews_hi, na.strings=c(NA, ""))
### generate month, year columns for df_listings
df_listings$year <- year(ymd(df_listings$last_review))
df_listings$month <- month(ymd(df_listings$last_review))
### generate month, year columns for df_reviews
df_reviews$day <- day(ymd(df_reviews$date))
df_reviews$month <- month(ymd(df_reviews$date))
df_reviews$year <- year(ymd(df_reviews$date))
#unique(df_reviews$month)
#length(unique(df_reviews$month)) # see how many bins we need
#unique(df_reviews$year)
#length(unique(df_reviews$year)) # see how many bins we need
#table(df_reviews$year)
# summarise reviews per season
season_tot <- df_reviews %>%
select(month, listing_id) %>%
mutate(
season = case_when(
month %in% c(9, 10, 11) ~ "Fall",
month %in% c(12, 1, 2) ~ "Winter",
month %in% c(3, 4, 5) ~ "Spring",
TRUE ~ "Summer")) %>%
group_by(season) %>%
summarise(totreviews = n()) %>%
data.frame()
# summarise listings per island and room type
islands_df <- df_listings %>%
select(room_type, neighbourhood_group) %>%
group_by(room_type, neighbourhood_group) %>%
summarise(n=length(neighbourhood_group), .groups='keep') %>%
group_by(room_type) %>%
mutate(island_group = ifelse(neighbourhood_group == "Honolulu", "Oahu", neighbourhood_group)) %>%
mutate(percent_of_total = round(100*n/sum(n), 1)) %>%
ungroup() %>%
data.frame()
# summarise reviews per date
date_df <- df_reviews %>%
select(date, listing_id) %>%
group_by(date) %>%
summarise(n = length(date), .groups='keep') %>%
data.frame()
# identify top reviewed AirBnBs
df_topReviews <- df_listings %>%
group_by(room_type, neighbourhood_group) %>%
slice(which.max(number_of_reviews)) %>%
data.frame()
print(colnames(df_listings))
## [1] "id" "name"
## [3] "host_id" "host_name"
## [5] "neighbourhood_group" "neighbourhood"
## [7] "latitude" "longitude"
## [9] "room_type" "price"
## [11] "minimum_nights" "number_of_reviews"
## [13] "last_review" "reviews_per_month"
## [15] "calculated_host_listings_count" "availability_365"
## [17] "number_of_reviews_ltm" "license"
## [19] "year" "month"
print(colnames(df_reviews))
## [1] "listing_id" "date" "day" "month" "year"
This chart was built to determine the trend/growth of AirBnB over the years and to specifically see if an impact from covid-19 would be evident. These findings show a significant impact on AirBnB travel in 2020.
ggplot(date_df, aes(x = date, y = n)) +
geom_point(color = "darkorchid4") +
labs(title = "Total Reviews Over Time",
subtitle = "Inspecting impact of Covid-19",
y = "Number of Reviews per Day",
x = "Date") + theme_bw(base_size = 12) +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
font_set <- list(
family = "Times New Roman, monospace",
size = 14,
color = "RebeccaPurple")
plot_ly(islands_df, labels = ~island_group, values = ~n, texttemplate="%{label}: <br> (%{percent})") %>%
add_pie(hole=0.6) %>%
layout(title = "Hawaii AirBnB Listings per Island", font=font_set) %>%
layout(annotations = list(text=paste0("Total Listings: \n",
scales::comma(sum(islands_df$n))),
showarrow=F), showlegend=TRUE)
font_set2 <- list(
family = "Times New Roman, monospace",
size = 12)
# texttemplate="%{label}: %{percent}"
plot_ly(textposition="auto", automargin=TRUE, labels=~room_type, values = ~n) %>%
add_pie(data=islands_df[islands_df$island_group == 'Oahu',], name="Oahu", title = "Oahu\n ", domain=list(row=0, column=0)) %>%
add_pie(data=islands_df[islands_df$island_group == 'Maui',], name="Maui", title = "Maui\n ", domain=list(row=0, column=1)) %>%
add_pie(data=islands_df[islands_df$island_group == 'Kauai',], name="Kauai", title = "Kauai\n ", domain=list(row=1, column=0)) %>%
add_pie(data=islands_df[islands_df$island_group == 'Hawaii',], name="Hawaii", title = "Hawaii\n ", domain=list(row=1, column=1)) %>%
layout(title="AirBnB Room Type Listings per Island", showlegend=TRUE, font=font_set2, grid=list(rows=2,columns=2))
# generate map
map <- leaflet() %>%
addProviderTiles(providers$Esri) %>%
addCircles(
lng = subset(df_topReviews, room_type == 'Entire home/apt')$longitude,
lat = subset(df_topReviews, room_type == 'Entire home/apt')$latitude,
opacity = 10,
color = 'green',
popup = paste(sep = "<br/>",
subset(df_topReviews, room_type == 'Entire home/apt')$name,
subset(df_topReviews, room_type == 'Entire home/apt')$room_type,
paste0("Price per Night: ", dollar(subset(df_topReviews, room_type == 'Entire home/apt')$price)),
paste0("Total Reviews: ", subset(df_topReviews, room_type == 'Entire home/apt')$number_of_reviews))
) %>%
addCircles(
lng = subset(df_topReviews, room_type == 'Hotel room')$longitude,
lat = subset(df_topReviews, room_type == 'Hotel room')$latitude,
opacity = 10,
color = 'blue',
popup = paste(sep = "<br/>",
subset(df_topReviews, room_type == 'Hotel room')$name,
subset(df_topReviews, room_type == 'Hotel room')$room_type,
paste0("Price per Night: ", dollar(subset(df_topReviews, room_type == 'Hotel room')$price)),
paste0("Total Reviews: ", subset(df_topReviews, room_type == 'Hotel room')$number_of_reviews))
) %>%
addCircles(
lng = subset(df_topReviews, room_type == 'Private room')$longitude,
lat = subset(df_topReviews, room_type == 'Private room')$latitude,
opacity = 10,
color = 'orange',
popup = paste(sep = "<br/>",
subset(df_topReviews, room_type == 'Private room')$name,
subset(df_topReviews, room_type == 'Private room')$room_type,
paste0("Price per Night: ", dollar(subset(df_topReviews, room_type == 'Private room')$price)),
paste0("Total Reviews: ", subset(df_topReviews, room_type == 'Private room')$number_of_reviews))
) %>%
addCircles(
lng = subset(df_topReviews, room_type == 'Shared room')$longitude,
lat = subset(df_topReviews, room_type == 'Shared room')$latitude,
opacity = 10,
color = 'red',
popup = paste(sep = "<br/>",
subset(df_topReviews, room_type == 'Shared room')$name,
subset(df_topReviews, room_type == 'Shared room')$room_type,
paste0("Price per Night: ", dollar(subset(df_topReviews, room_type == 'Shared room')$price)),
paste0("Total Reviews: ", subset(df_topReviews, room_type == 'Shared room')$number_of_reviews))
) %>%
addLegend("bottomleft",
colors =c("green", "blue", "orange", "red"),
labels= c("Entire home/apt", "Hotel room","Private room","Shared room"),
title= "Most Reviewed Air BnB Rooms per Room Type",
opacity = .5)
map
Learned that the peak season to visit is in the summer, while the less popular period (to potentially avoid crowds) would be in the fall. More specifically, July and August have the highest amount of visits while October and November have the lowest.
Learned that the two most popular islands are Oahu and Maui, and Kauai seemed to be the least popular.
The “room types” in AirBnB listings were overwhelmingly Entire Room/Apt types.
This analysis also discovered a significant impact in visits in the year 2020, which aligns to the Covid-19 pandemic.
Finally, this analysis highlighted the highest reviewed AirBnB listings per island and per room_type, to aid in future planning.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.