This part of the code includes steps taken to clean Airbnb listing data. Data was obtained from insideairbnb.com. This data includes airbnb detailed listings data in Los Angeles, California, United States, compiled as at 9 September, 2022.
install.packages("tidyverse")
library("tidyverse")
airbnb_df <- read_csv("Listings Cleaned- LA.csv")
#add new dataframe which only includes 20 columns instead of 30
airbnb_df_v2 <-airbnb_df[c("id","name","host_id","host_response_time","host_response_rate","host_acceptance_rate","host_is_superhost","host_total_listings_count","property_type","room_type","bedrooms","price","number_of_reviews","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","availability_365")]
head(airbnb_df_v2)
## # A tibble: 6 × 20
## id name host_id host_…¹ host_…² host_…³ host_…⁴ host_…⁵ prope…⁶ room_…⁷
## <dbl> <chr> <dbl> <chr> <chr> <chr> <lgl> <dbl> <chr> <chr>
## 1 183319 Panora… 867995 within… 100% 100% FALSE 3 Entire… Entire…
## 2 51307 Spanis… 235568 N/A N/A N/A FALSE 2 Entire… Entire…
## 3 184314 Boho C… 884031 N/A N/A N/A FALSE 2 Entire… Entire…
## 4 51498 Guest … 236758 within… 100% 88% FALSE 2 Entire… Entire…
## 5 109 Amazin… 521 N/A N/A N/A FALSE 3 Entire… Entire…
## 6 185536 Lovely… 890778 within… 100% 80% FALSE 2 Privat… Privat…
## # … with 10 more variables: bedrooms <dbl>, price <chr>,
## # number_of_reviews <dbl>, review_scores_rating <dbl>,
## # review_scores_accuracy <dbl>, review_scores_cleanliness <dbl>,
## # review_scores_checkin <dbl>, review_scores_communication <dbl>,
## # review_scores_location <dbl>, availability_365 <dbl>, and abbreviated
## # variable names ¹host_response_time, ²host_response_rate,
## # ³host_acceptance_rate, ⁴host_is_superhost, ⁵host_total_listings_count, …
summary(airbnb_df_v2)
## id name host_id host_response_time
## Min. :1.090e+02 Length:45815 Min. : 521 Length:45815
## 1st Qu.:2.392e+07 Class :character 1st Qu.: 24132476 Class :character
## Median :4.521e+07 Mode :character Median : 99696114 Mode :character
## Mean :1.641e+17 Mean :149639418
## 3rd Qu.:5.493e+17 3rd Qu.:250312632
## Max. :7.118e+17 Max. :478604871
##
## host_response_rate host_acceptance_rate host_is_superhost
## Length:45815 Length:45815 Mode :logical
## Class :character Class :character FALSE:32203
## Mode :character Mode :character TRUE :13514
## NA's :98
##
##
##
## host_total_listings_count property_type room_type
## Min. : 1.0 Length:45815 Length:45815
## 1st Qu.: 2.0 Class :character Class :character
## Median : 4.0 Mode :character Mode :character
## Mean : 116.9
## 3rd Qu.: 18.0
## Max. :20000.0
## NA's :13
## bedrooms price number_of_reviews review_scores_rating
## Min. : 1.000 Length:45815 Min. : 0.0 Min. :0.000
## 1st Qu.: 1.000 Class :character 1st Qu.: 1.0 1st Qu.:4.670
## Median : 1.000 Mode :character Median : 6.0 Median :4.880
## Mean : 1.865 Mean : 35.2 Mean :4.701
## 3rd Qu.: 2.000 3rd Qu.: 35.0 3rd Qu.:5.000
## Max. :24.000 Max. :1702.0 Max. :5.000
## NA's :3760 NA's :10579
## review_scores_accuracy review_scores_cleanliness review_scores_checkin
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:4.740 1st Qu.:4.640 1st Qu.:4.860
## Median :4.910 Median :4.860 Median :4.970
## Mean :4.767 Mean :4.704 Mean :4.841
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :10848 NA's :10847 NA's :10854
## review_scores_communication review_scores_location availability_365
## Min. :0.000 Min. :0.000 Min. : 0.0
## 1st Qu.:4.830 1st Qu.:4.760 1st Qu.: 61.0
## Median :4.970 Median :4.920 Median :180.0
## Mean :4.823 Mean :4.799 Mean :192.8
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:341.0
## Max. :5.000 Max. :5.000 Max. :365.0
## NA's :10848 NA's :10857
glimpse(airbnb_df_v2)
## Rows: 45,815
## Columns: 20
## $ id <dbl> 183319, 51307, 184314, 51498, 109, 185536,…
## $ name <chr> "Panoramic Ocean View Venice Beach", "Span…
## $ host_id <dbl> 867995, 235568, 884031, 236758, 521, 89077…
## $ host_response_time <chr> "within a day", "N/A", "N/A", "within an h…
## $ host_response_rate <chr> "100%", "N/A", "N/A", "100%", "N/A", "100%…
## $ host_acceptance_rate <chr> "100%", "N/A", "N/A", "88%", "N/A", "80%",…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ host_total_listings_count <dbl> 3, 2, 2, 2, 3, 2, 3, 1, 2, 2, 5, 1, 10, 1,…
## $ property_type <chr> "Entire rental unit", "Entire bungalow", "…
## $ room_type <chr> "Entire home/apt", "Entire home/apt", "Ent…
## $ bedrooms <dbl> 2, 1, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ price <chr> "$152.00", "$75.00", "$125.00", "$189.00",…
## $ number_of_reviews <dbl> 3, 138, 30, 378, 2, 13, 37, 188, 12, 24, 1…
## $ review_scores_rating <dbl> 5.00, 4.64, 4.88, 4.73, 4.00, 4.42, 4.84, …
## $ review_scores_accuracy <dbl> 4.67, 4.77, 4.93, 4.83, 5.00, 4.38, 4.81, …
## $ review_scores_cleanliness <dbl> 4.67, 4.55, 4.63, 4.80, 5.00, 3.77, 4.92, …
## $ review_scores_checkin <dbl> 5.00, 4.89, 4.78, 4.94, 3.00, 4.77, 5.00, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.93, 4.90, 4.00, 4.85, 4.97, …
## $ review_scores_location <dbl> 5.00, 4.95, 4.89, 4.83, 5.00, 4.77, 5.00, …
## $ availability_365 <dbl> 0, 224, 0, 348, 139, 358, 202, 250, 357, 3…
#drop rows will null values for bedrooms, reviews, host response and acceptance rate
airbnb_df_v3 <- airbnb_df_v2 %>% drop_na(host_response_time) %>% drop_na(host_response_rate) %>% drop_na(host_acceptance_rate) %>% drop_na(review_scores_rating) %>% drop_na(review_scores_accuracy) %>% drop_na(review_scores_cleanliness) %>% drop_na(review_scores_checkin) %>% drop_na(review_scores_communication) %>% drop_na(review_scores_location) %>% drop_na(bedrooms) %>% drop_na(price)
glimpse(airbnb_df_v3)
## Rows: 31,910
## Columns: 20
## $ id <dbl> 183319, 51307, 184314, 109, 185536, 2708, …
## $ name <chr> "Panoramic Ocean View Venice Beach", "Span…
## $ host_id <dbl> 867995, 235568, 884031, 521, 890778, 3008,…
## $ host_response_time <chr> "within a day", "N/A", "N/A", "N/A", "with…
## $ host_response_rate <chr> "100%", "N/A", "N/A", "N/A", "100%", "100%…
## $ host_acceptance_rate <chr> "100%", "N/A", "N/A", "N/A", "80%", "100%"…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, T…
## $ host_total_listings_count <dbl> 3, 2, 2, 3, 2, 3, 1, 2, 2, 5, 10, 1, 2, 10…
## $ property_type <chr> "Entire rental unit", "Entire bungalow", "…
## $ room_type <chr> "Entire home/apt", "Entire home/apt", "Ent…
## $ bedrooms <dbl> 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ price <chr> "$152.00", "$75.00", "$125.00", "$115.00",…
## $ number_of_reviews <dbl> 3, 138, 30, 2, 13, 37, 188, 12, 24, 196, 2…
## $ review_scores_rating <dbl> 5.00, 4.64, 4.88, 4.00, 4.42, 4.84, 4.92, …
## $ review_scores_accuracy <dbl> 4.67, 4.77, 4.93, 5.00, 4.38, 4.81, 4.92, …
## $ review_scores_cleanliness <dbl> 4.67, 4.55, 4.63, 5.00, 3.77, 4.92, 4.87, …
## $ review_scores_checkin <dbl> 5.00, 4.89, 4.78, 3.00, 4.77, 5.00, 4.98, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.93, 4.00, 4.85, 4.97, 4.99, …
## $ review_scores_location <dbl> 5.00, 4.95, 4.89, 5.00, 4.77, 5.00, 4.96, …
## $ availability_365 <dbl> 0, 224, 0, 139, 358, 202, 250, 357, 323, 3…
install.packages("skimr")
install.packages("janitor")
install.packages("dplyr")
library(skimr)
library(janitor)
library(dplyr)
#Check unique values
unique(airbnb_df_v3$room_type)
## [1] "Entire home/apt" "Private room" "Hotel room" "Shared room"
#show top 5 duplicated rows and the frequency
row_occur <- data.frame(table(airbnb_df_v3$id))
duplicate_rows <- row_occur[row_occur$Freq > 1,]
head(duplicate_rows,5)
## Var1 Freq
## 25706 5.41261e+17 2
## 25714 5.42518e+17 2
## 25755 5.44831e+17 2
## 25808 5.47362e+17 2
## 25866 5.49182e+17 2
#number of duplicated rows
dim(airbnb_df_v3[duplicated(airbnb_df_v3$id),])[1]
## [1] 225
#drop all rows with duplicated listing id
airbnb_df_v4 <- airbnb_df_v3 %>% distinct(id, .keep_all = TRUE)
#Drop rows with host response rate of "N/A"
airbnb_df_v5 <- subset(airbnb_df_v4,host_response_rate!="N/A")
#Create new column with % remove in host response rate
airbnb_df_v5$new_host_response_rate <- gsub("%$","",airbnb_df_v5$host_response_rate)
#convert new column from character to numeric
airbnb_df_v5$new_host_response_rate <- as.integer(airbnb_df_v5$new_host_response_rate)
library(dplyr)
glimpse(airbnb_df_v5)
## Rows: 26,727
## Columns: 21
## $ id <dbl> 183319, 185536, 2708, 51546, 185557, 2732,…
## $ name <chr> "Panoramic Ocean View Venice Beach", "Love…
## $ host_id <dbl> 867995, 890778, 3008, 237114, 890778, 3041…
## $ host_response_time <chr> "within a day", "within a few hours", "wit…
## $ host_response_rate <chr> "100%", "100%", "100%", "100%", "100%", "1…
## $ host_acceptance_rate <chr> "100%", "80%", "100%", "100%", "80%", "37%…
## $ host_is_superhost <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TR…
## $ host_total_listings_count <dbl> 3, 2, 3, 1, 2, 2, 5, 10, 1, 2, 10, 14, 14,…
## $ property_type <chr> "Entire rental unit", "Private room in hom…
## $ room_type <chr> "Entire home/apt", "Private room", "Privat…
## $ bedrooms <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ price <chr> "$152.00", "$85.00", "$93.00", "$100.00", …
## $ number_of_reviews <dbl> 3, 13, 37, 188, 12, 24, 196, 23, 130, 85, …
## $ review_scores_rating <dbl> 5.00, 4.42, 4.84, 4.92, 4.92, 4.41, 4.82, …
## $ review_scores_accuracy <dbl> 4.67, 4.38, 4.81, 4.92, 5.00, 4.26, 4.71, …
## $ review_scores_cleanliness <dbl> 4.67, 3.77, 4.92, 4.87, 4.75, 4.58, 4.89, …
## $ review_scores_checkin <dbl> 5.00, 4.77, 5.00, 4.98, 5.00, 4.39, 4.93, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.97, 4.99, 5.00, 4.48, 4.89, …
## $ review_scores_location <dbl> 5.00, 4.77, 5.00, 4.96, 5.00, 4.91, 4.88, …
## $ availability_365 <dbl> 0, 358, 202, 250, 357, 323, 365, 215, 259,…
## $ new_host_response_rate <int> 100, 100, 100, 100, 100, 100, 95, 88, 100,…
#Create new column with $ , . removed in price
airbnb_df_v5$new_price <- gsub("[$,]","",airbnb_df_v5$price)
#convert new column from character to double
airbnb_df_v5$new_price <- as.double(airbnb_df_v5$new_price)
#Arrange data from ascending id
airbnb_df_v6 <- arrange(airbnb_df_v5,id)
head(airbnb_df_v6,5)
## # A tibble: 5 × 22
## id name host_id host_…¹ host_…² host_…³ host_…⁴ host_…⁵ prope…⁶ room_…⁷
## <dbl> <chr> <dbl> <chr> <chr> <chr> <lgl> <dbl> <chr> <chr>
## 1 2708 Runyon … 3008 within… 100% 100% TRUE 3 Privat… Privat…
## 2 2732 Zen Lif… 3041 within… 100% 37% FALSE 2 Privat… Privat…
## 3 3021 Furnish… 3415 within… 88% 78% FALSE 10 Entire… Entire…
## 4 5728 Tiny Ho… 9171 within… 100% 76% TRUE 14 Privat… Privat…
## 5 5729 Zen Roo… 9171 within… 100% 76% TRUE 14 Privat… Privat…
## # … with 12 more variables: bedrooms <dbl>, price <chr>,
## # number_of_reviews <dbl>, review_scores_rating <dbl>,
## # review_scores_accuracy <dbl>, review_scores_cleanliness <dbl>,
## # review_scores_checkin <dbl>, review_scores_communication <dbl>,
## # review_scores_location <dbl>, availability_365 <dbl>,
## # new_host_response_rate <int>, new_price <dbl>, and abbreviated variable
## # names ¹host_response_time, ²host_response_rate, ³host_acceptance_rate, …
#rename number of reviews to bookings
colnames(airbnb_df_v6)[13] = "bookings"
colnames(airbnb_df_v6)
## [1] "id" "name"
## [3] "host_id" "host_response_time"
## [5] "host_response_rate" "host_acceptance_rate"
## [7] "host_is_superhost" "host_total_listings_count"
## [9] "property_type" "room_type"
## [11] "bedrooms" "price"
## [13] "bookings" "review_scores_rating"
## [15] "review_scores_accuracy" "review_scores_cleanliness"
## [17] "review_scores_checkin" "review_scores_communication"
## [19] "review_scores_location" "availability_365"
## [21] "new_host_response_rate" "new_price"
install.packages("ggplot2")
library(ggplot2)
install.packages("ggcorrplot")
library(ggcorrplot)
install.packages("corrplot")
library(corrplot)
airbnb_df_v8 <- airbnb_df_v6[c("new_price","review_scores_rating","review_scores_cleanliness","availability_365","new_host_response_rate")]
names(airbnb_df_v8) <- c("Price","Overall_Rating","Cleanliness","Availability_365","Host Reponse Rate")
correlation_df <- cor(airbnb_df_v8)
corrplot(correlation_df, method="color", addCoef.col="black")
#cor(airbnb_df_v6[sapply(airbnb_df_v6,is.numeric)])
The image above illustrates the correlation matrix for multiple variables in the Airbnb data. The matrix shows that cleanliness of the airbnb has a positive relationship with the overall rating of the stay. Besides that, it can be seen that other factors have less correlation between each other.
airbnb_df_v7 <- airbnb_df_v6 %>% filter(bookings < 1000)
chart1 <- ggplot(data = airbnb_df_v7) + geom_point(mapping = aes(x = new_host_response_rate, y = bookings)) + labs(title="Host Response Rate vs. Bookings", y="Bookings", x="Host Response Rate (%)")
print(chart1)
The scatter plot above shows the relationship between host response rate and the number of bookings per host. While it may not necessary be true that host with low response rate has lesser bookings, but there is a gentle trend which shows that host with higher response rate tend to have a slight higher number of bookings. With this understanding, Airbnb could work on methods or ways to encourage host to respond more frequently. One suggestion include adding notifications 20 minutes after receiving booking to remind host to respond.
chart2 <- ggplot(airbnb_df_v6, aes(x = factor(room_type), fill = factor(room_type))) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 0, colour = "Black") + labs(title="Number of listing each Room Type", y="Number of Listing", x="Room Type")
print(chart2)
The bar chart above shows the number of listings as at 9 September 2022 base on each room type. Unsurprisingly, the most number of listing under Airbnb is the rental of home or apartment. From the chart, it is evident that there is a small percentage of listing that comes from hotel room and shared room. Airbnb could look into why hotel rooms are listed under Airbnb and investigate for possible target market of liasing with hotels to list their rooms.
airbnb_df_v9 <- airbnb_df_v6 %>% group_by(room_type) %>% summarise( avg_price = mean(new_price))
chart3 <- ggplot(aes(x = room_type, y = avg_price, fill = room_type), data = airbnb_df_v9) + geom_bar(stat = "identity") + labs(title="Average price for each Room Type", y="Average Price ($)", x="Room Type") + geom_text(aes(label = sprintf("%0.2f", round(avg_price, digits = 2))), vjust = 0) + scale_fill_manual(values=c("#c63287",
"#d44c9c",
"#e366b1",
"#f17fc5"))
print(chart3)
The bar chart above shows the average price listed on Airbnb base on room type. It can be seen that the rental of entire home or apartment charges the highest price out of the other three. Shared rooms on the other hand are the cheapest at $41. At such a low cost, Airbnb could market their rooms as affordable especially for solo travelers. Solo travelers looking for a place to stay can consider shared rooms listed on Airbnb for cheap stay as well as an opportunity to make friends.
airbnb_df_v10 <- airbnb_df_v6 %>% filter(host_is_superhost != "NA")
chart4 <-ggplot(airbnb_df_v10, aes(host_is_superhost,availability_365)) + geom_boxplot(fill = "#94C973", color = "black") + labs(title="Availability of rooms vs. Host Status", y="Availability out of 365 days", x="Is host a Superhost?")
print(chart4)
The image above shows a box plot of host with and without superhost status against the total number of days the listing is available for during the year. Out of the 365 days, you can see that median difference between host status is about 70 days, where superhost are likely to have more days booked. From the box plot, both superhost and non-superhost has similar lower quartile levels. Airbnb could possibly look into ways to encourage host to be a superhost to decrease availability of bookings.
install.packages("RSQLite")
library(RSQLite)
install.packages("sqldf")
library(sqldf)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
sqldf('SELECT id, host_response_rate AS Respone_Rate, host_total_listings_count AS Total_Listing, room_type AS Room_Type, review_scores_rating AS Rating, new_price AS Price, availability_365 AS Available FROM airbnb_df_v6 WHERE availability_365 < 365 ORDER BY review_scores_rating ASC LIMIT 10')
## id Respone_Rate Total_Listing Room_Type Rating Price Available
## 1 1433775 57% 2 Private room 0 90 345
## 2 19784306 100% 26 Entire home/apt 1 1000 364
## 3 20959427 50% 2 Entire home/apt 1 358 180
## 4 35727088 100% 4342 Entire home/apt 1 196 311
## 5 35764429 96% 55 Private room 1 236 3
## 6 35933694 96% 55 Private room 1 221 41
## 7 36590008 100% 6 Entire home/apt 1 564 266
## 8 37104345 100% 179 Entire home/apt 1 496 221
## 9 38948064 91% 2 Entire home/apt 1 260 364
## 10 39029905 90% 16 Private room 1 869 126
sqldf('SELECT property_type AS Property_Type, AVG(new_price) AS Total_Price FROM airbnb_df_v6 GROUP BY property_type ORDER BY Total_Price DESC LIMIT 5')
## Property_Type Total_Price
## 1 Entire villa 1043.778
## 2 Castle 1020.200
## 3 Tower 900.000
## 4 Entire chalet 850.000
## 5 Lighthouse 627.000
sqldf('SELECT availability_365 AS Vacant_Days, review_scores_accuracy AS Accuracy, review_scores_cleanliness AS Cleanliness, review_scores_checkin AS Check_In, review_scores_communication AS Communication, review_scores_location AS Location FROM airbnb_df_v6 ORDER BY availability_365 DESC LIMIT 10')
## Vacant_Days Accuracy Cleanliness Check_In Communication Location
## 1 365 4.92 4.46 5.00 5.00 4.95
## 2 365 4.85 4.89 4.92 4.89 4.84
## 3 365 3.82 3.73 4.82 4.64 4.36
## 4 365 4.93 4.98 4.95 4.90 4.84
## 5 365 4.84 4.93 4.95 4.90 4.89
## 6 365 4.68 4.70 4.91 4.95 4.79
## 7 365 4.50 4.38 4.75 4.50 4.75
## 8 365 4.82 4.45 4.64 4.73 4.64
## 9 365 4.40 4.30 4.50 4.70 4.80
## 10 365 4.84 4.89 4.95 4.87 4.93