Cleaning data

This part of the code includes steps taken to clean Airbnb listing data. Data was obtained from insideairbnb.com. This data includes airbnb detailed listings data in Los Angeles, California, United States, compiled as at 9 September, 2022.

install.packages("tidyverse") 
library("tidyverse")
airbnb_df <- read_csv("Listings Cleaned- LA.csv")
#add new dataframe which only includes 20 columns instead of 30

airbnb_df_v2 <-airbnb_df[c("id","name","host_id","host_response_time","host_response_rate","host_acceptance_rate","host_is_superhost","host_total_listings_count","property_type","room_type","bedrooms","price","number_of_reviews","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","availability_365")]
head(airbnb_df_v2)
## # A tibble: 6 × 20
##       id name    host_id host_…¹ host_…² host_…³ host_…⁴ host_…⁵ prope…⁶ room_…⁷
##    <dbl> <chr>     <dbl> <chr>   <chr>   <chr>   <lgl>     <dbl> <chr>   <chr>  
## 1 183319 Panora…  867995 within… 100%    100%    FALSE         3 Entire… Entire…
## 2  51307 Spanis…  235568 N/A     N/A     N/A     FALSE         2 Entire… Entire…
## 3 184314 Boho C…  884031 N/A     N/A     N/A     FALSE         2 Entire… Entire…
## 4  51498 Guest …  236758 within… 100%    88%     FALSE         2 Entire… Entire…
## 5    109 Amazin…     521 N/A     N/A     N/A     FALSE         3 Entire… Entire…
## 6 185536 Lovely…  890778 within… 100%    80%     FALSE         2 Privat… Privat…
## # … with 10 more variables: bedrooms <dbl>, price <chr>,
## #   number_of_reviews <dbl>, review_scores_rating <dbl>,
## #   review_scores_accuracy <dbl>, review_scores_cleanliness <dbl>,
## #   review_scores_checkin <dbl>, review_scores_communication <dbl>,
## #   review_scores_location <dbl>, availability_365 <dbl>, and abbreviated
## #   variable names ¹​host_response_time, ²​host_response_rate,
## #   ³​host_acceptance_rate, ⁴​host_is_superhost, ⁵​host_total_listings_count, …
summary(airbnb_df_v2)
##        id                name              host_id          host_response_time
##  Min.   :1.090e+02   Length:45815       Min.   :      521   Length:45815      
##  1st Qu.:2.392e+07   Class :character   1st Qu.: 24132476   Class :character  
##  Median :4.521e+07   Mode  :character   Median : 99696114   Mode  :character  
##  Mean   :1.641e+17                      Mean   :149639418                     
##  3rd Qu.:5.493e+17                      3rd Qu.:250312632                     
##  Max.   :7.118e+17                      Max.   :478604871                     
##                                                                               
##  host_response_rate host_acceptance_rate host_is_superhost
##  Length:45815       Length:45815         Mode :logical    
##  Class :character   Class :character     FALSE:32203      
##  Mode  :character   Mode  :character     TRUE :13514      
##                                          NA's :98         
##                                                           
##                                                           
##                                                           
##  host_total_listings_count property_type       room_type        
##  Min.   :    1.0           Length:45815       Length:45815      
##  1st Qu.:    2.0           Class :character   Class :character  
##  Median :    4.0           Mode  :character   Mode  :character  
##  Mean   :  116.9                                                
##  3rd Qu.:   18.0                                                
##  Max.   :20000.0                                                
##  NA's   :13                                                     
##     bedrooms         price           number_of_reviews review_scores_rating
##  Min.   : 1.000   Length:45815       Min.   :   0.0    Min.   :0.000       
##  1st Qu.: 1.000   Class :character   1st Qu.:   1.0    1st Qu.:4.670       
##  Median : 1.000   Mode  :character   Median :   6.0    Median :4.880       
##  Mean   : 1.865                      Mean   :  35.2    Mean   :4.701       
##  3rd Qu.: 2.000                      3rd Qu.:  35.0    3rd Qu.:5.000       
##  Max.   :24.000                      Max.   :1702.0    Max.   :5.000       
##  NA's   :3760                                          NA's   :10579       
##  review_scores_accuracy review_scores_cleanliness review_scores_checkin
##  Min.   :0.000          Min.   :0.000             Min.   :0.000        
##  1st Qu.:4.740          1st Qu.:4.640             1st Qu.:4.860        
##  Median :4.910          Median :4.860             Median :4.970        
##  Mean   :4.767          Mean   :4.704             Mean   :4.841        
##  3rd Qu.:5.000          3rd Qu.:5.000             3rd Qu.:5.000        
##  Max.   :5.000          Max.   :5.000             Max.   :5.000        
##  NA's   :10848          NA's   :10847             NA's   :10854        
##  review_scores_communication review_scores_location availability_365
##  Min.   :0.000               Min.   :0.000          Min.   :  0.0   
##  1st Qu.:4.830               1st Qu.:4.760          1st Qu.: 61.0   
##  Median :4.970               Median :4.920          Median :180.0   
##  Mean   :4.823               Mean   :4.799          Mean   :192.8   
##  3rd Qu.:5.000               3rd Qu.:5.000          3rd Qu.:341.0   
##  Max.   :5.000               Max.   :5.000          Max.   :365.0   
##  NA's   :10848               NA's   :10857
glimpse(airbnb_df_v2)
## Rows: 45,815
## Columns: 20
## $ id                          <dbl> 183319, 51307, 184314, 51498, 109, 185536,…
## $ name                        <chr> "Panoramic Ocean View Venice Beach", "Span…
## $ host_id                     <dbl> 867995, 235568, 884031, 236758, 521, 89077…
## $ host_response_time          <chr> "within a day", "N/A", "N/A", "within an h…
## $ host_response_rate          <chr> "100%", "N/A", "N/A", "100%", "N/A", "100%…
## $ host_acceptance_rate        <chr> "100%", "N/A", "N/A", "88%", "N/A", "80%",…
## $ host_is_superhost           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ host_total_listings_count   <dbl> 3, 2, 2, 2, 3, 2, 3, 1, 2, 2, 5, 1, 10, 1,…
## $ property_type               <chr> "Entire rental unit", "Entire bungalow", "…
## $ room_type                   <chr> "Entire home/apt", "Entire home/apt", "Ent…
## $ bedrooms                    <dbl> 2, 1, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ price                       <chr> "$152.00", "$75.00", "$125.00", "$189.00",…
## $ number_of_reviews           <dbl> 3, 138, 30, 378, 2, 13, 37, 188, 12, 24, 1…
## $ review_scores_rating        <dbl> 5.00, 4.64, 4.88, 4.73, 4.00, 4.42, 4.84, …
## $ review_scores_accuracy      <dbl> 4.67, 4.77, 4.93, 4.83, 5.00, 4.38, 4.81, …
## $ review_scores_cleanliness   <dbl> 4.67, 4.55, 4.63, 4.80, 5.00, 3.77, 4.92, …
## $ review_scores_checkin       <dbl> 5.00, 4.89, 4.78, 4.94, 3.00, 4.77, 5.00, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.93, 4.90, 4.00, 4.85, 4.97, …
## $ review_scores_location      <dbl> 5.00, 4.95, 4.89, 4.83, 5.00, 4.77, 5.00, …
## $ availability_365            <dbl> 0, 224, 0, 348, 139, 358, 202, 250, 357, 3…
#drop rows will null values for bedrooms, reviews, host response and acceptance rate

airbnb_df_v3 <- airbnb_df_v2 %>% drop_na(host_response_time) %>% drop_na(host_response_rate) %>% drop_na(host_acceptance_rate) %>% drop_na(review_scores_rating) %>% drop_na(review_scores_accuracy) %>% drop_na(review_scores_cleanliness) %>% drop_na(review_scores_checkin) %>% drop_na(review_scores_communication) %>% drop_na(review_scores_location) %>% drop_na(bedrooms) %>% drop_na(price) 
glimpse(airbnb_df_v3) 
## Rows: 31,910
## Columns: 20
## $ id                          <dbl> 183319, 51307, 184314, 109, 185536, 2708, …
## $ name                        <chr> "Panoramic Ocean View Venice Beach", "Span…
## $ host_id                     <dbl> 867995, 235568, 884031, 521, 890778, 3008,…
## $ host_response_time          <chr> "within a day", "N/A", "N/A", "N/A", "with…
## $ host_response_rate          <chr> "100%", "N/A", "N/A", "N/A", "100%", "100%…
## $ host_acceptance_rate        <chr> "100%", "N/A", "N/A", "N/A", "80%", "100%"…
## $ host_is_superhost           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, T…
## $ host_total_listings_count   <dbl> 3, 2, 2, 3, 2, 3, 1, 2, 2, 5, 10, 1, 2, 10…
## $ property_type               <chr> "Entire rental unit", "Entire bungalow", "…
## $ room_type                   <chr> "Entire home/apt", "Entire home/apt", "Ent…
## $ bedrooms                    <dbl> 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ price                       <chr> "$152.00", "$75.00", "$125.00", "$115.00",…
## $ number_of_reviews           <dbl> 3, 138, 30, 2, 13, 37, 188, 12, 24, 196, 2…
## $ review_scores_rating        <dbl> 5.00, 4.64, 4.88, 4.00, 4.42, 4.84, 4.92, …
## $ review_scores_accuracy      <dbl> 4.67, 4.77, 4.93, 5.00, 4.38, 4.81, 4.92, …
## $ review_scores_cleanliness   <dbl> 4.67, 4.55, 4.63, 5.00, 3.77, 4.92, 4.87, …
## $ review_scores_checkin       <dbl> 5.00, 4.89, 4.78, 3.00, 4.77, 5.00, 4.98, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.93, 4.00, 4.85, 4.97, 4.99, …
## $ review_scores_location      <dbl> 5.00, 4.95, 4.89, 5.00, 4.77, 5.00, 4.96, …
## $ availability_365            <dbl> 0, 224, 0, 139, 358, 202, 250, 357, 323, 3…
install.packages("skimr")
install.packages("janitor")
install.packages("dplyr")
library(skimr)
library(janitor)
library(dplyr)
#Check unique values

unique(airbnb_df_v3$room_type)
## [1] "Entire home/apt" "Private room"    "Hotel room"      "Shared room"
#show top 5 duplicated rows and the frequency 

row_occur <- data.frame(table(airbnb_df_v3$id))
duplicate_rows <- row_occur[row_occur$Freq > 1,]
head(duplicate_rows,5)
##              Var1 Freq
## 25706 5.41261e+17    2
## 25714 5.42518e+17    2
## 25755 5.44831e+17    2
## 25808 5.47362e+17    2
## 25866 5.49182e+17    2
#number of duplicated rows 

dim(airbnb_df_v3[duplicated(airbnb_df_v3$id),])[1]
## [1] 225
#drop all rows with duplicated listing id 

airbnb_df_v4 <- airbnb_df_v3 %>% distinct(id, .keep_all = TRUE)
#Drop rows with host response rate of "N/A" 

airbnb_df_v5 <- subset(airbnb_df_v4,host_response_rate!="N/A")

#Create new column with % remove in host response rate

airbnb_df_v5$new_host_response_rate <- gsub("%$","",airbnb_df_v5$host_response_rate)

#convert new column from character to numeric

airbnb_df_v5$new_host_response_rate <- as.integer(airbnb_df_v5$new_host_response_rate)

library(dplyr)
glimpse(airbnb_df_v5)
## Rows: 26,727
## Columns: 21
## $ id                          <dbl> 183319, 185536, 2708, 51546, 185557, 2732,…
## $ name                        <chr> "Panoramic Ocean View Venice Beach", "Love…
## $ host_id                     <dbl> 867995, 890778, 3008, 237114, 890778, 3041…
## $ host_response_time          <chr> "within a day", "within a few hours", "wit…
## $ host_response_rate          <chr> "100%", "100%", "100%", "100%", "100%", "1…
## $ host_acceptance_rate        <chr> "100%", "80%", "100%", "100%", "80%", "37%…
## $ host_is_superhost           <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TR…
## $ host_total_listings_count   <dbl> 3, 2, 3, 1, 2, 2, 5, 10, 1, 2, 10, 14, 14,…
## $ property_type               <chr> "Entire rental unit", "Private room in hom…
## $ room_type                   <chr> "Entire home/apt", "Private room", "Privat…
## $ bedrooms                    <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ price                       <chr> "$152.00", "$85.00", "$93.00", "$100.00", …
## $ number_of_reviews           <dbl> 3, 13, 37, 188, 12, 24, 196, 23, 130, 85, …
## $ review_scores_rating        <dbl> 5.00, 4.42, 4.84, 4.92, 4.92, 4.41, 4.82, …
## $ review_scores_accuracy      <dbl> 4.67, 4.38, 4.81, 4.92, 5.00, 4.26, 4.71, …
## $ review_scores_cleanliness   <dbl> 4.67, 3.77, 4.92, 4.87, 4.75, 4.58, 4.89, …
## $ review_scores_checkin       <dbl> 5.00, 4.77, 5.00, 4.98, 5.00, 4.39, 4.93, …
## $ review_scores_communication <dbl> 5.00, 4.85, 4.97, 4.99, 5.00, 4.48, 4.89, …
## $ review_scores_location      <dbl> 5.00, 4.77, 5.00, 4.96, 5.00, 4.91, 4.88, …
## $ availability_365            <dbl> 0, 358, 202, 250, 357, 323, 365, 215, 259,…
## $ new_host_response_rate      <int> 100, 100, 100, 100, 100, 100, 95, 88, 100,…
#Create new column with $ , . removed in price

airbnb_df_v5$new_price <- gsub("[$,]","",airbnb_df_v5$price)

#convert new column from character to double

airbnb_df_v5$new_price <- as.double(airbnb_df_v5$new_price)
#Arrange data from ascending id

airbnb_df_v6 <- arrange(airbnb_df_v5,id)
head(airbnb_df_v6,5)
## # A tibble: 5 × 22
##      id name     host_id host_…¹ host_…² host_…³ host_…⁴ host_…⁵ prope…⁶ room_…⁷
##   <dbl> <chr>      <dbl> <chr>   <chr>   <chr>   <lgl>     <dbl> <chr>   <chr>  
## 1  2708 Runyon …    3008 within… 100%    100%    TRUE          3 Privat… Privat…
## 2  2732 Zen Lif…    3041 within… 100%    37%     FALSE         2 Privat… Privat…
## 3  3021 Furnish…    3415 within… 88%     78%     FALSE        10 Entire… Entire…
## 4  5728 Tiny Ho…    9171 within… 100%    76%     TRUE         14 Privat… Privat…
## 5  5729 Zen Roo…    9171 within… 100%    76%     TRUE         14 Privat… Privat…
## # … with 12 more variables: bedrooms <dbl>, price <chr>,
## #   number_of_reviews <dbl>, review_scores_rating <dbl>,
## #   review_scores_accuracy <dbl>, review_scores_cleanliness <dbl>,
## #   review_scores_checkin <dbl>, review_scores_communication <dbl>,
## #   review_scores_location <dbl>, availability_365 <dbl>,
## #   new_host_response_rate <int>, new_price <dbl>, and abbreviated variable
## #   names ¹​host_response_time, ²​host_response_rate, ³​host_acceptance_rate, …
#rename number of reviews to bookings 

colnames(airbnb_df_v6)[13] = "bookings"
colnames(airbnb_df_v6)
##  [1] "id"                          "name"                       
##  [3] "host_id"                     "host_response_time"         
##  [5] "host_response_rate"          "host_acceptance_rate"       
##  [7] "host_is_superhost"           "host_total_listings_count"  
##  [9] "property_type"               "room_type"                  
## [11] "bedrooms"                    "price"                      
## [13] "bookings"                    "review_scores_rating"       
## [15] "review_scores_accuracy"      "review_scores_cleanliness"  
## [17] "review_scores_checkin"       "review_scores_communication"
## [19] "review_scores_location"      "availability_365"           
## [21] "new_host_response_rate"      "new_price"

Data visualisation in R

install.packages("ggplot2")
library(ggplot2)
install.packages("ggcorrplot")
library(ggcorrplot)
install.packages("corrplot")
library(corrplot)

Correlation Matrix

airbnb_df_v8 <- airbnb_df_v6[c("new_price","review_scores_rating","review_scores_cleanliness","availability_365","new_host_response_rate")]

names(airbnb_df_v8) <- c("Price","Overall_Rating","Cleanliness","Availability_365","Host Reponse Rate")

correlation_df <- cor(airbnb_df_v8)

corrplot(correlation_df, method="color", addCoef.col="black")

#cor(airbnb_df_v6[sapply(airbnb_df_v6,is.numeric)])

The image above illustrates the correlation matrix for multiple variables in the Airbnb data. The matrix shows that cleanliness of the airbnb has a positive relationship with the overall rating of the stay. Besides that, it can be seen that other factors have less correlation between each other.

Scatter Plot

airbnb_df_v7 <- airbnb_df_v6 %>% filter(bookings < 1000)


chart1 <- ggplot(data = airbnb_df_v7) + geom_point(mapping = aes(x = new_host_response_rate, y = bookings)) + labs(title="Host Response Rate vs. Bookings", y="Bookings", x="Host Response Rate (%)")

print(chart1)

The scatter plot above shows the relationship between host response rate and the number of bookings per host. While it may not necessary be true that host with low response rate has lesser bookings, but there is a gentle trend which shows that host with higher response rate tend to have a slight higher number of bookings. With this understanding, Airbnb could work on methods or ways to encourage host to respond more frequently. One suggestion include adding notifications 20 minutes after receiving booking to remind host to respond.

Bar Chart

chart2 <- ggplot(airbnb_df_v6, aes(x = factor(room_type), fill = factor(room_type))) +
  geom_bar() +
  geom_text(aes(label = ..count..), stat = "count", vjust = 0, colour = "Black") + labs(title="Number of listing each Room Type", y="Number of Listing", x="Room Type")

print(chart2)

The bar chart above shows the number of listings as at 9 September 2022 base on each room type. Unsurprisingly, the most number of listing under Airbnb is the rental of home or apartment. From the chart, it is evident that there is a small percentage of listing that comes from hotel room and shared room. Airbnb could look into why hotel rooms are listed under Airbnb and investigate for possible target market of liasing with hotels to list their rooms.

Bar Chart

airbnb_df_v9 <- airbnb_df_v6 %>% group_by(room_type) %>% summarise( avg_price = mean(new_price))

chart3 <- ggplot(aes(x = room_type, y = avg_price, fill = room_type), data = airbnb_df_v9) + geom_bar(stat = "identity") + labs(title="Average price for each Room Type", y="Average Price ($)", x="Room Type") + geom_text(aes(label = sprintf("%0.2f", round(avg_price, digits = 2))), vjust = 0) + scale_fill_manual(values=c("#c63287",
                             "#d44c9c",
                             "#e366b1",
                             "#f17fc5"))

print(chart3)

The bar chart above shows the average price listed on Airbnb base on room type. It can be seen that the rental of entire home or apartment charges the highest price out of the other three. Shared rooms on the other hand are the cheapest at $41. At such a low cost, Airbnb could market their rooms as affordable especially for solo travelers. Solo travelers looking for a place to stay can consider shared rooms listed on Airbnb for cheap stay as well as an opportunity to make friends.

Line Chart

airbnb_df_v10 <- airbnb_df_v6 %>% filter(host_is_superhost != "NA")

chart4 <-ggplot(airbnb_df_v10, aes(host_is_superhost,availability_365)) + geom_boxplot(fill = "#94C973", color = "black") + labs(title="Availability of rooms vs. Host Status", y="Availability out of 365 days", x="Is host a Superhost?") 

print(chart4)

The image above shows a box plot of host with and without superhost status against the total number of days the listing is available for during the year. Out of the 365 days, you can see that median difference between host status is about 70 days, where superhost are likely to have more days booked. From the box plot, both superhost and non-superhost has similar lower quartile levels. Airbnb could possibly look into ways to encourage host to be a superhost to decrease availability of bookings.

SQL

install.packages("RSQLite")
library(RSQLite)
install.packages("sqldf")
library(sqldf)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"

Listings with lowest airbnb ratings

sqldf('SELECT id, host_response_rate AS Respone_Rate, host_total_listings_count AS Total_Listing, room_type AS Room_Type, review_scores_rating AS Rating, new_price AS Price, availability_365 AS Available FROM airbnb_df_v6 WHERE availability_365 < 365 ORDER BY review_scores_rating ASC LIMIT 10')
##          id Respone_Rate Total_Listing       Room_Type Rating Price Available
## 1   1433775          57%             2    Private room      0    90       345
## 2  19784306         100%            26 Entire home/apt      1  1000       364
## 3  20959427          50%             2 Entire home/apt      1   358       180
## 4  35727088         100%          4342 Entire home/apt      1   196       311
## 5  35764429          96%            55    Private room      1   236         3
## 6  35933694          96%            55    Private room      1   221        41
## 7  36590008         100%             6 Entire home/apt      1   564       266
## 8  37104345         100%           179 Entire home/apt      1   496       221
## 9  38948064          91%             2 Entire home/apt      1   260       364
## 10 39029905          90%            16    Private room      1   869       126

Property type with highest average price

sqldf('SELECT property_type AS Property_Type, AVG(new_price) AS Total_Price FROM airbnb_df_v6 GROUP BY property_type ORDER BY Total_Price DESC LIMIT 5')
##   Property_Type Total_Price
## 1  Entire villa    1043.778
## 2        Castle    1020.200
## 3         Tower     900.000
## 4 Entire chalet     850.000
## 5    Lighthouse     627.000

Listing with most days vacant in a year

sqldf('SELECT availability_365 AS Vacant_Days, review_scores_accuracy AS Accuracy, review_scores_cleanliness AS Cleanliness, review_scores_checkin AS Check_In, review_scores_communication AS Communication, review_scores_location AS Location FROM airbnb_df_v6 ORDER BY availability_365 DESC LIMIT 10')
##    Vacant_Days Accuracy Cleanliness Check_In Communication Location
## 1          365     4.92        4.46     5.00          5.00     4.95
## 2          365     4.85        4.89     4.92          4.89     4.84
## 3          365     3.82        3.73     4.82          4.64     4.36
## 4          365     4.93        4.98     4.95          4.90     4.84
## 5          365     4.84        4.93     4.95          4.90     4.89
## 6          365     4.68        4.70     4.91          4.95     4.79
## 7          365     4.50        4.38     4.75          4.50     4.75
## 8          365     4.82        4.45     4.64          4.73     4.64
## 9          365     4.40        4.30     4.50          4.70     4.80
## 10         365     4.84        4.89     4.95          4.87     4.93