Introduction

This report presents an analysis of Airbnb listings in Seattle, encompassing a range of data including listing and host names, neighborhoods, pricing, number of reviews, comments, and availability from June 24, 2024, to June 24, 2025. The insights derived from this analysis can be used to assist current and prospective Airbnb hosts in making informed decisions and enhancing their offerings. Additionally, it provides valuable information for clients in selecting the most suitable Airbnb accommodations.

Key Points

Data Visualisation & Analysis

Import Data

calender <- read.csv('~/Case Studies/Seattle Air BnB/calendar.csv')
listings <- read.csv('~/Case Studies/Seattle Air BnB/listings.csv')
reviews <- read.csv('~/Case Studies/Seattle Air BnB/reviews.csv')

Import Libraries

library('dplyr')
library('magrittr')
library('ggplot2')
library('wordcloud')
library('tm')
library('RColorBrewer')
library('lubridate')
library('patchwork')

Data Cleaning

# check for different variations of spelling 
unique(listings$neighbourhood_group)
##  [1] "Other neighborhoods" "West Seattle"        "Ballard"            
##  [4] "Magnolia"            "Queen Anne"          "Downtown"           
##  [7] "Cascade"             "Capitol Hill"        "Beacon Hill"        
## [10] "Lake City"           "Rainier Valley"      "Central Area"       
## [13] "University District" "Delridge"            "Northgate"          
## [16] "Seward Park"         "Interbay"
unique(listings$neighbourhood)
##  [1] "Wallingford"               "Georgetown"               
##  [3] "Fairmount Park"            "Whittier Heights"         
##  [5] "Sunset Hill"               "Fremont"                  
##  [7] "Phinney Ridge"             "Crown Hill"               
##  [9] "Lawton Park"               "Alki"                     
## [11] "North Queen Anne"          "West Queen Anne"          
## [13] "First Hill"                "Eastlake"                 
## [15] "Broadway"                  "Stevens"                  
## [17] "North Admiral"             "Portage Bay"              
## [19] "International District"    "Green Lake"               
## [21] "North Beacon Hill"         "Greenwood"                
## [23] "Cedar Park"                "Columbia City"            
## [25] "Mount Baker"               "Mann"                     
## [27] "Genesee"                   "Ravenna"                  
## [29] "Belltown"                  "University District"      
## [31] "Harrison/Denny-Blaine"     "South Delridge"           
## [33] "Broadview"                 "Maple Leaf"               
## [35] "East Queen Anne"           "Atlantic"                 
## [37] "Highland Park"             "West Woodland"            
## [39] "Laurelhurst"               "Madison Park"             
## [41] "Fauntleroy"                "Madrona"                  
## [43] "Loyal Heights"             "Gatewood"                 
## [45] "Haller Lake"               "Dunlap"                   
## [47] "Leschi"                    "Adams"                    
## [49] "North Beach/Blue Ridge"    "North Delridge"           
## [51] "Bryant"                    "Seward Park"              
## [53] "Pioneer Square"            "Pike-Market"              
## [55] "High Point"                "Central Business District"
## [57] "Yesler Terrace"            "Bitter Lake"              
## [59] "Windermere"                "Lower Queen Anne"         
## [61] "Minor"                     "Rainier Beach"            
## [63] "Seaview"                   "Victory Heights"          
## [65] "Roosevelt"                 "Matthews Beach"           
## [67] "Southeast Magnolia"        "Olympic Hills"            
## [69] "Mid-Beacon Hill"           "Brighton"                 
## [71] "South Lake Union"          "Briarcliff"               
## [73] "Montlake"                  "North College Park"       
## [75] "View Ridge"                "Riverview"                
## [77] "Pinehurst"                 "Interbay"                 
## [79] "Wedgwood"                  "Rainier View"             
## [81] "Meadowbrook"               "South Beacon Hill"        
## [83] "Industrial District"       "South Park"               
## [85] "Westlake"                  "Arbor Heights"            
## [87] "Roxhill"                   "Holly Park"
unique(listings$room_type)
## [1] "Entire home/apt" "Private room"    "Shared room"     "Hotel room"
unique(calender$available)
## [1] "f" "t"
# see column names 
colnames(listings)
##  [1] "id"                             "name"                          
##  [3] "host_id"                        "host_name"                     
##  [5] "neighbourhood_group"            "neighbourhood"                 
##  [7] "latitude"                       "longitude"                     
##  [9] "room_type"                      "price"                         
## [11] "minimum_nights"                 "number_of_reviews"             
## [13] "last_review"                    "reviews_per_month"             
## [15] "calculated_host_listings_count" "availability_365"              
## [17] "number_of_reviews_ltm"          "license"
colnames(reviews)
## [1] "listing_id"    "id"            "date"          "reviewer_id"  
## [5] "reviewer_name" "comments"
colnames(calender)
## [1] "listing_id"     "date"           "available"      "price"         
## [5] "adjusted_price" "minimum_nights" "maximum_nights"
# replace column name in listings 
listings <- listings %>% 
    rename(listing_id = id)
# check for duplicates 
sum(duplicated(listings))
## [1] 0
sum(duplicated(reviews))
## [1] 0
sum(duplicated(listings))
## [1] 0
# check for NAs in listings data set 
sum(is.na(listings))
## [1] 1272
colnames(listings)[colSums(is.na(listings)) >0]
## [1] "price"             "reviews_per_month"
# view the specific rows with NAs 
head(listings[is.na(listings$price), ], 5)
##    listing_id                                               name host_id
## 5        9596         the down home , spacious, central and fab!   14942
## 7       37234 Your Home Away From Home.  Private Parking onsite.  160789
## 20     340706                     Charming Wallingford Apartment 1015653
## 23     356248        Amazing Waterview Condo near UW + Eastlake!  306615
## 26     368403                 Large Luxury Lakeside Seattle Home 4186078
##    host_name neighbourhood_group neighbourhood latitude longitude
## 5      Joyce Other neighborhoods   Wallingford 47.65608 -122.3360
## 7    Darrell             Ballard   Sunset Hill 47.68897 -122.3942
## 20     David Other neighborhoods   Wallingford 47.65431 -122.3336
## 23      Tara        Capitol Hill   Portage Bay 47.65047 -122.3202
## 26     Trung Other neighborhoods    Green Lake 47.68528 -122.3312
##          room_type price minimum_nights number_of_reviews last_review
## 5  Entire home/apt    NA             30                96  2020-09-28
## 7  Entire home/apt    NA              3                 6  2023-08-26
## 20 Entire home/apt    NA              3               213  2024-06-19
## 23 Entire home/apt    NA             30                30  2024-06-06
## 26 Entire home/apt    NA              4                46  2023-04-02
##    reviews_per_month calculated_host_listings_count availability_365
## 5               0.61                              2               16
## 7               0.06                              2               92
## 20              1.42                              1                1
## 23              0.32                              2                1
## 26              0.35                              1              337
##    number_of_reviews_ltm             license
## 5                      0 STR -OPLI-19-002622
## 7                      2  STR-OPLI-19-002333
## 20                    13  STR-OPLI-24-000114
## 23                     2                    
## 26                     0  STR-OPLI-19-001130
head(listings[is.na(listings$reviews_per_month), ], 5)
##     listing_id                                   name  host_id host_name
## 21      340738         Victorian home on Capitol Hill  1729224    Marlow
## 273    4630355            Luxury Penthouse in Seattle 23167869    Bhuwan
## 542   10834487                            Corner room 48441443  Savannah
## 555   11254431           Master suite in Ballard home 19984963     Sarah
## 639   13081598 Urban Classic Lake Union & City center  1543665     Kaela
##     neighbourhood_group   neighbourhood latitude longitude       room_type
## 21         Capitol Hill         Stevens 47.62376 -122.3049 Entire home/apt
## 273          Queen Anne East Queen Anne 47.63353 -122.3475 Entire home/apt
## 542        Central Area           Minor 47.61179 -122.3135    Private room
## 555             Ballard   West Woodland 47.66669 -122.3663    Private room
## 639          Queen Anne East Queen Anne 47.63156 -122.3451 Entire home/apt
##     price minimum_nights number_of_reviews last_review reviews_per_month
## 21    495            100                 0                            NA
## 273   125             30                 0                            NA
## 542    90             30                 0                            NA
## 555   140            365                 0                            NA
## 639   138             30                 0                            NA
##     calculated_host_listings_count availability_365 number_of_reviews_ltm
## 21                               4              167                     0
## 273                              1               58                     0
## 542                              5              317                     0
## 555                              1              365                     0
## 639                              3              365                     0
##     license
## 21         
## 273        
## 542        
## 555        
## 639
# replace NAs in price column with avg price of listings in the neighbourhood 
avg_price_neighbourhood <- listings %>% 
    group_by(neighbourhood) %>% 
    summarise(avg_price = round(mean(price, na.rm = TRUE),2))

listings <- listings %>% 
    left_join(avg_price_neighbourhood, by = "neighbourhood") %>% 
    mutate(price = ifelse(is.na(price), avg_price, price)) %>% 
    select(-avg_price)
# replace NAs in reviews per month with 0 
listings$reviews_per_month[is.na(listings$reviews_per_month)] <- 0
# check for NAs in reviews dataset
sum(is.na(reviews))
## [1] 21
colnames(reviews)[colSums(is.na(reviews)) >0]
## [1] "comments"
# replace NAs in comments with NIL 
reviews$comments[is.na(reviews$comments)] <- 'NIL'
# check for NAs in calender dataset 
sum(is.na(calender))
## [1] 2350880
colnames(calender)[colSums(is.na(calender)) >0]
## [1] "adjusted_price" "minimum_nights" "maximum_nights"
# replace NAs in  adjusted price, min and max nights with 0
calender$adjusted_price[is.na(calender$adjusted_price)] <- 0
calender$minimum_nights[is.na(calender$minimum_nights)] <- 0
calender$maximum_nights[is.na(calender$maximum_nights)] <- 0
# see structure of dataset 
str(calender)
## 'data.frame':    2350878 obs. of  7 variables:
##  $ listing_id    : num  6606 6606 6606 6606 6606 ...
##  $ date          : chr  "2024-06-24" "2024-06-25" "2024-06-26" "2024-06-27" ...
##  $ available     : chr  "f" "f" "f" "f" ...
##  $ price         : chr  "$90.00" "$90.00" "$90.00" "$90.00" ...
##  $ adjusted_price: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ minimum_nights: num  30 30 30 30 30 30 30 30 30 30 ...
##  $ maximum_nights: num  1125 1125 1125 1125 1125 ...
str(listings)
## 'data.frame':    6442 obs. of  18 variables:
##  $ listing_id                    : num  6606 9419 9531 9534 9596 ...
##  $ name                          : chr  "Fab, private seattle urban cottage!" "Glorious sun room w/ memory foambed" "The Adorable Sweet Orange Craftsman" "The Coolest Tangerine Dream MIL!" ...
##  $ host_id                       : int  14942 30559 31481 31481 14942 102684 160789 601600 601266 2438665 ...
##  $ host_name                     : chr  "Joyce" "Angielena" "Cassie" "Cassie" ...
##  $ neighbourhood_group           : chr  "Other neighborhoods" "Other neighborhoods" "West Seattle" "West Seattle" ...
##  $ neighbourhood                 : chr  "Wallingford" "Georgetown" "Fairmount Park" "Fairmount Park" ...
##  $ latitude                      : num  47.7 47.6 47.6 47.6 47.7 ...
##  $ longitude                     : num  -122 -122 -122 -122 -122 ...
##  $ room_type                     : chr  "Entire home/apt" "Private room" "Entire home/apt" "Entire home/apt" ...
##  $ price                         : num  99 76 189 125 246 ...
##  $ minimum_nights                : int  30 2 3 2 30 2 3 3 30 30 ...
##  $ number_of_reviews             : int  160 196 97 77 96 1056 6 554 34 76 ...
##  $ last_review                   : chr  "2023-08-05" "2024-06-09" "2024-06-16" "2023-12-27" ...
##  $ reviews_per_month             : num  0.88 1.16 0.64 0.51 0.61 6.12 0.06 3.49 0.28 0.62 ...
##  $ calculated_host_listings_count: int  2 10 2 2 2 1 2 2 1 4 ...
##  $ availability_365              : int  147 337 133 1 16 96 92 314 173 43 ...
##  $ number_of_reviews_ltm         : int  1 15 23 2 0 61 2 28 4 1 ...
##  $ license                       : chr  "str-opli-19-002622" "Exempt" "STR-OPLI-19-002182" "STR-OPLI-19-002182" ...
str(reviews)
## 'data.frame':    481350 obs. of  6 variables:
##  $ listing_id   : num  6606 6606 6606 6606 6606 ...
##  $ id           : num  5664 338761 467904 480017 487278 ...
##  $ date         : chr  "2009-07-17" "2011-06-27" "2011-08-22" "2011-08-27" ...
##  $ reviewer_id  : int  18085 434031 976182 997921 206901 552477 1110380 2354750 1845181 1821528 ...
##  $ reviewer_name: chr  "Vivian" "Elliott" "Allegra" "Brittney" ...
##  $ comments     : chr  "The Urban Cottage is comfortable, beautiful, fun and really convenient!  Joyce is an amazing host and super fri"| __truncated__ "Joyce was a wonderful host and the urban cottage is a such an awesome place to stay (quiet, clean, comfortable,"| __truncated__ "Beautiful cottage and warm hospitality from Joyce. Even though we never got a chance to see each other I felt w"| __truncated__ "Joyce is a wonderful host! She is warm, helpful and fun to visit with. The cottage is cozy, bright and has all "| __truncated__ ...
# replace t and f in available column 
calender$available <- ifelse(calender$available == "f", 0, ifelse(calender$available == "t", 1, calender$available))
# change data type 
calender$available <- as.numeric(calender$available)
calender$price <- as.numeric(gsub("\\$", "", calender$price))
# trim data
listings[] <- lapply(listings, function(x) if(is.character(x)) trimws(x) else x)
reviews[] <- lapply(reviews, function(x) if(is.character(x)) trimws(x) else x)
# finalise dataset 
listingsdf <- subset(listings, select = -c(license,  number_of_reviews_ltm, availability_365))

reviewsdf <- reviews %>%  
    left_join(listings, by = "listing_id") %>% 
    select(listing_id, id, name, host_id, host_name, neighbourhood, neighbourhood_group, date, reviewer_id, reviewer_name, comments)

calenderdf <- calender %>% 
    left_join(listings, by = "listing_id") %>% 
    select(listing_id, name, host_id, host_name, neighbourhood, neighbourhood_group, date, available, price.x, adjusted_price, minimum_nights.x, maximum_nights)
# rename columns 
calenderdf <- calenderdf %>%  
    rename(price = price.x, minimum_nights = minimum_nights.x)

Visualisations

summary(listings)
##    listing_id            name              host_id           host_name        
##  Min.   :6.606e+03   Length:6442        Min.   :     4193   Length:6442       
##  1st Qu.:3.418e+07   Class :character   1st Qu.: 19877426   Class :character  
##  Median :6.144e+17   Mode  :character   Median : 80333752   Mode  :character  
##  Mean   :4.779e+17                      Mean   :156139594                     
##  3rd Qu.:9.319e+17                      3rd Qu.:255275523                     
##  Max.   :1.185e+18                      Max.   :584875397                     
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:6442         Length:6442        Min.   :47.50   Min.   :-122.4  
##  Class :character    Class :character   1st Qu.:47.60   1st Qu.:-122.4  
##  Mode  :character    Mode  :character   Median :47.62   Median :-122.3  
##                                         Mean   :47.63   Mean   :-122.3  
##                                         3rd Qu.:47.66   3rd Qu.:-122.3  
##                                         Max.   :47.73   Max.   :-122.2  
##   room_type             price        minimum_nights   number_of_reviews
##  Length:6442        Min.   :  10.0   Min.   :  1.00   Min.   :   0.00  
##  Class :character   1st Qu.: 120.0   1st Qu.:  2.00   1st Qu.:   4.00  
##  Mode  :character   Median : 171.0   Median :  2.00   Median :  29.00  
##                     Mean   : 212.6   Mean   : 11.13   Mean   :  74.72  
##                     3rd Qu.: 250.0   3rd Qu.: 30.00   3rd Qu.:  97.00  
##                     Max.   :9000.0   Max.   :365.00   Max.   :1404.00  
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:6442        Min.   :  0.000   Min.   :  1.00                
##  Class :character   1st Qu.:  0.300   1st Qu.:  1.00                
##  Mode  :character   Median :  1.410   Median :  2.00                
##                     Mean   :  1.969   Mean   : 29.11                
##                     3rd Qu.:  3.050   3rd Qu.: 11.00                
##                     Max.   :101.200   Max.   :340.00                
##  availability_365 number_of_reviews_ltm   license         
##  Min.   :  0.0    Min.   :  0.00        Length:6442       
##  1st Qu.: 81.0    1st Qu.:  1.00        Class :character  
##  Median :178.0    Median : 10.00        Mode  :character  
##  Mean   :183.3    Mean   : 18.68                          
##  3rd Qu.:295.0    3rd Qu.: 30.00                          
##  Max.   :365.0    Max.   :280.00
summary(reviewsdf)
##    listing_id              id                name              host_id         
##  Min.   :6.606e+03   Min.   :5.664e+03   Length:481350      Min.   :     4193  
##  1st Qu.:1.257e+07   1st Qu.:4.649e+08   Class :character   1st Qu.: 10162393  
##  Median :2.537e+07   Median :5.863e+17   Mode  :character   Median : 37101658  
##  Mean   :1.329e+17   Mean   :4.980e+17                      Mean   : 95677460  
##  3rd Qu.:5.057e+07   3rd Qu.:9.218e+17                      3rd Qu.:116740214  
##  Max.   :1.178e+18   Max.   :1.186e+18                      Max.   :581770696  
##   host_name         neighbourhood      neighbourhood_group     date          
##  Length:481350      Length:481350      Length:481350       Length:481350     
##  Class :character   Class :character   Class :character    Class :character  
##  Mode  :character   Mode  :character   Mode  :character    Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##   reviewer_id        reviewer_name        comments        
##  Min.   :       15   Length:481350      Length:481350     
##  1st Qu.: 30739500   Class :character   Class :character  
##  Median : 93728328   Mode  :character   Mode  :character  
##  Mean   :146998640                                        
##  3rd Qu.:219177054                                        
##  Max.   :584564834
summary(calenderdf)
##    listing_id            name              host_id           host_name        
##  Min.   :6.606e+03   Length:2350878     Min.   :     4193   Length:2350878    
##  1st Qu.:3.416e+07   Class :character   1st Qu.: 19862778   Class :character  
##  Median :6.144e+17   Mode  :character   Median : 80333752   Mode  :character  
##  Mean   :4.778e+17                      Mean   :156060758                     
##  3rd Qu.:9.319e+17                      3rd Qu.:255224226                     
##  Max.   :1.185e+18                      Max.   :584875397                     
##                                                                               
##  neighbourhood      neighbourhood_group     date             available     
##  Length:2350878     Length:2350878      Length:2350878     Min.   :0.0000  
##  Class :character   Class :character    Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character    Mode  :character   Median :1.0000  
##                                                            Mean   :0.5028  
##                                                            3rd Qu.:1.0000  
##                                                            Max.   :1.0000  
##                                                                            
##      price       adjusted_price minimum_nights   maximum_nights     
##  Min.   :  0.0   Min.   :0      Min.   :  0.00   Min.   :0.000e+00  
##  1st Qu.:100.0   1st Qu.:0      1st Qu.:  2.00   1st Qu.:3.650e+02  
##  Median :150.0   Median :0      Median :  3.00   Median :1.125e+03  
##  Mean   :207.1   Mean   :0      Mean   : 11.57   Mean   :2.017e+05  
##  3rd Qu.:250.0   3rd Qu.:0      3rd Qu.: 30.00   3rd Qu.:1.125e+03  
##  Max.   :999.0   Max.   :0      Max.   :365.00   Max.   :2.147e+09  
##  NA's   :94171

Top 10 Most Reviewed Listings

top_10_most_reviewed_listings <- listingsdf %>% 
    top_n(10, wt=number_of_reviews) %>% 
    arrange(desc(number_of_reviews))

p1 <- ggplot(top_10_most_reviewed_listings, aes(x=name, y=number_of_reviews, fill=neighbourhood)) +
        geom_bar(stat='identity') +
        labs(x='Listing Name', y="Number of Reviews", title='Top 10 Most Reviewed Listings', fill='Neighbourhood') + 
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, color='white', size=8),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p2 <- ggplot(top_10_most_reviewed_listings, aes(x=name, y=price, color="coral")) +
        geom_line(group=1) +
        labs(x='Listing Name', y="Price", title='Price of Top 10 Most Reviewed Listings') + 
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, color='white', size=8),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'), 
              legend.position = 'none')

combined1 <- p1 + p2 + plot_layout(ncol=2)
combined1

Top 10 Most Reviewed Listings by Month

top_10_most_reviewed_listings_month <- listingsdf %>%  
    top_n(10, wt=reviews_per_month) %>%  
    arrange(desc(reviews_per_month))

p3 <- ggplot(top_10_most_reviewed_listings_month, aes(x=name, y=reviews_per_month, fill=neighbourhood)) +
        geom_bar(stat='identity') +
        labs(x='Listing Name', y='Number of Monthly Reviews', title='Top 10 Most Reviewed Listings by Month', fill='Neighbouhood') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 70, hjust = 1, vjust = 1, color='white', size=8),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p4 <- ggplot(top_10_most_reviewed_listings_month, aes(x=name, y=price)) +
        geom_line(group=1, color='lightslateblue') +
        labs(x='Listing Name', y='Price', title='Price of Top 10 Most Reviewed Listings by Month') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 70, hjust = 1, vjust = 1, color='white', size=8),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'), 
              legend.position = 'none')

combined2 <- p3 + p4 + plot_layout(ncol = 2)
combined2

Top 5 Hosts with The Most Listings

top_5_hosts_listings_num <- listingsdf %>%  
    group_by(host_id, host_name) %>%  
    summarise(listings_count = n(), 
              number_of_reviews = sum(number_of_reviews)) %>% 
    top_n(5, wt=listings_count) %>%  
    arrange(desc(listings_count))
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
top_5_hosts_listings_num <- head(top_5_hosts_listings_num, 5)

p5 <- ggplot(top_5_hosts_listings_num, aes(x=host_name, y=listings_count, fill=host_name)) +
        geom_bar(stat='identity') +
        labs(x='Host Name', y='Listing Count', title = 'Top 5 Hosts with The Highest Number of Listings', fill = 'Host Name') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p6 <- ggplot(top_5_hosts_listings_num, aes(x=host_name, y=number_of_reviews, fill=host_name)) +
        geom_bar(stat='identity') +
        labs(x='Host Name', y='Number of Reviews', title = 'Number of Reviews of The Respective Hosts', fill = 'Host Name') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

combined3 <- p5 + p6 + plot_layout(ncol=2)
combined3

Top 5 Hosts with The Most Reviews

top_5_hosts_listings_reviews <- listingsdf %>%  
    group_by(host_id, host_name) %>% 
    summarise(total_reviews = sum(number_of_reviews)) %>%  
    arrange(desc(total_reviews)) %>%  
    head(5)
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
ggplot(top_5_hosts_listings_reviews, aes(x=host_name, y=total_reviews, fill=host_name)) +
    geom_bar(stat='identity') +
    labs(x='Host Name', y='Total Reviews Count', title='Top 5 Hosts with The Most Reviews Count',
         fill='Host Name') +
    theme(plot.background = element_rect(fill = "grey15"),
          panel.background = element_rect(fill = "grey9"),
          panel.grid = element_line(colour = "grey15"),
          legend.background = element_rect(fill = "grey15"),
          axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
          axis.text.y = element_text(color='white'),
          plot.title = element_text(color='white'),
          axis.title.x = element_text(color='white'),
          axis.title.y = element_text(color='white'),
          legend.text = element_text(color='white'),
          legend.title = element_text(color='white'))

Top 5 Neighbourhoods with The Most Listings

top_5_neighbourhood_listings_num <- listingsdf %>%  
    group_by(neighbourhood) %>%  
    summarise(listings_count = n(), 
              avg_price = round(mean(price), 2)) %>%  
    top_n(5, wt=listings_count) %>%  
    arrange(desc(listings_count))

p7 <- ggplot(top_5_neighbourhood_listings_num, aes(x=neighbourhood, y=listings_count, fill=neighbourhood)) +
        geom_bar(stat='identity') + 
        labs(x='Neighbourhood', y='Listing Count', title = 'Top 5 Neighbourhoods by Listing Count', fill = 'Neighbourhood') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white', size=10),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p8 <- ggplot(top_5_neighbourhood_listings_num, aes(x=neighbourhood, y=avg_price)) +
        geom_line(group=1, color='magenta1') +
        labs(x='Neighbourhood', y='Average Price', title = 'Average Price of The Top 5 Neighbourhoods by Listing Count') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white', size=10),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'), 
              legend.position = 'none')

combined4 <- p7 + p8 +plot_layout(ncol=2)
combined4

Top 5 Neighbourhood Groups with The Most Listings

top_5_neighbourhood_group_listings_num <- listings %>%  
    group_by(neighbourhood_group) %>% 
    summarise(listing_count = n(),
              avg_price = round(mean(price),2)) %>% 
    top_n(5, wt=listing_count) %>%  
    arrange(desc(listing_count))

p9 <- ggplot(top_5_neighbourhood_group_listings_num, aes(x=neighbourhood_group, y=listing_count, fill = neighbourhood_group)) +
        geom_bar(stat='identity') +
        labs(x='Neighbourhood Group', y = 'Listing Count', 
             title='Top 5 Neighbourhood Groups by Listing Count', fill = 'Neighbourhood Group') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white', size=10),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p10 <- ggplot(top_5_neighbourhood_group_listings_num, aes(x=neighbourhood_group, y=avg_price, color='salmon')) +
        geom_line(group=1) +
        labs(x='Neighbourhood Group', y='Average Price', 
             title = "Average Price of Top 5 Neighbourhood Groups by Listing Count") +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white', size=10),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'),
              legend.position = 'none')

combined5 <- p9 + p10 + plot_layout(ncol=2)
combined5

Top 5 Most Expensive Neighbourhoods

top_5_avg_price_neighbouhood <- listingsdf %>% 
    group_by(neighbourhood) %>% 
    summarise(listing_count = n(),
              avg_price = round(mean(price), 2)) %>% 
    arrange(desc(avg_price)) %>% 
    top_n(5, wt=avg_price)

p11 <- ggplot(top_5_avg_price_neighbouhood, aes(x=neighbourhood, y=listing_count, fill=neighbourhood)) +
    geom_bar(stat='identity') +
    labs(x='Neighbourhood', y = 'Listing Count', 
         title='Listing Count of The Top 5 Most Expensive Neighbourhood', 
         fill = 'Neighbourhood') +
    theme(plot.background = element_rect(fill = "grey15"),
          panel.background = element_rect(fill = "grey9"),
          panel.grid = element_line(colour = "grey15"),
          legend.background = element_rect(fill = "grey15"),
          axis.text.x = element_text(color='white'),
          axis.text.y = element_text(color='white'),
          plot.title = element_text(color='white'),
          axis.title.x = element_text(color='white'),
          axis.title.y = element_text(color='white'),
          legend.text = element_text(color='white'),
          legend.title = element_text(color='white'))

p12 <- ggplot(top_5_avg_price_neighbouhood,aes(x=neighbourhood, y=avg_price)) +
        geom_line(group=1, colour='purple') +
        labs(x='Neighbourhood', y='Average Price', 
             title = "Top 5 Most Expensive Neighbourhood") +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'),
              legend.position = 'none')

combined6 <- p11 + p12 +plot_layout(ncol=2)
combined6

Top 5 Most Expensive Neighbourhood Groups

top_5_avg_price_neighbouhood_group <- listingsdf %>% 
    group_by(neighbourhood_group) %>% 
    summarise(listing_count = n(),
              avg_price = round(mean(price),2)) %>% 
    top_n(5, wt=avg_price)

p13 <- ggplot(top_5_avg_price_neighbouhood_group, aes(x=neighbourhood_group, 
                                               y=listing_count, fill=neighbourhood_group)) +
        geom_bar(stat='identity') +
        labs(x='Neighbourhood Group', y = 'Listing Count', 
             title='Listing Count of The Top 5 Most Expensive Neighbourhood Groups', 
             fill = 'Neighbourhood Group') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p14 <- ggplot(top_5_avg_price_neighbouhood_group, aes(x=neighbourhood_group, y=avg_price)) +
        geom_line(group=1, color='green') +
        labs(x='Neighbourhood Group', y='Average Price', 
             title = "Top 5 Most Expensive Neighbourhood Groups") +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'),
              legend.position = 'none')

combined7 <- p13 + p14 + plot_layout(ncol = 2)
combined7

Listing Count & Average Price for Each Room Type

room_type_count_price <- listingsdf %>%  
    group_by(room_type) %>%  
    summarise(listing_count= n(),
              avg_price=mean(price)) %>%  
    arrange(desc(listing_count))

p15 <- ggplot(room_type_count_price, aes(x=room_type, y=listing_count, fill=room_type)) +
        geom_bar(stat='identity') +
        labs(x='Room Type', y='Listing Count', title = 'Listing Count for Each Room Type',
             fill='Room Type') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

p16 <- ggplot(room_type_count_price, aes(x=room_type, y=avg_price)) +
        geom_line(group=1, color='orange') +
        labs(x='Room Type', y='Average Price', title='Average Price for Each Room Type') +
        theme(plot.background = element_rect(fill = "grey15"),
              panel.background = element_rect(fill = "grey9"),
              panel.grid = element_line(colour = "grey15"),
              legend.background = element_rect(fill = "grey15"),
              axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
              axis.text.y = element_text(color='white'),
              plot.title = element_text(color='white'),
              axis.title.x = element_text(color='white'),
              axis.title.y = element_text(color='white'),
              legend.text = element_text(color='white'),
              legend.title = element_text(color='white'))

combined8 <- p15 + p16 + plot_layout(ncol = 2)
combined8

Box Plot on Pricing for Room Types

ggplot(listingsdf, aes(x=room_type, y=price, fill=room_type)) +
    geom_boxplot() + 
    geom_jitter(width=0, color='white') +
    labs(x='Room Type', y='Average Price', title='Box Plot of Room Type Pricing') +
    theme_minimal() +
    theme(plot.background = element_rect(fill = "grey15"),
          panel.background = element_rect(fill = "grey9"),
          panel.grid = element_line(colour = "grey15"),
          legend.background = element_rect(fill = "grey15"),
          axis.text.x = element_text(color='white'),
          axis.text.y = element_text(color='white'),
          plot.title = element_text(color='white'),
          axis.title.x = element_text(color='white'),
          axis.title.y = element_text(color='white'),
          legend.text = element_text(color='white'),
          legend.title = element_text(color='white'))

Availability of The Top 10 Most Reviewed Listings

listing_avail_df <- calenderdf %>% 
    filter(listing_id %in% top_10_most_reviewed_listings$listing_id) 
    
listing_avail_df$date <- as.Date(listing_avail_df$date)

listing_avail_df <- listing_avail_df %>%  
    mutate(month = month(date, label=TRUE),
           year = year(date))

avg_listing_avail_month <- listing_avail_df %>% 
    group_by(year, month, listing_id, name) %>% 
    summarise(avg_availability = mean(available))
## `summarise()` has grouped output by 'year', 'month', 'listing_id'. You can
## override using the `.groups` argument.
ggplot(avg_listing_avail_month, aes(x=month, y=avg_availability, color=name, group=name)) +
    geom_line() +
    facet_wrap(~ name, scales = 'free_y', ncol = 5) +
    labs(x='Month', y='Average Availability', title='Average Availability Through Each Month',
         color='Listing Name') +
    scale_y_continuous(limits = c(0, 1)) +
    theme(plot.background = element_rect(fill = "grey15"),
          panel.background = element_rect(fill = "grey9"),
          panel.grid = element_line(colour = "grey15"),
          legend.background = element_rect(fill = "grey15"),
          axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white', size=7),
          axis.text.y = element_text(color='white'),
          plot.title = element_text(color='white'),
          axis.title.x = element_text(color='white'),
          axis.title.y = element_text(color='white'),
          legend.text = element_text(color='white'),
          legend.title = element_text(color='white'))

Word Cloud of Comments on Top 10 Most Reviewed Listings

listingscomments <- reviewsdf %>%  
    filter(listing_id %in% top_10_most_reviewed_listings$listing_id) %>% 
    select(comments)

comments_vector <- unlist(listingscomments$comments)

corpus <- Corpus(VectorSource(comments_vector))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove common stopwords such as "like", "the", etc. 

# Create a Document-Term Matrix
dtm <- TermDocumentMatrix(corpus)

# Convert to matrix and then to a data frame
m <- as.matrix(dtm)
word_freq <- sort(rowSums(m), decreasing = TRUE)
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)

set.seed(1234)  # For reproducibility
wordcloud(words = word_freq_df$word, freq = word_freq_df$freq, 
          min.freq = 1, max.words = 100,
          random.order = FALSE, rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))

Word Cloud of Comments on Top 5 Most Listings Hosts

listingscomments2 <- reviewsdf %>%  
    filter(host_id %in% top_5_hosts_listings_reviews$host_id) %>% 
    select(comments)

comments_vector2 <- unlist(listingscomments2)

corpus2 <- Corpus(VectorSource(comments_vector2))
corpus2 <- tm_map(corpus2, content_transformer(tolower))
corpus2 <- tm_map(corpus2, removePunctuation)
corpus2 <- tm_map(corpus2, removeNumbers)
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))

dtm2 <- TermDocumentMatrix(corpus2)

m2 <- as.matrix(dtm2)
word_freq2 <- sort(rowSums(m2), decreasing = TRUE)
word_freq_df2 <- data.frame(word= names(word_freq2), freq = word_freq2)

set.seed(1234)
wordcloud(words = word_freq_df2$word, freq = word_freq_df2$freq,
          min.freq = 1, max.words = 100,
          random.order = FALSE, rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))