This report presents an analysis of Airbnb listings in Seattle, encompassing a range of data including listing and host names, neighborhoods, pricing, number of reviews, comments, and availability from June 24, 2024, to June 24, 2025. The insights derived from this analysis can be used to assist current and prospective Airbnb hosts in making informed decisions and enhancing their offerings. Additionally, it provides valuable information for clients in selecting the most suitable Airbnb accommodations.
calender <- read.csv('~/Case Studies/Seattle Air BnB/calendar.csv')
listings <- read.csv('~/Case Studies/Seattle Air BnB/listings.csv')
reviews <- read.csv('~/Case Studies/Seattle Air BnB/reviews.csv')
library('dplyr')
library('magrittr')
library('ggplot2')
library('wordcloud')
library('tm')
library('RColorBrewer')
library('lubridate')
library('patchwork')
# check for different variations of spelling
unique(listings$neighbourhood_group)
## [1] "Other neighborhoods" "West Seattle" "Ballard"
## [4] "Magnolia" "Queen Anne" "Downtown"
## [7] "Cascade" "Capitol Hill" "Beacon Hill"
## [10] "Lake City" "Rainier Valley" "Central Area"
## [13] "University District" "Delridge" "Northgate"
## [16] "Seward Park" "Interbay"
unique(listings$neighbourhood)
## [1] "Wallingford" "Georgetown"
## [3] "Fairmount Park" "Whittier Heights"
## [5] "Sunset Hill" "Fremont"
## [7] "Phinney Ridge" "Crown Hill"
## [9] "Lawton Park" "Alki"
## [11] "North Queen Anne" "West Queen Anne"
## [13] "First Hill" "Eastlake"
## [15] "Broadway" "Stevens"
## [17] "North Admiral" "Portage Bay"
## [19] "International District" "Green Lake"
## [21] "North Beacon Hill" "Greenwood"
## [23] "Cedar Park" "Columbia City"
## [25] "Mount Baker" "Mann"
## [27] "Genesee" "Ravenna"
## [29] "Belltown" "University District"
## [31] "Harrison/Denny-Blaine" "South Delridge"
## [33] "Broadview" "Maple Leaf"
## [35] "East Queen Anne" "Atlantic"
## [37] "Highland Park" "West Woodland"
## [39] "Laurelhurst" "Madison Park"
## [41] "Fauntleroy" "Madrona"
## [43] "Loyal Heights" "Gatewood"
## [45] "Haller Lake" "Dunlap"
## [47] "Leschi" "Adams"
## [49] "North Beach/Blue Ridge" "North Delridge"
## [51] "Bryant" "Seward Park"
## [53] "Pioneer Square" "Pike-Market"
## [55] "High Point" "Central Business District"
## [57] "Yesler Terrace" "Bitter Lake"
## [59] "Windermere" "Lower Queen Anne"
## [61] "Minor" "Rainier Beach"
## [63] "Seaview" "Victory Heights"
## [65] "Roosevelt" "Matthews Beach"
## [67] "Southeast Magnolia" "Olympic Hills"
## [69] "Mid-Beacon Hill" "Brighton"
## [71] "South Lake Union" "Briarcliff"
## [73] "Montlake" "North College Park"
## [75] "View Ridge" "Riverview"
## [77] "Pinehurst" "Interbay"
## [79] "Wedgwood" "Rainier View"
## [81] "Meadowbrook" "South Beacon Hill"
## [83] "Industrial District" "South Park"
## [85] "Westlake" "Arbor Heights"
## [87] "Roxhill" "Holly Park"
unique(listings$room_type)
## [1] "Entire home/apt" "Private room" "Shared room" "Hotel room"
unique(calender$available)
## [1] "f" "t"
# see column names
colnames(listings)
## [1] "id" "name"
## [3] "host_id" "host_name"
## [5] "neighbourhood_group" "neighbourhood"
## [7] "latitude" "longitude"
## [9] "room_type" "price"
## [11] "minimum_nights" "number_of_reviews"
## [13] "last_review" "reviews_per_month"
## [15] "calculated_host_listings_count" "availability_365"
## [17] "number_of_reviews_ltm" "license"
colnames(reviews)
## [1] "listing_id" "id" "date" "reviewer_id"
## [5] "reviewer_name" "comments"
colnames(calender)
## [1] "listing_id" "date" "available" "price"
## [5] "adjusted_price" "minimum_nights" "maximum_nights"
# replace column name in listings
listings <- listings %>%
rename(listing_id = id)
# check for duplicates
sum(duplicated(listings))
## [1] 0
sum(duplicated(reviews))
## [1] 0
sum(duplicated(listings))
## [1] 0
# check for NAs in listings data set
sum(is.na(listings))
## [1] 1272
colnames(listings)[colSums(is.na(listings)) >0]
## [1] "price" "reviews_per_month"
# view the specific rows with NAs
head(listings[is.na(listings$price), ], 5)
## listing_id name host_id
## 5 9596 the down home , spacious, central and fab! 14942
## 7 37234 Your Home Away From Home. Private Parking onsite. 160789
## 20 340706 Charming Wallingford Apartment 1015653
## 23 356248 Amazing Waterview Condo near UW + Eastlake! 306615
## 26 368403 Large Luxury Lakeside Seattle Home 4186078
## host_name neighbourhood_group neighbourhood latitude longitude
## 5 Joyce Other neighborhoods Wallingford 47.65608 -122.3360
## 7 Darrell Ballard Sunset Hill 47.68897 -122.3942
## 20 David Other neighborhoods Wallingford 47.65431 -122.3336
## 23 Tara Capitol Hill Portage Bay 47.65047 -122.3202
## 26 Trung Other neighborhoods Green Lake 47.68528 -122.3312
## room_type price minimum_nights number_of_reviews last_review
## 5 Entire home/apt NA 30 96 2020-09-28
## 7 Entire home/apt NA 3 6 2023-08-26
## 20 Entire home/apt NA 3 213 2024-06-19
## 23 Entire home/apt NA 30 30 2024-06-06
## 26 Entire home/apt NA 4 46 2023-04-02
## reviews_per_month calculated_host_listings_count availability_365
## 5 0.61 2 16
## 7 0.06 2 92
## 20 1.42 1 1
## 23 0.32 2 1
## 26 0.35 1 337
## number_of_reviews_ltm license
## 5 0 STR -OPLI-19-002622
## 7 2 STR-OPLI-19-002333
## 20 13 STR-OPLI-24-000114
## 23 2
## 26 0 STR-OPLI-19-001130
head(listings[is.na(listings$reviews_per_month), ], 5)
## listing_id name host_id host_name
## 21 340738 Victorian home on Capitol Hill 1729224 Marlow
## 273 4630355 Luxury Penthouse in Seattle 23167869 Bhuwan
## 542 10834487 Corner room 48441443 Savannah
## 555 11254431 Master suite in Ballard home 19984963 Sarah
## 639 13081598 Urban Classic Lake Union & City center 1543665 Kaela
## neighbourhood_group neighbourhood latitude longitude room_type
## 21 Capitol Hill Stevens 47.62376 -122.3049 Entire home/apt
## 273 Queen Anne East Queen Anne 47.63353 -122.3475 Entire home/apt
## 542 Central Area Minor 47.61179 -122.3135 Private room
## 555 Ballard West Woodland 47.66669 -122.3663 Private room
## 639 Queen Anne East Queen Anne 47.63156 -122.3451 Entire home/apt
## price minimum_nights number_of_reviews last_review reviews_per_month
## 21 495 100 0 NA
## 273 125 30 0 NA
## 542 90 30 0 NA
## 555 140 365 0 NA
## 639 138 30 0 NA
## calculated_host_listings_count availability_365 number_of_reviews_ltm
## 21 4 167 0
## 273 1 58 0
## 542 5 317 0
## 555 1 365 0
## 639 3 365 0
## license
## 21
## 273
## 542
## 555
## 639
# replace NAs in price column with avg price of listings in the neighbourhood
avg_price_neighbourhood <- listings %>%
group_by(neighbourhood) %>%
summarise(avg_price = round(mean(price, na.rm = TRUE),2))
listings <- listings %>%
left_join(avg_price_neighbourhood, by = "neighbourhood") %>%
mutate(price = ifelse(is.na(price), avg_price, price)) %>%
select(-avg_price)
# replace NAs in reviews per month with 0
listings$reviews_per_month[is.na(listings$reviews_per_month)] <- 0
# check for NAs in reviews dataset
sum(is.na(reviews))
## [1] 21
colnames(reviews)[colSums(is.na(reviews)) >0]
## [1] "comments"
# replace NAs in comments with NIL
reviews$comments[is.na(reviews$comments)] <- 'NIL'
# check for NAs in calender dataset
sum(is.na(calender))
## [1] 2350880
colnames(calender)[colSums(is.na(calender)) >0]
## [1] "adjusted_price" "minimum_nights" "maximum_nights"
# replace NAs in adjusted price, min and max nights with 0
calender$adjusted_price[is.na(calender$adjusted_price)] <- 0
calender$minimum_nights[is.na(calender$minimum_nights)] <- 0
calender$maximum_nights[is.na(calender$maximum_nights)] <- 0
# see structure of dataset
str(calender)
## 'data.frame': 2350878 obs. of 7 variables:
## $ listing_id : num 6606 6606 6606 6606 6606 ...
## $ date : chr "2024-06-24" "2024-06-25" "2024-06-26" "2024-06-27" ...
## $ available : chr "f" "f" "f" "f" ...
## $ price : chr "$90.00" "$90.00" "$90.00" "$90.00" ...
## $ adjusted_price: num 0 0 0 0 0 0 0 0 0 0 ...
## $ minimum_nights: num 30 30 30 30 30 30 30 30 30 30 ...
## $ maximum_nights: num 1125 1125 1125 1125 1125 ...
str(listings)
## 'data.frame': 6442 obs. of 18 variables:
## $ listing_id : num 6606 9419 9531 9534 9596 ...
## $ name : chr "Fab, private seattle urban cottage!" "Glorious sun room w/ memory foambed" "The Adorable Sweet Orange Craftsman" "The Coolest Tangerine Dream MIL!" ...
## $ host_id : int 14942 30559 31481 31481 14942 102684 160789 601600 601266 2438665 ...
## $ host_name : chr "Joyce" "Angielena" "Cassie" "Cassie" ...
## $ neighbourhood_group : chr "Other neighborhoods" "Other neighborhoods" "West Seattle" "West Seattle" ...
## $ neighbourhood : chr "Wallingford" "Georgetown" "Fairmount Park" "Fairmount Park" ...
## $ latitude : num 47.7 47.6 47.6 47.6 47.7 ...
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ room_type : chr "Entire home/apt" "Private room" "Entire home/apt" "Entire home/apt" ...
## $ price : num 99 76 189 125 246 ...
## $ minimum_nights : int 30 2 3 2 30 2 3 3 30 30 ...
## $ number_of_reviews : int 160 196 97 77 96 1056 6 554 34 76 ...
## $ last_review : chr "2023-08-05" "2024-06-09" "2024-06-16" "2023-12-27" ...
## $ reviews_per_month : num 0.88 1.16 0.64 0.51 0.61 6.12 0.06 3.49 0.28 0.62 ...
## $ calculated_host_listings_count: int 2 10 2 2 2 1 2 2 1 4 ...
## $ availability_365 : int 147 337 133 1 16 96 92 314 173 43 ...
## $ number_of_reviews_ltm : int 1 15 23 2 0 61 2 28 4 1 ...
## $ license : chr "str-opli-19-002622" "Exempt" "STR-OPLI-19-002182" "STR-OPLI-19-002182" ...
str(reviews)
## 'data.frame': 481350 obs. of 6 variables:
## $ listing_id : num 6606 6606 6606 6606 6606 ...
## $ id : num 5664 338761 467904 480017 487278 ...
## $ date : chr "2009-07-17" "2011-06-27" "2011-08-22" "2011-08-27" ...
## $ reviewer_id : int 18085 434031 976182 997921 206901 552477 1110380 2354750 1845181 1821528 ...
## $ reviewer_name: chr "Vivian" "Elliott" "Allegra" "Brittney" ...
## $ comments : chr "The Urban Cottage is comfortable, beautiful, fun and really convenient! Joyce is an amazing host and super fri"| __truncated__ "Joyce was a wonderful host and the urban cottage is a such an awesome place to stay (quiet, clean, comfortable,"| __truncated__ "Beautiful cottage and warm hospitality from Joyce. Even though we never got a chance to see each other I felt w"| __truncated__ "Joyce is a wonderful host! She is warm, helpful and fun to visit with. The cottage is cozy, bright and has all "| __truncated__ ...
# replace t and f in available column
calender$available <- ifelse(calender$available == "f", 0, ifelse(calender$available == "t", 1, calender$available))
# change data type
calender$available <- as.numeric(calender$available)
calender$price <- as.numeric(gsub("\\$", "", calender$price))
# trim data
listings[] <- lapply(listings, function(x) if(is.character(x)) trimws(x) else x)
reviews[] <- lapply(reviews, function(x) if(is.character(x)) trimws(x) else x)
# finalise dataset
listingsdf <- subset(listings, select = -c(license, number_of_reviews_ltm, availability_365))
reviewsdf <- reviews %>%
left_join(listings, by = "listing_id") %>%
select(listing_id, id, name, host_id, host_name, neighbourhood, neighbourhood_group, date, reviewer_id, reviewer_name, comments)
calenderdf <- calender %>%
left_join(listings, by = "listing_id") %>%
select(listing_id, name, host_id, host_name, neighbourhood, neighbourhood_group, date, available, price.x, adjusted_price, minimum_nights.x, maximum_nights)
# rename columns
calenderdf <- calenderdf %>%
rename(price = price.x, minimum_nights = minimum_nights.x)
summary(listings)
## listing_id name host_id host_name
## Min. :6.606e+03 Length:6442 Min. : 4193 Length:6442
## 1st Qu.:3.418e+07 Class :character 1st Qu.: 19877426 Class :character
## Median :6.144e+17 Mode :character Median : 80333752 Mode :character
## Mean :4.779e+17 Mean :156139594
## 3rd Qu.:9.319e+17 3rd Qu.:255275523
## Max. :1.185e+18 Max. :584875397
## neighbourhood_group neighbourhood latitude longitude
## Length:6442 Length:6442 Min. :47.50 Min. :-122.4
## Class :character Class :character 1st Qu.:47.60 1st Qu.:-122.4
## Mode :character Mode :character Median :47.62 Median :-122.3
## Mean :47.63 Mean :-122.3
## 3rd Qu.:47.66 3rd Qu.:-122.3
## Max. :47.73 Max. :-122.2
## room_type price minimum_nights number_of_reviews
## Length:6442 Min. : 10.0 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.: 120.0 1st Qu.: 2.00 1st Qu.: 4.00
## Mode :character Median : 171.0 Median : 2.00 Median : 29.00
## Mean : 212.6 Mean : 11.13 Mean : 74.72
## 3rd Qu.: 250.0 3rd Qu.: 30.00 3rd Qu.: 97.00
## Max. :9000.0 Max. :365.00 Max. :1404.00
## last_review reviews_per_month calculated_host_listings_count
## Length:6442 Min. : 0.000 Min. : 1.00
## Class :character 1st Qu.: 0.300 1st Qu.: 1.00
## Mode :character Median : 1.410 Median : 2.00
## Mean : 1.969 Mean : 29.11
## 3rd Qu.: 3.050 3rd Qu.: 11.00
## Max. :101.200 Max. :340.00
## availability_365 number_of_reviews_ltm license
## Min. : 0.0 Min. : 0.00 Length:6442
## 1st Qu.: 81.0 1st Qu.: 1.00 Class :character
## Median :178.0 Median : 10.00 Mode :character
## Mean :183.3 Mean : 18.68
## 3rd Qu.:295.0 3rd Qu.: 30.00
## Max. :365.0 Max. :280.00
summary(reviewsdf)
## listing_id id name host_id
## Min. :6.606e+03 Min. :5.664e+03 Length:481350 Min. : 4193
## 1st Qu.:1.257e+07 1st Qu.:4.649e+08 Class :character 1st Qu.: 10162393
## Median :2.537e+07 Median :5.863e+17 Mode :character Median : 37101658
## Mean :1.329e+17 Mean :4.980e+17 Mean : 95677460
## 3rd Qu.:5.057e+07 3rd Qu.:9.218e+17 3rd Qu.:116740214
## Max. :1.178e+18 Max. :1.186e+18 Max. :581770696
## host_name neighbourhood neighbourhood_group date
## Length:481350 Length:481350 Length:481350 Length:481350
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## reviewer_id reviewer_name comments
## Min. : 15 Length:481350 Length:481350
## 1st Qu.: 30739500 Class :character Class :character
## Median : 93728328 Mode :character Mode :character
## Mean :146998640
## 3rd Qu.:219177054
## Max. :584564834
summary(calenderdf)
## listing_id name host_id host_name
## Min. :6.606e+03 Length:2350878 Min. : 4193 Length:2350878
## 1st Qu.:3.416e+07 Class :character 1st Qu.: 19862778 Class :character
## Median :6.144e+17 Mode :character Median : 80333752 Mode :character
## Mean :4.778e+17 Mean :156060758
## 3rd Qu.:9.319e+17 3rd Qu.:255224226
## Max. :1.185e+18 Max. :584875397
##
## neighbourhood neighbourhood_group date available
## Length:2350878 Length:2350878 Length:2350878 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.5028
## 3rd Qu.:1.0000
## Max. :1.0000
##
## price adjusted_price minimum_nights maximum_nights
## Min. : 0.0 Min. :0 Min. : 0.00 Min. :0.000e+00
## 1st Qu.:100.0 1st Qu.:0 1st Qu.: 2.00 1st Qu.:3.650e+02
## Median :150.0 Median :0 Median : 3.00 Median :1.125e+03
## Mean :207.1 Mean :0 Mean : 11.57 Mean :2.017e+05
## 3rd Qu.:250.0 3rd Qu.:0 3rd Qu.: 30.00 3rd Qu.:1.125e+03
## Max. :999.0 Max. :0 Max. :365.00 Max. :2.147e+09
## NA's :94171
top_10_most_reviewed_listings <- listingsdf %>%
top_n(10, wt=number_of_reviews) %>%
arrange(desc(number_of_reviews))
p1 <- ggplot(top_10_most_reviewed_listings, aes(x=name, y=number_of_reviews, fill=neighbourhood)) +
geom_bar(stat='identity') +
labs(x='Listing Name', y="Number of Reviews", title='Top 10 Most Reviewed Listings', fill='Neighbourhood') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p2 <- ggplot(top_10_most_reviewed_listings, aes(x=name, y=price, color="coral")) +
geom_line(group=1) +
labs(x='Listing Name', y="Price", title='Price of Top 10 Most Reviewed Listings') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined1 <- p1 + p2 + plot_layout(ncol=2)
combined1
top_10_most_reviewed_listings_month <- listingsdf %>%
top_n(10, wt=reviews_per_month) %>%
arrange(desc(reviews_per_month))
p3 <- ggplot(top_10_most_reviewed_listings_month, aes(x=name, y=reviews_per_month, fill=neighbourhood)) +
geom_bar(stat='identity') +
labs(x='Listing Name', y='Number of Monthly Reviews', title='Top 10 Most Reviewed Listings by Month', fill='Neighbouhood') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 70, hjust = 1, vjust = 1, color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p4 <- ggplot(top_10_most_reviewed_listings_month, aes(x=name, y=price)) +
geom_line(group=1, color='lightslateblue') +
labs(x='Listing Name', y='Price', title='Price of Top 10 Most Reviewed Listings by Month') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 70, hjust = 1, vjust = 1, color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined2 <- p3 + p4 + plot_layout(ncol = 2)
combined2
top_5_hosts_listings_num <- listingsdf %>%
group_by(host_id, host_name) %>%
summarise(listings_count = n(),
number_of_reviews = sum(number_of_reviews)) %>%
top_n(5, wt=listings_count) %>%
arrange(desc(listings_count))
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
top_5_hosts_listings_num <- head(top_5_hosts_listings_num, 5)
p5 <- ggplot(top_5_hosts_listings_num, aes(x=host_name, y=listings_count, fill=host_name)) +
geom_bar(stat='identity') +
labs(x='Host Name', y='Listing Count', title = 'Top 5 Hosts with The Highest Number of Listings', fill = 'Host Name') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p6 <- ggplot(top_5_hosts_listings_num, aes(x=host_name, y=number_of_reviews, fill=host_name)) +
geom_bar(stat='identity') +
labs(x='Host Name', y='Number of Reviews', title = 'Number of Reviews of The Respective Hosts', fill = 'Host Name') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
combined3 <- p5 + p6 + plot_layout(ncol=2)
combined3
top_5_hosts_listings_reviews <- listingsdf %>%
group_by(host_id, host_name) %>%
summarise(total_reviews = sum(number_of_reviews)) %>%
arrange(desc(total_reviews)) %>%
head(5)
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
ggplot(top_5_hosts_listings_reviews, aes(x=host_name, y=total_reviews, fill=host_name)) +
geom_bar(stat='identity') +
labs(x='Host Name', y='Total Reviews Count', title='Top 5 Hosts with The Most Reviews Count',
fill='Host Name') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
top_5_neighbourhood_listings_num <- listingsdf %>%
group_by(neighbourhood) %>%
summarise(listings_count = n(),
avg_price = round(mean(price), 2)) %>%
top_n(5, wt=listings_count) %>%
arrange(desc(listings_count))
p7 <- ggplot(top_5_neighbourhood_listings_num, aes(x=neighbourhood, y=listings_count, fill=neighbourhood)) +
geom_bar(stat='identity') +
labs(x='Neighbourhood', y='Listing Count', title = 'Top 5 Neighbourhoods by Listing Count', fill = 'Neighbourhood') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white', size=10),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p8 <- ggplot(top_5_neighbourhood_listings_num, aes(x=neighbourhood, y=avg_price)) +
geom_line(group=1, color='magenta1') +
labs(x='Neighbourhood', y='Average Price', title = 'Average Price of The Top 5 Neighbourhoods by Listing Count') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white', size=10),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined4 <- p7 + p8 +plot_layout(ncol=2)
combined4
top_5_neighbourhood_group_listings_num <- listings %>%
group_by(neighbourhood_group) %>%
summarise(listing_count = n(),
avg_price = round(mean(price),2)) %>%
top_n(5, wt=listing_count) %>%
arrange(desc(listing_count))
p9 <- ggplot(top_5_neighbourhood_group_listings_num, aes(x=neighbourhood_group, y=listing_count, fill = neighbourhood_group)) +
geom_bar(stat='identity') +
labs(x='Neighbourhood Group', y = 'Listing Count',
title='Top 5 Neighbourhood Groups by Listing Count', fill = 'Neighbourhood Group') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white', size=10),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p10 <- ggplot(top_5_neighbourhood_group_listings_num, aes(x=neighbourhood_group, y=avg_price, color='salmon')) +
geom_line(group=1) +
labs(x='Neighbourhood Group', y='Average Price',
title = "Average Price of Top 5 Neighbourhood Groups by Listing Count") +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white', size=10),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined5 <- p9 + p10 + plot_layout(ncol=2)
combined5
top_5_avg_price_neighbouhood <- listingsdf %>%
group_by(neighbourhood) %>%
summarise(listing_count = n(),
avg_price = round(mean(price), 2)) %>%
arrange(desc(avg_price)) %>%
top_n(5, wt=avg_price)
p11 <- ggplot(top_5_avg_price_neighbouhood, aes(x=neighbourhood, y=listing_count, fill=neighbourhood)) +
geom_bar(stat='identity') +
labs(x='Neighbourhood', y = 'Listing Count',
title='Listing Count of The Top 5 Most Expensive Neighbourhood',
fill = 'Neighbourhood') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p12 <- ggplot(top_5_avg_price_neighbouhood,aes(x=neighbourhood, y=avg_price)) +
geom_line(group=1, colour='purple') +
labs(x='Neighbourhood', y='Average Price',
title = "Top 5 Most Expensive Neighbourhood") +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined6 <- p11 + p12 +plot_layout(ncol=2)
combined6
top_5_avg_price_neighbouhood_group <- listingsdf %>%
group_by(neighbourhood_group) %>%
summarise(listing_count = n(),
avg_price = round(mean(price),2)) %>%
top_n(5, wt=avg_price)
p13 <- ggplot(top_5_avg_price_neighbouhood_group, aes(x=neighbourhood_group,
y=listing_count, fill=neighbourhood_group)) +
geom_bar(stat='identity') +
labs(x='Neighbourhood Group', y = 'Listing Count',
title='Listing Count of The Top 5 Most Expensive Neighbourhood Groups',
fill = 'Neighbourhood Group') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p14 <- ggplot(top_5_avg_price_neighbouhood_group, aes(x=neighbourhood_group, y=avg_price)) +
geom_line(group=1, color='green') +
labs(x='Neighbourhood Group', y='Average Price',
title = "Top 5 Most Expensive Neighbourhood Groups") +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'),
legend.position = 'none')
combined7 <- p13 + p14 + plot_layout(ncol = 2)
combined7
room_type_count_price <- listingsdf %>%
group_by(room_type) %>%
summarise(listing_count= n(),
avg_price=mean(price)) %>%
arrange(desc(listing_count))
p15 <- ggplot(room_type_count_price, aes(x=room_type, y=listing_count, fill=room_type)) +
geom_bar(stat='identity') +
labs(x='Room Type', y='Listing Count', title = 'Listing Count for Each Room Type',
fill='Room Type') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p16 <- ggplot(room_type_count_price, aes(x=room_type, y=avg_price)) +
geom_line(group=1, color='orange') +
labs(x='Room Type', y='Average Price', title='Average Price for Each Room Type') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
combined8 <- p15 + p16 + plot_layout(ncol = 2)
combined8
ggplot(listingsdf, aes(x=room_type, y=price, fill=room_type)) +
geom_boxplot() +
geom_jitter(width=0, color='white') +
labs(x='Room Type', y='Average Price', title='Box Plot of Room Type Pricing') +
theme_minimal() +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white'),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
listing_avail_df <- calenderdf %>%
filter(listing_id %in% top_10_most_reviewed_listings$listing_id)
listing_avail_df$date <- as.Date(listing_avail_df$date)
listing_avail_df <- listing_avail_df %>%
mutate(month = month(date, label=TRUE),
year = year(date))
avg_listing_avail_month <- listing_avail_df %>%
group_by(year, month, listing_id, name) %>%
summarise(avg_availability = mean(available))
## `summarise()` has grouped output by 'year', 'month', 'listing_id'. You can
## override using the `.groups` argument.
ggplot(avg_listing_avail_month, aes(x=month, y=avg_availability, color=name, group=name)) +
geom_line() +
facet_wrap(~ name, scales = 'free_y', ncol = 5) +
labs(x='Month', y='Average Availability', title='Average Availability Through Each Month',
color='Listing Name') +
scale_y_continuous(limits = c(0, 1)) +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, color='white', size=7),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
listingscomments <- reviewsdf %>%
filter(listing_id %in% top_10_most_reviewed_listings$listing_id) %>%
select(comments)
comments_vector <- unlist(listingscomments$comments)
corpus <- Corpus(VectorSource(comments_vector))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove common stopwords such as "like", "the", etc.
# Create a Document-Term Matrix
dtm <- TermDocumentMatrix(corpus)
# Convert to matrix and then to a data frame
m <- as.matrix(dtm)
word_freq <- sort(rowSums(m), decreasing = TRUE)
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
set.seed(1234) # For reproducibility
wordcloud(words = word_freq_df$word, freq = word_freq_df$freq,
min.freq = 1, max.words = 100,
random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
listingscomments2 <- reviewsdf %>%
filter(host_id %in% top_5_hosts_listings_reviews$host_id) %>%
select(comments)
comments_vector2 <- unlist(listingscomments2)
corpus2 <- Corpus(VectorSource(comments_vector2))
corpus2 <- tm_map(corpus2, content_transformer(tolower))
corpus2 <- tm_map(corpus2, removePunctuation)
corpus2 <- tm_map(corpus2, removeNumbers)
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
dtm2 <- TermDocumentMatrix(corpus2)
m2 <- as.matrix(dtm2)
word_freq2 <- sort(rowSums(m2), decreasing = TRUE)
word_freq_df2 <- data.frame(word= names(word_freq2), freq = word_freq2)
set.seed(1234)
wordcloud(words = word_freq_df2$word, freq = word_freq_df2$freq,
min.freq = 1, max.words = 100,
random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))