# Libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
library(readxl)DATA 110 - Assignment #5
Loading the libraries and Data
# Dataset
setwd("C:/Users/kpeter81/OneDrive - montgomerycollege.edu/Datasets")
airbnb <- read_excel("airbnb_DC_25.csv")Studying the Dataset
head(airbnb)# A tibble: 6 × 18
id name host_id host_name neighbourhood_group neighbourhood latitude
<dbl> <chr> <dbl> <chr> <lgl> <chr> <dbl>
1 3686 Vita's Hid… 4645 Vita NA Historic Ana… 38.9
2 3943 Historic R… 5059 Vasa NA Edgewood, Bl… 38.9
3 4197 Capitol Hi… 5061 Sandra NA Capitol Hill… 38.9
4 4529 Bertina's … 5803 Bertina NA Eastland Gar… 38.9
5 5589 Cozy apt i… 6527 Ami NA Kalorama Hei… 38.9
6 7103 Lovely gue… 17633 Charlotte NA Spring Valle… 38.9
# ℹ 11 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
# minimum_nights <dbl>, number_of_reviews <dbl>, last_review <dttm>,
# reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
# availability_365 <dbl>, number_of_reviews_ltm <dbl>, license <chr>
str(airbnb)tibble [6,257 × 18] (S3: tbl_df/tbl/data.frame)
$ id : num [1:6257] 3686 3943 4197 4529 5589 ...
$ name : chr [1:6257] "Vita's Hideaway" "Historic Rowhouse Near Monuments" "Capitol Hill Bedroom walk to Metro" "Bertina's House Part One" ...
$ host_id : num [1:6257] 4645 5059 5061 5803 6527 ...
$ host_name : chr [1:6257] "Vita" "Vasa" "Sandra" "Bertina" ...
$ neighbourhood_group : logi [1:6257] NA NA NA NA NA NA ...
$ neighbourhood : chr [1:6257] "Historic Anacostia" "Edgewood, Bloomingdale, Truxton Circle, Eckington" "Capitol Hill, Lincoln Park" "Eastland Gardens, Kenilworth" ...
$ latitude : num [1:6257] 38.9 38.9 38.9 38.9 38.9 ...
$ longitude : num [1:6257] -77 -77 -77 -76.9 -77 ...
$ room_type : chr [1:6257] "Private room" "Private room" "Private room" "Private room" ...
$ price : num [1:6257] 60 63 128 64 NA 74 85 52 125 52 ...
$ minimum_nights : num [1:6257] 31 1 4 30 50 31 31 31 30 31 ...
$ number_of_reviews : num [1:6257] 84 534 64 102 96 91 415 120 38 106 ...
$ last_review : POSIXct[1:6257], format: "2023-08-30" "2025-02-19" ...
$ reviews_per_month : num [1:6257] 0.48 2.77 0.33 0.54 0.51 0.5 2.25 0.66 0.21 0.63 ...
$ calculated_host_listings_count: num [1:6257] 1 5 2 2 1 27 4 4 1 4 ...
$ availability_365 : num [1:6257] 1 349 352 179 158 310 194 218 39 257 ...
$ number_of_reviews_ltm : num [1:6257] 0 38 6 0 0 0 3 3 0 2 ...
$ license : chr [1:6257] NA "Hosted License: 5007242201001033" "Hosted License: 5007242201000749" "Exempt" ...
colSums(is.na(airbnb)) id name
0 0
host_id host_name
0 2
neighbourhood_group neighbourhood
6257 0
latitude longitude
0 0
room_type price
0 1488
minimum_nights number_of_reviews
0 0
last_review reviews_per_month
1236 1236
calculated_host_listings_count availability_365
0 0
number_of_reviews_ltm license
0 1560
Visualization
#Filter the dataset for NAs and get rid of outliers
airbnb1 <- airbnb |>
filter(!is.na(price)) |>
filter(price < 550) # select only the prices above 500 (there are a lot of outliers)
head(airbnb1)# A tibble: 6 × 18
id name host_id host_name neighbourhood_group neighbourhood latitude
<dbl> <chr> <dbl> <chr> <lgl> <chr> <dbl>
1 3686 Vita's Hid… 4645 Vita NA Historic Ana… 38.9
2 3943 Historic R… 5059 Vasa NA Edgewood, Bl… 38.9
3 4197 Capitol Hi… 5061 Sandra NA Capitol Hill… 38.9
4 4529 Bertina's … 5803 Bertina NA Eastland Gar… 38.9
5 7103 Lovely gue… 17633 Charlotte NA Spring Valle… 38.9
6 11785 Sanctuary … 32015 Teresa NA Cathedral He… 38.9
# ℹ 11 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
# minimum_nights <dbl>, number_of_reviews <dbl>, last_review <dttm>,
# reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
# availability_365 <dbl>, number_of_reviews_ltm <dbl>, license <chr>
airbnb_box <- airbnb1 |>
ggplot() +
geom_boxplot(aes(y= price, x = room_type, fill= room_type)) +
scale_fill_manual(values=c("#6bcdcf","#f5ba78","#6bcdcf", "#6bcdcf")) +
theme_grey() +
labs(title = "Price Distribution Across Three Airbnb Room Types",
y = "Price (in US Dollars)", x = "Room Type",
caption = "Data Retrieved from the DATASETS Google Drive Folder: \n https://drive.google.com/drive/u/1/folders/1AMRfddeMwKRaNidOV87JP1iCVn_z-Uv_") +
coord_flip()
airbnb_boxWrite a short paragraph (3–5 sentences) describing:
the visualization you created, and
one key insight or pattern you observe in the plot.
I created a side-by-side boxplot that shows the differences in price distributions among 4 room types: Entire home/apt, hotel room. private room, and shared room. The room type with the greatest median price was Hotel rooms, around $250 per night, so I opted to highlight this difference by coloring the Hotel Room boxplot differently than the others. I observed many outliers in the dataset, and had to filter the price to be less than or greater than 500 as a result, as the outliers led to extreme confusion in the graph. The cheapest Airbnb type, by far, was the shared room.
Extra Code I decided not to use, but I thought was cool:
airbnb2 <- airbnb1 |>
group_by(room_type)|>
summarise(avg_price = mean(price), # calculates the average price
avg_avalibility = mean(availability_365)) # calculates the mean avalibility,
#365 days out of the year
head(airbnb2)# A tibble: 4 × 3
room_type avg_price avg_avalibility
<chr> <dbl> <dbl>
1 Entire home/apt 168. 176.
2 Hotel room 277. 169.
3 Private room 95.5 192.
4 Shared room 69.3 218.
library(RColorBrewer)
library(treemap)Warning: package 'treemap' was built under R version 4.5.2
treemap(airbnb2,
index="room_type",
vSize="avg_avalibility", # square size based on availibility
vColor="avg_price", # price determines which shade of blue
type="manual",
palette="Blues", #Use RColorBrewer palette
title = "Average Price and Avalibility By Room Type", # plot title
title.legend = "Average Price (in US Dollars)") # legend label