DATA 110 - Assignment #5

Author

Kalina Peterson

Loading the libraries and Data

# Libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
library(readxl)
# Dataset 
setwd("C:/Users/kpeter81/OneDrive - montgomerycollege.edu/Datasets") 
airbnb <- read_excel("airbnb_DC_25.csv")

Studying the Dataset

head(airbnb)
# A tibble: 6 × 18
     id name        host_id host_name neighbourhood_group neighbourhood latitude
  <dbl> <chr>         <dbl> <chr>     <lgl>               <chr>            <dbl>
1  3686 Vita's Hid…    4645 Vita      NA                  Historic Ana…     38.9
2  3943 Historic R…    5059 Vasa      NA                  Edgewood, Bl…     38.9
3  4197 Capitol Hi…    5061 Sandra    NA                  Capitol Hill…     38.9
4  4529 Bertina's …    5803 Bertina   NA                  Eastland Gar…     38.9
5  5589 Cozy apt i…    6527 Ami       NA                  Kalorama Hei…     38.9
6  7103 Lovely gue…   17633 Charlotte NA                  Spring Valle…     38.9
# ℹ 11 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
#   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <dttm>,
#   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
#   availability_365 <dbl>, number_of_reviews_ltm <dbl>, license <chr>
str(airbnb)
tibble [6,257 × 18] (S3: tbl_df/tbl/data.frame)
 $ id                            : num [1:6257] 3686 3943 4197 4529 5589 ...
 $ name                          : chr [1:6257] "Vita's Hideaway" "Historic Rowhouse Near Monuments" "Capitol Hill Bedroom walk to Metro" "Bertina's  House Part One" ...
 $ host_id                       : num [1:6257] 4645 5059 5061 5803 6527 ...
 $ host_name                     : chr [1:6257] "Vita" "Vasa" "Sandra" "Bertina" ...
 $ neighbourhood_group           : logi [1:6257] NA NA NA NA NA NA ...
 $ neighbourhood                 : chr [1:6257] "Historic Anacostia" "Edgewood, Bloomingdale, Truxton Circle, Eckington" "Capitol Hill, Lincoln Park" "Eastland Gardens, Kenilworth" ...
 $ latitude                      : num [1:6257] 38.9 38.9 38.9 38.9 38.9 ...
 $ longitude                     : num [1:6257] -77 -77 -77 -76.9 -77 ...
 $ room_type                     : chr [1:6257] "Private room" "Private room" "Private room" "Private room" ...
 $ price                         : num [1:6257] 60 63 128 64 NA 74 85 52 125 52 ...
 $ minimum_nights                : num [1:6257] 31 1 4 30 50 31 31 31 30 31 ...
 $ number_of_reviews             : num [1:6257] 84 534 64 102 96 91 415 120 38 106 ...
 $ last_review                   : POSIXct[1:6257], format: "2023-08-30" "2025-02-19" ...
 $ reviews_per_month             : num [1:6257] 0.48 2.77 0.33 0.54 0.51 0.5 2.25 0.66 0.21 0.63 ...
 $ calculated_host_listings_count: num [1:6257] 1 5 2 2 1 27 4 4 1 4 ...
 $ availability_365              : num [1:6257] 1 349 352 179 158 310 194 218 39 257 ...
 $ number_of_reviews_ltm         : num [1:6257] 0 38 6 0 0 0 3 3 0 2 ...
 $ license                       : chr [1:6257] NA "Hosted License: 5007242201001033" "Hosted License: 5007242201000749" "Exempt" ...
colSums(is.na(airbnb))
                            id                           name 
                             0                              0 
                       host_id                      host_name 
                             0                              2 
           neighbourhood_group                  neighbourhood 
                          6257                              0 
                      latitude                      longitude 
                             0                              0 
                     room_type                          price 
                             0                           1488 
                minimum_nights              number_of_reviews 
                             0                              0 
                   last_review              reviews_per_month 
                          1236                           1236 
calculated_host_listings_count               availability_365 
                             0                              0 
         number_of_reviews_ltm                        license 
                             0                           1560 

Visualization

#Filter the dataset for NAs and get rid of outliers
airbnb1 <- airbnb |>
  filter(!is.na(price)) |>
  filter(price < 550) # select only the prices above 500 (there are a lot of outliers)
head(airbnb1)
# A tibble: 6 × 18
     id name        host_id host_name neighbourhood_group neighbourhood latitude
  <dbl> <chr>         <dbl> <chr>     <lgl>               <chr>            <dbl>
1  3686 Vita's Hid…    4645 Vita      NA                  Historic Ana…     38.9
2  3943 Historic R…    5059 Vasa      NA                  Edgewood, Bl…     38.9
3  4197 Capitol Hi…    5061 Sandra    NA                  Capitol Hill…     38.9
4  4529 Bertina's …    5803 Bertina   NA                  Eastland Gar…     38.9
5  7103 Lovely gue…   17633 Charlotte NA                  Spring Valle…     38.9
6 11785 Sanctuary …   32015 Teresa    NA                  Cathedral He…     38.9
# ℹ 11 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
#   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <dttm>,
#   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
#   availability_365 <dbl>, number_of_reviews_ltm <dbl>, license <chr>
airbnb_box <- airbnb1 |>
  ggplot() + 
  geom_boxplot(aes(y= price, x = room_type, fill= room_type)) +
  scale_fill_manual(values=c("#6bcdcf","#f5ba78","#6bcdcf", "#6bcdcf")) +
  theme_grey() +
  labs(title = "Price Distribution Across Three Airbnb Room Types", 
       y = "Price (in US Dollars)", x = "Room Type", 
       caption =  "Data Retrieved from the DATASETS Google Drive Folder: \n https://drive.google.com/drive/u/1/folders/1AMRfddeMwKRaNidOV87JP1iCVn_z-Uv_") +
coord_flip()
airbnb_box

Write a short paragraph (3–5 sentences) describing:

  • the visualization you created, and

  • one key insight or pattern you observe in the plot.

I created a side-by-side boxplot that shows the differences in price distributions among 4 room types: Entire home/apt, hotel room. private room, and shared room. The room type with the greatest median price was Hotel rooms, around $250 per night, so I opted to highlight this difference by coloring the Hotel Room boxplot differently than the others. I observed many outliers in the dataset, and had to filter the price to be less than or greater than 500 as a result, as the outliers led to extreme confusion in the graph. The cheapest Airbnb type, by far, was the shared room.

Extra Code I decided not to use, but I thought was cool:

airbnb2 <- airbnb1 |>
  group_by(room_type)|>
  summarise(avg_price = mean(price), # calculates the average price
            avg_avalibility = mean(availability_365))  # calculates the mean avalibility, 
                                                        #365 days out of the year
head(airbnb2)
# A tibble: 4 × 3
  room_type       avg_price avg_avalibility
  <chr>               <dbl>           <dbl>
1 Entire home/apt     168.             176.
2 Hotel room          277.             169.
3 Private room         95.5            192.
4 Shared room          69.3            218.
library(RColorBrewer)
library(treemap)
Warning: package 'treemap' was built under R version 4.5.2
treemap(airbnb2, 
        index="room_type", 
        vSize="avg_avalibility", # square size based on availibility
        vColor="avg_price", # price determines which shade of blue
        type="manual",    
        palette="Blues",  #Use RColorBrewer palette
        title = "Average Price and Avalibility By Room Type",  # plot title
        title.legend = "Average Price (in US Dollars)")  # legend label