# Load necessary libraries
library(ggplot2)
library(RColorBrewer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
?dplyr::filter
library(scales)
# Load data
airbnb <- read.csv("/Users/michellenguyen/Downloads/DATASETS 2/airbnb_ny19.csv")
# Display first 6 rows of data
head(airbnb)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  10/19/2018              0.21
## 2              1                45   5/21/2019              0.38
## 3              3                 0                            NA
## 4              1               270    7/5/2019              4.64
## 5             10                 9  11/19/2018              0.10
## 6              3                74   6/22/2019              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129
# Find the different room types that can be offered
room_type <- unique(airbnb$room_type)
room_type
## [1] "Private room"    "Entire home/apt" "Shared room"
# Group data by room type
by_room <- airbnb %>% group_by(room_type, neighbourhood_group)
head(by_room)
## # A tibble: 6 × 16
## # Groups:   room_type, neighbourhood_group [4]
##      id name           host_id host_name neighbourhood_g… neighbourhood latitude
##   <int> <chr>            <int> <chr>     <chr>            <chr>            <dbl>
## 1  2539 Clean & quiet…    2787 John      Brooklyn         Kensington        40.6
## 2  2595 Skylit Midtow…    2845 Jennifer  Manhattan        Midtown           40.8
## 3  3647 THE VILLAGE O…    4632 Elisabeth Manhattan        Harlem            40.8
## 4  3831 Cozy Entire F…    4869 LisaRoxa… Brooklyn         Clinton Hill      40.7
## 5  5022 Entire Apt: S…    7192 Laura     Manhattan        East Harlem       40.8
## 6  5099 Large Cozy 1 …    7322 Chris     Manhattan        Murray Hill       40.7
## # … with 9 more variables: longitude <dbl>, room_type <chr>, price <int>,
## #   minimum_nights <int>, number_of_reviews <int>, last_review <chr>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <int>,
## #   availability_365 <int>
# Find the average price for each room type in each neighborhood group
by_room_avg <- by_room %>%
  summarise(price = mean(price))
## `summarise()` has grouped output by 'room_type'. You can override using the
## `.groups` argument.
head(by_room_avg)
## # A tibble: 6 × 3
## # Groups:   room_type [2]
##   room_type       neighbourhood_group price
##   <chr>           <chr>               <dbl>
## 1 Entire home/apt Bronx               128. 
## 2 Entire home/apt Brooklyn            178. 
## 3 Entire home/apt Manhattan           249. 
## 4 Entire home/apt Queens              147. 
## 5 Entire home/apt Staten Island       174. 
## 6 Private room    Bronx                66.8
# Display a graph illustrating which neighborhood group has the most expensive rooms by room type
avg_price <- ggplot(data = by_room_avg, aes(x = room_type, y = price, fill = neighbourhood_group)) + geom_bar(position="dodge", stat="identity") + labs(title="Price Per Room Type By Neighbourhood Group") + xlab("Room Type") + ylab("Price") + scale_fill_discrete(name = "Neighborhood Group")
avg_price

# Shows that Manhattan has the most expensive rooms, shared room vs. private room does not have that much of a difference in price, and entire home/apt is the most expensive option out of the three
# Only get data from entire home/apt airbnbs
airbnb_entireRoom <- airbnb %>%
  filter(room_type == "Entire home/apt", neighbourhood_group == "Manhattan")
head(airbnb_entireRoom)
##     id                                             name host_id host_name
## 1 2595                            Skylit Midtown Castle    2845  Jennifer
## 2 5022 Entire Apt: Spacious Studio/Loft by central park    7192     Laura
## 3 5099        Large Cozy 1 BR Apartment In Midtown East    7322     Chris
## 4 5238               Cute & Cozy Lower East Side 1 bdrm    7549       Ben
## 5 5295                 Beautiful 1br on Upper West Side    7702      Lena
## 6 6090                    West Village Nest - Superhost   11975     Alina
##   neighbourhood_group   neighbourhood latitude longitude       room_type price
## 1           Manhattan         Midtown 40.75362 -73.98377 Entire home/apt   225
## 2           Manhattan     East Harlem 40.79851 -73.94399 Entire home/apt    80
## 3           Manhattan     Murray Hill 40.74767 -73.97500 Entire home/apt   200
## 4           Manhattan       Chinatown 40.71344 -73.99037 Entire home/apt   150
## 5           Manhattan Upper West Side 40.80316 -73.96545 Entire home/apt   135
## 6           Manhattan    West Village 40.73530 -74.00525 Entire home/apt   120
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                45   5/21/2019              0.38
## 2             10                 9  11/19/2018              0.10
## 3              3                74   6/22/2019              0.59
## 4              1               160    6/9/2019              1.33
## 5              5                53   6/22/2019              0.43
## 6             90                27  10/31/2018              0.22
##   calculated_host_listings_count availability_365
## 1                              2              355
## 2                              1                0
## 3                              1              129
## 4                              4              188
## 5                              1                6
## 6                              1                0
# Scatter plot of latitude and longitude vs. price for Entire home/apt rooms
ggplot(airbnb_entireRoom, aes(x = longitude, y = latitude, color=price, size = price)) + geom_point() + labs(title="Prices for Entire Home/Apt Airbnbs in Manhattan") + xlab("Longitude") + ylab("Latitude")

# Group the airbnb dataframe by host name and get the average price for all of their listings
airbnb_hostidprice <- 
  airbnb %>%
  group_by(host_name) %>%
  summarise(price = mean(price)) 
head(airbnb_hostidprice)
## # A tibble: 6 × 2
##   host_name                  price
##   <chr>                      <dbl>
## 1 ""                          111.
## 2 "​ Valéria"                   75 
## 3 "'Cil"                      120 
## 4 "(Ari) HENRY LEE"           140 
## 5 "(Email hidden by Airbnb)"  177.
## 6 "(Mary) Haiy"               126
# Find the count of listings for each name
airbnb_hostidcount <- airbnb %>%
  count(host_name)
# Create a dataframe displaying the host name with the amount of listings they have and the average price of their listings
airbnb_hostidcount$price <- airbnb_hostidprice$price
host_airbnb <- airbnb_hostidcount %>%
  filter(rank(desc(n)) <= 10)
host_airbnb
##       host_name   n    price
## 1          Alex 279 188.3978
## 2    Blueground 232 303.1509
## 3        Daniel 226 146.2212
## 4         David 403 163.3846
## 5       Jessica 205 247.3024
## 6          John 294 142.4898
## 7         Maria 204 119.2549
## 8       Michael 417 160.4197
## 9         Sarah 227 129.4185
## 10 Sonder (NYC) 327 253.1957
# Find the average price for all hosts with only 1 listing
# Do this to see if there is a significant difference in average price for hosts with many properties vs. 1 property
# Does having more properties mean your properties tend to be more expensive?
avg_1 <- airbnb_hostidcount %>%
  filter(n == 1) %>%
  summarise(price = mean(price))
# Append the row with average price for hosts with only 1 listing
host_airbnb[nrow(host_airbnb) + 1,] = c("1 Listing", 1, avg_1)
# Display graph showing Listing Count vs. Price
airbnb_countMax <- ggplot(host_airbnb, aes(x = host_name, y = price, fill = n)) + geom_bar(position = "dodge", stat = "identity") + labs(title="Listing count vs. Price", fill = "Listing Count") + xlab("Host Name") + ylab("Count of Airbnb Listings")
airbnb_countMax

Essay for Project 1

The data set I chose was provided by Airbnb and shows listings provided by hosts, which are identified by the variables “host_id” and “host_name”, the price of the listing, what kind of room the listing is, the count of listings each host has, the days available out of the year, and information about the reviews for the listing such as average reviews per month, date last reviewed, and number of reviews. This data set allows us to compare the statistics for a wide range of airbnbs with different price points, locations, and host experience, and I decided to tackle this project with the mindset of someone who is brand new to airbnb hosting and look at indicators of a successful listing.

For my first graph, I looked at the average prices per room type based on neighborhood group. When I chose the variables, I asked myself, “which room type should I choose and where?” I cleaned up this data by grouping the data set by room type and neighborhood group and finding the mean for the prices for each room type in each neighborhood group. At first glance, the graph shows a significant difference in price with the entire room/apt listings vs. shared room and private room. Shared room and private room listings were about the same price, but they were around half the price of entire room/apt listings. Paying attention to the neighborhood groups, Manhattan consistently had the most expensive listing regardless of room type, and the other 4 groups did not have any significant pattern. To sum up this graph, the most expensive airbnbs are located in Manhattan and are entire home/apt types.

For my second graph, I wanted to take my findings from my first graph further and created a scatterplot using the latitude and longitude variables for entire home/apt listings in Manhattan and looked at where in Manhattan are the listings more expensive. I cleaned up the data for this graph by filtering the data frame to have entire home/apt and Manhattan values. I really wanted to put the data points on top of a map of Manhattan to make the visualization a bit easier to digest, but I couldn’t figure out how to do that. Looking at the graph, you can see that the the more expensive listings are in lower Manhattan, which makes sense because a lot of the stores/restaurants are there. I had a hard time making this graph more visually appealing because there were just so many data points. I opted out to make the size and color be the prices, but there are so many listings that are not that expensive so it makes the more expensive listings hard to see.

For my last graph, I wanted to see if having more listings/experience as an Airbnb host means that your listings are more expensive. I can only imagine if you are a successful host, you are always on the lookout for nice apartments/homes to buy and rent out, and having a nicer listing means a higher price. This graph displays the top 10 hosts with the most listings and their average price points as well as the average price for all hosts with only 1 listing. I cleaned the data for this graph by grouping the data frame by host name, summarizing the mean for each price, obtaining the count for each host name, filtering the data to only get the top 10 counts, and appending to the data frame the average price for hosts with 1 listing, which is $155.41. Right off the bat, you can see that host experience does not equal higher price. There is a lot of variance and the hosts with the most listings have the same average price as hosts with 1 listing.

After finishing this project, I wish there was a way to find the amount of days booked for each listing. When I took a look at the variable “availability_365”, I felt like that it wasn’t a good indicator for a successful airbnb because the listing could be fully booked all year, or fully booked for the times they allowed it to be. I was also having a hard time choosing different types of graphs for this project. I really wanted to challenge myself and incorporate streamgraphs, alluvials, etc., but when comparing the data we used to display those graphs, it seemed a lot different than the data I was trying to display. I also had a lot of data points for my graphs, so I felt as though it would not have looked as nice.