#Importing data
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
Airbnb <- read.csv('Airbnb.csv')
#Making a copy of the data
airbnb_nyc <- Airbnb
#Getting rid of the datasets that I don’t need
rm(Airbnb)
#Loading dplyr and lubridate packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(extrafont)
## Registering fonts with R
#Checking the structure of each of columns of the data
str(airbnb_nyc)
## 'data.frame': 30478 obs. of 13 variables:
## $ Host.Id : int 5162530 33134899 39608626 500 500 1039 1783 2078 2339 2339 ...
## $ Host.Since : chr "" "" "" "6/26/2008" ...
## $ Name : chr "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
## $ Neighbourhood. : chr "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
## $ Property.Type : chr "Apartment" "Apartment" "Apartment" "Apartment" ...
## $ Review.Scores.Rating..bin.: int NA NA NA NA 95 100 100 90 90 95 ...
## $ Room.Type : chr "Entire home/apt" "Private room" "Private room" "Entire home/apt" ...
## $ Zipcode : int 11249 11206 10032 10024 10036 11222 10004 11201 10009 10009 ...
## $ Beds : int 1 1 1 3 3 1 1 1 2 2 ...
## $ Number.of.Records : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Number.Of.Reviews : int 0 1 1 0 39 4 9 80 95 23 ...
## $ Price : chr "145" "37" "28" "199" ...
## $ Review.Scores.Rating : int NA NA NA NA 96 100 100 94 90 96 ...
#Converting Zipcode from integer to character
airbnb_nyc$Zipcode <- as.character(airbnb_nyc$Zipcode)
#Converting Price from character to integer
airbnb_nyc$Price <- as.integer(airbnb_nyc$Price)
## Warning: NAs introduced by coercion
#Rechecking the structure of converted columns of the data
str(airbnb_nyc)
## 'data.frame': 30478 obs. of 13 variables:
## $ Host.Id : int 5162530 33134899 39608626 500 500 1039 1783 2078 2339 2339 ...
## $ Host.Since : chr "" "" "" "6/26/2008" ...
## $ Name : chr "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
## $ Neighbourhood. : chr "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
## $ Property.Type : chr "Apartment" "Apartment" "Apartment" "Apartment" ...
## $ Review.Scores.Rating..bin.: int NA NA NA NA 95 100 100 90 90 95 ...
## $ Room.Type : chr "Entire home/apt" "Private room" "Private room" "Entire home/apt" ...
## $ Zipcode : chr "11249" "11206" "10032" "10024" ...
## $ Beds : int 1 1 1 3 3 1 1 1 2 2 ...
## $ Number.of.Records : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Number.Of.Reviews : int 0 1 1 0 39 4 9 80 95 23 ...
## $ Price : int 145 37 28 199 549 149 250 90 270 290 ...
## $ Review.Scores.Rating : int NA NA NA NA 96 100 100 94 90 96 ...
#Cleaning messy data
airbnb_nyc$Neighbourhood <- ifelse((airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 10009) | (airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 10022) | (airbnb_nyc$Neighbourhood == "Bronx" & airbnb_nyc$Zipcode == 10032), "Manhattan", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse((airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 11385) | (airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 11416) | (airbnb_nyc$Neighbourhood == "Manhattan" & airbnb_nyc$Zipcode == 11419), "Queens", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse(airbnb_nyc$Neighbourhood == "Brooklyn" & airbnb_nyc$Zipcode == 7030, "Northeast New Jersey", airbnb_nyc$Neighbourhood)
airbnb_nyc$Neighbourhood <- ifelse(airbnb_nyc$Neighbourhood == "Manhattan" & airbnb_nyc$Zipcode == 7150, "North Inbound", airbnb_nyc$Neighbourhood)
#Renaming the columns for convenience
airbnb_nyc <- airbnb_nyc %>% rename(listing_name=Name, borough=Neighbourhood, property_type=Property.Type, room_type=Room.Type, zipcode=Zipcode, beds=Beds, no_of_reviews=Number.Of.Reviews, price=Price, rating=Review.Scores.Rating)
#Getting rid of the rows that I don’t need
airbnb_nyc <- airbnb_nyc %>% filter(borough=='Bronx' | borough=='Brooklyn' | borough=='Manhattan' | borough=='Queens' | borough=='Staten Island')
#Creating a new date column, a year column, and factor type borough and zipcode columns
airbnb_nyc <- airbnb_nyc %>% mutate(date=mdy(Host.Since), year=substr(date,1,4))
airbnb_nyc$borough_fac <- as.factor(airbnb_nyc$borough)
airbnb_nyc$zipcode_fac <- as.factor(airbnb_nyc$zipcode)
#Getting rid of the columns that I don’t need
airbnb_nyc <- airbnb_nyc %>% select(-Host.Id, -Host.Since, -Review.Scores.Rating..bin., -Number.of.Records)
#Renaming one of the categories of room type
airbnb_nyc$room_type[airbnb_nyc$room_type=="Entire home/apt"] <- "Entire home/apt."
#Analysis 1: A comparative look at the number of total rented rooms, total price, average price, and median price in the five boroughs of NYC from June 2008 to August 2015 and their changes over the years under this timeframe
#Comparing the number of total rented rooms, total price, average price, and median price in five boroughs of NYC (2008-15)
price_by_boro2 <- airbnb_nyc %>% group_by(borough) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())
#Comparing the Change in Number of total rented rooms, total price, average price, and median price in five boroughs of NYC (2008-15)
#Creating a new dataset for the convenience of the comparison
price_by_boro <- airbnb_nyc %>% group_by(borough, year) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
price_by_boro <- price_by_boro[complete.cases(price_by_boro$year), ]
options(scipen=99)
# Comparing Total rented rooms
price_by_boro %>% ggplot(aes(year, total_rented_rooms, group= borough, color= borough))+geom_line()+labs(title="Change in Total Rented Rooms in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Total Rented Rooms")+theme_classic()
#Comparing total price
price_by_boro %>% ggplot(aes(year, total_price, group= borough, color= borough))+geom_line()+labs(title="Change in Total Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Total Price")+theme_classic()
#Comparing average price
price_by_boro %>% ggplot(aes(year, average_price, group= borough, color= borough))+geom_line()+labs(title="Change in Average Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Average Price")+theme_classic()
#Comparing median price
price_by_boro %>% ggplot(aes(year, median_price, group= borough, color= borough))+geom_line()+labs(title="Change in Median Price in five Boroughs of NYC", subtitle="June 2008 - August 2015", x="Year", y="Median Price")+theme_classic()
#Analysis 2: Comparing the total rented rooms, total price, average price, and median price based on room types in five boroughs of NYC (2008-15)
price_by_boro_rt <- airbnb_nyc %>% group_by(borough, room_type) %>% summarize(total_price=sum(price, na.rm=TRUE), average_price=mean(price, na.rm=TRUE), median_price=median(price, na.rm=TRUE), total_rented_rooms=n())
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
#Bar Graphs of total rented rooms based on room types in five boroughs
airbnb_nyc %>% group_by(borough, room_type) %>% summarise(Count=n()) %>% ggplot(aes(reorder(borough, -Count), Count, fill=borough))+geom_col(show.legend=FALSE, color="black")+geom_text(aes(label=comma(Count)))+labs(title="Number of Rented Rooms Based on Room Type in each Borough", subtitle="June 2008 - August 2015", x="Borough Name", y="Total Number")+facet_wrap(~room_type)+theme_classic()
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
#Analysis 3: Is room price affecting the Airbnb rating in NYC?
#Filter to low-rated rooms in a new dataset
airbnb_nyc_bad <- airbnb_nyc %>% filter(rating<60)
airbnb_nyc_bad %>% group_by(borough) %>% summarize(Count=n())
## # A tibble: 4 × 2
## borough Count
## <chr> <int>
## 1 Brooklyn 44
## 2 Manhattan 69
## 3 Queens 14
## 4 Staten Island 1
airbnb_nyc_bad1 <- airbnb_nyc_bad %>% group_by(borough) %>% summarize(Count=n())
sum(airbnb_nyc_bad1$Count)
## [1] 128
#Filter to Bronx, Brooklyn, Manhattan, Queens, and Staten Island in new datasets
airbnb_bx <- airbnb_nyc %>% filter(borough=='Bronx')
airbnb_bk <- airbnb_nyc %>% filter(borough=='Brooklyn')
airbnb_mn <- airbnb_nyc %>% filter(borough=='Manhattan')
airbnb_qn <- airbnb_nyc %>% filter(borough=='Queens')
airbnb_si <- airbnb_nyc %>% filter(borough=='Staten Island')
#Correlation between Room Price and Rating in NYC and its five boroughs
cor(airbnb_nyc$price, airbnb_nyc$rating, use='complete.obs')
## [1] 0.0875274
cor(airbnb_bx$price, airbnb_bx$rating, use='complete.obs')
## [1] 0.1176378
cor(airbnb_bk$price, airbnb_bk$rating, use='complete.obs')
## [1] 0.1252373
cor(airbnb_mn$price, airbnb_mn$rating, use='complete.obs')
## [1] 0.08765012
cor(airbnb_qn$price, airbnb_qn$rating, use='complete.obs')
## [1] 0.08084138
cor(airbnb_si$price, airbnb_si$rating, use='complete.obs')
## [1] 0.02259818
#Regression analysis between Room Price and Rating in five boroughs of NYC
airbnb_nyc %>% ggplot(aes(price, rating))+geom_point(color="blue")+geom_smooth(method="lm", color="red")+labs(x="Room Price", y="Rating", title="Room Price Vs Rating in five Boroughs of NYC", subtitle="Data from June 2008 - August 2015")+facet_wrap(~borough)+theme_test()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8351 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8351 rows containing missing values or values outside the scale range
## (`geom_point()`).
lm(rating~price, data=airbnb_nyc)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_nyc)
##
## Coefficients:
## (Intercept) price
## 90.819486 0.007742
reg_nyc <- lm(rating~price, data=airbnb_nyc)
summary(reg_nyc)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_nyc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.142 -3.245 1.665 6.703 9.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.8194861 0.1074618 845.13 <0.0000000000000002 ***
## price 0.0077416 0.0005938 13.04 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.82 on 22014 degrees of freedom
## (8351 observations deleted due to missingness)
## Multiple R-squared: 0.007661, Adjusted R-squared: 0.007616
## F-statistic: 170 on 1 and 22014 DF, p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_bx)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_bx)
##
## Coefficients:
## (Intercept) price
## 89.84439 0.02439
reg_bx <- lm(rating~price, data=airbnb_bx)
summary(reg_bx)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_bx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.405 -3.412 1.314 6.729 9.912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.84439 1.19457 75.211 <0.0000000000000002 ***
## price 0.02439 0.01414 1.725 0.086 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.13 on 212 degrees of freedom
## (128 observations deleted due to missingness)
## Multiple R-squared: 0.01384, Adjusted R-squared: 0.009187
## F-statistic: 2.975 on 1 and 212 DF, p-value: 0.08602
lm(rating~price, data=airbnb_bk)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_bk)
##
## Coefficients:
## (Intercept) price
## 90.60748 0.01387
reg_bk <- lm(rating~price, data=airbnb_bk)
summary(reg_bk)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_bk)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.133 -3.035 1.632 6.341 9.018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.607484 0.176910 512.17 <0.0000000000000002 ***
## price 0.013872 0.001198 11.58 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.606 on 8422 degrees of freedom
## (3185 observations deleted due to missingness)
## Multiple R-squared: 0.01568, Adjusted R-squared: 0.01557
## F-statistic: 134.2 on 1 and 8422 DF, p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_mn)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_mn)
##
## Coefficients:
## (Intercept) price
## 90.539047 0.007053
reg_mn <- lm(rating~price, data=airbnb_mn)
summary(reg_mn)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_mn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.655 -3.367 1.685 6.640 9.285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.5390468 0.1553710 582.728 <0.0000000000000002 ***
## price 0.0070527 0.0007416 9.511 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.815 on 11683 degrees of freedom
## (4292 observations deleted due to missingness)
## Multiple R-squared: 0.007683, Adjusted R-squared: 0.007598
## F-statistic: 90.45 on 1 and 11683 DF, p-value: < 0.00000000000000022
lm(rating~price, data=airbnb_qn)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_qn)
##
## Coefficients:
## (Intercept) price
## 90.30490 0.01297
reg_qn <- lm(rating~price, data=airbnb_qn)
summary(reg_qn)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_qn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.251 -3.278 2.099 7.479 9.371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.304903 0.459463 196.54 < 0.0000000000000002 ***
## price 0.012975 0.004004 3.24 0.00122 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.853 on 1596 degrees of freedom
## (694 observations deleted due to missingness)
## Multiple R-squared: 0.006535, Adjusted R-squared: 0.005913
## F-statistic: 10.5 on 1 and 1596 DF, p-value: 0.001219
lm(rating~price, data=airbnb_si)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_si)
##
## Coefficients:
## (Intercept) price
## 91.1984 0.0019
reg_si <- lm(rating~price, data=airbnb_si)
summary(reg_si)
##
## Call:
## lm(formula = rating ~ price, data = airbnb_si)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.3484 -3.8114 0.6117 5.7086 8.7352
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.198352 1.135015 80.350 <0.0000000000000002 ***
## price 0.001900 0.008714 0.218 0.828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.562 on 93 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.0005107, Adjusted R-squared: -0.01024
## F-statistic: 0.04752 on 1 and 93 DF, p-value: 0.8279
#Analysis 4: What price categories people are more prone to?
#Number of rented rooms based on price categories
#Making a summary of price to see how it can be broken down into few categories
summary(airbnb_nyc$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10.0 80.0 125.0 153.3 192.0 999.0 181
#breaking down price into three categories: low, medium low and medium
breaks <- c(-Inf, 50, 100, 200, 300, 400, Inf)
labels <- c("Low", "Medium-low", "Reasonable", "Medium", "Medium-high", "High")
airbnb_nyc <- airbnb_nyc %>% mutate(price_cat=cut(price, breaks=breaks, labels=labels))
#Checking the datatype of the new column ‘price_cat’ to see if it is factor-type or not
str(airbnb_nyc)
## 'data.frame': 30367 obs. of 15 variables:
## $ listing_name : chr "1 Bedroom in Prime Williamsburg" "Sunny, Private room in Bushwick" "Sunny Room in Harlem" "Gorgeous 1 BR with Private Balcony" ...
## $ Neighbourhood.: chr "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
## $ property_type : chr "Apartment" "Apartment" "Apartment" "Apartment" ...
## $ room_type : chr "Entire home/apt." "Private room" "Private room" "Entire home/apt." ...
## $ zipcode : chr "11249" "11206" "10032" "10024" ...
## $ beds : int 1 1 1 3 3 1 1 1 2 2 ...
## $ no_of_reviews : int 0 1 1 0 39 4 9 80 95 23 ...
## $ price : int 145 37 28 199 549 149 250 90 270 290 ...
## $ rating : int NA NA NA NA 96 100 100 94 90 96 ...
## $ borough : chr "Brooklyn" "Brooklyn" "Manhattan" "Manhattan" ...
## $ date : Date, format: NA NA ...
## $ year : chr NA NA NA "2008" ...
## $ borough_fac : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 2 3 3 3 2 3 2 3 3 ...
## $ zipcode_fac : Factor w/ 186 levels "10001","10002",..: 135 102 31 22 35 118 4 98 8 8 ...
## $ price_cat : Factor w/ 6 levels "Low","Medium-low",..: 3 1 1 3 6 3 4 2 4 4 ...
#Number of hired rooms in each price category
summary(airbnb_nyc$price_cat)
## Low Medium-low Reasonable Medium Medium-high High
## 1941 10136 12349 3837 1012 911
## NA's
## 181
Price_category <- c("Low", "Medium-low", "Reasonable", "Medium", "Medium-high", "High")
Range <- c("0-50","51-100", "101-200", "201-300", "301-400", "500-999")
Total_hired_rooms <- c(1942, 10136, 12349, 3837, 1012, 911)
price_hired_rooms <- data.frame(Price_category, Range, Total_hired_rooms)
#Filter to reasonably priced ‘very good-rated’ rooms in a new dataset
airbnb_nyc_good_price <- airbnb_nyc %>% filter(price<200, rating>80)
#Analysis 5: What is the percentage of reasonably priced very good-rated rooms in five boroughs of NYC?
summary(airbnb_nyc_good_price$borough_fac)
## Bronx Brooklyn Manhattan Queens Staten Island
## 186 6517 7122 1303 77
summary(airbnb_nyc$borough_fac)
## Bronx Brooklyn Manhattan Queens Staten Island
## 342 11609 15977 2292 147
Borough <- c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island")
Total_hired_rooms <- c(342, 11609, 15978, 2292, 147)
Low_to_reasonably_priced_good_rated_hired_rooms <- c(186, 6517, 7122, 1303, 77)
good_rated_percentage <- data.frame(Borough, Total_hired_rooms, Low_to_reasonably_priced_good_rated_hired_rooms)
good_rated_percentage %>% mutate(Percentage=Low_to_reasonably_priced_good_rated_hired_rooms*100/Total_hired_rooms)
## Borough Total_hired_rooms
## 1 Bronx 342
## 2 Brooklyn 11609
## 3 Manhattan 15978
## 4 Queens 2292
## 5 Staten Island 147
## Low_to_reasonably_priced_good_rated_hired_rooms Percentage
## 1 186 54.38596
## 2 6517 56.13748
## 3 7122 44.57379
## 4 1303 56.84991
## 5 77 52.38095
good_rated_percentage <- good_rated_percentage %>% mutate(Percentage=Low_to_reasonably_priced_good_rated_hired_rooms*100/Total_hired_rooms)
Reference: