##library
##file
setwd("C:/Users/danieljiang/Desktop")
airbnb <- read_csv("AB_NYC_2019.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## name = col_character(),
## host_id = col_double(),
## host_name = col_character(),
## neighbourhood_group = col_character(),
## neighbourhood = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## room_type = col_character(),
## price = col_double(),
## minimum_nights = col_double(),
## number_of_reviews = col_double(),
## last_review = col_date(format = ""),
## reviews_per_month = col_double(),
## calculated_host_listings_count = col_double(),
## availability_365 = col_double()
## )
#factor
airbnb$neighbourhood_group<-as.factor(airbnb$neighbourhood_group)
airbnb$neighbourhood <- as.factor(airbnb$neighbourhood)
airbnb$room_type <- as.factor(airbnb$room_type)
glimpse(airbnb)
## Observations: 48,895
## Variables: 16
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5...
## $ name <chr> "Clean & quiet apt home by the ...
## $ host_id <dbl> 2787, 2845, 4632, 4869, 7192, 7...
## $ host_name <chr> "John", "Jennifer", "Elisabeth"...
## $ neighbourhood_group <fct> Brooklyn, Manhattan, Manhattan,...
## $ neighbourhood <fct> Kensington, Midtown, Harlem, Cl...
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 4...
## $ longitude <dbl> -73.97237, -73.98377, -73.94190...
## $ room_type <fct> Private room, Entire home/apt, ...
## $ price <dbl> 149, 225, 150, 89, 80, 200, 60,...
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1,...
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 430, ...
## $ last_review <date> 2018-10-19, 2019-05-21, NA, 20...
## $ reviews_per_month <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.5...
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1...
## $ availability_365 <dbl> 365, 355, 365, 194, 0, 129, 0, ...
#lubridate
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
airbnb$last_review = as.Date(airbnb$last_review, "%m/%d/%Y")
airbnb$last_year = as.factor(format(airbnb$last_review, "%Y"))
#NA
sapply(airbnb,function(x) sum(is.na(x)))
## id name
## 0 16
## host_id host_name
## 0 21
## neighbourhood_group neighbourhood
## 0 0
## latitude longitude
## 0 0
## room_type price
## 0 0
## minimum_nights number_of_reviews
## 0 0
## last_review reviews_per_month
## 10052 10052
## calculated_host_listings_count availability_365
## 0 0
## last_year
## 10052
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.1
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr_plot <- aggr(airbnb, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## last_review 0.2055833930
## reviews_per_month 0.2055833930
## last_year 0.2055833930
## host_name 0.0004294918
## name 0.0003272318
## id 0.0000000000
## host_id 0.0000000000
## neighbourhood_group 0.0000000000
## neighbourhood 0.0000000000
## latitude 0.0000000000
## longitude 0.0000000000
## room_type 0.0000000000
## price 0.0000000000
## minimum_nights 0.0000000000
## number_of_reviews 0.0000000000
## calculated_host_listings_count 0.0000000000
## availability_365 0.0000000000
#summary
summary(airbnb)
## id name host_id
## Min. : 2539 Length:48895 Min. : 2438
## 1st Qu.: 9471945 Class :character 1st Qu.: 7822033
## Median :19677284 Mode :character Median : 30793816
## Mean :19017143 Mean : 67620011
## 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :36487245 Max. :274321313
##
## host_name neighbourhood_group neighbourhood
## Length:48895 Bronx : 1091 Williamsburg : 3920
## Class :character Brooklyn :20104 Bedford-Stuyvesant: 3714
## Mode :character Manhattan :21661 Harlem : 2658
## Queens : 5666 Bushwick : 2465
## Staten Island: 373 Upper West Side : 1971
## Hell's Kitchen : 1958
## (Other) :32209
## latitude longitude room_type
## Min. :40.50 Min. :-74.24 Entire home/apt:25409
## 1st Qu.:40.69 1st Qu.:-73.98 Private room :22326
## Median :40.72 Median :-73.96 Shared room : 1160
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## price minimum_nights number_of_reviews
## Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.: 69.0 1st Qu.: 1.00 1st Qu.: 1.00
## Median : 106.0 Median : 3.00 Median : 5.00
## Mean : 152.7 Mean : 7.03 Mean : 23.27
## 3rd Qu.: 175.0 3rd Qu.: 5.00 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.00 Max. :629.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-03-28 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-08 1st Qu.: 0.190 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.720 Median : 1.000
## Mean :2018-10-04 Mean : 1.373 Mean : 7.144
## 3rd Qu.:2019-06-23 3rd Qu.: 2.020 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :58.500 Max. :327.000
## NA's :10052 NA's :10052
## availability_365 last_year
## Min. : 0.0 2019 :25209
## 1st Qu.: 0.0 2018 : 6050
## Median : 45.0 2017 : 3205
## Mean :112.8 2016 : 2707
## 3rd Qu.:227.0 2015 : 1393
## Max. :365.0 (Other): 279
## NA's :10052
#EDA~price
ggplot(airbnb) +
geom_bar(aes(price),fill = '#fd5c63',alpha = 0.85,binwidth = 10) +
theme_minimal(base_size = 13) + xlab("Price") + ylab("Number") +
ggtitle("The Distrubition of Price")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
#EDA~price
ggplot(airbnb, aes(price)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "#fd5c63") +
geom_density(alpha = 0.2, fill = "#fd5c63") +ggtitle("Transformed distribution of price",subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_bin).
## Warning: Removed 11 rows containing non-finite values (stat_density).
#neighbourhood_group
ggplot(airbnb) + geom_histogram(aes(neighbourhood_group, fill = neighbourhood_group), stat = "count",alpha = 0.85) +
theme_minimal(base_size=13) + xlab("") + ylab("") +theme(legend.position="none") +
ggtitle("The Number of Property in Each Area")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
airbnb_nh <- airbnb %>%
group_by(neighbourhood_group) %>%
summarise(price = round(mean(price), 2))
airbnb_nh
## # A tibble: 5 x 2
## neighbourhood_group price
## <fct> <dbl>
## 1 Bronx 87.5
## 2 Brooklyn 124.
## 3 Manhattan 197.
## 4 Queens 99.5
## 5 Staten Island 115.
ggplot(airbnb, aes(price)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "#fd5c63") +
geom_density(alpha = 0.2, fill = "#fd5c63") +ggtitle("Transformed distribution of price",subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
scale_x_log10() + facet_wrap(~neighbourhood_group)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_bin).
## Warning: Removed 11 rows containing non-finite values (stat_density).
#roomtype
airbnb_rt <- airbnb %>%
group_by(room_type) %>%
summarise(price = round(mean(price), 2))
airbnb_rt
## # A tibble: 3 x 2
## room_type price
## <fct> <dbl>
## 1 Entire home/apt 212.
## 2 Private room 89.8
## 3 Shared room 70.1
ggplot(airbnb, aes(price)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "#fd5c63") +
geom_density(alpha = 0.2, fill = "#fd5c63") +ggtitle("Transformed distribution of price",subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
scale_x_log10() + facet_wrap(~room_type)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_bin).
## Warning: Removed 11 rows containing non-finite values (stat_density).
#neighbourhood_group x roomtype
ggplot(airbnb) + geom_histogram(aes(neighbourhood_group, fill = room_type), stat = "count",alpha = 0.85, position = 'fill') +
theme_minimal(base_size=13) + xlab("") + ylab("") +
ggtitle("The Proportion of Room Type in Each Area")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
#number of reviews
ggplot(airbnb, aes(number_of_reviews, price)) +
theme(axis.title = element_text(), axis.title.x = element_text()) +
geom_point(aes(size = price), alpha = 0.05, color = "slateblue") +
xlab("Number of reviews") +
ylab("Price") +
ggtitle("Relationship between number of reviews",
subtitle = "The most expensive objects have small number of reviews (or 0)")
#year
ggplot(airbnb) +
geom_histogram(aes(last_year), stat = "count", fill = '#fd5c63',alpha = 0.85) +
theme_minimal(base_size=13)+xlab("")+ylab("") +
ggtitle("The Number of New Property")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
airbnb_yr <- airbnb %>%
group_by(last_year) %>%
summarise(price = round(mean(price), 2))
## Warning: Factor `last_year` contains implicit NA, consider using
## `forcats::fct_explicit_na`
airbnb_yr
## # A tibble: 10 x 2
## last_year price
## <fct> <dbl>
## 1 2011 169
## 2 2012 158.
## 3 2013 256.
## 4 2014 160.
## 5 2015 157.
## 6 2016 152.
## 7 2017 135.
## 8 2018 139.
## 9 2019 142.
## 10 <NA> 193.
#correlation
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")