library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(moments)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
master = read.csv("/Users/rose/Desktop/work/AB_MANHATTAN_DATA.csv", header = T)
names(master)
## [1] "id" "name"
## [3] "host_id" "host_name"
## [5] "neighbourhood_group" "neighbourhood"
## [7] "latitude" "longitude"
## [9] "room_type" "price"
## [11] "minimum_nights" "number_of_reviews"
## [13] "last_review" "reviews_per_month"
## [15] "calculated_host_listings_count" "availability_365"
#DataLink:New York City Airbnb Open Data | Kaggle. Retrieved September 24, 2019, from https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data
colnames(master)[1] = 'id'
master = subset(master,master$price >0)
summary(master)
## id
## Min. : 2595
## 1st Qu.: 9160997
## Median :19116646
## Mean :18774840
## 3rd Qu.:29541310
## Max. :36487245
##
## name
## : 9
## Harlem Gem : 7
## Cozy East Village Apartment : 6
## IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS: 6
## New York Apartment : 6
## Private room in Manhattan : 6
## (Other) :21620
## host_id host_name neighbourhood_group
## Min. : 2845 Sonder (NYC): 327 Manhattan:21660
## 1st Qu.: 8525354 Blueground : 230
## Median : 30656279 Michael : 212
## Mean : 67833116 David : 202
## 3rd Qu.:106950224 John : 151
## Max. :274321313 Kara : 135
## (Other) :20403
## neighbourhood latitude longitude
## Harlem :2658 Min. :40.70 Min. :-74.02
## Upper West Side:1971 1st Qu.:40.73 1st Qu.:-73.99
## Hell's Kitchen :1958 Median :40.76 Median :-73.98
## East Village :1853 Mean :40.77 Mean :-73.97
## Upper East Side:1798 3rd Qu.:40.80 3rd Qu.:-73.95
## Midtown :1545 Max. :40.88 Max. :-73.91
## (Other) :9877
## room_type price minimum_nights
## Entire home/apt:13198 Min. : 10.0 Min. : 1.000
## Private room : 7982 1st Qu.: 95.0 1st Qu.: 1.000
## Shared room : 480 Median : 150.0 Median : 3.000
## Mean : 196.9 Mean : 8.579
## 3rd Qu.: 220.0 3rd Qu.: 6.000
## Max. :10000.0 Max. :1250.000
##
## number_of_reviews last_review reviews_per_month
## Min. : 0.00 : 5028 Min. : 0.010
## 1st Qu.: 1.00 6/23/2019: 599 1st Qu.: 0.170
## Median : 4.00 7/1/2019 : 557 Median : 0.610
## Mean : 20.99 6/30/2019: 508 Mean : 1.272
## 3rd Qu.: 19.00 6/22/2019: 296 3rd Qu.: 1.890
## Max. :607.00 6/24/2019: 290 Max. :58.500
## (Other) :14382 NA's :5028
## calculated_host_listings_count availability_365
## Min. : 1.00 Min. : 0
## 1st Qu.: 1.00 1st Qu.: 0
## Median : 1.00 Median : 36
## Mean : 12.79 Mean :112
## 3rd Qu.: 2.00 3rd Qu.:230
## Max. :327.00 Max. :365
##
master$price = log(master$price)
summary(master)
## id
## Min. : 2595
## 1st Qu.: 9160997
## Median :19116646
## Mean :18774840
## 3rd Qu.:29541310
## Max. :36487245
##
## name
## : 9
## Harlem Gem : 7
## Cozy East Village Apartment : 6
## IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS: 6
## New York Apartment : 6
## Private room in Manhattan : 6
## (Other) :21620
## host_id host_name neighbourhood_group
## Min. : 2845 Sonder (NYC): 327 Manhattan:21660
## 1st Qu.: 8525354 Blueground : 230
## Median : 30656279 Michael : 212
## Mean : 67833116 David : 202
## 3rd Qu.:106950224 John : 151
## Max. :274321313 Kara : 135
## (Other) :20403
## neighbourhood latitude longitude
## Harlem :2658 Min. :40.70 Min. :-74.02
## Upper West Side:1971 1st Qu.:40.73 1st Qu.:-73.99
## Hell's Kitchen :1958 Median :40.76 Median :-73.98
## East Village :1853 Mean :40.77 Mean :-73.97
## Upper East Side:1798 3rd Qu.:40.80 3rd Qu.:-73.95
## Midtown :1545 Max. :40.88 Max. :-73.91
## (Other) :9877
## room_type price minimum_nights
## Entire home/apt:13198 Min. :2.303 Min. : 1.000
## Private room : 7982 1st Qu.:4.554 1st Qu.: 1.000
## Shared room : 480 Median :5.011 Median : 3.000
## Mean :4.999 Mean : 8.579
## 3rd Qu.:5.394 3rd Qu.: 6.000
## Max. :9.210 Max. :1250.000
##
## number_of_reviews last_review reviews_per_month
## Min. : 0.00 : 5028 Min. : 0.010
## 1st Qu.: 1.00 6/23/2019: 599 1st Qu.: 0.170
## Median : 4.00 7/1/2019 : 557 Median : 0.610
## Mean : 20.99 6/30/2019: 508 Mean : 1.272
## 3rd Qu.: 19.00 6/22/2019: 296 3rd Qu.: 1.890
## Max. :607.00 6/24/2019: 290 Max. :58.500
## (Other) :14382 NA's :5028
## calculated_host_listings_count availability_365
## Min. : 1.00 Min. : 0
## 1st Qu.: 1.00 1st Qu.: 0
## Median : 1.00 Median : 36
## Mean : 12.79 Mean :112
## 3rd Qu.: 2.00 3rd Qu.:230
## Max. :327.00 Max. :365
##
str(master)
## 'data.frame': 21660 obs. of 16 variables:
## $ id : int 2595 3647 5022 5099 5178 5203 5238 5295 5441 6021 ...
## $ name : Factor w/ 21263 levels ""," 2-3 bedroom UWS garden triplex ",..: 16570 19778 8100 10600 10626 6356 7223 2474 4103 21067 ...
## $ host_id : int 2845 4632 7192 7322 8967 7490 7549 7702 7989 11528 ...
## $ host_name : Factor w/ 5973 levels "","(Ari) HENRY LEE",..: 2541 1587 3153 1069 5046 3649 683 3199 2890 1120 ...
## $ neighbourhood_group : Factor w/ 1 level "Manhattan": 1 1 1 1 1 1 1 1 1 1 ...
## $ neighbourhood : Factor w/ 32 levels "Battery Park City",..: 18 11 5 20 12 30 3 30 12 30 ...
## $ latitude : num 40.8 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -73.9 -73.9 -74 -74 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 2 1 1 2 2 1 1 2 2 ...
## $ price : num 5.42 5.01 4.38 5.3 4.37 ...
## $ minimum_nights : int 1 3 10 3 2 2 1 5 2 2 ...
## $ number_of_reviews : int 45 0 9 74 430 118 160 53 188 113 ...
## $ last_review : Factor w/ 1566 levels "","1/1/2015",..: 928 1 304 1088 1096 1222 1166 1088 1092 1276 ...
## $ reviews_per_month : num 0.38 NA 0.1 0.59 3.47 0.99 1.33 0.43 1.5 0.91 ...
## $ calculated_host_listings_count: int 2 1 1 1 1 1 4 1 1 1 ...
## $ availability_365 : int 355 365 0 129 220 0 188 6 39 333 ...
# The variable reviews_per_month has 5029 NA observations.
# Replace NA with 0 to resolve missing data.
master[is.na(master)] = 0
summary(master)
## id
## Min. : 2595
## 1st Qu.: 9160997
## Median :19116646
## Mean :18774840
## 3rd Qu.:29541310
## Max. :36487245
##
## name
## : 9
## Harlem Gem : 7
## Cozy East Village Apartment : 6
## IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS: 6
## New York Apartment : 6
## Private room in Manhattan : 6
## (Other) :21620
## host_id host_name neighbourhood_group
## Min. : 2845 Sonder (NYC): 327 Manhattan:21660
## 1st Qu.: 8525354 Blueground : 230
## Median : 30656279 Michael : 212
## Mean : 67833116 David : 202
## 3rd Qu.:106950224 John : 151
## Max. :274321313 Kara : 135
## (Other) :20403
## neighbourhood latitude longitude
## Harlem :2658 Min. :40.70 Min. :-74.02
## Upper West Side:1971 1st Qu.:40.73 1st Qu.:-73.99
## Hell's Kitchen :1958 Median :40.76 Median :-73.98
## East Village :1853 Mean :40.77 Mean :-73.97
## Upper East Side:1798 3rd Qu.:40.80 3rd Qu.:-73.95
## Midtown :1545 Max. :40.88 Max. :-73.91
## (Other) :9877
## room_type price minimum_nights
## Entire home/apt:13198 Min. :2.303 Min. : 1.000
## Private room : 7982 1st Qu.:4.554 1st Qu.: 1.000
## Shared room : 480 Median :5.011 Median : 3.000
## Mean :4.999 Mean : 8.579
## 3rd Qu.:5.394 3rd Qu.: 6.000
## Max. :9.210 Max. :1250.000
##
## number_of_reviews last_review reviews_per_month
## Min. : 0.00 : 5028 Min. : 0.0000
## 1st Qu.: 1.00 6/23/2019: 599 1st Qu.: 0.0200
## Median : 4.00 7/1/2019 : 557 Median : 0.2800
## Mean : 20.99 6/30/2019: 508 Mean : 0.9768
## 3rd Qu.: 19.00 6/22/2019: 296 3rd Qu.: 1.3000
## Max. :607.00 6/24/2019: 290 Max. :58.5000
## (Other) :14382
## calculated_host_listings_count availability_365
## Min. : 1.00 Min. : 0
## 1st Qu.: 1.00 1st Qu.: 0
## Median : 1.00 Median : 36
## Mean : 12.79 Mean :112
## 3rd Qu.: 2.00 3rd Qu.:230
## Max. :327.00 Max. :365
##
model = lm(price ~ minimum_nights +
number_of_reviews +
reviews_per_month +
calculated_host_listings_count +
availability_365,
data = master)
k = 5 # Numbers of IVs
leverage = hatvalues(model = model)
cutleverage = (2 * k + 2) / nrow(master)
paste('Leverage cutoff is: ', cutleverage)
## [1] "Leverage cutoff is: 0.000554016620498615"
badleverage = as.numeric(leverage > cutleverage)
table(badleverage)
## badleverage
## 0 1
## 19959 1701
cooks = cooks.distance(model)
cutcooks = 4 / (nrow(master) - k - 1)
paste('Cooks cutoff is: ', cutcooks)
## [1] "Cooks cutoff is: 0.000184723376743327"
badcooks = as.numeric(cooks > cutcooks)
table(badcooks)
## badcooks
## 0 1
## 20883 777
# Out of the whole dataset, only these variables will be considered.
variables = c(
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365'
)
mahal = mahalanobis(master[variables],
colMeans(master[variables]),
cov(master[variables]))
cutmahal = qchisq(1-.001, ncol(master[variables]))
paste('Mahalanobis cutoff is: ', round(cutmahal, digits = 2))
## [1] "Mahalanobis cutoff is: 22.46"
badmahal = as.numeric(mahal > cutmahal)
table(badmahal)
## badmahal
## 0 1
## 20807 853
#master_noout = subset(master2, mahal < cutmahal)
#summary(master_noout)
overall = badmahal + badleverage + badcooks
table(overall)
## overall
## 0 1 2 3
## 19546 1162 687 265
master_noout = subset(master, overall < 2)
summary(master_noout)
## id
## Min. : 2595
## 1st Qu.: 9217903
## Median :18823950
## Mean :18600622
## 3rd Qu.:28890634
## Max. :36487245
##
## name
## : 8
## Harlem Gem : 7
## IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS: 6
## New York Apartment : 6
## Private room in Manhattan : 6
## West Village Apartment : 6
## (Other) :20669
## host_id host_name neighbourhood_group
## Min. : 2845 David : 196 Manhattan:20708
## 1st Qu.: 8520370 Michael : 193
## Median : 30283594 Blueground: 184
## Mean : 65249837 John : 148
## 3rd Qu.: 98844306 Kara : 133
## Max. :274321313 Mike : 133
## (Other) :19721
## neighbourhood latitude longitude
## Harlem :2587 Min. :40.70 Min. :-74.02
## Upper West Side:1916 1st Qu.:40.73 1st Qu.:-73.99
## Hell's Kitchen :1868 Median :40.76 Median :-73.98
## East Village :1797 Mean :40.77 Mean :-73.97
## Upper East Side:1766 3rd Qu.:40.80 3rd Qu.:-73.95
## Midtown :1498 Max. :40.88 Max. :-73.91
## (Other) :9276
## room_type price minimum_nights
## Entire home/apt:12541 Min. :2.303 Min. : 1.000
## Private room : 7718 1st Qu.:4.554 1st Qu.: 2.000
## Shared room : 449 Median :5.004 Median : 3.000
## Mean :4.987 Mean : 7.296
## 3rd Qu.:5.389 3rd Qu.: 5.000
## Max. :8.006 Max. :120.000
##
## number_of_reviews last_review reviews_per_month
## Min. : 0.00 : 4781 Min. :0.0000
## 1st Qu.: 1.00 6/23/2019: 548 1st Qu.:0.0200
## Median : 4.00 7/1/2019 : 521 Median :0.2700
## Mean : 18.49 6/30/2019: 476 Mean :0.8855
## 3rd Qu.: 18.00 6/22/2019: 272 3rd Qu.:1.1800
## Max. :221.00 6/24/2019: 266 Max. :7.7900
## (Other) :13844
## calculated_host_listings_count availability_365
## Min. : 1.000 Min. : 0
## 1st Qu.: 1.000 1st Qu.: 0
## Median : 1.000 Median : 30
## Mean : 7.615 Mean :107
## 3rd Qu.: 2.000 3rd Qu.:215
## Max. :232.000 Max. :365
##
#master2 is data without outliers and required variables (no categorical)
master2 = master_noout[, variables]
summary(master2)
## price minimum_nights number_of_reviews reviews_per_month
## Min. :2.303 Min. : 1.000 Min. : 0.00 Min. :0.0000
## 1st Qu.:4.554 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:0.0200
## Median :5.004 Median : 3.000 Median : 4.00 Median :0.2700
## Mean :4.987 Mean : 7.296 Mean : 18.49 Mean :0.8855
## 3rd Qu.:5.389 3rd Qu.: 5.000 3rd Qu.: 18.00 3rd Qu.:1.1800
## Max. :8.006 Max. :120.000 Max. :221.00 Max. :7.7900
## calculated_host_listings_count availability_365
## Min. : 1.000 Min. : 0
## 1st Qu.: 1.000 1st Qu.: 0
## Median : 1.000 Median : 30
## Mean : 7.615 Mean :107
## 3rd Qu.: 2.000 3rd Qu.:215
## Max. :232.000 Max. :365
The IVs are not highly correlated. Hence, it hasn’t met the assumption for multicollinearity/additivity.
model = lm(price ~ minimum_nights +
number_of_reviews +
reviews_per_month +
calculated_host_listings_count +
availability_365,
data = master_noout)
#To test for Multicollinearity
summary(model, correlation = T)
##
## Call:
## lm(formula = price ~ minimum_nights + number_of_reviews + reviews_per_month +
## calculated_host_listings_count + availability_365, data = master_noout)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.76773 -0.42415 0.00486 0.37682 3.00553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.939e+00 6.878e-03 718.089 <2e-16 ***
## minimum_nights -4.526e-03 4.725e-04 -9.580 <2e-16 ***
## number_of_reviews -1.713e-03 1.694e-04 -10.112 <2e-16 ***
## reviews_per_month -5.318e-03 4.457e-03 -1.193 0.233
## calculated_host_listings_count 2.265e-03 1.866e-04 12.138 <2e-16 ***
## availability_365 9.377e-04 3.801e-05 24.671 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6386 on 20702 degrees of freedom
## Multiple R-squared: 0.04907, Adjusted R-squared: 0.04884
## F-statistic: 213.7 on 5 and 20702 DF, p-value: < 2.2e-16
##
## Correlation of Coefficients:
## (Intercept) minimum_nights
## minimum_nights -0.38
## number_of_reviews -0.08 0.04
## reviews_per_month -0.33 0.17
## calculated_host_listings_count 0.05 -0.33
## availability_365 -0.29 -0.28
## number_of_reviews reviews_per_month
## minimum_nights
## number_of_reviews
## reviews_per_month -0.59
## calculated_host_listings_count 0.06 0.03
## availability_365 -0.11 -0.11
## calculated_host_listings_count
## minimum_nights
## number_of_reviews
## reviews_per_month
## calculated_host_listings_count
## availability_365 -0.23
# Individual pair-wise correlation is an indicator of multicollinearity. The correlation coeff of the variables shows that each variables are not highly correlated as the value is not close to 1.
# Also multicollinearity exist when there is very high R squared ie more than 0.90 and coefficients are not significant according to their p-values. In our model R squared is 0.20, f statistic is significant 782.2 and p value is less than 0.05. The individual p value for most of the variables is significant except for reviews_per_month variable. Hence there is no multicollinearity
The linearity assumption has been tested by resettest function which returns p-value greater than 5%. Hence, the assumption for linearity has met.
random = rchisq(nrow(master_noout[variables]), ncol(master_noout[variables]))
fake = lm(random~., data = master2)
summary(fake)
##
## Call:
## lm(formula = random ~ ., data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8711 -2.5628 -0.6461 1.8516 26.9973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3272519 0.1908416 33.154 <2e-16 ***
## price -0.0751174 0.0378864 -1.983 0.0474 *
## minimum_nights 0.0011945 0.0025811 0.463 0.6435
## number_of_reviews 0.0022259 0.0009257 2.405 0.0162 *
## reviews_per_month -0.0151092 0.0242979 -0.622 0.5341
## calculated_host_listings_count 0.0008084 0.0010210 0.792 0.4285
## availability_365 -0.0001376 0.0002102 -0.655 0.5128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.481 on 20701 degrees of freedom
## Multiple R-squared: 0.0005838, Adjusted R-squared: 0.0002941
## F-statistic: 2.015 on 6 and 20701 DF, p-value: 0.06001
standardized = rstudent(fake)
qqnorm(standardized)
abline(0, 1)
plot(fake, 2)
resettest(fake)
##
## RESET test
##
## data: fake
## RESET = 1.0913, df1 = 2, df2 = 20699, p-value = 0.3358
The assumption for normality hasn’t met as the histogram is positively skewed and the is not rounded near 0.
skewness(master2, na.rm = TRUE)
## price minimum_nights
## 0.4146988 2.7977708
## number_of_reviews reviews_per_month
## 2.8090271 1.9307349
## calculated_host_listings_count availability_365
## 6.1082194 0.8314128
kurtosis(master2, na.rm = TRUE)
## price minimum_nights
## 3.813680 14.561599
## number_of_reviews reviews_per_month
## 11.435476 6.459823
## calculated_host_listings_count availability_365
## 45.310079 2.100742
hist(standardized, breaks = 15)
The data is unevenly spread to 0 on x-axis. Hence, the assumption Homogeneity has not met. However, data on y-axis is not rounded to 0. Hence, the assumption Homoscedasticity hasn’t met.
fitvalues = scale(fake$fitted.values)
plot(fitvalues, standardized)
abline(0, 0)
abline(v = 0)
plot(fake, 1)
summary(master2)
## price minimum_nights number_of_reviews reviews_per_month
## Min. :2.303 Min. : 1.000 Min. : 0.00 Min. :0.0000
## 1st Qu.:4.554 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:0.0200
## Median :5.004 Median : 3.000 Median : 4.00 Median :0.2700
## Mean :4.987 Mean : 7.296 Mean : 18.49 Mean :0.8855
## 3rd Qu.:5.389 3rd Qu.: 5.000 3rd Qu.: 18.00 3rd Qu.:1.1800
## Max. :8.006 Max. :120.000 Max. :221.00 Max. :7.7900
## calculated_host_listings_count availability_365
## Min. : 1.000 Min. : 0
## 1st Qu.: 1.000 1st Qu.: 0
## Median : 1.000 Median : 30
## Mean : 7.615 Mean :107
## 3rd Qu.: 2.000 3rd Qu.:215
## Max. :232.000 Max. :365
str(master2)
## 'data.frame': 20708 obs. of 6 variables:
## $ price : num 5.42 5.01 4.38 5.3 4.37 ...
## $ minimum_nights : int 1 3 10 3 2 1 5 2 2 90 ...
## $ number_of_reviews : int 45 0 9 74 118 160 53 188 113 27 ...
## $ reviews_per_month : num 0.38 0 0.1 0.59 0.99 1.33 0.43 1.5 0.91 0.22 ...
## $ calculated_host_listings_count: int 2 1 1 1 1 4 1 1 1 1 ...
## $ availability_365 : int 355 365 0 129 0 188 6 39 333 0 ...
#Pearson method for a correlation table
round(cor(master2, use="pairwise.complete.obs", method = "pearson"), 2)
## price minimum_nights number_of_reviews
## price 1.00 0.04 -0.07
## minimum_nights 0.04 1.00 -0.16
## number_of_reviews -0.07 -0.16 1.00
## reviews_per_month -0.04 -0.22 0.62
## calculated_host_listings_count 0.13 0.43 -0.11
## availability_365 0.18 0.34 0.13
## reviews_per_month
## price -0.04
## minimum_nights -0.22
## number_of_reviews 0.62
## reviews_per_month 1.00
## calculated_host_listings_count -0.13
## availability_365 0.11
## calculated_host_listings_count
## price 0.13
## minimum_nights 0.43
## number_of_reviews -0.11
## reviews_per_month -0.13
## calculated_host_listings_count 1.00
## availability_365 0.33
## availability_365
## price 0.18
## minimum_nights 0.34
## number_of_reviews 0.13
## reviews_per_month 0.11
## calculated_host_listings_count 0.33
## availability_365 1.00
#Price has the highest correlation to availability_365 according to the Pearson table
cor.test(master2$price,master2$number_of_reviews, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: master2$price and master2$number_of_reviews
## t = -9.8171, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08160979 -0.05449556
## sample estimates:
## cor
## -0.06806524
cor.test(master2$price,master2$minimum_nights, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: master2$price and master2$minimum_nights
## t = 6.4757, df = 20706, p-value = 9.65e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03135613 0.05854149
## sample estimates:
## cor
## 0.04495714
cor.test(master2$price,master2$reviews_per_month, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: master2$price and master2$reviews_per_month
## t = -5.7287, df = 20706, p-value = 1.026e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05337154 -0.02617423
## sample estimates:
## cor
## -0.03978025
cor.test(master2$price,master2$calculated_host_listings_count, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: master2$price and master2$calculated_host_listings_count
## t = 19.305, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1195629 0.1463218
## sample estimates:
## cor
## 0.1329666
cor.test(master2$price,master2$availability_365, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: master2$price and master2$availability_365
## t = 26.207, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1659609 0.1923269
## sample estimates:
## cor
## 0.1791761
#Create Correltation plots for the dataset-
#library(corrplot)
#cordata only created for corrplot
cordata = master2
colnames(cordata) = c("Price", "Min Nights", "No. of RVWs", "RVWs per month", "Host Listing Ct", "Avail 365")
corrplot(cor(cordata), method="circle")
#library(ggplot2)
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'),
legend.key = element_rect(fill = 'white'),
text = element_text(size = 15))
# Scatterplot for each Independent with Devependent Variable with No Grouping
# Reviews per month
scatterplot.Monthlyreview = ggplot(master2, aes(master2$reviews_per_month, master2$price))
scatterplot.Monthlyreview +
geom_point(color = "blue")+
geom_smooth(method = lm , fill = "grey", color = "red") +
xlab("Reviews Per Month") +
ylab("Price") +
cleanup
## Warning: Use of `master2$reviews_per_month` is discouraged. Use
## `reviews_per_month` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$reviews_per_month` is discouraged. Use
## `reviews_per_month` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'
# Minimum Nights
scatterplot.Nights = ggplot(master2, aes(master2$minimum_nights, master2$price))
scatterplot.Nights +
geom_point(color = "blue") +
geom_smooth(method = lm,color = "red" , fill = "grey" ) +
xlab("Minimum NIghts") +
ylab("Price") +
cleanup
## Warning: Use of `master2$minimum_nights` is discouraged. Use
## `minimum_nights` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$minimum_nights` is discouraged. Use
## `minimum_nights` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'
# No of Reviews
scatterplot.Reviews = ggplot(master2, aes(master2$number_of_reviews, master2$price ))
scatterplot.Reviews +
geom_point(color = "blue")+
geom_smooth(method = lm,color = "red" , fill = "grey")+
xlab("No of Reviews") +
ylab("Price") +
cleanup
## Warning: Use of `master2$number_of_reviews` is discouraged. Use
## `number_of_reviews` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$number_of_reviews` is discouraged. Use
## `number_of_reviews` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'
# Host Listing Count
scatterplot.listingcount = ggplot(master2, aes(master2$calculated_host_listings_count, master2$price))
scatterplot.listingcount +
geom_point(color = "blue") +
geom_smooth(method = lm,color = "red" , fill = "grey") +
xlab("Listing count") +
ylab("Price") +
cleanup
## Warning: Use of `master2$calculated_host_listings_count` is discouraged.
## Use `calculated_host_listings_count` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$calculated_host_listings_count` is discouraged.
## Use `calculated_host_listings_count` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'
# Availabiliy
scatterplot.availability = ggplot(master2, aes(master2$availability_365, master2$price))
scatterplot.availability +
geom_point(color = "blue") +
geom_smooth(method = lm,color = "red" , fill = "black") +
xlab("Availability") +
ylab("Price") +
cleanup
## Warning: Use of `master2$availability_365` is discouraged. Use
## `availability_365` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$availability_365` is discouraged. Use
## `availability_365` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'
#Regression Analysis
library(haven)
#we will examine the significance of each variable by looking at the increase in adj Rsquared
model1= lm(price~ availability_365, data = master2)
summary(model1) #model1 is significant addition of treatment is significant as p<0.05 with adj R sq 0.03
##
## Call:
## lm(formula = price ~ availability_365, data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73802 -0.43905 -0.00606 0.40191 3.02796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.891e+00 5.780e-03 846.20 <2e-16 ***
## availability_365 8.957e-04 3.418e-05 26.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6442 on 20706 degrees of freedom
## Multiple R-squared: 0.0321, Adjusted R-squared: 0.03206
## F-statistic: 686.8 on 1 and 20706 DF, p-value: < 2.2e-16
summary(model1)$adj.r.squared
## [1] 0.03205733
model2= lm(price~ availability_365+ calculated_host_listings_count, data = master2)
summary(model2) #model2 is significant addition of treatment is significant as p<0.05 with adj R sq 0.04
##
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count,
## data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70213 -0.43520 -0.01348 0.39055 3.02669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.891e+00 5.762e-03 848.73 <2e-16 ***
## availability_365 7.595e-04 3.605e-05 21.07 <2e-16 ***
## calculated_host_listings_count 2.009e-03 1.740e-04 11.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6421 on 20705 degrees of freedom
## Multiple R-squared: 0.0383, Adjusted R-squared: 0.0382
## F-statistic: 412.2 on 2 and 20705 DF, p-value: < 2.2e-16
summary(model2)$adj.r.squared
## [1] 0.03820301
model3= lm(price~ availability_365+ calculated_host_listings_count + minimum_nights, data = master2)
summary(model3) #model3 is significant addition of treatment is significant as p<0.05 with adj R sq 0.04
##
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count +
## minimum_nights, data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.71052 -0.42899 -0.00368 0.38948 3.02238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.905e+00 6.063e-03 809.00 < 2e-16 ***
## availability_365 8.238e-04 3.706e-05 22.23 < 2e-16 ***
## calculated_host_listings_count 2.505e-03 1.865e-04 13.43 < 2e-16 ***
## minimum_nights -3.369e-03 4.609e-04 -7.31 2.76e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6413 on 20704 degrees of freedom
## Multiple R-squared: 0.04077, Adjusted R-squared: 0.04063
## F-statistic: 293.3 on 3 and 20704 DF, p-value: < 2.2e-16
summary(model3)$adj.r.squared
## [1] 0.04063288
model4= lm(price~ availability_365+ calculated_host_listings_count + minimum_nights +number_of_reviews, data = master2)
summary(model4) #model4 is significant addition of treatment is significant as p<0.05 with adj R sq 0.05
##
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count +
## minimum_nights + number_of_reviews, data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.76435 -0.42347 0.00437 0.37703 3.00124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.936e+00 6.485e-03 761.234 <2e-16 ***
## availability_365 9.329e-04 3.779e-05 24.684 <2e-16 ***
## calculated_host_listings_count 2.272e-03 1.866e-04 12.179 <2e-16 ***
## minimum_nights -4.431e-03 4.657e-04 -9.515 <2e-16 ***
## number_of_reviews -1.832e-03 1.368e-04 -13.388 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6386 on 20703 degrees of freedom
## Multiple R-squared: 0.04901, Adjusted R-squared: 0.04882
## F-statistic: 266.7 on 4 and 20703 DF, p-value: < 2.2e-16
summary(model4)$adj.r.squared
## [1] 0.04882163
model5= lm(price~availability_365+ calculated_host_listings_count + minimum_nights +number_of_reviews + reviews_per_month , data = master2)
summary(model5) #model5 is significant addition of treatment is significant as p<0.05 with adj R sq 0.05
##
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count +
## minimum_nights + number_of_reviews + reviews_per_month, data = master2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.76773 -0.42415 0.00486 0.37682 3.00553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.939e+00 6.878e-03 718.089 <2e-16 ***
## availability_365 9.377e-04 3.801e-05 24.671 <2e-16 ***
## calculated_host_listings_count 2.265e-03 1.866e-04 12.138 <2e-16 ***
## minimum_nights -4.526e-03 4.725e-04 -9.580 <2e-16 ***
## number_of_reviews -1.713e-03 1.694e-04 -10.112 <2e-16 ***
## reviews_per_month -5.318e-03 4.457e-03 -1.193 0.233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6386 on 20702 degrees of freedom
## Multiple R-squared: 0.04907, Adjusted R-squared: 0.04884
## F-statistic: 213.7 on 5 and 20702 DF, p-value: < 2.2e-16
summary(model5)$adj.r.squared
## [1] 0.0488411
#Comparing the models
anova(model1, model2, model3, model4, model5)
#Looking at R squared value in all the models, we noticed that minimum_nights provides least increase in the adjusted R squared. So we will remove it from the final model
#master3= master2[,-11] #removing minimum_nights
#fullmodel= lm(price~ number_of_reviews + reviews_per_month + calculated_host_listings_count + availability_365, data = master3)
#summary(fullmodel)
#adj R squared is 0.04. The model explains 30% of variance in the data. The p value is <0.05. hence the model is significant
#install.packages("leaps")
#library(leaps)
#data.subset = regsubsets(price~.,master2)
#data.subset.summary = summary(data.subset)
#data.subset.summary
#which.max(data.subset.summary$adjr2)
#Model Validation
mahal1 = mahalanobis(master2,
colMeans(master2),
cov(master2))
cutmahal1 = qchisq(1-.001,ncol(master2))
badmahal1 = as.numeric(mahal1 > cutmahal1)
table(badmahal1)
## badmahal1
## 0 1
## 20063 645
k=5# number of IV's
leverage1 = hatvalues(model5)
cutleverage1 = (2*k+2)/nrow(master2)
badleverage1 = as.numeric(leverage1 > cutleverage1)
table(badleverage1)
## badleverage1
## 0 1
## 18707 2001
#Testing for Influence using Cook's values
cooks1 = cooks.distance(model5)
cutcooks1 = 4/(nrow(master2)-k-1)
badcooks1 = as.numeric(cooks1 > cutcooks1)
table(badcooks1)
## badcooks1
## 0 1
## 19772 936
totalout1 = badmahal1 + badleverage1 + badcooks1 #Removing the bad values
table(totalout1)
## totalout1
## 0 1 2 3
## 18057 1837 697 117
noout = subset(master2, totalout1 < 2)
#running the model again after getting rid of outliers
model5.2 = lm(noout$price~ number_of_reviews + reviews_per_month +
calculated_host_listings_count+availability_365+minimum_nights, data = noout)
summary(model5.2)
##
## Call:
## lm(formula = noout$price ~ number_of_reviews + reviews_per_month +
## calculated_host_listings_count + availability_365 + minimum_nights,
## data = noout)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.77145 -0.42542 0.00237 0.37713 2.90844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.947e+00 7.079e-03 698.882 < 2e-16 ***
## number_of_reviews -1.987e-03 2.032e-04 -9.778 < 2e-16 ***
## reviews_per_month -5.138e-03 5.012e-03 -1.025 0.305
## calculated_host_listings_count 2.254e-03 3.276e-04 6.882 6.08e-12 ***
## availability_365 9.608e-04 3.937e-05 24.403 < 2e-16 ***
## minimum_nights -6.065e-03 6.000e-04 -10.109 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6352 on 19888 degrees of freedom
## Multiple R-squared: 0.04128, Adjusted R-squared: 0.04104
## F-statistic: 171.3 on 5 and 19888 DF, p-value: < 2.2e-16
#Multicollinearity
summary(model5.2, correlation = T)
##
## Call:
## lm(formula = noout$price ~ number_of_reviews + reviews_per_month +
## calculated_host_listings_count + availability_365 + minimum_nights,
## data = noout)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.77145 -0.42542 0.00237 0.37713 2.90844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.947e+00 7.079e-03 698.882 < 2e-16 ***
## number_of_reviews -1.987e-03 2.032e-04 -9.778 < 2e-16 ***
## reviews_per_month -5.138e-03 5.012e-03 -1.025 0.305
## calculated_host_listings_count 2.254e-03 3.276e-04 6.882 6.08e-12 ***
## availability_365 9.608e-04 3.937e-05 24.403 < 2e-16 ***
## minimum_nights -6.065e-03 6.000e-04 -10.109 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6352 on 19888 degrees of freedom
## Multiple R-squared: 0.04128, Adjusted R-squared: 0.04104
## F-statistic: 171.3 on 5 and 19888 DF, p-value: < 2.2e-16
##
## Correlation of Coefficients:
## (Intercept) number_of_reviews
## number_of_reviews -0.10
## reviews_per_month -0.33 -0.60
## calculated_host_listings_count 0.09 0.06
## availability_365 -0.25 -0.10
## minimum_nights -0.42 0.03
## reviews_per_month
## number_of_reviews
## reviews_per_month
## calculated_host_listings_count 0.01
## availability_365 -0.12
## minimum_nights 0.17
## calculated_host_listings_count
## number_of_reviews
## reviews_per_month
## calculated_host_listings_count
## availability_365 -0.23
## minimum_nights -0.44
## availability_365
## number_of_reviews
## reviews_per_month
## calculated_host_listings_count
## availability_365
## minimum_nights -0.26
#Test for Assumptions
standardized2 = rstudent(model5.2)
fitted2 = scale(model5.2$fitted.values)
#Test for Linearity (graphical test)
qqnorm(standardized2)
abline(0,1)
#Linearity (formal test)
resettest(model5.2)
##
## RESET test
##
## data: model5.2
## RESET = 2.7582, df1 = 2, df2 = 19886, p-value = 0.06343
#Test for Normality
hist(standardized2)
#Test for Homegeneity/Homoscedasticity
plot(fitted2,standardized2)
abline(0,0)
abline(v=0)