Consider the data set ‘airbnbdata.csv’. This data is a simplified version of Kaggle data set (https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)
We are trying to predict per night price of Airbnb’s in NYC.
Variables are as follows:
id: Listing idneighbourhood_group: Neighborhood in NYCroom_type: listing space typeprice:price in dollarsminimum_nights: amount of nights minimumnumber_of_reviews: number of reviewsairbnb.Please separate the testing and training sets:
- set a seed
- separate 15% of your data into testing set
airbnb=read.table("https://unh.box.com/shared/static/nq5382z6ek8js3we09b9ma1qa3wpr2g0.csv", header = TRUE, sep=",", dec=".")
head(airbnb)
## id neighbourhood_group room_type price minimum_nights
## 1 2539 Brooklyn Private room 149 1
## 2 2595 Manhattan Entire home/apt 225 1
## 3 3647 Manhattan Private room 150 3
## 4 3831 Brooklyn Entire home/apt 89 1
## 5 5022 Manhattan Entire home/apt 80 10
## 6 5099 Manhattan Entire home/apt 200 3
## number_of_reviews
## 1 9
## 2 45
## 3 0
## 4 270
## 5 9
## 6 74
set.seed(123)
size_airbnb=floor(0.85 * nrow(airbnb))
index_airbnb=sample(seq_len(nrow(airbnb)), size = size_airbnb)
train_airbnb=airbnb[index_airbnb,]
test_airbnb=airbnb[-index_airbnb,]
c(nrow(airbnb), nrow(train_airbnb), nrow(test_airbnb))
## [1] 1800 1530 270
For questions 2 to 6 use training set
price (dependent variable) and number_of_reviews and between price and minimum_nights by obtaining the scatter plots and the respective correlations. Hint: plot and corplot(airbnb[,"number_of_reviews"], airbnb[,"price"], col="blue")
x=cbind(Price=airbnb[,"price"], Number_of_Reviews=airbnb[,"number_of_reviews"])
cor(x)
## Price Number_of_Reviews
## Price 1.00000000 -0.09568711
## Number_of_Reviews -0.09568711 1.00000000
plot(airbnb[,"minimum_nights"], airbnb[,"price"], col="blue")
x=cbind(Price=airbnb[,"price"], Minimum_nights=airbnb[,"minimum_nights"])
cor(x)
## Price Minimum_nights
## Price 1.000000000 -0.004730418
## Minimum_nights -0.004730418 1.000000000
price (dependent variable) and neighbourhood_group by obtaining the box plot. Hint: boxplotggplot(airbnb, aes(x=neighbourhood_group, y=price)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4)
price (as the dependent variable) and the following variables: minimum_nights, neighbourhood_group, and room_type (recall to specify categorical variables as a factor) separately. Report your regression models.model1=lm(price~minimum_nights, data=train_airbnb)
summary(model1)
##
## Call:
## lm(formula = price ~ minimum_nights, data = train_airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.51 -71.47 -30.71 38.55 2843.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.7684 3.5808 43.780 <2e-16 ***
## minimum_nights -0.0845 0.1353 -0.625 0.532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 131.3 on 1528 degrees of freedom
## Multiple R-squared: 0.0002553, Adjusted R-squared: -0.000399
## F-statistic: 0.3902 on 1 and 1528 DF, p-value: 0.5323
ggplot(train_airbnb, aes(x=minimum_nights, y=price)) +
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
model2=lm(price~as.factor(neighbourhood_group), data=train_airbnb)
summary(model2)
##
## Call:
## lm(formula = price ~ as.factor(neighbourhood_group), data = train_airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.86 -68.76 -27.48 27.52 2825.14
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 74.35 28.97 2.567
## as.factor(neighbourhood_group)Brooklyn 73.13 29.38 2.489
## as.factor(neighbourhood_group)Manhattan 100.51 29.38 3.421
## as.factor(neighbourhood_group)Queens 24.43 32.09 0.761
## as.factor(neighbourhood_group)Staten Island 46.59 42.74 1.090
## Pr(>|t|)
## (Intercept) 0.01036 *
## as.factor(neighbourhood_group)Brooklyn 0.01291 *
## as.factor(neighbourhood_group)Manhattan 0.00064 ***
## as.factor(neighbourhood_group)Queens 0.44654
## as.factor(neighbourhood_group)Staten Island 0.27579
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129.5 on 1525 degrees of freedom
## Multiple R-squared: 0.02821, Adjusted R-squared: 0.02566
## F-statistic: 11.07 on 4 and 1525 DF, p-value: 7.511e-09
ggplot(train_airbnb, aes(x=as.factor(neighbourhood_group), y=price)) +
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
model3=lm(price~as.factor(room_type), data=train_airbnb)
summary(model3)
##
## Call:
## lm(formula = price ~ as.factor(room_type), data = train_airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.23 -52.23 -21.55 19.45 2899.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 192.234 4.061 47.33 < 2e-16 ***
## as.factor(room_type)Private room -91.680 6.506 -14.09 < 2e-16 ***
## as.factor(room_type)Shared room -85.296 31.129 -2.74 0.00621 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123.5 on 1527 degrees of freedom
## Multiple R-squared: 0.1164, Adjusted R-squared: 0.1152
## F-statistic: 100.6 on 2 and 1527 DF, p-value: < 2.2e-16
ggplot(train_airbnb, aes(x=as.factor(room_type), y=price)) +
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
price (as the dependent variable) and the following variables: minimum_nights, neighbourhood_group, and room_type. Report your regression model.mlr=lm(price~.-number_of_reviews-id, data=train_airbnb)
summary(mlr)
##
## Call:
## lm(formula = price ~ . - number_of_reviews - id, data = train_airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -158.34 -52.93 -20.31 15.88 2881.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.2764 27.5981 4.503 7.21e-06 ***
## neighbourhood_groupBrooklyn 60.3083 27.7634 2.172 0.02999 *
## neighbourhood_groupManhattan 85.1011 27.7813 3.063 0.00223 **
## neighbourhood_groupQueens 32.7913 30.3251 1.081 0.27972
## neighbourhood_groupStaten Island 60.9064 40.3666 1.509 0.13155
## room_typePrivate room -88.9131 6.5400 -13.595 < 2e-16 ***
## room_typeShared room -88.6173 30.9044 -2.867 0.00419 **
## minimum_nights -0.2593 0.1267 -2.047 0.04083 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122.3 on 1522 degrees of freedom
## Multiple R-squared: 0.1353, Adjusted R-squared: 0.1313
## F-statistic: 34.02 on 7 and 1522 DF, p-value: < 2.2e-16
#Autocorrelation:
durbinWatsonTest(mlr)
## lag Autocorrelation D-W Statistic p-value
## 1 0.04444142 1.910738 0.09
## Alternative hypothesis: rho != 0
#Constant Variance:
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(mlr)
##
## studentized Breusch-Pagan test
##
## data: mlr
## BP = 2.4469, df = 7, p-value = 0.931
#Near perfect multicolinearity
vif(mlr)
## GVIF Df GVIF^(1/(2*Df))
## neighbourhood_group 1.026291 4 1.003249
## room_type 1.032664 2 1.008068
## minimum_nights 1.009938 1 1.004957
For Autocorrelation (Durbin Watson Test), since the p-value is small, then residuals are auto-correlated.
For constant variance (White Test), since the p-value is large, then the assumption of homoskedastisity holds. Hence the assumption of constant variance holds.
price better?Model Prediction
pred_1=predict(model1, newdata=test_airbnb)
pred_2=predict(model2, newdata=test_airbnb)
pred_3=predict(model3, newdata=test_airbnb)
pred_4=predict(mlr, newdata=test_airbnb)
prediction=cbind(obs=seq(1,nrow(test_airbnb)),price=test_airbnb[,"price"], pred_1, pred_2, pred_3, pred_4)
head(prediction)
## obs price pred_1 pred_2 pred_3 pred_4
## 3 1 150 156.5149 174.8575 100.5542 119.68652
## 7 2 60 152.9660 147.4765 100.5542 84.00337
## 15 3 120 149.1636 174.8575 192.2338 186.04086
## 21 4 299 156.5149 147.4765 192.2338 183.80686
## 22 5 130 156.5994 147.4765 100.5542 95.15309
## 43 6 120 156.1769 147.4765 100.5542 93.85661
ggplot(prediction, aes(x=obs)) +
geom_line(aes(y = price)) +
geom_line(aes(y = pred_1, color="red")) +
geom_line(aes(y = pred_2, color="blue")) +
geom_line(aes(y = pred_3, color="green"))
geom_line(aes(y = pred_4, color="black"))
## mapping: y = ~pred_4, colour = black
## geom_line: na.rm = FALSE, orientation = NA
## stat_identity: na.rm = FALSE
## position_identity
Testing Accuracy
mse1_t=mean((test_airbnb[,"price"]-pred_1)^2)
mse2_t=mean((test_airbnb[,"price"]-pred_2)^2)
mse3_t=mean((test_airbnb[,"price"]-pred_3)^2)
mse4_t=mean((test_airbnb[,"price"]-pred_4)^2)
Models=c("Model 1", "Model 2", "Model 3", "Model 4")
MSE=c(mse1_t, mse2_t, mse3_t, mse4_t)
MSE=round(MSE, 4)
df=rbind(Models, MSE)
kable(df) %>%
kable_styling("striped")
| Models | Model 1 | Model 2 | Model 3 | Model 4 |
| MSE | 39664.9611 | 38207.7849 | 36830.6585 | 35971.7024 |
Based on the low Mean Squared Error, the 4th model (multiple linear regression) predicts better than the rest.