Sameer Mathur
Simple Linear / Log-linear Regression Model
# reading data into `allCities.df` dataframe
allCities.df <- read.csv(paste("NonTouristData.csv"))
# number of rows and columns
dim(allCities.df)
[1] 4360 20
# name of the data columns
colnames(allCities.df)
[1] "CityName" "Population" "CityRank"
[4] "IsMetroCity" "IsTouristDestination" "IsWeekend"
[7] "IsNewYearEve" "Date" "HotelName"
[10] "RoomRent" "StarRating" "Airport"
[13] "HotelAddress" "HotelPincode" "HotelDescription"
[16] "FreeWifi" "FreeBreakfast" "HotelCapacity"
[19] "HasSwimmingPool" "CostOfLivingIndex"
# attaching data columns of the dataframe
attach(allCities.df)
# summary of the data columns
library(psych)
describe(allCities.df)[, 1:5]
vars n mean sd median
CityName* 1 4360 7.97 5.45 8.00
Population 2 4360 4252615.64 2906992.61 3124458.00
CityRank 3 4360 7.99 9.01 4.00
IsMetroCity* 4 4360 1.10 0.29 1.00
IsTouristDestination* 5 4360 1.00 0.00 1.00
IsWeekend* 6 4360 1.62 0.48 2.00
IsNewYearEve* 7 4360 1.12 0.33 1.00
Date* 8 4360 4.50 2.29 4.50
HotelName* 9 4360 290.25 166.58 285.00
RoomRent 10 4360 4313.05 3845.28 3500.00
StarRating 11 4360 3.56 0.75 3.00
Airport 12 4360 17.95 21.18 12.00
HotelAddress* 13 4360 378.78 189.18 402.00
HotelPincode 14 4360 483180.70 162057.18 500032.00
HotelDescription* 15 4360 185.69 118.71 178.00
FreeWifi* 16 4360 1.95 0.22 2.00
FreeBreakfast* 17 4360 1.67 0.47 2.00
HotelCapacity 18 4360 70.25 70.20 41.00
HasSwimmingPool* 19 4360 1.35 0.48 1.00
CostOfLivingIndex 20 4360 26.87 2.46 27.21
Regress Room Rent on Hotel Capacity.
# simple linear OLS model
SimOLSModel <- RoomRent ~ HotelCapacity
# fitting simple linear OLS Model
fitSimOLSModel <- lm(SimOLSModel, data = allCities.df)
# summary of the simple linear OLS model
summary(fitSimOLSModel)
Call:
lm(formula = SimOLSModel, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-4511 -1608 -824 447 50335
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3161.3968 78.6231 40.21 <2e-16 ***
HotelCapacity 16.3947 0.7918 20.71 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3669 on 4358 degrees of freedom
Multiple R-squared: 0.08957, Adjusted R-squared: 0.08936
F-statistic: 428.8 on 1 and 4358 DF, p-value: < 2.2e-16
Regress log of Room Rent on Hotel Capacity.
# simple log-linear OLS model
SimLogOLSModel <- log(RoomRent) ~ HotelCapacity
# fitting simple log-linear OLS Model
fitSimLogOLSModel <- lm(SimLogOLSModel, data = allCities.df)
# summary of the simple log-linear OLS model
summary(fitSimLogOLSModel)
Call:
lm(formula = SimLogOLSModel, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-1.65920 -0.33306 -0.05096 0.27714 2.47340
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.8807376 0.0113578 693.86 <2e-16 ***
HotelCapacity 0.0040417 0.0001144 35.34 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5301 on 4358 degrees of freedom
Multiple R-squared: 0.2227, Adjusted R-squared: 0.2225
F-statistic: 1249 on 1 and 4358 DF, p-value: < 2.2e-16
# simple log-log OLS model
SimLogLogOLSModel <- log(RoomRent) ~ log(HotelCapacity)
# fitting simple log-log OLS Model
fitSimLogLogOLSModel <- lm(SimLogLogOLSModel, data = allCities.df)
# summary of the simple log-log OLS model
summary(fitSimLogLogOLSModel)
Call:
lm(formula = SimLogLogOLSModel, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-1.79446 -0.33758 -0.05757 0.25203 2.34295
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.064803 0.031949 221.12 <2e-16 ***
log(HotelCapacity) 0.290488 0.008168 35.56 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5293 on 4358 degrees of freedom
Multiple R-squared: 0.2249, Adjusted R-squared: 0.2247
F-statistic: 1265 on 1 and 4358 DF, p-value: < 2.2e-16
Simple Linear OLS Model
# adjusted R-squared
summary(fitSimOLSModel)$adj.r.squared
[1] 0.0893634
# AIC of simple linear OLS model
AIC(fitSimOLSModel)
[1] 83949.13
# BIC of simple linear OLS model
BIC(fitSimOLSModel)
[1] 83968.27
# log likelihood of simple linear OLS model
-2*logLik(fitSimOLSModel)
'log Lik.' 83943.13 (df=3)
Simple Log-Linear OLS Model
# adjusted R-squared
summary(fitSimLogOLSModel)$adj.r.squared
[1] 0.2225307
# AIC of simple log-linear OLS model
AIC(fitSimLogOLSModel)
[1] 6842.379
# BIC of simple log-linear OLS model
BIC(fitSimLogOLSModel)
[1] 6861.519
# log likelihood of simple log-linear OLS model
-2*logLik(fitSimLogOLSModel)
'log Lik.' 6836.379 (df=3)
Regress Room Rent on following variables:
# multiple linear OLS model
MulOLSModel <- RoomRent ~ StarRating + HotelCapacity + Airport + HasSwimmingPool + FreeWifi + FreeBreakfast + IsWeekend + Population + CostOfLivingIndex
# fitting multiple linear OLS Model
fitMulOLSModel <- lm(MulOLSModel, data = allCities.df)
# summary of the multiple linear OLS model
summary(fitMulOLSModel)
Call:
lm(formula = MulOLSModel, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-6599 -1419 -516 666 49035
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.746e+03 7.335e+02 2.381 0.0173 *
StarRating 1.959e+03 1.007e+02 19.460 < 2e-16 ***
HotelCapacity -3.573e-01 1.096e+00 -0.326 0.7445
Airport 6.061e+00 2.416e+00 2.508 0.0122 *
HasSwimmingPoolYes 1.183e+03 1.473e+02 8.033 1.21e-15 ***
FreeWifiYes 3.228e+02 2.289e+02 1.410 0.1585
FreeBreakfastYes 1.468e+02 1.131e+02 1.298 0.1944
IsWeekendYes -6.932e+01 1.038e+02 -0.668 0.5043
Population -8.768e-05 1.984e-05 -4.418 1.02e-05 ***
CostOfLivingIndex -1.821e+02 2.315e+01 -7.868 4.52e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3318 on 4350 degrees of freedom
Multiple R-squared: 0.2568, Adjusted R-squared: 0.2553
F-statistic: 167 on 9 and 4350 DF, p-value: < 2.2e-16
Regress log of Room Rent on following variables:
# multiple log-linear OLS model
MulLogOLSModel <- log(RoomRent) ~ StarRating + HotelCapacity + Airport + HasSwimmingPool + FreeWifi + FreeBreakfast + IsWeekend + Population + CostOfLivingIndex
# fitting multiple log-linear OLS Model
fitMulLogOLSModel <- lm(MulLogOLSModel, data = allCities.df)
# summary of the multiple log-linear OLS model
summary(fitMulLogOLSModel)
Call:
lm(formula = MulLogOLSModel, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-1.51924 -0.29383 -0.05229 0.22530 2.15117
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.857e+00 1.003e-01 68.388 < 2e-16 ***
StarRating 3.592e-01 1.376e-02 26.105 < 2e-16 ***
HotelCapacity 1.049e-03 1.499e-04 7.003 2.90e-12 ***
Airport 1.863e-03 3.303e-04 5.640 1.81e-08 ***
HasSwimmingPoolYes 2.199e-01 2.014e-02 10.922 < 2e-16 ***
FreeWifiYes -6.484e-03 3.129e-02 -0.207 0.836
FreeBreakfastYes 9.066e-02 1.546e-02 5.863 4.87e-09 ***
IsWeekendYes -1.703e-02 1.419e-02 -1.200 0.230
Population -3.019e-08 2.713e-09 -11.128 < 2e-16 ***
CostOfLivingIndex -2.670e-03 3.164e-03 -0.844 0.399
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4536 on 4350 degrees of freedom
Multiple R-squared: 0.4318, Adjusted R-squared: 0.4307
F-statistic: 367.4 on 9 and 4350 DF, p-value: < 2.2e-16
Multiple Linear OLS Model
# adjusted R-squared
summary(fitMulOLSModel)$adj.r.squared
[1] 0.2552941
# AIC of simple linear OLS model
AIC(fitMulOLSModel)
[1] 83080.08
# BIC of simple linear OLS model
BIC(fitMulOLSModel)
[1] 83150.26
# log likelihood of simple linear OLS model
-2*logLik(fitMulOLSModel)
'log Lik.' 83058.08 (df=11)
Multiple Log-Linear OLS Model
# adjusted R-squared
summary(fitMulLogOLSModel)$adj.r.squared
[1] 0.4306695
# AIC of simple log-linear OLS model
AIC(fitMulLogOLSModel)
[1] 5491.866
# BIC of simple log-linear OLS model
BIC(fitMulLogOLSModel)
[1] 5562.048
# log likelihood of simple log-linear OLS model
-2*logLik(fitMulLogOLSModel)
'log Lik.' 5469.866 (df=11)
# confidence interval
round(confint(fitMulLogOLSModel), 4)
2.5 % 97.5 %
(Intercept) 6.6608 7.0540
StarRating 0.3322 0.3862
HotelCapacity 0.0008 0.0013
Airport 0.0012 0.0025
HasSwimmingPoolYes 0.1805 0.2594
FreeWifiYes -0.0678 0.0549
FreeBreakfastYes 0.0603 0.1210
IsWeekendYes -0.0448 0.0108
Population 0.0000 0.0000
CostOfLivingIndex -0.0089 0.0035
# coefficient plots of multiple log-linear OLS Model
library(coefplot)
coefplot(fitMulLogOLSModel,
intercept = FALSE,
outerCI = 1.96,
lwdOuter = 1.5)
Better regression model selection using step().
# full model
fullModel <- lm(RoomRent ~
StarRating
+ HotelCapacity
+ Airport
+ HasSwimmingPool
+ FreeWifi
+ FreeBreakfast
+ IsWeekend
+ Population
+ CostOfLivingIndex,
data = allCities.df)
# step model selection
stepModelSel <- step(fullModel, trace=0, steps = 1000)
stepModelSummary <- summary(stepModelSel)
stepModelSummary
Call:
lm(formula = RoomRent ~ StarRating + Airport + HasSwimmingPool +
FreeWifi + Population + CostOfLivingIndex, data = allCities.df)
Residuals:
Min 1Q Median 3Q Max
-6440 -1436 -504 676 49062
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.915e+03 6.922e+02 2.766 0.00569 **
StarRating 1.928e+03 8.819e+01 21.859 < 2e-16 ***
Airport 5.779e+00 2.390e+00 2.418 0.01563 *
HasSwimmingPoolYes 1.164e+03 1.395e+02 8.345 < 2e-16 ***
FreeWifiYes 3.769e+02 2.256e+02 1.671 0.09476 .
Population -8.660e-05 1.919e-05 -4.512 6.59e-06 ***
CostOfLivingIndex -1.848e+02 2.282e+01 -8.098 7.18e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3318 on 4353 degrees of freedom
Multiple R-squared: 0.2564, Adjusted R-squared: 0.2554
F-statistic: 250.2 on 6 and 4353 DF, p-value: < 2.2e-16