library(MASS)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
data(Boston)
View(Boston)
# We will split the data into training and testing sets
set.seed(2)
library(caTools) # sample.split function is present in this package
split <- sample.split(Boston$medv,SplitRatio = 0.7)
# We divided the data with the ratio 0.7(70:30)
split
## [1] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## [23] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [34] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [45] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [56] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [78] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [89] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [100] FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [111] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [122] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [144] TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [166] TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## [177] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [188] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [199] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [210] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [221] TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [232] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [243] TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [254] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [276] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [287] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [298] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [309] FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE
## [320] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [331] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [342] TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [353] FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [364] FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [375] TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
## [386] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [397] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [408] TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [419] TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [430] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [441] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [452] TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [463] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [474] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [485] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [496] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE
trainingdata <- subset(Boston,split=="TRUE")
testingdata <- subset(Boston, split=="FALSE")
# To view the correlation of variables
plot(Boston$crim, Boston$medv, cex = 0.5, xlab = "Crime rate", ylab = "Price")

cr <- cor(Boston)
# Creating Scatterplot matrix
attach(Boston)
library(lattice)
splom(~Boston[c(1:6,14)],groups = NULL, data = Boston,axis.line.tck = 0, axis.text.alpha = 0)

splom(~Boston[c(7:14)],groups = NULL, data = Boston,axis.line.tck = 0, axis.text.alpha = 0)

# Studying rm and medv
plot(rm,medv)
abline(lm(medv~rm), col = "red") # regression fit line

# We can use corrplot to visualize
library(corrplot)
corrplot(cr, type = "lower")

corrplot(cr, method = "number")

# Finding multicolinearlity
#install.packages("caret")
library(caret)
# To exclude medv(Output)
Boston_a = subset(Boston, select = -c(medv))
numericData <- Boston_a[sapply(Boston_a, is.numeric)]
descrCor <- cor(numericData)
# Vif
#install.packages("car")
library(car)
## Loading required package: carData
model <- lm(medv~.,data = trainingdata)
vif(model)
## crim zn indus chas nox rm age dis
## 1.654454 2.300741 3.990279 1.072913 4.506525 1.820852 2.995407 3.897520
## rad tax ptratio black lstat
## 7.120702 8.577992 1.997447 1.365848 2.641099
# Now to create the model we will use all columns
model <- lm(medv~.,data = trainingdata)
#or
model <- lm(medv~ crim + zn + indus + chas + nox + rm + age + dis + rad +
tax + ptratio + black + lstat, data = trainingdata)
# For descripion of the model
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat, data = trainingdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4036 -2.8472 -0.5166 1.8768 24.3219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.975065 6.171855 6.801 4.45e-11 ***
## crim -0.124709 0.036198 -3.445 0.000639 ***
## zn 0.060213 0.016620 3.623 0.000334 ***
## indus 0.007313 0.073690 0.099 0.921005
## chas 2.366739 1.074791 2.202 0.028308 *
## nox -21.354016 4.722318 -4.522 8.38e-06 ***
## rm 3.483738 0.487656 7.144 5.23e-12 ***
## age -0.006623 0.015952 -0.415 0.678241
## dis -1.859822 0.245499 -7.576 3.17e-13 ***
## rad 0.337442 0.076949 4.385 1.53e-05 ***
## tax -0.012126 0.004312 -2.812 0.005201 **
## ptratio -0.886151 0.164662 -5.382 1.35e-07 ***
## black 0.008751 0.003099 2.824 0.005009 **
## lstat -0.590932 0.060140 -9.826 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.914 on 353 degrees of freedom
## Multiple R-squared: 0.7408, Adjusted R-squared: 0.7312
## F-statistic: 77.6 on 13 and 353 DF, p-value: < 2.2e-16
# Model creation after removing tax
model <- lm(medv~ crim + zn + indus + chas + rm + rad + age + dis +
nox + ptratio + black + lstat, data = trainingdata)
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + rm + rad + age +
## dis + nox + ptratio + black + lstat, data = trainingdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9490 -2.9538 -0.4061 1.9077 24.3331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.509132 6.209497 6.524 2.37e-10 ***
## crim -0.124265 0.036549 -3.400 0.000751 ***
## zn 0.050394 0.016406 3.072 0.002294 **
## indus -0.084775 0.066653 -1.272 0.204248
## chas 2.725358 1.077557 2.529 0.011866 *
## rm 3.561410 0.491599 7.245 2.73e-12 ***
## rad 0.172407 0.050245 3.431 0.000672 ***
## age -0.006871 0.016106 -0.427 0.669934
## dis -1.873322 0.247834 -7.559 3.52e-13 ***
## nox -22.358206 4.754506 -4.703 3.69e-06 ***
## ptratio -0.925909 0.165647 -5.590 4.55e-08 ***
## black 0.009015 0.003127 2.883 0.004182 **
## lstat -0.591380 0.060723 -9.739 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.962 on 354 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.726
## F-statistic: 81.81 on 12 and 354 DF, p-value: < 2.2e-16
# Model after removing indus and age
model <- lm(medv~ crim + zn + chas + rm + rad + nox + dis +
ptratio + black + lstat, data = trainingdata)
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + rm + rad + nox + dis +
## ptratio + black + lstat, data = trainingdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0923 -3.0470 -0.4139 1.8288 24.0622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.350201 6.199887 6.508 2.59e-10 ***
## crim -0.122429 0.036489 -3.355 0.000878 ***
## zn 0.051728 0.016282 3.177 0.001618 **
## chas 2.603143 1.073338 2.425 0.015793 *
## rm 3.637570 0.476458 7.635 2.10e-13 ***
## rad 0.166086 0.049779 3.336 0.000938 ***
## nox -24.618054 4.363543 -5.642 3.44e-08 ***
## dis -1.770642 0.230796 -7.672 1.64e-13 ***
## ptratio -0.962238 0.163382 -5.889 8.97e-09 ***
## black 0.009152 0.003120 2.934 0.003566 **
## lstat -0.608516 0.057900 -10.510 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.961 on 356 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7261
## F-statistic: 98.04 on 10 and 356 DF, p-value: < 2.2e-16
# Now we can use this model to predictthe output of test set
predic <- predict(model, testingdata)
predic
## 2 4 5 8 9 19 21
## 24.637885 28.120007 27.223937 18.177974 9.315073 16.120446 11.601768
## 23 24 26 27 31 34 35
## 14.953530 12.849702 12.669747 14.772988 10.254794 13.571315 12.778319
## 36 42 44 49 53 56 67
## 23.787292 27.658729 24.122106 6.694142 27.158729 30.356296 26.602764
## 72 74 80 87 90 94 97
## 22.112220 24.566838 23.900349 21.826867 31.370604 29.474275 25.308109
## 100 104 106 109 113 114 119
## 33.051929 21.191388 19.389263 23.645461 21.830261 21.607072 21.456191
## 123 130 131 132 133 134 135
## 18.527226 14.617165 20.900314 20.269633 21.002661 16.429059 13.792913
## 139 143 145 148 149 150 151
## 14.188082 12.415790 6.514073 6.251079 7.498174 13.084246 19.626771
## 164 167 170 171 173 174 178
## 42.730411 38.243848 27.255762 22.963529 23.000432 29.412755 29.535834
## 186 188 189 190 195 202 210
## 23.815333 35.616302 34.599820 36.516617 32.527639 30.979728 15.788309
## 223 224 226 228 230 235 236
## 31.740896 29.666855 39.755511 32.375220 31.524563 31.401067 24.913628
## 240 241 242 244 246 247 249
## 28.474929 26.900270 23.393973 27.442134 12.269774 19.571972 20.820488
## 252 255 256 268 270 271 273
## 25.223711 24.707964 22.255740 40.365087 25.027978 21.631734 28.305465
## 278 293 303 306 309 313 316
## 35.028950 32.385131 29.245806 30.153400 28.987814 23.271408 20.283444
## 320 321 330 334 340 344 346
## 20.889442 24.750349 26.191223 20.708619 20.014138 28.643092 16.682309
## 348 351 353 355 356 358 359
## 25.934847 21.094337 17.672104 14.272041 16.704564 22.408066 22.009226
## 364 371 374 376 380 382 385
## 20.088070 36.346777 5.335506 25.704552 16.725386 18.359808 2.524664
## 398 399 407 409 415 416 418
## 16.388529 5.359530 8.291356 13.966953 -5.954831 8.696874 6.338836
## 420 423 425 426 441 442 449
## 14.029159 19.486910 15.626345 9.536133 11.969379 16.791069 17.298255
## 453 459 471 473 477 487 494
## 18.451114 17.004866 20.869799 23.331980 20.950448 20.289816 21.315682
## 497 498 499 502 503 505
## 13.854404 19.443915 21.840282 23.492489 22.516375 26.331246
# To compare predicted values and actual values, we can use plots
plot(testingdata$medv, type = "l", lty = 1.8, col = "green")
lines(predic, type = "l", col = "blue")
