Intro To Linear Regression In R
Setup
Setup1
Setup2
Load Libs
library(MASS)
library(plyr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#install.packages("caret")
library(caret)
## Loading required package: lattice
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
#install.packages("gbm")
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
#install.packages("corrgram")
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:plyr':
##
## baseball
#install.packages("corrplot")
library(corrplot)
Load IRIS
Boston is ready made; just use it
# If .txt tab file, use this
#my_housing_data <- read.delim(file.choose())
# Or, if .csv file, use this
#my_housing_data <- read.csv(file.choose())
# Store the data in the variable my_housing_data
data("Boston")
my_housing_data <- Boston
View(my_housing_data)
# Print the first 6 rows
head(my_housing_data, 6)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
# Print the last 6 rows
tail(my_housing_data, 6)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 501 0.22438 0 9.69 0 0.585 6.027 79.7 2.4982 6 391 19.2 396.90
## 502 0.06263 0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99
## 503 0.04527 0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90
## 504 0.06076 0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90
## 505 0.10959 0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45
## 506 0.04741 0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90
## lstat medv
## 501 14.33 16.8
## 502 9.67 22.4
## 503 9.08 20.6
## 504 5.64 23.9
## 505 6.48 22.0
## 506 7.88 11.9
Summary Iris
summary(my_housing_data, digits = 1)
## crim zn indus chas nox
## Min. :6e-03 Min. : 0 Min. : 0.5 Min. :0.00 Min. :0.4
## 1st Qu.:8e-02 1st Qu.: 0 1st Qu.: 5.2 1st Qu.:0.00 1st Qu.:0.4
## Median :3e-01 Median : 0 Median : 9.7 Median :0.00 Median :0.5
## Mean :4e+00 Mean : 11 Mean :11.1 Mean :0.07 Mean :0.6
## 3rd Qu.:4e+00 3rd Qu.: 12 3rd Qu.:18.1 3rd Qu.:0.00 3rd Qu.:0.6
## Max. :9e+01 Max. :100 Max. :27.7 Max. :1.00 Max. :0.9
## rm age dis rad tax
## Min. :4 Min. : 3 Min. : 1 Min. : 1 Min. :187
## 1st Qu.:6 1st Qu.: 45 1st Qu.: 2 1st Qu.: 4 1st Qu.:279
## Median :6 Median : 78 Median : 3 Median : 5 Median :330
## Mean :6 Mean : 69 Mean : 4 Mean :10 Mean :408
## 3rd Qu.:7 3rd Qu.: 94 3rd Qu.: 5 3rd Qu.:24 3rd Qu.:666
## Max. :9 Max. :100 Max. :12 Max. :24 Max. :711
## ptratio black lstat medv
## Min. :13 Min. : 0.3 Min. : 2 Min. : 5
## 1st Qu.:17 1st Qu.:375.4 1st Qu.: 7 1st Qu.:17
## Median :19 Median :391.4 Median :11 Median :21
## Mean :18 Mean :356.7 Mean :13 Mean :23
## 3rd Qu.:20 3rd Qu.:396.2 3rd Qu.:17 3rd Qu.:25
## Max. :22 Max. :396.9 Max. :38 Max. :50
Stat.desc() function on Iris
#install.packages("pastecs")
# Compute descriptive statistics
library(pastecs)
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:tidyr':
##
## extract
res <- stat.desc(my_housing_data[, -5])
round(res, 2)
## crim zn indus chas rm age dis
## nbr.val 506.00 506.00 506.00 506.00 506.00 506.00 506.00
## nbr.null 0.00 372.00 0.00 471.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## min 0.01 0.00 0.46 0.00 3.56 2.90 1.13
## max 88.98 100.00 27.74 1.00 8.78 100.00 12.13
## range 88.97 100.00 27.28 1.00 5.22 97.10 11.00
## sum 1828.44 5750.00 5635.21 35.00 3180.03 34698.90 1920.29
## median 0.26 0.00 9.69 0.00 6.21 77.50 3.21
## mean 3.61 11.36 11.14 0.07 6.28 68.57 3.80
## SE.mean 0.38 1.04 0.30 0.01 0.03 1.25 0.09
## CI.mean.0.95 0.75 2.04 0.60 0.02 0.06 2.46 0.18
## var 73.99 543.94 47.06 0.06 0.49 792.36 4.43
## std.dev 8.60 23.32 6.86 0.25 0.70 28.15 2.11
## coef.var 2.38 2.05 0.62 3.67 0.11 0.41 0.55
## rad tax ptratio black lstat medv
## nbr.val 506.00 506.00 506.00 506.00 506.00 506.00
## nbr.null 0.00 0.00 0.00 0.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00
## min 1.00 187.00 12.60 0.32 1.73 5.00
## max 24.00 711.00 22.00 396.90 37.97 50.00
## range 23.00 524.00 9.40 396.58 36.24 45.00
## sum 4832.00 206568.00 9338.50 180477.06 6402.45 11401.60
## median 5.00 330.00 19.05 391.44 11.36 21.20
## mean 9.55 408.24 18.46 356.67 12.65 22.53
## SE.mean 0.39 7.49 0.10 4.06 0.32 0.41
## CI.mean.0.95 0.76 14.72 0.19 7.97 0.62 0.80
## var 75.82 28404.76 4.69 8334.75 50.99 84.59
## std.dev 8.71 168.54 2.16 91.29 7.14 9.20
## coef.var 0.91 0.41 0.12 0.26 0.56 0.41
Dataset Split - method 1
#vctTrnRecs <- createDataPartition(y=my_housing_data$medv, p=0.7, list=FALSE)
#my_housing_data_TrnData <- my_housing_data[vctTrnRecs,]
#my_housing_data_TstData <- my_housing_data[-vctTrnRecs,]
Find Corelations - method 1
#functions
#detectNA <- function(inp) {
# sum(is.na(inp))
#}
#detectCor <- function(x) {
# cor(as.numeric(my_housing_data[, x]),
# as.numeric(my_housing_data$medv),
# method="spearman")
#}
## find correlations
#vcnCorsData <- abs(sapply(colnames(my_housing_data), detectCor)) #absolute value
#summary(vcnCorsData)
#Show Corelations
#vcnCorsData
#Plot Corelations
#corrgram(my_housing_data)
Dataset Split - method 2
set.seed(707)
#install.packages("caTools")
library(caTools)
split <- sample.split(my_housing_data$medv, SplitRatio = 0.7)
#split
training_data <- subset(my_housing_data, split=="TRUE")
testing_data <- subset(my_housing_data, split=="FALSE")
Find Corelations - method 2
plot(my_housing_data$crim, my_housing_data$medv, cex=0.5, xlab="Crime Rate", ylab = "medv")
cr <- cor(my_housing_data)
Plot scatter plot for Corelations - method 2
attach(my_housing_data)
library(lattice)
splom(~my_housing_data[c(1:6, 14)], groups=NULL, data=my_housing_data, axis.line.tck = 0, axis.text.alpha = 0)
splom(~my_housing_data[c(7:14)], groups=NULL, data=my_housing_data, axis.line.tck = 0, axis.text.alpha = 0)
Find Corelations - method 2
plot(rm, medv)
#regression fit line
abline(lm(medv~rm), col="red")
Visualization - method 2
library(corrplot)
corrplot(cr, type = "lower")
corrplot(cr, method = "number")
finding multicollinearity - method 2
library(caret)
#to exclude medv(output)
housing_a <- subset(my_housing_data, select = -c(medv))
numericData <- housing_a[sapply(housing_a, is.numeric)]
descrCor <- cor(numericData)
vif - method 2
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
model <- lm(medv~., data = training_data)
vif(model)
## crim zn indus chas nox rm age dis
## 2.186063 2.426594 4.044576 1.098107 4.466141 1.999619 3.161921 4.088476
## rad tax ptratio black lstat
## 7.111726 8.137088 1.931770 1.404318 3.138782
Create model using all variables - method 2
model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox + rm+ age + dis + rad + tax+ ptratio + black + lstat, data = training_data)
description of model - method 2
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1846 -2.9393 -0.6079 2.1119 24.8407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.893461 6.085615 5.241 2.76e-07 ***
## crim -0.092407 0.054133 -1.707 0.088694 .
## zn 0.047614 0.016067 2.963 0.003249 **
## indus 0.030606 0.071300 0.429 0.667998
## chas 2.832769 1.003739 2.822 0.005039 **
## nox -19.220333 4.537990 -4.235 2.91e-05 ***
## rm 4.387744 0.490338 8.948 < 2e-16 ***
## age -0.001238 0.016121 -0.077 0.938835
## dis -1.573771 0.241048 -6.529 2.31e-10 ***
## rad 0.284439 0.076137 3.736 0.000218 ***
## tax -0.011483 0.004189 -2.741 0.006429 **
## ptratio -0.873512 0.157825 -5.535 6.09e-08 ***
## black 0.010619 0.003261 3.256 0.001238 **
## lstat -0.528799 0.063089 -8.382 1.26e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.791 on 353 degrees of freedom
## Multiple R-squared: 0.7537, Adjusted R-squared: 0.7446
## F-statistic: 83.08 on 13 and 353 DF, p-value: < 2.2e-16
Re-Create model using lesser variables - method 2
model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox + rm+ age + dis + rad + ptratio + black + lstat, data = training_data)
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + ptratio + black + lstat, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.7097 -3.1897 -0.5404 1.9789 24.7599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.32910 6.11430 4.960 1.09e-06 ***
## crim -0.08939 0.05462 -1.637 0.10258
## zn 0.03932 0.01592 2.469 0.01402 *
## indus -0.05105 0.06537 -0.781 0.43535
## chas 3.13277 1.00690 3.111 0.00201 **
## nox -20.16860 4.56624 -4.417 1.33e-05 ***
## rm 4.48134 0.49363 9.078 < 2e-16 ***
## age -0.00211 0.01627 -0.130 0.89686
## dis -1.59456 0.24314 -6.558 1.93e-10 ***
## rad 0.12666 0.05030 2.518 0.01224 *
## ptratio -0.90836 0.15875 -5.722 2.25e-08 ***
## black 0.01084 0.00329 3.296 0.00108 **
## lstat -0.53049 0.06366 -8.333 1.76e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.834 on 354 degrees of freedom
## Multiple R-squared: 0.7484, Adjusted R-squared: 0.7399
## F-statistic: 87.77 on 12 and 354 DF, p-value: < 2.2e-16
Re-Create model using lesser variables - method 2
model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox + rm + dis + rad + ptratio + black + lstat, data = training_data)
summary(model)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + dis +
## rad + ptratio + black + lstat, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.6773 -3.1893 -0.5365 1.9819 24.7189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.396435 6.083787 4.996 9.19e-07 ***
## crim -0.089652 0.054505 -1.645 0.10089
## zn 0.039592 0.015758 2.513 0.01243 *
## indus -0.051073 0.065282 -0.782 0.43453
## chas 3.124282 1.003377 3.114 0.00200 **
## nox -20.327722 4.392309 -4.628 5.18e-06 ***
## rm 4.467649 0.481540 9.278 < 2e-16 ***
## dis -1.585673 0.232963 -6.807 4.27e-11 ***
## rad 0.127339 0.049963 2.549 0.01123 *
## ptratio -0.910298 0.157828 -5.768 1.75e-08 ***
## black 0.010816 0.003279 3.299 0.00107 **
## lstat -0.533274 0.059856 -8.909 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.828 on 355 degrees of freedom
## Multiple R-squared: 0.7484, Adjusted R-squared: 0.7406
## F-statistic: 96.01 on 11 and 355 DF, p-value: < 2.2e-16
Predict on test data - method 2
predic <- predict(model, testing_data)
#predic
compare predicted and actual data - method 2
plot(testing_data$medv, type="l", lty = 1.8, col = "green")
lines(predic, type="l",col = "blue")
Predict on sample test data - method 2
#predic <- predict(model, #sample_data)
#predic