To understand Regression technique we will be working upon sample data set of Boston(existing data base in R)
#Importing packages
library(ggplot2)
library(e1071)
library(MASS)
library(ISLR)
library(corrgram)
library(corrplot)
library(car)
data(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
attach(Boston)# Storing Boston data set in R Memory
reg <- lm(medv~lstat, data = Boston) # Linear equation
reg
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
summary(reg)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
par( mfrow = c( 2, 2 ) )
plot(reg)
par( mfrow = c( 1, 1 ) )
“Checking residuals is a way to discover new insights in your model and data!” ## Residual vs Fitted Plot
plot(reg$fitted.values,reg$residuals)
#This plots explain
# * Residuals have non-linear pattern or not.
# * Equal Spread around a horizontal line without distinct pattern ---> No non-linear relationship exists
# * If thier is a distinctive pattern in the residual plot --> non-linear relationship exists
qqnorm(rstandard(reg),ylab="Standardized Residuals",xlab="Normal Scores",main="Normal Q-Q Plot")
qqline(rstandard(reg))
#This plots explain
# * Residuals are normally distributed or not
# * It's good if residuals are lined well on the straight dashed line
hist(reg$residuals) # Right Skewed
skewness(reg$residuals) # Right Skewed
## [1] 1.448435
#Predict the price of the apartment basis age of the apartment and lstat i.e. lower status of the population
#only using age and lstat Independent variable
lm.fit<- lm(medv~lstat+age,data = Boston)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
1.Their is at least one variable that can explain your dependent variable
2.P (No of independent variables ) < N (No of observations) else use forward and backward selection technique
3.F statistics is for complete model not for single variable
# Calling all independent variables together by using '.(dot)'
lm.fit1<-lm(medv~.,data = Boston)
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
correlations = cor(Boston) # Correlation coefficient among all the variables(One on One)
correlations #Output is very tough to understand
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## ptratio black lstat medv
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046
## zn -0.3916785 0.17552032 -0.4129946 0.3604453
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252
## chas -0.1215152 0.04878848 -0.0539293 0.1752602
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208
## rm -0.3555015 0.12806864 -0.6138083 0.6953599
## age 0.2615150 -0.27353398 0.6023385 -0.3769546
## dis -0.2324705 0.29151167 -0.4969958 0.2499287
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867
## black -0.1773833 1.00000000 -0.3660869 0.3334608
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627
## medv -0.5077867 0.33346082 -0.7376627 1.0000000
corrgram(correlations) #Blue Positive and Red Negative, Intensity of color tells the strength of coefficient
corrplot(correlations)
corrplot(correlations,order="hclust")
corrplot(correlations,order="hclust",method="square",tl.cex = 0.6,cl.cex = 0.6)
#tl = text labels
#cl = color legend
1.Check individual correlation value of all the independent variables(Correlation matrix)
2.Check R Squared value after removing any of the two collinear variables and accordingly you can remove that value
3.Create a composite value ( new value = value 1+ Value 2); you can use any calculation technique to come up with composite value to replace both values
vif(lm.fit1) # part of library(Car)
## crim zn indus chas nox rm age dis
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945
## rad tax ptratio black lstat
## 7.484496 9.008554 1.799084 1.348521 2.941491
If vif value of any variable is < 4 –> No multicollinearity
If vif value of any variable is in b/w [4 to 10]–> Needs inspection,
If vif value of any variable is > 10 —> Variable should be dropped
lm.fit2 <- lm(medv~.-rad,data = Boston)
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ . - rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8842 -2.7975 -0.6162 1.7073 27.7650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.759382 4.992011 5.961 4.77e-09 ***
## crim -0.067540 0.032317 -2.090 0.037137 *
## zn 0.039720 0.013928 2.852 0.004531 **
## indus -0.058411 0.060267 -0.969 0.332925
## chas 3.114373 0.874017 3.563 0.000402 ***
## nox -15.261798 3.857929 -3.956 8.74e-05 ***
## rm 4.114610 0.421073 9.772 < 2e-16 ***
## age -0.003927 0.013440 -0.292 0.770279
## dis -1.490153 0.203490 -7.323 9.95e-13 ***
## tax 0.001334 0.002363 0.565 0.572533
## ptratio -0.838736 0.131086 -6.398 3.66e-10 ***
## black 0.008415 0.002733 3.079 0.002196 **
## lstat -0.516418 0.051715 -9.986 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.842 on 493 degrees of freedom
## Multiple R-squared: 0.7294, Adjusted R-squared: 0.7228
## F-statistic: 110.8 on 12 and 493 DF, p-value: < 2.2e-16
vif(lm.fit2)
## crim zn indus chas nox rm age dis
## 1.664471 2.273018 3.682265 1.061561 4.304929 1.885425 3.083009 3.954951
## tax ptratio black lstat
## 3.415289 1.734873 1.341459 2.937752
*P value of age has reduced but still insignificant
#influence.measures(lm.fit1) ## Outputs marked as aestrik are influential records, very long o/p of this command to tough to interpret
influencePlot(lm.fit1)
## StudRes Hat CookD
## 369 5.907411 0.06633094 0.16567369
## 381 -1.004229 0.30595949 0.03175477
infIndexPlot(lm.fit1)
infIndexPlot(lm.fit1, id.n = 3) # identifying which variable is influencing the o/p
### Regression equation after excluding influential value basis hat - value
lm.fit1<-lm(medv~crim +zn+indus+chas+rm+age+dis+tax+ptratio+black+lstat,data = Boston[-c(381,406,419),])
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + rm + age + dis +
## tax + ptratio + black + lstat, data = Boston[-c(381, 406,
## 419), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0830 -2.7017 -0.7248 1.5813 28.5954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.551307 4.290936 4.323 1.86e-05 ***
## crim 0.020033 0.050271 0.398 0.690436
## zn 0.040854 0.014093 2.899 0.003912 **
## indus -0.098012 0.059708 -1.642 0.101327
## chas 2.934525 0.883141 3.323 0.000958 ***
## rm 4.306532 0.426415 10.099 < 2e-16 ***
## age -0.017186 0.013149 -1.307 0.191800
## dis -1.215059 0.198348 -6.126 1.85e-09 ***
## tax -0.003066 0.002390 -1.283 0.200204
## ptratio -0.681687 0.126035 -5.409 9.93e-08 ***
## black 0.010795 0.002829 3.816 0.000153 ***
## lstat -0.545649 0.052741 -10.346 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.898 on 491 degrees of freedom
## Multiple R-squared: 0.72, Adjusted R-squared: 0.7137
## F-statistic: 114.8 on 11 and 491 DF, p-value: < 2.2e-16
lm.fit1<-lm(medv~crim +zn+indus+chas+rm+age+dis+tax+ptratio+black+lstat,data = Boston[-c(360:380),])
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + rm + age + dis +
## tax + ptratio + black + lstat, data = Boston[-c(360:380),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4122 -2.1836 -0.5578 1.7315 19.5401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.066643 3.630800 0.018 0.9854
## crim -0.047680 0.025503 -1.870 0.0622 .
## zn 0.021580 0.010932 1.974 0.0490 *
## indus -0.034966 0.046082 -0.759 0.4484
## chas 1.488927 0.733802 2.029 0.0430 *
## rm 7.061068 0.381094 18.528 < 2e-16 ***
## age -0.048505 0.010382 -4.672 3.89e-06 ***
## dis -0.949029 0.153565 -6.180 1.38e-09 ***
## tax -0.007597 0.001836 -4.137 4.16e-05 ***
## ptratio -0.711204 0.097843 -7.269 1.51e-12 ***
## black 0.010633 0.002193 4.848 1.69e-06 ***
## lstat -0.215610 0.046542 -4.633 4.68e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.779 on 473 degrees of freedom
## Multiple R-squared: 0.8234, Adjusted R-squared: 0.8193
## F-statistic: 200.5 on 11 and 473 DF, p-value: < 2.2e-16
anova(lm.fit1,lm.fit2)
lm.fit3 <- lm(medv~ age*lstat,data = Boston)
summary(lm.fit3) # help in creating composite value
##
## Call:
## lm(formula = medv ~ age * lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age:lstat 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
View(Carseats)
class(Carseats)
## [1] "data.frame"
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
##Carseats$ShelveLoc_Medium <- ifelse(Carseats$ShelveLoc == "Medium"",1,0)
attach(Carseats)
lm.fit <- lm(Sales~Income+Price+ShelveLoc)
summary(lm.fit)
##
## Call:
## lm(formula = Sales ~ Income + Price + ShelveLoc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3186 -1.3618 -0.0961 1.3318 5.0642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.779706 0.559396 19.270 < 2e-16 ***
## Income 0.015351 0.003361 4.568 6.59e-06 ***
## Price -0.055708 0.003967 -14.042 < 2e-16 ***
## ShelveLocGood 4.957717 0.279336 17.748 < 2e-16 ***
## ShelveLocMedium 1.935691 0.229639 8.429 6.57e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.871 on 395 degrees of freedom
## Multiple R-squared: 0.5655, Adjusted R-squared: 0.5611
## F-statistic: 128.5 on 4 and 395 DF, p-value: < 2.2e-16
contrasts(ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
#Running regression equation
lm.fit1<-lm(medv~crim +zn+indus+chas+rm+age+dis+tax+ptratio+black+lstat,data = Boston[-c(360:380),])
stepAIC(lm.fit1)
#StepAIC
#* Lower the AIC ---> Better the Model
#* Tradeoff b/w complexity and explanability of your model
#* Any pattern in residual plot --> Hetroskedaskicity Exits
#* Eliminate the patter