Cars <- read.csv(file.choose()) ## choose the Cars.csv data set
head(Cars)
##   HP      MPG VOL       SP       WT
## 1 49 53.70068  89 104.1854 28.76206
## 2 55 50.01340  92 105.4613 30.46683
## 3 55 50.01340  92 105.4613 30.19360
## 4 70 45.69632  92 113.4613 30.63211
## 5 53 50.50423  92 104.4613 29.88915
## 6 70 45.69632  89 113.1854 29.59177
View(Cars)
attach(Cars)

Exploratory Data Analysis(60% of time)

1. Measures of Central Tendency

2. Measures of Dispersion

3. Third Moment Business decision

4. Fourth Moment Business decision

5. Proability distributions of variables

6. Graphical representations

### > Histogram, Box plot, Dot plot, Stem & Leaf plot, Bar Plot

summary(Cars)
str(HP)

7. Find the correlation b/n Output (MPG) & (HP,VOL,SP)-Scatter plot

windows()
plot(Cars)
pairs(Cars)

cor(Cars)
##              HP        MPG         VOL         SP          WT
## HP   1.00000000 -0.7250383  0.07745947  0.9738481  0.07651307
## MPG -0.72503835  1.0000000 -0.52905658 -0.6871246 -0.52675909
## VOL  0.07745947 -0.5290566  1.00000000  0.1021700  0.99920308
## SP   0.97384807 -0.6871246  0.10217001  1.0000000  0.10243919
## WT   0.07651307 -0.5267591  0.99920308  0.1024392  1.00000000
plot(HP,MPG)

cor(HP,MPG)
## [1] -0.7250383
cor(MPG,HP)
## [1] -0.7250383

8. Correlation Coefficient matrix - Strength & Direction of Correlation

cor(Cars)
##              HP        MPG         VOL         SP          WT
## HP   1.00000000 -0.7250383  0.07745947  0.9738481  0.07651307
## MPG -0.72503835  1.0000000 -0.52905658 -0.6871246 -0.52675909
## VOL  0.07745947 -0.5290566  1.00000000  0.1021700  0.99920308
## SP   0.97384807 -0.6871246  0.10217001  1.0000000  0.10243919
## WT   0.07651307 -0.5267591  0.99920308  0.1024392  1.00000000

The Linear MOdel of interest

m1 <- lm(MPG ~ VOL + HP + SP + WT, data = Cars)
summary(m1)
## 
## Call:
## lm(formula = MPG ~ VOL + HP + SP + WT, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6320 -2.9944 -0.3705  2.2149 15.6179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.67734   14.90030   2.059   0.0429 *  
## VOL         -0.33605    0.56864  -0.591   0.5563    
## HP          -0.20544    0.03922  -5.239  1.4e-06 ***
## SP           0.39563    0.15826   2.500   0.0146 *  
## WT           0.40057    1.69346   0.237   0.8136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.488 on 76 degrees of freedom
## Multiple R-squared:  0.7705, Adjusted R-squared:  0.7585 
## F-statistic:  63.8 on 4 and 76 DF,  p-value: < 2.2e-16

Prediction based on only Volume

mv <- lm(MPG ~ VOL, data=Cars)
summary(mv)## Volume become Significant
## 
## Call:
## lm(formula = MPG ~ VOL, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3074  -5.2026   0.1902   5.4536  17.1632 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 55.81709    3.95696  14.106  < 2e-16 ***
## VOL         -0.21662    0.03909  -5.541 3.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.798 on 79 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2708 
## F-statistic: 30.71 on 1 and 79 DF,  p-value: 3.823e-07

Prediction based on only Weight

mw <- lm(MPG ~ WT, data=Cars)
summary(mw)## Weight become Significant
## 
## Call:
## lm(formula = MPG ~ WT, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3933  -5.4377   0.2738   5.2951  16.9351 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.2296     3.8761  14.249  < 2e-16 ***
## WT           -0.6420     0.1165  -5.508 4.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.811 on 79 degrees of freedom
## Multiple R-squared:  0.2775, Adjusted R-squared:  0.2683 
## F-statistic: 30.34 on 1 and 79 DF,  p-value: 4.383e-07

Prediction based on Volume and Weight

mvw <-lm(MPG ~ VOL + WT, data=Cars)
summary(mvw) ## Both become Insignificant
## 
## Call:
## lm(formula = MPG ~ VOL + WT, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.9939  -4.9460   0.0028   5.3905  17.6972 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  56.8847     4.5342   12.55   <2e-16 ***
## VOL          -0.6983     0.9841   -0.71    0.480    
## WT            1.4349     2.9291    0.49    0.626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.835 on 78 degrees of freedom
## Multiple R-squared:  0.2821, Adjusted R-squared:  0.2637 
## F-statistic: 15.33 on 2 and 78 DF,  p-value: 2.434e-06

It is Better to delete influential observations (identify nonsense records) rather than deleting entire column which is

install.packages(“car”)

library(car)
## Warning: package 'car' was built under R version 3.6.1
## Loading required package: carData

plotting Influential measueres

windows()
influencePlot(m1) ## A user friendly representation fo the above

##      StudRes        Hat      CookD
## 1   2.421762 0.05200781 0.06047977
## 71 -2.100131 0.22253511 0.24164401
## 77  4.503603 0.25138750 1.08651940

##Regression after deleting the 77the observation, which is influential observation

m2 <- lm(MPG ~ VOL+SP+HP+WT,data=Cars[-77,])
summary(m2)
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP + WT, data = Cars[-77, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3943 -2.3555 -0.5913  1.8978 12.0184 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.82675   13.32251   2.089  0.04013 *  
## VOL         -0.18546    0.50895  -0.364  0.71659    
## SP           0.41189    0.14139   2.913  0.00471 ** 
## HP          -0.22664    0.03534  -6.413 1.14e-08 ***
## WT           0.03754    1.51458   0.025  0.98029    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 75 degrees of freedom
## Multiple R-squared:  0.8192, Adjusted R-squared:  0.8096 
## F-statistic: 84.96 on 4 and 75 DF,  p-value: < 2.2e-16

Regression after deleting the 77the & 71st observations

m3 <- lm(MPG ~ VOL+SP+HP+WT, data = Cars[-c(71,1,77),])
summary(m3)
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP + WT, data = Cars[-c(71, 1, 
##     77), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7300 -2.5391 -0.3696  2.1482 10.7151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.82062   13.01740   1.907  0.06049 .  
## VOL         -0.31823    0.49668  -0.641  0.52372    
## SP           0.44618    0.13881   3.214  0.00195 ** 
## HP          -0.22688    0.03413  -6.647 4.67e-09 ***
## WT           0.40617    1.48045   0.274  0.78459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.859 on 73 degrees of freedom
## Multiple R-squared:  0.821,  Adjusted R-squared:  0.8112 
## F-statistic: 83.72 on 4 and 73 DF,  p-value: < 2.2e-16

Variance Inflation factor to check collinearity b/n variables

vif(m1)

vif>10 then there exists collinearity among all the variables, now Beta is also going insignificant, cookD (Cooks distance), if >0.5 or highest consider as influcential record. so remove any of one as vif value of WT is slightly > VOL, WT is removed, thumb rule is if vif >10 then remove or higest vif cooksD value

finalmodel <- lm(MPG ~ VOL+SP+HP, data=Cars)
summary(finalmodel)
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5869 -2.8942 -0.3157  2.1291 15.6669 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.92339   14.46589   2.069   0.0419 *  
## VOL         -0.20165    0.02259  -8.928 1.65e-13 ***
## SP           0.40066    0.15586   2.571   0.0121 *  
## HP          -0.20670    0.03861  -5.353 8.64e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.46 on 77 degrees of freedom
## Multiple R-squared:  0.7704, Adjusted R-squared:  0.7614 
## F-statistic: 86.11 on 3 and 77 DF,  p-value: < 2.2e-16

Added Variable plot to check correlatoin b/n variable and o/p variable

windows()
avPlots(m1)

avPlots(finalmodel)

### VIF and AV plot has given us an indication to delete “wt” variable

test <- read.csv(file.choose())  ##test.csv
predict(finalmodel, newdata= test)
##        1        2        3 
## 42.36150 42.26954 42.85590
summary(Cars)
##        HP             MPG             VOL               SP        
##  Min.   : 49.0   Min.   :12.10   Min.   : 50.00   Min.   : 99.56  
##  1st Qu.: 84.0   1st Qu.:27.86   1st Qu.: 89.00   1st Qu.:113.83  
##  Median :100.0   Median :35.15   Median :101.00   Median :118.21  
##  Mean   :117.5   Mean   :34.42   Mean   : 98.77   Mean   :121.54  
##  3rd Qu.:140.0   3rd Qu.:39.53   3rd Qu.:113.00   3rd Qu.:126.40  
##  Max.   :322.0   Max.   :53.70   Max.   :160.00   Max.   :169.60  
##        WT       
##  Min.   :15.71  
##  1st Qu.:29.59  
##  Median :32.73  
##  Mean   :32.41  
##  3rd Qu.:37.39  
##  Max.   :53.00
windows()
plot(test)

## transformation example on Final model

finalmodel1 <-lm(MPG~ log(VOL)+log(SP)+log(HP), data= Cars)
summary(finalmodel1)
## 
## Call:
## lm(formula = MPG ~ log(VOL) + log(SP) + log(HP), data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5400  -0.7858   0.1328   0.9994  10.7265 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -115.717     37.725  -3.067  0.00298 ** 
## log(VOL)     -11.270      1.192  -9.454 1.60e-14 ***
## log(SP)       78.244     10.064   7.774 2.76e-11 ***
## log(HP)      -37.144      2.625 -14.147  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.775 on 77 degrees of freedom
## Multiple R-squared:  0.9111, Adjusted R-squared:  0.9077 
## F-statistic: 263.1 on 3 and 77 DF,  p-value: < 2.2e-16

accuracy increased to 91%

predict(finalmodel, newdata=test)
##        1        2        3 
## 42.36150 42.26954 42.85590
windows() ##Evaluate model LINE assumptions
avPlots(finalmodel)

## Akaike Information Criteria 70% remedy, install.packages(“MASS”)

stepAIC(m1)  ## automatically removes influential records on model