Multiple Linear Regression

Predicts mpg for a car from hp, vol, sp, and wt

Cars dataset - IV x1 hp, x2 vol, x3 sp, x4 wt input variable - DV y mpg output variable

Exploratory Data Analysis(60% of time)

1. Measures of Central Tendency

2. Measures of Dispersion

3. Third Moment Business decision

4. Fourth Moment Business decision

5. Probability distributions of variables

6. Graphical representations

> Histogram,Box plot,Dot plot,Stem & Leaf plot,

> Bar plot

7. Find the correlation b/n Output (MPG) & (HP,VOL,SP)-Scatter plot

8. Correlation Coefficient matrix - Strength & Direction of Correlation

Cars <- read.csv("E:\\mikhilesh\\Horizon 2020\\ApCoTe Yogesh Sky Analytics DA DS Learning\\Cars dataset.csv") # reading the Cars.csv data set
#View(Cars)

# Exploratory Data Analysis(60% of time)
# 1. Measures of Central Tendency
# 2. Measures of Dispersion
# 3. Third Moment Business decision
# 4. Fourth Moment Business decision
# 5. Probability distributions of variables
# 6. Graphical representations
#  > Histogram,Box plot,Dot plot,Stem & Leaf plot, 
#  > Bar plot

dim(Cars)
## [1] 81  5
attach(Cars)
class(Cars)
## [1] "data.frame"
names(Cars)
## [1] "HP"  "MPG" "VOL" "SP"  "WT"
str(Cars)
## 'data.frame':    81 obs. of  5 variables:
##  $ HP : int  49 55 55 70 53 70 55 62 62 80 ...
##  $ MPG: num  53.7 50 50 45.7 50.5 ...
##  $ VOL: int  89 92 92 92 92 89 92 50 50 94 ...
##  $ SP : num  104 105 105 113 104 ...
##  $ WT : num  28.8 30.5 30.2 30.6 29.9 ...
str(HP)
##  int [1:81] 49 55 55 70 53 70 55 62 62 80 ...
summary(Cars)
##        HP             MPG             VOL               SP        
##  Min.   : 49.0   Min.   :12.10   Min.   : 50.00   Min.   : 99.56  
##  1st Qu.: 84.0   1st Qu.:27.86   1st Qu.: 89.00   1st Qu.:113.83  
##  Median :100.0   Median :35.15   Median :101.00   Median :118.21  
##  Mean   :117.5   Mean   :34.42   Mean   : 98.77   Mean   :121.54  
##  3rd Qu.:140.0   3rd Qu.:39.53   3rd Qu.:113.00   3rd Qu.:126.40  
##  Max.   :322.0   Max.   :53.70   Max.   :160.00   Max.   :169.60  
##        WT       
##  Min.   :15.71  
##  1st Qu.:29.59  
##  Median :32.73  
##  Mean   :32.41  
##  3rd Qu.:37.39  
##  Max.   :53.00
# 7. Find the correlation b/n Output (MPG) & (HP,VOL,SP)-Scatter plot

# To check for Multicolinearity Problem
plot(Cars) # OR pairs(Cars) will give the same plots

#Plots demonstrate Multicolineariity (linear relation) between HP and SP pair and VOL and WT pair

# 8. Correlation Coefficient matrix - Strength & Direction of Correlation
#To confirm, lets check correlation r
cor(Cars) #Values greater than 0.85 are considered strong correlation
##              HP        MPG         VOL         SP          WT
## HP   1.00000000 -0.7250383  0.07745947  0.9738481  0.07651307
## MPG -0.72503835  1.0000000 -0.52905658 -0.6871246 -0.52675909
## VOL  0.07745947 -0.5290566  1.00000000  0.1021700  0.99920308
## SP   0.97384807 -0.6871246  0.10217001  1.0000000  0.10243919
## WT   0.07651307 -0.5267591  0.99920308  0.1024392  1.00000000
# pairs with > 0.85 are HP - SP and VOL - WT

plot(HP, SP)

cor(HP, MPG) 
## [1] -0.7250383
cor(MPG, HP)
## [1] -0.7250383
# First build a basic Linear regression model with all the input variables
# The Linear Model of interest
m1 <- lm(MPG ~ VOL+ HP + SP + WT,data = Cars)  #syntax m1 <- lm(y ~ x,data =dsetname)
m1
## 
## Call:
## lm(formula = MPG ~ VOL + HP + SP + WT, data = Cars)
## 
## Coefficients:
## (Intercept)          VOL           HP           SP           WT  
##     30.6773      -0.3361      -0.2054       0.3956       0.4006
summary(m1)
## 
## Call:
## lm(formula = MPG ~ VOL + HP + SP + WT, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6320 -2.9944 -0.3705  2.2149 15.6179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.67734   14.90030   2.059   0.0429 *  
## VOL         -0.33605    0.56864  -0.591   0.5563    
## HP          -0.20544    0.03922  -5.239  1.4e-06 ***
## SP           0.39563    0.15826   2.500   0.0146 *  
## WT           0.40057    1.69346   0.237   0.8136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.488 on 76 degrees of freedom
## Multiple R-squared:  0.7705, Adjusted R-squared:  0.7585 
## F-statistic:  63.8 on 4 and 76 DF,  p-value: < 2.2e-16
# results show p-values > 0.05 for VOL and WT - not significant model

# Next TO check which of vol and wt are creating problems, build a separate model for vol and wt each

# Prediction based on only Volume 
mv <- lm(MPG ~ VOL,data = Cars)
mv
## 
## Call:
## lm(formula = MPG ~ VOL, data = Cars)
## 
## Coefficients:
## (Intercept)          VOL  
##     55.8171      -0.2166
summary(mv) # Volume became significant  - p-value < 0.05
## 
## Call:
## lm(formula = MPG ~ VOL, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3074  -5.2026   0.1902   5.4536  17.1632 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 55.81709    3.95696  14.106  < 2e-16 ***
## VOL         -0.21662    0.03909  -5.541 3.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.798 on 79 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2708 
## F-statistic: 30.71 on 1 and 79 DF,  p-value: 3.823e-07
# Prediction based on only Weight
mw <- lm(MPG ~ WT,data = Cars)
summary(mw) # Weight became significant - p-value < 0.05
## 
## Call:
## lm(formula = MPG ~ WT, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3933  -5.4377   0.2738   5.2951  16.9351 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.2296     3.8761  14.249  < 2e-16 ***
## WT           -0.6420     0.1165  -5.508 4.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.811 on 79 degrees of freedom
## Multiple R-squared:  0.2775, Adjusted R-squared:  0.2683 
## F-statistic: 30.34 on 1 and 79 DF,  p-value: 4.383e-07
# As seen from m1 model summary, HP and SP dont have any problems.
# Next we check model/Prediction based on Volume and Weight
mvw <- lm(MPG ~ VOL + WT,data = Cars)
summary(mvw) # Both became Insignificant  - p-value > 0.05 for both vol and wt
## 
## Call:
## lm(formula = MPG ~ VOL + WT, data = Cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.9939  -4.9460   0.0028   5.3905  17.6972 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  56.8847     4.5342   12.55   <2e-16 ***
## VOL          -0.6983     0.9841   -0.71    0.480    
## WT            1.4349     2.9291    0.49    0.626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.835 on 78 degrees of freedom
## Multiple R-squared:  0.2821, Adjusted R-squared:  0.2637 
## F-statistic: 15.33 on 2 and 78 DF,  p-value: 2.434e-06
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
# It is Better to delete influential observations rather than deleting entire column which is creating the multicolinearity problem

## plotting Influential measures 
influencePlot(m1) # A user friendly representation of the above

##      StudRes        Hat      CookD
## 1   2.421762 0.05200781 0.06047977
## 71 -2.100131 0.22253511 0.24164401
## 77  4.503603 0.25138750 1.08651940
#COOK'S DISTANCE is a measure to identify which is the most influential record.

# Regression after deleting the most influential - 77th observation, which is influential observation
m2 <- lm(MPG ~ VOL+SP+HP+WT,data=Cars[-77,])
summary(m2) # Summary demonstrate even after removing the most influential record 77, vol and wt are still insignificant p-vale > 0.05
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP + WT, data = Cars[-77, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3943 -2.3555 -0.5913  1.8978 12.0184 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.82675   13.32251   2.089  0.04013 *  
## VOL         -0.18546    0.50895  -0.364  0.71659    
## SP           0.41189    0.14139   2.913  0.00471 ** 
## HP          -0.22664    0.03534  -6.413 1.14e-08 ***
## WT           0.03754    1.51458   0.025  0.98029    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 75 degrees of freedom
## Multiple R-squared:  0.8192, Adjusted R-squared:  0.8096 
## F-statistic: 84.96 on 4 and 75 DF,  p-value: < 2.2e-16
#SO NEXT WE WILL REMOVE INFULENTIAL REMOVES and check the significance of model

# Regression after deleting the 77th, 71st, and 1st Observations
m3 <- lm(MPG ~ VOL+ SP+ HP+ WT,data=Cars[-c(71,77,1),])
summary(m3) # Summary demonstrate even after removing all the influential record 77, 71 & 1st - vol and wt are still insignificant p-vale > 0.05
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP + WT, data = Cars[-c(71, 77, 
##     1), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7300 -2.5391 -0.3696  2.1482 10.7151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.82062   13.01740   1.907  0.06049 .  
## VOL         -0.31823    0.49668  -0.641  0.52372    
## SP           0.44618    0.13881   3.214  0.00195 ** 
## HP          -0.22688    0.03413  -6.647 4.67e-09 ***
## WT           0.40617    1.48045   0.274  0.78459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.859 on 73 degrees of freedom
## Multiple R-squared:  0.821,  Adjusted R-squared:  0.8112 
## F-statistic: 83.72 on 4 and 73 DF,  p-value: < 2.2e-16
# Variance Inflation factor to check collinearity b/n variables 
vif(m1)
##       VOL        HP        SP        WT 
## 638.80608  19.92659  20.00764 639.53382
#(USusally if vif > 10, you should remove that variable)
# In this model m1, all vif > 10 - then there exists collinearity among all the variables 
#But we cannot remove all the variables,
#SO we will remove the highest vif variable first - in this case - WT

# check the new model again after removing highest VIF variable - wt
finalmodel <- lm(MPG ~ VOL + SP + HP,data=Cars)
finalmodel
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP, data = Cars)
## 
## Coefficients:
## (Intercept)          VOL           SP           HP  
##     29.9234      -0.2017       0.4007      -0.2067
summary(finalmodel) # Summary demonstrate after removing highest VIF wt, model is significant, all other variables p-value < 0.05
## 
## Call:
## lm(formula = MPG ~ VOL + SP + HP, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5869 -2.8942 -0.3157  2.1291 15.6669 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.92339   14.46589   2.069   0.0419 *  
## VOL         -0.20165    0.02259  -8.928 1.65e-13 ***
## SP           0.40066    0.15586   2.571   0.0121 *  
## HP          -0.20670    0.03861  -5.353 8.64e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.46 on 77 degrees of freedom
## Multiple R-squared:  0.7704, Adjusted R-squared:  0.7614 
## F-statistic: 86.11 on 3 and 77 DF,  p-value: < 2.2e-16
# JUST for PRACTICE and UNDERSTANDING-  check the new model again after removing second highest VIF variable - vol
finalmodel1 <- lm(MPG ~ WT + SP + HP,data=Cars)
finalmodel1
## 
## Call:
## lm(formula = MPG ~ WT + SP + HP, data = Cars)
## 
## Coefficients:
## (Intercept)           WT           SP           HP  
##     28.7848      -0.5994       0.4078      -0.2085
summary(finalmodel1) # Summary demonstrate after removing swcond highest VIF vol, model is significant, intercept p-value > 0.05 - not significant
## 
## Call:
## lm(formula = MPG ~ WT + SP + HP, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7567 -2.7652 -0.3683  1.8589 15.7690 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.78481   14.49047   1.986   0.0505 .  
## WT          -0.59941    0.06739  -8.895 1.91e-13 ***
## SP           0.40775    0.15626   2.609   0.0109 *  
## HP          -0.20850    0.03871  -5.386 7.56e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.469 on 77 degrees of freedom
## Multiple R-squared:  0.7695, Adjusted R-squared:  0.7605 
## F-statistic: 85.68 on 3 and 77 DF,  p-value: < 2.2e-16
# SO removing wt is better model than removing vol
#OTHER OPTION TO VIF METHOD IS AVPLOT
## Added Variable plot to check correlation b/n variables and o/p variable
avPlots(m1)

# In this model m1, plots show correlation b/w hp-mpg, sp-mpg. If observed minutely, there is a slight negative coorelation b/w vol-mpg but wt-mpg line is almost flat - showing no relation

avPlots(finalmodel)

# In this finalmodel, plots show all the variables have correlation after removing wt

# VIF and AV plot has given us an indication to delete "wt" variable
library(MASS)
stepAIC(m1) #Akaike Information Criterion
## Start:  AIC=248.06
## MPG ~ VOL + HP + SP + WT
## 
##        Df Sum of Sq    RSS    AIC
## - WT    1      1.13 1531.8 246.12
## - VOL   1      7.03 1537.7 246.43
## <none>              1530.7 248.06
## - SP    1    125.87 1656.5 252.46
## - HP    1    552.74 2083.4 271.03
## 
## Step:  AIC=246.12
## MPG ~ VOL + HP + SP
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1531.8 246.12
## - SP    1    131.46 1663.3 250.79
## - HP    1    570.08 2101.9 269.75
## - VOL   1   1585.81 3117.6 301.68
## 
## Call:
## lm(formula = MPG ~ VOL + HP + SP, data = Cars)
## 
## Coefficients:
## (Intercept)          VOL           HP           SP  
##     29.9234      -0.2017      -0.2067       0.4007
#StepAIC take all input variables from the basic model m1 - removes all influential variables, and runs all the models and stops at the best model
#Notice stepAIC automatically removed wt variable and given final model as lm(formula = MPG ~ VOL + HP + SP, data = Cars)
#Lesser the AIC better the model
#Predicting values for MPG using the final model
pv <- predict(finalmodel,Cars)
pv
##        1        2        3        4        5        6        7        8 
## 43.59077 42.25679 42.25679 42.36150 42.26954 42.85590 42.25679 48.13221 
##        9       10       11       12       13       14       15       16 
## 48.13221 40.76616 41.43447 47.94095 39.86565 41.43447 41.67943 41.43447 
##       17       18       19       20       21       22       23       24 
## 41.27332 47.94095 41.27332 38.01722 38.66367 37.46001 38.11462 39.42854 
##       25       26       27       28       29       30       31       32 
## 40.09224 46.73898 35.69724 38.66367 38.09467 35.87282 35.04067 37.18309 
##       33       34       35       36       37       38       39       40 
## 37.32689 34.69006 37.40255 37.63925 39.28727 38.33839 38.33839 35.96910 
##       41       42       43       44       45       46       47       48 
## 34.13369 35.28731 37.34958 38.25375 35.95927 36.20872 34.23109 35.56226 
##       49       50       51       52       53       54       55       56 
## 36.95791 33.17920 33.17920 33.17920 29.38875 27.38159 28.31041 28.69214 
##       57       58       59       60       61       62       63       64 
## 35.78519 33.17920 35.43009 32.36991 29.73729 28.87233 25.07082 26.38923 
##       65       66       67       68       69       70       71       72 
## 25.85377 36.45744 25.91011 23.76768 24.42689 20.12047 27.91145 22.66872 
##       73       74       75       76       77       78       79       80 
## 23.16313 18.68892 23.79778 20.97037 21.23314 17.86773 26.21686 12.23755 
##       81 
## 15.59296
# converting numeric pv into dataframe
pv <- as.data.frame(pv)

# ADDING the predicted values of MPG to original datset
final <- cbind(Cars,pv)
final
##     HP      MPG VOL        SP       WT       pv
## 1   49 53.70068  89 104.18535 28.76206 43.59077
## 2   55 50.01340  92 105.46126 30.46683 42.25679
## 3   55 50.01340  92 105.46126 30.19360 42.25679
## 4   70 45.69632  92 113.46126 30.63211 42.36150
## 5   53 50.50423  92 104.46126 29.88915 42.26954
## 6   70 45.69632  89 113.18535 29.59177 42.85590
## 7   55 50.01340  92 105.46126 30.30848 42.25679
## 8   62 46.71655  50 102.59851 15.84776 48.13221
## 9   62 46.71655  50 102.59851 16.35948 48.13221
## 10  80 42.29908  94 115.64520 30.92015 40.76616
## 11  73 44.65283  89 111.18535 29.36334 41.43447
## 12  92 39.35409  50 117.59851 15.75353 47.94095
## 13  92 39.35409  99 122.10506 32.81359 39.86565
## 14  73 44.65283  89 111.18535 29.37844 41.43447
## 15  66 45.73489  89 108.18535 29.34728 41.67943
## 16  73 44.65283  89 111.18535 29.60453 41.43447
## 17  78 42.78991  91 114.36929 29.53578 41.27332
## 18  92 39.35409  50 117.59851 16.19412 47.94095
## 19  78 42.78991  91 114.36929 29.92939 41.27332
## 20  90 38.90183 103 118.47294 33.51697 38.01722
## 21  92 38.41100  99 119.10506 32.32465 38.66367
## 22  74 42.82848 107 110.84082 34.90821 37.46001
## 23  95 38.31061 101 120.28900 32.67583 38.11462
## 24  81 40.47472  96 113.82914 31.83712 39.42854
## 25  95 38.31061  89 119.18535 28.78173 40.09224
## 26  92 38.41100  50 114.59851 16.04317 46.73898
## 27  92 38.41100 117 120.76052 38.06282 35.69724
## 28  92 38.41100  99 119.10506 32.83507 38.66367
## 29  52 43.46943 104  99.56491 34.48321 38.09467
## 30 103 35.40419 107 121.84082 35.54936 35.87282
## 31  84 39.43124 114 113.48461 37.04235 35.04067
## 32  84 39.43124 101 112.28900 33.23436 37.18309
## 33 102 36.28546  97 119.92111 31.38004 37.32689
## 34 102 36.28546 113 121.39264 37.57329 34.69006
## 35  81 39.53163 101 111.28900 32.70164 37.40255
## 36  90 37.95874  98 115.01309 31.91122 37.63925
## 37  90 37.95874  88 114.09338 28.75400 39.28727
## 38 102 34.07067  86 116.90944 27.87992 38.33839
## 39 102 34.07067  86 116.90944 28.63050 38.33839
## 40 130 31.01413  92 128.46126 30.11543 35.96910
## 41  95 35.15273 113 116.39264 37.39252 34.13369
## 42  95 35.15273 106 115.74885 35.02718 35.28731
## 43 102 34.07067  92 117.46126 30.52743 37.34958
## 44  95 35.15273  88 114.09338 28.34398 38.25375
## 45  93 35.64356 102 114.38097 33.07863 35.95927
## 46 100 34.56150  99 117.10506 32.62192 36.20872
## 47 100 34.56150 111 118.20870 36.49862 34.23109
## 48  98 35.05233 103 116.47294 33.91006 35.56226
## 49 130 31.01413  86 127.90944 28.07060 36.95791
## 50 115 29.62994 101 118.28900 33.45847 33.17920
## 51 115 29.62994 101 118.28900 33.21395 33.17920
## 52 115 29.62994 101 118.28900 33.43671 33.17920
## 53 115 29.62994 124 120.40431 40.39816 29.38875
## 54 180 24.48737 113 143.39264 37.62069 27.38159
## 55 160 26.85228 113 135.39264 37.25439 28.31041
## 56 130 27.85625 124 126.40431 40.58907 28.69214
## 57  96 31.11358  92 110.46126 30.14754 35.78519
## 58 115 29.62994 101 118.28900 32.73452 33.17920
## 59 100 30.13192  94 112.64520 30.61528 35.43009
## 60 100 28.86023 115 115.57658 37.66287 32.36991
## 61 145 27.35427 111 130.20870 36.88815 29.73729
## 62 120 24.60913 116 117.66855 37.86041 28.87233
## 63 140 23.51592 131 126.04810 43.39099 25.07082
## 64 140 23.51592 123 125.31234 40.72283 26.38923
## 65 150 23.60516 121 128.12840 40.15948 25.85377
## 66 165 40.05000  50 126.59851 15.71286 36.45744
## 67 165 23.10317 114 132.48461 37.97996 25.91011
## 68 165 23.10317 127 133.68022 41.57397 23.76768
## 69 165 23.10317 123 133.31234 40.47204 24.42689
## 70 245 21.27371 112 158.30067 37.14173 20.12047
## 71 280 19.67851  50 164.59851 15.82306 27.91145
## 72 162 23.20357 135 133.41598 44.01314 22.66872
## 73 162 23.20357 132 133.14007 43.35312 23.16313
## 74 140 19.08634 160 124.71524 52.99775 18.68892
## 75 140 19.08634 129 121.86416 42.61870 23.79778
## 76 175 18.76284 129 132.86416 42.77822 20.97037
## 77 322 36.90000  50 169.59851 16.13295 21.23314
## 78 238 19.19789 115 150.57658 37.92311 17.86773
## 79 263 34.00000  50 151.59851 15.76963 26.21686
## 80 295 19.83373 119 167.94446 39.42310 12.23755
## 81 236 12.10126 107 139.84082 34.94861 15.59296
summary(Cars)
##        HP             MPG             VOL               SP        
##  Min.   : 49.0   Min.   :12.10   Min.   : 50.00   Min.   : 99.56  
##  1st Qu.: 84.0   1st Qu.:27.86   1st Qu.: 89.00   1st Qu.:113.83  
##  Median :100.0   Median :35.15   Median :101.00   Median :118.21  
##  Mean   :117.5   Mean   :34.42   Mean   : 98.77   Mean   :121.54  
##  3rd Qu.:140.0   3rd Qu.:39.53   3rd Qu.:113.00   3rd Qu.:126.40  
##  Max.   :322.0   Max.   :53.70   Max.   :160.00   Max.   :169.60  
##        WT       
##  Min.   :15.71  
##  1st Qu.:29.59  
##  Median :32.73  
##  Mean   :32.41  
##  3rd Qu.:37.39  
##  Max.   :53.00
Testcar <- read.csv("E:\\mikhilesh\\Horizon 2020\\ApCoTe Yogesh Sky Analytics DA DS Learning\\cars testset.csv") # reading the test.csv data set
Testcar
##   HP VOL       SP
## 1 70  92 113.4613
## 2 53  92 104.4613
## 3 70  89 113.1854
#predicting mpg values for testcars
pv1 <- predict(finalmodel,newdata = Testcar)
pv1
##        1        2        3 
## 42.36150 42.26954 42.85590
# converting numeric pv into dataframe
pv1 <- as.data.frame(pv1)

# ADDING the predicted values of MPG to original datset
finaltest <- cbind(Testcar,pv1)
finaltest
##   HP VOL       SP      pv1
## 1 70  92 113.4613 42.36150
## 2 53  92 104.4613 42.26954
## 3 70  89 113.1854 42.85590
summary(Testcar)
##        HP             VOL             SP       
##  Min.   :53.00   Min.   :89.0   Min.   :104.5  
##  1st Qu.:61.50   1st Qu.:90.5   1st Qu.:108.8  
##  Median :70.00   Median :92.0   Median :113.2  
##  Mean   :64.33   Mean   :91.0   Mean   :110.4  
##  3rd Qu.:70.00   3rd Qu.:92.0   3rd Qu.:113.3  
##  Max.   :70.00   Max.   :92.0   Max.   :113.5