Multiple Linear Regression

50_Startups

Assignment 2

dataset <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\3 MLR\\50_Startups.csv")
colnames(dataset)
## [1] "R.D.Spend"       "Administration"  "Marketing.Spend" "State"          
## [5] "Profit"
dataset <- dataset[-4]
attach(dataset)

summary(dataset)
##    R.D.Spend      Administration   Marketing.Spend      Profit      
##  Min.   :     0   Min.   : 51283   Min.   :     0   Min.   : 14681  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   1st Qu.: 90139  
##  Median : 73051   Median :122700   Median :212716   Median :107978  
##  Mean   : 73722   Mean   :121345   Mean   :211025   Mean   :112013  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469   3rd Qu.:139766  
##  Max.   :165349   Max.   :182646   Max.   :471784   Max.   :192262
plot(dataset)

# Correlation Coefficient matrix - Strength & Direction of Correlation
cor(dataset)
##                 R.D.Spend Administration Marketing.Spend    Profit
## R.D.Spend       1.0000000     0.24195525      0.72424813 0.9729005
## Administration  0.2419552     1.00000000     -0.03215388 0.2007166
## Marketing.Spend 0.7242481    -0.03215388      1.00000000 0.7477657
## Profit          0.9729005     0.20071657      0.74776572 1.0000000
# Partial Correlation matrix - Pure Correlation  b/n the varibles
#install.packages("corpcor")
library(corpcor)
cor2pcor(cor(dataset))
##            [,1]        [,2]        [,3]        [,4]
## [1,] 1.00000000  0.20852619  0.03890336  0.93477127
## [2,] 0.20852619  1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506  1.00000000  0.23707116
## [4,] 0.93477127 -0.07725021  0.23707116  1.00000000
# The Linear Model of interest with all the columns
model <- lm(Profit~R.D.Spend+Administration+Marketing.Spend)
summary(model) # R-Squared value : 0.95
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
## R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration  -2.682e-02  5.103e-02  -0.526    0.602    
## Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
# Checking for influential data members
library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
influence.measures(model)
## Influence measures of
##   lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend) :
## 
##       dfb.1_  dfb.R.D.  dfb.Admn  dfb.Mr.S    dffit cov.r   cook.d    hat
## 1   0.004431 -0.002040 -0.001828 -4.86e-03 -0.01122 1.248 3.22e-05 0.1249
## 2  -0.062584  0.018541  0.041065  4.46e-02  0.11333 1.236 3.28e-03 0.1235
## 3   0.081857  0.182130 -0.165823  2.39e-02  0.36009 1.119 3.24e-02 0.1103
## 4  -0.005134  0.128991 -0.056777  4.47e-02  0.29500 1.074 2.17e-02 0.0748
## 5  -0.103057 -0.136473  0.150239  1.97e-02 -0.23928 1.178 1.45e-02 0.1103
## 6  -0.069716 -0.097046  0.111396 -1.46e-02 -0.21541 1.129 1.17e-02 0.0782
## 7  -0.010014 -0.091486  0.004568  8.48e-02 -0.10692 1.312 2.92e-03 0.1720
## 8   0.054198 -0.035776 -0.043622 -1.64e-02 -0.11841 1.138 3.57e-03 0.0591
## 9  -0.006926  0.001890  0.006332  2.84e-03  0.01258 1.156 4.04e-05 0.0556
## 10 -0.046927 -0.083594  0.065087  2.18e-02 -0.13850 1.125 4.87e-03 0.0563
## 11  0.124367  0.164282 -0.123463 -1.09e-01  0.24686 1.009 1.51e-02 0.0420
## 12  0.181220  0.143325 -0.193694 -8.15e-02  0.25563 1.074 1.64e-02 0.0647
## 13 -0.010018  0.046846  0.023103  7.05e-03  0.21705 0.948 1.16e-02 0.0242
## 14 -0.039982  0.002448  0.050526  2.34e-02  0.12610 1.069 4.01e-03 0.0278
## 15  0.256593 -0.181700 -0.268033  5.91e-02 -0.51010 0.842 6.13e-02 0.0638
## 16 -0.062230 -0.242883  0.078834  1.10e-01 -0.37839 0.848 3.40e-02 0.0401
## 17 -0.010617 -0.057633  0.025599  8.95e-02  0.18305 1.007 8.34e-03 0.0265
## 18  0.065688  0.024044 -0.071189 -4.93e-02 -0.11408 1.114 3.31e-03 0.0443
## 19 -0.011150  0.004966  0.014323 -3.72e-02 -0.09238 1.100 2.17e-03 0.0308
## 20  0.043989  0.261698  0.035221 -3.47e-01  0.40250 1.252 4.07e-02 0.1822
## 21  0.000102 -0.018343  0.000474  2.74e-02  0.04081 1.133 4.25e-04 0.0397
## 22  0.140627  0.117170 -0.155388 -1.39e-01 -0.20837 1.148 1.10e-02 0.0866
## 23  0.024966  0.061810 -0.028686 -8.15e-02 -0.10712 1.123 2.92e-03 0.0476
## 24 -0.003676  0.017598  0.003252 -2.33e-02 -0.03287 1.151 2.76e-04 0.0529
## 25 -0.109661 -0.087336  0.092409  9.71e-02 -0.14097 1.139 5.05e-03 0.0650
## 26 -0.010284  0.008784  0.044543 -3.78e-02  0.10875 1.101 3.00e-03 0.0356
## 27  0.013462 -0.034455 -0.044048  5.70e-02 -0.11604 1.115 3.42e-03 0.0449
## 28  0.132540  0.239831 -0.132426 -3.05e-01 -0.34604 1.090 2.98e-02 0.0944
## 29 -0.043457 -0.008583  0.060294 -6.42e-03  0.07179 1.250 1.32e-03 0.1292
## 30  0.005453 -0.002942 -0.011993  9.09e-03 -0.02221 1.162 1.26e-04 0.0607
## 31  0.007462  0.006378 -0.004159 -9.90e-03  0.01280 1.154 4.19e-05 0.0542
## 32  0.001163 -0.000866 -0.003027  2.78e-03 -0.00602 1.169 9.27e-06 0.0661
## 33 -0.019167 -0.028418  0.005176  4.45e-02 -0.05268 1.186 7.09e-04 0.0822
## 34 -0.011897  0.006926  0.008144 -5.33e-03 -0.02239 1.126 1.28e-04 0.0317
## 35 -0.173631 -0.193610  0.231928  1.50e-01  0.28895 1.133 2.10e-02 0.0987
## 36  0.123050 -0.034659 -0.103225  2.06e-02  0.16264 1.114 6.70e-03 0.0570
## 37 -0.090124 -0.402279  0.203553  2.91e-01  0.48212 0.896 5.55e-02 0.0690
## 38  0.014721  0.000948 -0.014072 -1.47e-03  0.01601 1.283 6.55e-05 0.1489
## 39  0.369985 -0.154973 -0.312156  8.05e-02  0.47799 1.053 5.62e-02 0.1132
## 40 -0.064018  0.013047  0.051627  3.92e-05 -0.07778 1.155 1.54e-03 0.0624
## 41  0.006168 -0.062763  0.018554  3.59e-02  0.08518 1.133 1.85e-03 0.0487
## 42  0.065951 -0.028739 -0.048818  8.06e-03  0.08804 1.157 1.98e-03 0.0655
## 43  0.014072 -0.010602 -0.007914  2.93e-03  0.02308 1.153 1.36e-04 0.0539
## 44  0.069222 -0.068538  0.051283 -9.75e-02  0.28724 1.056 2.06e-02 0.0664
## 45 -0.004343 -0.005425  0.012671 -5.46e-03  0.02195 1.211 1.23e-04 0.0984
## 46  0.183015 -0.169881  0.078009 -2.10e-01  0.62093 0.844 9.03e-02 0.0862
## 47  0.096322  0.434681 -0.145428 -3.90e-01 -0.46543 1.354 5.45e-02 0.2406
## 48 -0.014007  0.049791 -0.046708  3.73e-02 -0.14485 1.187 5.34e-03 0.0962
## 49 -0.861899 -0.141175  0.693163  4.29e-01 -0.89031 1.093 1.91e-01 0.2180
## 50 -0.409479  0.599022 -0.107675  1.44e-01 -1.27816 0.268 2.88e-01 0.0748
##    inf
## 1     
## 2     
## 3     
## 4     
## 5     
## 6     
## 7    *
## 8     
## 9     
## 10    
## 11    
## 12    
## 13    
## 14    
## 15    
## 16    
## 17    
## 18    
## 19    
## 20    
## 21    
## 22    
## 23    
## 24    
## 25    
## 26    
## 27    
## 28    
## 29    
## 30    
## 31    
## 32    
## 33    
## 34    
## 35    
## 36    
## 37    
## 38   *
## 39    
## 40    
## 41    
## 42    
## 43    
## 44    
## 45    
## 46    
## 47   *
## 48    
## 49   *
## 50   *
influenceIndexPlot(model)

influencePlot(model)

##       StudRes        Hat      CookD
## 46  2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
vif(model)
##       R.D.Spend  Administration Marketing.Spend 
##        2.468903        1.175091        2.326773
# preparing test models to check the accuracy of the prediction
test_model <- lm(Profit ~ Administration+R.D.Spend+Marketing.Spend, data = dataset[-c(46,47,49,50)])
summary(test_model) # R-squared value : 0.9507
## 
## Call:
## lm(formula = Profit ~ Administration + R.D.Spend + Marketing.Spend, 
##     data = dataset[-c(46, 47, 49, 50)])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
## Administration  -2.682e-02  5.103e-02  -0.526    0.602    
## R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
# Analysing the final model combination
library(MASS)

stepAIC(model)
## Start:  AIC=916.88
## Profit ~ R.D.Spend + Administration + Marketing.Spend
## 
##                   Df  Sum of Sq        RSS     AIC
## - Administration   1 2.3539e+07 3.9444e+09  915.18
## <none>                          3.9209e+09  916.88
## - Marketing.Spend  1 2.3349e+08 4.1543e+09  917.77
## - R.D.Spend        1 2.7147e+10 3.1068e+10 1018.37
## 
## Step:  AIC=915.18
## Profit ~ R.D.Spend + Marketing.Spend
## 
##                   Df  Sum of Sq        RSS     AIC
## <none>                          3.9444e+09  915.18
## - Marketing.Spend  1 3.1165e+08 4.2560e+09  916.98
## - R.D.Spend        1 3.1149e+10 3.5094e+10 1022.46
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
## 
## Coefficients:
##     (Intercept)        R.D.Spend  Marketing.Spend  
##       4.698e+04        7.966e-01        2.991e-02
avPlots(model)

# Final Model is predicted but it has some insignificant values

Final_Model <- lm(Profit ~ R.D.Spend+Marketing.Spend)
summary(Final_Model) # R-squared value : 0.9505
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
## R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
# Evaluate model LINE assumptions 
plot(Final_Model)

hist(residuals(Final_Model)) # close to normal distribution

The above model predicts with an accuracy of 95.05 %.