Assignment 2
dataset <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\3 MLR\\50_Startups.csv")
colnames(dataset)
## [1] "R.D.Spend" "Administration" "Marketing.Spend" "State"
## [5] "Profit"
dataset <- dataset[-4]
attach(dataset)
summary(dataset)
## R.D.Spend Administration Marketing.Spend Profit
## Min. : 0 Min. : 51283 Min. : 0 Min. : 14681
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 1st Qu.: 90139
## Median : 73051 Median :122700 Median :212716 Median :107978
## Mean : 73722 Mean :121345 Mean :211025 Mean :112013
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469 3rd Qu.:139766
## Max. :165349 Max. :182646 Max. :471784 Max. :192262
plot(dataset)

# Correlation Coefficient matrix - Strength & Direction of Correlation
cor(dataset)
## R.D.Spend Administration Marketing.Spend Profit
## R.D.Spend 1.0000000 0.24195525 0.72424813 0.9729005
## Administration 0.2419552 1.00000000 -0.03215388 0.2007166
## Marketing.Spend 0.7242481 -0.03215388 1.00000000 0.7477657
## Profit 0.9729005 0.20071657 0.74776572 1.0000000
# Partial Correlation matrix - Pure Correlation b/n the varibles
#install.packages("corpcor")
library(corpcor)
cor2pcor(cor(dataset))
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.20852619 0.03890336 0.93477127
## [2,] 0.20852619 1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506 1.00000000 0.23707116
## [4,] 0.93477127 -0.07725021 0.23707116 1.00000000
# The Linear Model of interest with all the columns
model <- lm(Profit~R.D.Spend+Administration+Marketing.Spend)
summary(model) # R-Squared value : 0.95
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# Checking for influential data members
library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
influence.measures(model)
## Influence measures of
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend) :
##
## dfb.1_ dfb.R.D. dfb.Admn dfb.Mr.S dffit cov.r cook.d hat
## 1 0.004431 -0.002040 -0.001828 -4.86e-03 -0.01122 1.248 3.22e-05 0.1249
## 2 -0.062584 0.018541 0.041065 4.46e-02 0.11333 1.236 3.28e-03 0.1235
## 3 0.081857 0.182130 -0.165823 2.39e-02 0.36009 1.119 3.24e-02 0.1103
## 4 -0.005134 0.128991 -0.056777 4.47e-02 0.29500 1.074 2.17e-02 0.0748
## 5 -0.103057 -0.136473 0.150239 1.97e-02 -0.23928 1.178 1.45e-02 0.1103
## 6 -0.069716 -0.097046 0.111396 -1.46e-02 -0.21541 1.129 1.17e-02 0.0782
## 7 -0.010014 -0.091486 0.004568 8.48e-02 -0.10692 1.312 2.92e-03 0.1720
## 8 0.054198 -0.035776 -0.043622 -1.64e-02 -0.11841 1.138 3.57e-03 0.0591
## 9 -0.006926 0.001890 0.006332 2.84e-03 0.01258 1.156 4.04e-05 0.0556
## 10 -0.046927 -0.083594 0.065087 2.18e-02 -0.13850 1.125 4.87e-03 0.0563
## 11 0.124367 0.164282 -0.123463 -1.09e-01 0.24686 1.009 1.51e-02 0.0420
## 12 0.181220 0.143325 -0.193694 -8.15e-02 0.25563 1.074 1.64e-02 0.0647
## 13 -0.010018 0.046846 0.023103 7.05e-03 0.21705 0.948 1.16e-02 0.0242
## 14 -0.039982 0.002448 0.050526 2.34e-02 0.12610 1.069 4.01e-03 0.0278
## 15 0.256593 -0.181700 -0.268033 5.91e-02 -0.51010 0.842 6.13e-02 0.0638
## 16 -0.062230 -0.242883 0.078834 1.10e-01 -0.37839 0.848 3.40e-02 0.0401
## 17 -0.010617 -0.057633 0.025599 8.95e-02 0.18305 1.007 8.34e-03 0.0265
## 18 0.065688 0.024044 -0.071189 -4.93e-02 -0.11408 1.114 3.31e-03 0.0443
## 19 -0.011150 0.004966 0.014323 -3.72e-02 -0.09238 1.100 2.17e-03 0.0308
## 20 0.043989 0.261698 0.035221 -3.47e-01 0.40250 1.252 4.07e-02 0.1822
## 21 0.000102 -0.018343 0.000474 2.74e-02 0.04081 1.133 4.25e-04 0.0397
## 22 0.140627 0.117170 -0.155388 -1.39e-01 -0.20837 1.148 1.10e-02 0.0866
## 23 0.024966 0.061810 -0.028686 -8.15e-02 -0.10712 1.123 2.92e-03 0.0476
## 24 -0.003676 0.017598 0.003252 -2.33e-02 -0.03287 1.151 2.76e-04 0.0529
## 25 -0.109661 -0.087336 0.092409 9.71e-02 -0.14097 1.139 5.05e-03 0.0650
## 26 -0.010284 0.008784 0.044543 -3.78e-02 0.10875 1.101 3.00e-03 0.0356
## 27 0.013462 -0.034455 -0.044048 5.70e-02 -0.11604 1.115 3.42e-03 0.0449
## 28 0.132540 0.239831 -0.132426 -3.05e-01 -0.34604 1.090 2.98e-02 0.0944
## 29 -0.043457 -0.008583 0.060294 -6.42e-03 0.07179 1.250 1.32e-03 0.1292
## 30 0.005453 -0.002942 -0.011993 9.09e-03 -0.02221 1.162 1.26e-04 0.0607
## 31 0.007462 0.006378 -0.004159 -9.90e-03 0.01280 1.154 4.19e-05 0.0542
## 32 0.001163 -0.000866 -0.003027 2.78e-03 -0.00602 1.169 9.27e-06 0.0661
## 33 -0.019167 -0.028418 0.005176 4.45e-02 -0.05268 1.186 7.09e-04 0.0822
## 34 -0.011897 0.006926 0.008144 -5.33e-03 -0.02239 1.126 1.28e-04 0.0317
## 35 -0.173631 -0.193610 0.231928 1.50e-01 0.28895 1.133 2.10e-02 0.0987
## 36 0.123050 -0.034659 -0.103225 2.06e-02 0.16264 1.114 6.70e-03 0.0570
## 37 -0.090124 -0.402279 0.203553 2.91e-01 0.48212 0.896 5.55e-02 0.0690
## 38 0.014721 0.000948 -0.014072 -1.47e-03 0.01601 1.283 6.55e-05 0.1489
## 39 0.369985 -0.154973 -0.312156 8.05e-02 0.47799 1.053 5.62e-02 0.1132
## 40 -0.064018 0.013047 0.051627 3.92e-05 -0.07778 1.155 1.54e-03 0.0624
## 41 0.006168 -0.062763 0.018554 3.59e-02 0.08518 1.133 1.85e-03 0.0487
## 42 0.065951 -0.028739 -0.048818 8.06e-03 0.08804 1.157 1.98e-03 0.0655
## 43 0.014072 -0.010602 -0.007914 2.93e-03 0.02308 1.153 1.36e-04 0.0539
## 44 0.069222 -0.068538 0.051283 -9.75e-02 0.28724 1.056 2.06e-02 0.0664
## 45 -0.004343 -0.005425 0.012671 -5.46e-03 0.02195 1.211 1.23e-04 0.0984
## 46 0.183015 -0.169881 0.078009 -2.10e-01 0.62093 0.844 9.03e-02 0.0862
## 47 0.096322 0.434681 -0.145428 -3.90e-01 -0.46543 1.354 5.45e-02 0.2406
## 48 -0.014007 0.049791 -0.046708 3.73e-02 -0.14485 1.187 5.34e-03 0.0962
## 49 -0.861899 -0.141175 0.693163 4.29e-01 -0.89031 1.093 1.91e-01 0.2180
## 50 -0.409479 0.599022 -0.107675 1.44e-01 -1.27816 0.268 2.88e-01 0.0748
## inf
## 1
## 2
## 3
## 4
## 5
## 6
## 7 *
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38 *
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47 *
## 48
## 49 *
## 50 *
influenceIndexPlot(model)

influencePlot(model)

## StudRes Hat CookD
## 46 2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
vif(model)
## R.D.Spend Administration Marketing.Spend
## 2.468903 1.175091 2.326773
# preparing test models to check the accuracy of the prediction
test_model <- lm(Profit ~ Administration+R.D.Spend+Marketing.Spend, data = dataset[-c(46,47,49,50)])
summary(test_model) # R-squared value : 0.9507
##
## Call:
## lm(formula = Profit ~ Administration + R.D.Spend + Marketing.Spend,
## data = dataset[-c(46, 47, 49, 50)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# Analysing the final model combination
library(MASS)
stepAIC(model)
## Start: AIC=916.88
## Profit ~ R.D.Spend + Administration + Marketing.Spend
##
## Df Sum of Sq RSS AIC
## - Administration 1 2.3539e+07 3.9444e+09 915.18
## <none> 3.9209e+09 916.88
## - Marketing.Spend 1 2.3349e+08 4.1543e+09 917.77
## - R.D.Spend 1 2.7147e+10 3.1068e+10 1018.37
##
## Step: AIC=915.18
## Profit ~ R.D.Spend + Marketing.Spend
##
## Df Sum of Sq RSS AIC
## <none> 3.9444e+09 915.18
## - Marketing.Spend 1 3.1165e+08 4.2560e+09 916.98
## - R.D.Spend 1 3.1149e+10 3.5094e+10 1022.46
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
##
## Coefficients:
## (Intercept) R.D.Spend Marketing.Spend
## 4.698e+04 7.966e-01 2.991e-02
avPlots(model)

# Final Model is predicted but it has some insignificant values
Final_Model <- lm(Profit ~ R.D.Spend+Marketing.Spend)
summary(Final_Model) # R-squared value : 0.9505
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
# Evaluate model LINE assumptions
plot(Final_Model)




hist(residuals(Final_Model)) # close to normal distribution
