Multiple Linear Regression Class Assignment (Karthick Sothivelr)
# Read the data to analyze (CSV File)
startup <- read.csv("C:\\Users\\samy_\\Desktop\\Data Science Class Materials\\50_Startups.csv")
attach(startup)
# Understanding the Data
View(startup)
dim(startup) # Check the dimension of the data
## [1] 50 5
colnames(startup) # Check the column names
## [1] "R.D.Spend" "Administration" "Marketing.Spend" "State"
## [5] "Profit"
class(startup) # Check the type of data
## [1] "data.frame"
str(startup) # Check the structure of data
## 'data.frame': 50 obs. of 5 variables:
## $ R.D.Spend : num 165349 162598 153442 144372 142107 ...
## $ Administration : num 136898 151378 101146 118672 91392 ...
## $ Marketing.Spend: num 471784 443899 407935 383200 366168 ...
## $ State : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
## $ Profit : num 192262 191792 191050 182902 166188 ...
### The data contains a categorical variable (Nominal) == Factor with 3 levels
levels(State) # Check the levels of the categorical variable
## [1] "California" "Florida" "New York"
summary(State) # Number of records for each level
## California Florida New York
## 17 16 17
# Statistical Summary of the Data
summary(startup)
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 California:17
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Florida :16
## Median : 73051 Median :122700 Median :212716 New York :17
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
library(psych)
describe(startup)
## vars n mean sd median trimmed mad
## R.D.Spend 1 50 73721.62 45902.26 73051.08 72883.87 47053.05
## Administration 2 50 121344.64 28017.80 122699.79 123127.46 31868.81
## Marketing.Spend 3 50 211025.10 122290.31 212716.24 211200.78 128222.13
## State* 4 50 2.00 0.83 2.00 2.00 1.48
## Profit 5 50 112012.64 40306.18 107978.19 111727.66 39346.94
## min max range skew kurtosis se
## R.D.Spend 0.00 165349.2 165349.2 0.15 -0.89 6491.56
## Administration 51283.14 182645.6 131362.4 -0.46 -0.04 3962.32
## Marketing.Spend 0.00 471784.1 471784.1 -0.04 -0.81 17294.46
## State* 1.00 3.0 2.0 0.00 -1.59 0.12
## Profit 14681.40 192261.8 177580.4 0.02 -0.29 5700.15
~~~~
Multiple Regression Analysis (Y = Profit, X = R.D.Spend, Administration, Marketing.Spend, State)
~~~~
# Scatterplot Matrices
pairs(startup) # From the plots, it can be seen that there is moderate correlation between R.D.Spend and Marketing.Spend

# Correlation
cor(startup[,-4]) # Correlation only for numerical variables (so remove categorical variable State)
## R.D.Spend Administration Marketing.Spend Profit
## R.D.Spend 1.0000000 0.24195525 0.72424813 0.9729005
## Administration 0.2419552 1.00000000 -0.03215388 0.2007166
## Marketing.Spend 0.7242481 -0.03215388 1.00000000 0.7477657
## Profit 0.9729005 0.20071657 0.74776572 1.0000000
# Correlation Coefficient between R.D.Spend and Marketing.Spend is 0.724 (Moderate +ve Correlation)
# Partial correlation matrix
library(corpcor)
cor2pcor(cor(startup[,-4]))
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.20852619 0.03890336 0.93477127
## [2,] 0.20852619 1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506 1.00000000 0.23707116
## [4,] 0.93477127 -0.07725021 0.23707116 1.00000000
# Multiple Regresion using all the variables
m1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State)
summary(m1) # There are 3 insignificant parameters in the model: Administration, Marketing.Spend, State
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33504 -4736 90 6672 17338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.013e+04 6.885e+03 7.281 4.44e-09 ***
## R.D.Spend 8.060e-01 4.641e-02 17.369 < 2e-16 ***
## Administration -2.700e-02 5.223e-02 -0.517 0.608
## Marketing.Spend 2.698e-02 1.714e-02 1.574 0.123
## StateFlorida 1.988e+02 3.371e+03 0.059 0.953
## StateNew York -4.189e+01 3.256e+03 -0.013 0.990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9439 on 44 degrees of freedom
## Multiple R-squared: 0.9508, Adjusted R-squared: 0.9452
## F-statistic: 169.9 on 5 and 44 DF, p-value: < 2.2e-16
Do analysis for Administration, Marketing.Spend and State separately
## Administation
m_admin <- lm(Profit ~ Administration)
summary(m_admin) # The parameter for Administration is still insignificant
##
## Call:
## lm(formula = Profit ~ Administration)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96072 -23426 -3564 25438 84870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.697e+04 2.532e+04 3.040 0.00382 **
## Administration 2.887e-01 2.034e-01 1.419 0.16222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared: 0.04029, Adjusted R-squared: 0.02029
## F-statistic: 2.015 on 1 and 48 DF, p-value: 0.1622
## Marketing.Spend
m_market <- lm(Profit ~ Marketing.Spend)
summary(m_market) # The parameter for Marketing.Spend becomes significant
##
## Call:
## lm(formula = Profit ~ Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83739 -18802 4925 15879 64642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.000e+04 7.685e+03 7.808 4.29e-10 ***
## Marketing.Spend 2.465e-01 3.159e-02 7.803 4.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared: 0.5592, Adjusted R-squared: 0.55
## F-statistic: 60.88 on 1 and 48 DF, p-value: 4.381e-10
## State
m_state <- lm(Profit ~ State)
summary(m_state) # The parameter for State is still insignificant
##
## Call:
## lm(formula = Profit ~ State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89224 -22673 -6835 26283 87887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103905 9862 10.536 5.77e-14 ***
## StateFlorida 14869 14163 1.050 0.299
## StateNew York 9851 13946 0.706 0.483
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40660 on 47 degrees of freedom
## Multiple R-squared: 0.02388, Adjusted R-squared: -0.01766
## F-statistic: 0.5748 on 2 and 47 DF, p-value: 0.5667
## Adminstration + Marketing.Spend
m_am <- lm(Profit ~ Administration + Marketing.Spend)
summary(m_am) # Parameters for Administration and Marketing.Spend becomes significant
##
## Call:
## lm(formula = Profit ~ Administration + Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82155 -12168 2836 13650 56472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.022e+04 1.770e+04 1.143 0.2589
## Administration 3.237e-01 1.312e-01 2.468 0.0173 *
## Marketing.Spend 2.488e-01 3.005e-02 8.281 9.73e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25710 on 47 degrees of freedom
## Multiple R-squared: 0.6097, Adjusted R-squared: 0.5931
## F-statistic: 36.71 on 2 and 47 DF, p-value: 2.496e-10
## Administration + State
m_as <- lm(Profit ~ Administration + State)
summary(m_as) # Parameters for Adminstration and State are insignificant
##
## Call:
## lm(formula = Profit ~ Administration + State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88150 -24212 -6167 25357 79147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.945e+04 2.665e+04 2.607 0.0123 *
## Administration 2.853e-01 2.053e-01 1.390 0.1713
## StateFlorida 1.458e+04 1.403e+04 1.039 0.3041
## StateNew York 9.624e+03 1.381e+04 0.697 0.4894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40260 on 46 degrees of freedom
## Multiple R-squared: 0.0632, Adjusted R-squared: 0.002109
## F-statistic: 1.035 on 3 and 46 DF, p-value: 0.3862
## Marketing.Spend + State
m_ms <- lm(Profit ~ Marketing.Spend + State)
summary(m_ms) # Only parameter for Marketing.Spend is significant
##
## Call:
## lm(formula = Profit ~ Marketing.Spend + State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81629 -17491 2486 16278 65811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.864e+04 8.984e+03 6.527 4.67e-08 ***
## Marketing.Spend 2.480e-01 3.295e-02 7.525 1.49e-09 ***
## StateFlorida -1.195e+03 9.819e+03 -0.122 0.904
## StateNew York 4.197e+03 9.468e+03 0.443 0.660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27520 on 46 degrees of freedom
## Multiple R-squared: 0.5625, Adjusted R-squared: 0.534
## F-statistic: 19.71 on 3 and 46 DF, p-value: 2.317e-08
## Administration + Marketing.Spend + State
m_ams <- lm(Profit ~ Administration + Marketing.Spend + State)
summary(m_ams) # Parameters for Adminstration and Marketing.Spend are significant
##
## Call:
## lm(formula = Profit ~ Administration + Marketing.Spend + State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79838 -12006 1497 12027 57394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.903e+04 1.843e+04 1.033 0.3072
## Administration 3.239e-01 1.335e-01 2.426 0.0193 *
## Marketing.Spend 2.507e-01 3.135e-02 7.997 3.48e-10 ***
## StateFlorida -1.704e+03 9.338e+03 -0.182 0.8561
## StateNew York 3.876e+03 9.003e+03 0.431 0.6689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26160 on 45 degrees of freedom
## Multiple R-squared: 0.6131, Adjusted R-squared: 0.5787
## F-statistic: 17.83 on 4 and 45 DF, p-value: 7.778e-09
## There is no collinearity problem with Administration and Marketing.Spend
## The input varialbe State is not significant enough to represent percent variation in Profit
# Deletion Diagnostics for Identifying Influential Observations
influence.measures(m1) # There are 3 influential observations
## Influence measures of
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State) :
##
## dfb.1_ dfb.R.D. dfb.Admn dfb.Mr.S dfb.SttF dfb.StNY dffit
## 1 0.00263 -0.000680 -0.00108 -0.002943 0.001137 -0.002293 -0.00671
## 2 -0.04961 0.020841 0.04627 0.058280 -0.082437 -0.070955 0.15256
## 3 0.08011 0.197811 -0.17477 -0.013702 0.153248 -0.028648 0.40914
## 4 -0.03595 0.110000 -0.04970 0.073783 -0.047849 0.163879 0.37650
## 5 -0.10434 -0.153851 0.16342 0.047737 -0.115870 0.017478 -0.28836
## 6 -0.04629 -0.081405 0.10566 -0.033055 0.027220 -0.115816 -0.26364
## 7 -0.02130 -0.095187 0.00468 0.082074 0.037236 0.050562 -0.12099
## 8 0.05670 -0.045130 -0.04118 0.001538 -0.079214 0.007763 -0.15671
## 9 -0.01048 0.000992 0.00845 0.005091 -0.002030 0.010515 0.02068
## 10 -0.07333 -0.087676 0.06482 0.009105 0.113458 0.114051 -0.19312
## 11 0.11698 0.186811 -0.13740 -0.159583 0.227283 -0.009802 0.36595
## 12 0.22674 0.152695 -0.19750 -0.063320 -0.180498 -0.191722 0.34058
## 13 -0.01749 0.069420 0.01009 -0.048411 0.246939 -0.005732 0.36117
## 14 -0.00754 0.005568 0.05424 0.040526 -0.144106 -0.139683 0.20968
## 15 0.26742 -0.221204 -0.25724 0.142195 -0.359607 0.022591 -0.67871
## 16 -0.00258 -0.208289 0.06663 0.071114 0.021586 -0.300602 -0.52394
## 17 0.03723 -0.055403 0.02986 0.116725 -0.217604 -0.202959 0.30836
## 18 0.08149 0.035255 -0.07538 -0.061166 0.013546 -0.089993 -0.15985
## 19 -0.00866 -0.003027 0.01978 -0.017913 -0.089991 0.002396 -0.14249
## 20 0.00949 0.252210 0.03934 -0.342025 0.056267 0.163205 0.44287
## 21 0.00920 -0.018937 0.00127 0.034093 -0.043026 -0.038229 0.06425
## 22 0.16246 0.134008 -0.16322 -0.156986 0.024662 -0.123303 -0.26328
## 23 0.02844 0.057093 -0.02567 -0.065599 -0.080087 -0.001184 -0.14418
## 24 -0.00300 0.017345 0.00492 -0.019954 -0.025200 -0.000520 -0.04652
## 25 -0.08800 -0.077455 0.08926 0.087987 -0.013268 -0.093171 -0.17298
## 26 0.01269 0.012139 0.04637 -0.028528 -0.091010 -0.099577 0.15683
## 27 0.01876 -0.047078 -0.04093 0.086363 -0.120147 -0.002104 -0.18499
## 28 0.16992 0.271462 -0.14611 -0.339679 0.054174 -0.198665 -0.44228
## 29 -0.04142 -0.004892 0.05451 -0.014043 0.037581 0.001794 0.07940
## 30 0.00788 -0.001210 -0.01152 0.007045 -0.001871 -0.014675 -0.02593
## 31 0.00452 0.004911 -0.00318 -0.008237 0.008083 0.000222 0.01251
## 32 0.00132 -0.000298 -0.00210 0.001610 -0.000431 -0.002779 -0.00500
## 33 -0.02577 -0.029851 0.00520 0.042128 0.023004 0.031218 -0.06200
## 34 -0.01226 0.005720 0.01076 -0.000214 -0.026781 -0.001052 -0.03977
## 35 -0.13840 -0.196836 0.24231 0.173390 -0.173841 -0.153489 0.35397
## 36 0.09973 -0.049889 -0.10222 0.034443 0.001307 0.126044 0.22205
## 37 -0.10755 -0.379353 0.18952 0.218405 0.334034 0.033735 0.61014
## 38 0.01840 0.001322 -0.01609 -0.000850 -0.007621 -0.008050 0.02054
## 39 0.32020 -0.189819 -0.31345 0.109261 0.010955 0.273511 0.58408
## 40 -0.07434 0.011576 0.05092 -0.005414 0.049682 0.051171 -0.09721
## 41 0.02107 -0.063137 0.02014 0.044268 -0.066547 -0.063116 0.11560
## 42 0.05957 -0.022659 -0.05101 -0.006525 0.066841 0.005162 0.11358
## 43 0.01852 -0.010791 -0.00820 0.004902 -0.016530 -0.016757 0.03122
## 44 0.02429 -0.090858 0.05891 -0.085052 0.047181 0.220181 0.37544
## 45 -0.00188 -0.005257 0.01339 -0.004732 -0.009633 -0.011309 0.02616
## 46 0.09538 -0.212843 0.09139 -0.189969 0.108210 0.428337 0.77900
## 47 0.10683 0.434369 -0.14265 -0.364064 -0.144471 -0.025892 -0.50219
## 48 -0.02930 0.046774 -0.04694 0.031389 0.056945 0.068742 -0.16247
## 49 -0.78383 -0.112734 0.70160 0.418630 -0.124090 -0.373999 -0.98871
## 50 -0.56603 0.578956 -0.11423 0.080954 0.626360 0.703325 -1.50721
## cov.r cook.d hat inf
## 1 1.384 7.68e-06 0.1705
## 2 1.395 3.96e-03 0.1884
## 3 1.165 2.79e-02 0.1424
## 4 1.113 2.36e-02 0.1140
## 5 1.259 1.40e-02 0.1471
## 6 1.207 1.17e-02 0.1163
## 7 1.445 2.49e-03 0.2119 *
## 8 1.226 4.17e-03 0.0942
## 9 1.269 7.29e-05 0.0958
## 10 1.227 6.31e-03 0.1052
## 11 1.046 2.21e-02 0.0897
## 12 1.127 1.94e-02 0.1080
## 13 0.954 2.13e-02 0.0657
## 14 1.144 7.40e-03 0.0721
## 15 0.761 7.20e-02 0.1055
## 16 0.790 4.34e-02 0.0754
## 17 1.040 1.58e-02 0.0708
## 18 1.211 4.33e-03 0.0868
## 19 1.183 3.44e-03 0.0671
## 20 1.316 3.29e-02 0.2118
## 21 1.249 7.03e-04 0.0866
## 22 1.248 1.17e-02 0.1350
## 23 1.206 3.53e-03 0.0795
## 24 1.251 3.69e-04 0.0854
## 25 1.225 5.07e-03 0.0980
## 26 1.181 4.16e-03 0.0709
## 27 1.221 5.79e-03 0.0993
## 28 1.151 3.25e-02 0.1471
## 29 1.395 1.07e-03 0.1802
## 30 1.271 1.15e-04 0.0973
## 31 1.300 2.67e-05 0.1169
## 32 1.279 4.26e-06 0.1026
## 33 1.289 6.55e-04 0.1131
## 34 1.238 2.70e-04 0.0750
## 35 1.197 2.10e-02 0.1389
## 36 1.196 8.32e-03 0.0985
## 37 0.850 5.92e-02 0.1079
## 38 1.408 7.19e-05 0.1851
## 39 1.060 5.58e-02 0.1570
## 40 1.257 1.61e-03 0.0975
## 41 1.228 2.27e-03 0.0842
## 42 1.278 2.20e-03 0.1140
## 43 1.256 1.66e-04 0.0870
## 44 1.093 2.34e-02 0.1069
## 45 1.312 1.17e-04 0.1259
## 46 0.758 9.44e-02 0.1277
## 47 1.419 4.23e-02 0.2654 *
## 48 1.272 4.48e-03 0.1211
## 49 1.051 1.56e-01 0.2559
## 50 0.128 2.64e-01 0.1015 *
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
influenceIndexPlot(m1) # Influential Observations (50, 49, 46)

influencePlot(m1) # Influential observations (50, 49, 46, 47)

## StudRes Hat CookD
## 46 2.0357210 0.1277290 0.09439478
## 47 -0.8354542 0.2654200 0.04232333
## 49 -1.6860294 0.2558868 0.15637613
## 50 -4.4845939 0.1014896 0.26395944
# Regression Analysis after removing the 50th Observation
m2 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-50,])
summary(m2) # Only parameter for R.D.Spend is significant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startup[-50, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16374 -4572 -1225 5304 15402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.338e+04 5.794e+03 9.212 9.79e-12 ***
## R.D.Spend 7.836e-01 3.907e-02 20.056 < 2e-16 ***
## Administration -2.202e-02 4.363e-02 -0.505 0.6163
## Marketing.Spend 2.582e-02 1.432e-02 1.804 0.0783 .
## StateFlorida -1.564e+03 2.842e+03 -0.550 0.5849
## StateNew York -1.954e+03 2.752e+03 -0.710 0.4815
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7881 on 43 degrees of freedom
## Multiple R-squared: 0.9618, Adjusted R-squared: 0.9574
## F-statistic: 216.6 on 5 and 43 DF, p-value: < 2.2e-16
# Regression Analysis after removing 50th and 49th Observations
m3 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50,49),])
summary(m3) # Only parameter for R.D.Spend is significant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startup[-c(50, 49), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16207 -4834 -1653 5728 14153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.939e+04 6.094e+03 9.746 2.4e-12 ***
## R.D.Spend 7.888e-01 3.728e-02 21.160 < 2e-16 ***
## Administration -6.214e-02 4.499e-02 -1.381 0.174
## Marketing.Spend 1.791e-02 1.405e-02 1.275 0.209
## StateFlorida -1.150e+03 2.713e+03 -0.424 0.674
## StateNew York -6.658e+02 2.679e+03 -0.249 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7506 on 42 degrees of freedom
## Multiple R-squared: 0.9628, Adjusted R-squared: 0.9584
## F-statistic: 217.6 on 5 and 42 DF, p-value: < 2.2e-16
# Regression Analysis after removing 50th, 49th, and 46th observations
m4 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50, 49, 46),])
summary(m4) # Only parameter for R.D.Spend is significant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startup[-c(50, 49, 46), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15636.4 -4569.8 -975.6 5112.2 14684.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.812e+04 5.935e+03 9.793 2.7e-12 ***
## R.D.Spend 7.961e-01 3.628e-02 21.942 < 2e-16 ***
## Administration -6.116e-02 4.356e-02 -1.404 0.168
## Marketing.Spend 2.129e-02 1.371e-02 1.553 0.128
## StateFlorida -1.429e+03 2.630e+03 -0.543 0.590
## StateNew York -1.812e+03 2.659e+03 -0.682 0.499
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7267 on 41 degrees of freedom
## Multiple R-squared: 0.9645, Adjusted R-squared: 0.9602
## F-statistic: 223 on 5 and 41 DF, p-value: < 2.2e-16
# Regression Analysis after removing 50th, 49th, 47th, and 46th observations
m5 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50, 49, 46, 47),])
summary(m5) # Only parameter for R.D.Spend is significant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startup[-c(50, 49, 46, 47), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15580.6 -5247.7 -627.4 5388.7 13120.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.726e+04 5.944e+03 9.634 5.6e-12 ***
## R.D.Spend 7.730e-01 4.077e-02 18.962 < 2e-16 ***
## Administration -5.204e-02 4.395e-02 -1.184 0.2435
## Marketing.Spend 2.831e-02 1.481e-02 1.911 0.0631 .
## StateFlorida -9.124e+02 2.649e+03 -0.344 0.7323
## StateNew York -1.745e+03 2.644e+03 -0.660 0.5131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7225 on 40 degrees of freedom
## Multiple R-squared: 0.963, Adjusted R-squared: 0.9584
## F-statistic: 208.2 on 5 and 40 DF, p-value: < 2.2e-16
# Variance Inflation Factor to check the collinearity between variables
vif(m1) ## vif>10 then there exists collinearity among all the variables
## GVIF Df GVIF^(1/(2*Df))
## R.D.Spend 2.495511 1 1.579719
## Administration 1.177766 1 1.085249
## Marketing.Spend 2.416797 1 1.554605
## State 1.062673 2 1.015313
# Added Variable Plot to check the correlation between the input variables and output variable
avPlots(m1) # State has no correlation with the output variable

### The analysis performed have given us an indication to delete variable 'State'
~~~~
Final Model
~~~~
# Model without State
final_model1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend)
summary(final_model1) # Parameters for Administration and Marketing.Spend is still insignificant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# Model without State and Administration
final_model2 <- lm(Profit ~ R.D.Spend + Marketing.Spend)
summary(final_model2) # Parameter for Marketing.Spend is almost insignificant
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
# Model without State, Administration and Marketing.Spend
final_model3 <- lm(Profit ~ R.D.Spend)
summary(final_model3)
##
## Call:
## lm(formula = Profit ~ R.D.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34351 -4626 -375 6249 17188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.903e+04 2.538e+03 19.32 <2e-16 ***
## R.D.Spend 8.543e-01 2.931e-02 29.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9416 on 48 degrees of freedom
## Multiple R-squared: 0.9465, Adjusted R-squared: 0.9454
## F-statistic: 849.8 on 1 and 48 DF, p-value: < 2.2e-16
# Display the fit info (Evaluate Model LINE Assumptions)
par(mfrow = c(2,2))
plot(final_model3)

# Residuals of the Final Model
residuals(final_model3)
## 1 2 3 4 5
## 1972.53611 3853.34881 10933.73292 10532.98679 -4246.03345
## 6 7 8 9 10
## -4703.07684 -7911.21501 -4592.86725 200.43619 -4636.86286
## 11 12 13 14 15
## 10025.58603 9223.31414 12365.62919 6686.14589 -18896.72408
## 16 17 18 19 20
## -16952.39093 11314.10417 -4527.32413 -3146.51482 -83.64313
## 21 22 23 24 25
## 4298.11626 -4687.32693 -1893.56325 2008.63322 -6298.89207
## 26 27 28 29 30
## 3128.93710 -7652.16276 -5625.48960 -2177.76271 -4074.45459
## 31 32 33 34 35
## -2056.65845 -3777.62103 -5774.70108 338.01823 8018.50987
## 36 37 38 39 40
## 8137.23063 17188.08803 3267.66286 14914.19668 -967.34151
## 41 42 43 44 45
## 4642.43486 4937.24999 2269.34836 7479.66952 -2778.82105
## 46 47 48 49 50
## 15038.69300 -665.93527 -6473.16914 -13822.55778 -34351.49914
# # QQ plot of studentized residuals helps in identifying outlier
par(mfrow = c(1,1))
qqPlot(final_model3)

## [1] 15 50
# Confidence Interval for parameters
confint(final_model3, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 4.393012e+04 5.413568e+04
## R.D.Spend 7.953685e-01 9.132142e-01
# Model Predictions
pred <- predict(final_model3)
pred_df <- data.frame(pred)
pred_df
## pred
## 1 190289.29
## 2 187938.71
## 3 180116.66
## 4 172369.00
## 5 170433.97
## 6 161694.20
## 7 164033.73
## 8 160345.47
## 9 152011.33
## 10 154396.82
## 11 136096.36
## 12 135036.09
## 13 129219.89
## 14 127621.20
## 15 151499.37
## 16 146869.43
## 17 115678.83
## 18 129897.69
## 19 127413.41
## 20 122860.50
## 21 114175.91
## 22 116000.35
## 23 112245.81
## 24 106725.36
## 25 114850.93
## 26 104275.40
## 27 113385.70
## 28 110633.80
## 29 105460.14
## 30 105079.09
## 31 101994.25
## 32 101261.18
## 33 103202.54
## 34 96440.90
## 35 88694.29
## 36 88342.28
## 37 73520.10
## 38 86681.48
## 39 66314.86
## 40 81973.10
## 41 73597.48
## 42 72861.58
## 43 69229.14
## 44 62279.31
## 45 67979.15
## 46 49887.39
## 47 50156.69
## 48 49032.90
## 49 49495.97
## 50 49032.90
# Plot of Profit vs R.D.Spend
plot(R.D.Spend, Profit, main = "Profit vs R.D.Spend" ,las=1)
lines(R.D.Spend, pred)

# Predicting new values
new_RD <- data.frame(R.D.Spend = c(200000, 250000.25, 145000.2, 98000))
pred_new <- predict(final_model3, new_RD)
pred_results <- data.frame(R.D.Spend = new_RD, Estimated_Profit = pred_new)
pred_results
## R.D.Spend Estimated_Profit
## 1 200000.0 219891.2
## 2 250000.2 262606.0
## 3 145000.2 172905.3
## 4 98000.0 132753.5