Multiple Linear Regression Class Assignment (Karthick Sothivelr)

# Read the data to analyze (CSV File)
startup <- read.csv("C:\\Users\\samy_\\Desktop\\Data Science Class Materials\\50_Startups.csv")
attach(startup)
# Understanding the Data
View(startup)
dim(startup) # Check the dimension of the data
## [1] 50  5
colnames(startup) # Check the column names
## [1] "R.D.Spend"       "Administration"  "Marketing.Spend" "State"          
## [5] "Profit"
class(startup) # Check the type of data
## [1] "data.frame"
str(startup) # Check the structure of data
## 'data.frame':    50 obs. of  5 variables:
##  $ R.D.Spend      : num  165349 162598 153442 144372 142107 ...
##  $ Administration : num  136898 151378 101146 118672 91392 ...
##  $ Marketing.Spend: num  471784 443899 407935 383200 366168 ...
##  $ State          : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
##  $ Profit         : num  192262 191792 191050 182902 166188 ...
### The data contains a categorical variable (Nominal) == Factor with 3 levels
levels(State) # Check the levels of the categorical variable
## [1] "California" "Florida"    "New York"
summary(State) # Number of records for each level
## California    Florida   New York 
##         17         16         17
# Statistical Summary of the Data
summary(startup)
##    R.D.Spend      Administration   Marketing.Spend         State   
##  Min.   :     0   Min.   : 51283   Min.   :     0   California:17  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Florida   :16  
##  Median : 73051   Median :122700   Median :212716   New York  :17  
##  Mean   : 73722   Mean   :121345   Mean   :211025                  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                  
##  Max.   :165349   Max.   :182646   Max.   :471784                  
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
library(psych)
describe(startup)
##                 vars  n      mean        sd    median   trimmed       mad
## R.D.Spend          1 50  73721.62  45902.26  73051.08  72883.87  47053.05
## Administration     2 50 121344.64  28017.80 122699.79 123127.46  31868.81
## Marketing.Spend    3 50 211025.10 122290.31 212716.24 211200.78 128222.13
## State*             4 50      2.00      0.83      2.00      2.00      1.48
## Profit             5 50 112012.64  40306.18 107978.19 111727.66  39346.94
##                      min      max    range  skew kurtosis       se
## R.D.Spend           0.00 165349.2 165349.2  0.15    -0.89  6491.56
## Administration  51283.14 182645.6 131362.4 -0.46    -0.04  3962.32
## Marketing.Spend     0.00 471784.1 471784.1 -0.04    -0.81 17294.46
## State*              1.00      3.0      2.0  0.00    -1.59     0.12
## Profit          14681.40 192261.8 177580.4  0.02    -0.29  5700.15

~~~~

Multiple Regression Analysis (Y = Profit, X = R.D.Spend, Administration, Marketing.Spend, State)

~~~~

# Scatterplot Matrices
pairs(startup) # From the plots, it can be seen that there is moderate correlation between R.D.Spend and Marketing.Spend

# Correlation
cor(startup[,-4]) # Correlation only for numerical variables (so remove categorical variable State)
##                 R.D.Spend Administration Marketing.Spend    Profit
## R.D.Spend       1.0000000     0.24195525      0.72424813 0.9729005
## Administration  0.2419552     1.00000000     -0.03215388 0.2007166
## Marketing.Spend 0.7242481    -0.03215388      1.00000000 0.7477657
## Profit          0.9729005     0.20071657      0.74776572 1.0000000
# Correlation Coefficient between R.D.Spend and Marketing.Spend is 0.724 (Moderate +ve Correlation)
# Partial correlation matrix
library(corpcor)
cor2pcor(cor(startup[,-4])) 
##            [,1]        [,2]        [,3]        [,4]
## [1,] 1.00000000  0.20852619  0.03890336  0.93477127
## [2,] 0.20852619  1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506  1.00000000  0.23707116
## [4,] 0.93477127 -0.07725021  0.23707116  1.00000000
# Multiple Regresion using all the variables
m1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State)
summary(m1) # There are 3 insignificant parameters in the model: Administration, Marketing.Spend, State
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33504  -4736     90   6672  17338 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.013e+04  6.885e+03   7.281 4.44e-09 ***
## R.D.Spend        8.060e-01  4.641e-02  17.369  < 2e-16 ***
## Administration  -2.700e-02  5.223e-02  -0.517    0.608    
## Marketing.Spend  2.698e-02  1.714e-02   1.574    0.123    
## StateFlorida     1.988e+02  3.371e+03   0.059    0.953    
## StateNew York   -4.189e+01  3.256e+03  -0.013    0.990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9439 on 44 degrees of freedom
## Multiple R-squared:  0.9508, Adjusted R-squared:  0.9452 
## F-statistic: 169.9 on 5 and 44 DF,  p-value: < 2.2e-16

Do analysis for Administration, Marketing.Spend and State separately

## Administation
m_admin <- lm(Profit ~ Administration)
summary(m_admin) # The parameter for Administration is still insignificant
## 
## Call:
## lm(formula = Profit ~ Administration)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96072 -23426  -3564  25438  84870 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    7.697e+04  2.532e+04   3.040  0.00382 **
## Administration 2.887e-01  2.034e-01   1.419  0.16222   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared:  0.04029,    Adjusted R-squared:  0.02029 
## F-statistic: 2.015 on 1 and 48 DF,  p-value: 0.1622
## Marketing.Spend
m_market <- lm(Profit ~ Marketing.Spend)
summary(m_market) # The parameter for Marketing.Spend becomes significant
## 
## Call:
## lm(formula = Profit ~ Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83739 -18802   4925  15879  64642 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.000e+04  7.685e+03   7.808 4.29e-10 ***
## Marketing.Spend 2.465e-01  3.159e-02   7.803 4.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared:  0.5592, Adjusted R-squared:   0.55 
## F-statistic: 60.88 on 1 and 48 DF,  p-value: 4.381e-10
## State
m_state <- lm(Profit ~ State)
summary(m_state)  # The parameter for State is still insignificant
## 
## Call:
## lm(formula = Profit ~ State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89224 -22673  -6835  26283  87887 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     103905       9862  10.536 5.77e-14 ***
## StateFlorida     14869      14163   1.050    0.299    
## StateNew York     9851      13946   0.706    0.483    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40660 on 47 degrees of freedom
## Multiple R-squared:  0.02388,    Adjusted R-squared:  -0.01766 
## F-statistic: 0.5748 on 2 and 47 DF,  p-value: 0.5667
## Adminstration + Marketing.Spend
m_am <- lm(Profit ~ Administration + Marketing.Spend)
summary(m_am)  # Parameters for Administration and Marketing.Spend becomes significant
## 
## Call:
## lm(formula = Profit ~ Administration + Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82155 -12168   2836  13650  56472 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.022e+04  1.770e+04   1.143   0.2589    
## Administration  3.237e-01  1.312e-01   2.468   0.0173 *  
## Marketing.Spend 2.488e-01  3.005e-02   8.281 9.73e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25710 on 47 degrees of freedom
## Multiple R-squared:  0.6097, Adjusted R-squared:  0.5931 
## F-statistic: 36.71 on 2 and 47 DF,  p-value: 2.496e-10
## Administration + State
m_as <- lm(Profit ~ Administration + State)
summary(m_as) # Parameters for Adminstration and State are insignificant
## 
## Call:
## lm(formula = Profit ~ Administration + State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -88150 -24212  -6167  25357  79147 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    6.945e+04  2.665e+04   2.607   0.0123 *
## Administration 2.853e-01  2.053e-01   1.390   0.1713  
## StateFlorida   1.458e+04  1.403e+04   1.039   0.3041  
## StateNew York  9.624e+03  1.381e+04   0.697   0.4894  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40260 on 46 degrees of freedom
## Multiple R-squared:  0.0632, Adjusted R-squared:  0.002109 
## F-statistic: 1.035 on 3 and 46 DF,  p-value: 0.3862
## Marketing.Spend + State
m_ms <- lm(Profit ~ Marketing.Spend + State)
summary(m_ms)  # Only parameter for Marketing.Spend is significant
## 
## Call:
## lm(formula = Profit ~ Marketing.Spend + State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81629 -17491   2486  16278  65811 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.864e+04  8.984e+03   6.527 4.67e-08 ***
## Marketing.Spend  2.480e-01  3.295e-02   7.525 1.49e-09 ***
## StateFlorida    -1.195e+03  9.819e+03  -0.122    0.904    
## StateNew York    4.197e+03  9.468e+03   0.443    0.660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27520 on 46 degrees of freedom
## Multiple R-squared:  0.5625, Adjusted R-squared:  0.534 
## F-statistic: 19.71 on 3 and 46 DF,  p-value: 2.317e-08
## Administration + Marketing.Spend + State
m_ams <- lm(Profit ~ Administration + Marketing.Spend + State)
summary(m_ams)  # Parameters for Adminstration and Marketing.Spend are significant
## 
## Call:
## lm(formula = Profit ~ Administration + Marketing.Spend + State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79838 -12006   1497  12027  57394 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.903e+04  1.843e+04   1.033   0.3072    
## Administration   3.239e-01  1.335e-01   2.426   0.0193 *  
## Marketing.Spend  2.507e-01  3.135e-02   7.997 3.48e-10 ***
## StateFlorida    -1.704e+03  9.338e+03  -0.182   0.8561    
## StateNew York    3.876e+03  9.003e+03   0.431   0.6689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26160 on 45 degrees of freedom
## Multiple R-squared:  0.6131, Adjusted R-squared:  0.5787 
## F-statistic: 17.83 on 4 and 45 DF,  p-value: 7.778e-09
## There is no collinearity problem with Administration and Marketing.Spend
## The input varialbe State is not significant enough to represent percent variation in Profit
# Deletion Diagnostics for Identifying Influential Observations
influence.measures(m1) # There are 3 influential observations
## Influence measures of
##   lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +      State) :
## 
##      dfb.1_  dfb.R.D. dfb.Admn  dfb.Mr.S  dfb.SttF  dfb.StNY    dffit
## 1   0.00263 -0.000680 -0.00108 -0.002943  0.001137 -0.002293 -0.00671
## 2  -0.04961  0.020841  0.04627  0.058280 -0.082437 -0.070955  0.15256
## 3   0.08011  0.197811 -0.17477 -0.013702  0.153248 -0.028648  0.40914
## 4  -0.03595  0.110000 -0.04970  0.073783 -0.047849  0.163879  0.37650
## 5  -0.10434 -0.153851  0.16342  0.047737 -0.115870  0.017478 -0.28836
## 6  -0.04629 -0.081405  0.10566 -0.033055  0.027220 -0.115816 -0.26364
## 7  -0.02130 -0.095187  0.00468  0.082074  0.037236  0.050562 -0.12099
## 8   0.05670 -0.045130 -0.04118  0.001538 -0.079214  0.007763 -0.15671
## 9  -0.01048  0.000992  0.00845  0.005091 -0.002030  0.010515  0.02068
## 10 -0.07333 -0.087676  0.06482  0.009105  0.113458  0.114051 -0.19312
## 11  0.11698  0.186811 -0.13740 -0.159583  0.227283 -0.009802  0.36595
## 12  0.22674  0.152695 -0.19750 -0.063320 -0.180498 -0.191722  0.34058
## 13 -0.01749  0.069420  0.01009 -0.048411  0.246939 -0.005732  0.36117
## 14 -0.00754  0.005568  0.05424  0.040526 -0.144106 -0.139683  0.20968
## 15  0.26742 -0.221204 -0.25724  0.142195 -0.359607  0.022591 -0.67871
## 16 -0.00258 -0.208289  0.06663  0.071114  0.021586 -0.300602 -0.52394
## 17  0.03723 -0.055403  0.02986  0.116725 -0.217604 -0.202959  0.30836
## 18  0.08149  0.035255 -0.07538 -0.061166  0.013546 -0.089993 -0.15985
## 19 -0.00866 -0.003027  0.01978 -0.017913 -0.089991  0.002396 -0.14249
## 20  0.00949  0.252210  0.03934 -0.342025  0.056267  0.163205  0.44287
## 21  0.00920 -0.018937  0.00127  0.034093 -0.043026 -0.038229  0.06425
## 22  0.16246  0.134008 -0.16322 -0.156986  0.024662 -0.123303 -0.26328
## 23  0.02844  0.057093 -0.02567 -0.065599 -0.080087 -0.001184 -0.14418
## 24 -0.00300  0.017345  0.00492 -0.019954 -0.025200 -0.000520 -0.04652
## 25 -0.08800 -0.077455  0.08926  0.087987 -0.013268 -0.093171 -0.17298
## 26  0.01269  0.012139  0.04637 -0.028528 -0.091010 -0.099577  0.15683
## 27  0.01876 -0.047078 -0.04093  0.086363 -0.120147 -0.002104 -0.18499
## 28  0.16992  0.271462 -0.14611 -0.339679  0.054174 -0.198665 -0.44228
## 29 -0.04142 -0.004892  0.05451 -0.014043  0.037581  0.001794  0.07940
## 30  0.00788 -0.001210 -0.01152  0.007045 -0.001871 -0.014675 -0.02593
## 31  0.00452  0.004911 -0.00318 -0.008237  0.008083  0.000222  0.01251
## 32  0.00132 -0.000298 -0.00210  0.001610 -0.000431 -0.002779 -0.00500
## 33 -0.02577 -0.029851  0.00520  0.042128  0.023004  0.031218 -0.06200
## 34 -0.01226  0.005720  0.01076 -0.000214 -0.026781 -0.001052 -0.03977
## 35 -0.13840 -0.196836  0.24231  0.173390 -0.173841 -0.153489  0.35397
## 36  0.09973 -0.049889 -0.10222  0.034443  0.001307  0.126044  0.22205
## 37 -0.10755 -0.379353  0.18952  0.218405  0.334034  0.033735  0.61014
## 38  0.01840  0.001322 -0.01609 -0.000850 -0.007621 -0.008050  0.02054
## 39  0.32020 -0.189819 -0.31345  0.109261  0.010955  0.273511  0.58408
## 40 -0.07434  0.011576  0.05092 -0.005414  0.049682  0.051171 -0.09721
## 41  0.02107 -0.063137  0.02014  0.044268 -0.066547 -0.063116  0.11560
## 42  0.05957 -0.022659 -0.05101 -0.006525  0.066841  0.005162  0.11358
## 43  0.01852 -0.010791 -0.00820  0.004902 -0.016530 -0.016757  0.03122
## 44  0.02429 -0.090858  0.05891 -0.085052  0.047181  0.220181  0.37544
## 45 -0.00188 -0.005257  0.01339 -0.004732 -0.009633 -0.011309  0.02616
## 46  0.09538 -0.212843  0.09139 -0.189969  0.108210  0.428337  0.77900
## 47  0.10683  0.434369 -0.14265 -0.364064 -0.144471 -0.025892 -0.50219
## 48 -0.02930  0.046774 -0.04694  0.031389  0.056945  0.068742 -0.16247
## 49 -0.78383 -0.112734  0.70160  0.418630 -0.124090 -0.373999 -0.98871
## 50 -0.56603  0.578956 -0.11423  0.080954  0.626360  0.703325 -1.50721
##    cov.r   cook.d    hat inf
## 1  1.384 7.68e-06 0.1705    
## 2  1.395 3.96e-03 0.1884    
## 3  1.165 2.79e-02 0.1424    
## 4  1.113 2.36e-02 0.1140    
## 5  1.259 1.40e-02 0.1471    
## 6  1.207 1.17e-02 0.1163    
## 7  1.445 2.49e-03 0.2119   *
## 8  1.226 4.17e-03 0.0942    
## 9  1.269 7.29e-05 0.0958    
## 10 1.227 6.31e-03 0.1052    
## 11 1.046 2.21e-02 0.0897    
## 12 1.127 1.94e-02 0.1080    
## 13 0.954 2.13e-02 0.0657    
## 14 1.144 7.40e-03 0.0721    
## 15 0.761 7.20e-02 0.1055    
## 16 0.790 4.34e-02 0.0754    
## 17 1.040 1.58e-02 0.0708    
## 18 1.211 4.33e-03 0.0868    
## 19 1.183 3.44e-03 0.0671    
## 20 1.316 3.29e-02 0.2118    
## 21 1.249 7.03e-04 0.0866    
## 22 1.248 1.17e-02 0.1350    
## 23 1.206 3.53e-03 0.0795    
## 24 1.251 3.69e-04 0.0854    
## 25 1.225 5.07e-03 0.0980    
## 26 1.181 4.16e-03 0.0709    
## 27 1.221 5.79e-03 0.0993    
## 28 1.151 3.25e-02 0.1471    
## 29 1.395 1.07e-03 0.1802    
## 30 1.271 1.15e-04 0.0973    
## 31 1.300 2.67e-05 0.1169    
## 32 1.279 4.26e-06 0.1026    
## 33 1.289 6.55e-04 0.1131    
## 34 1.238 2.70e-04 0.0750    
## 35 1.197 2.10e-02 0.1389    
## 36 1.196 8.32e-03 0.0985    
## 37 0.850 5.92e-02 0.1079    
## 38 1.408 7.19e-05 0.1851    
## 39 1.060 5.58e-02 0.1570    
## 40 1.257 1.61e-03 0.0975    
## 41 1.228 2.27e-03 0.0842    
## 42 1.278 2.20e-03 0.1140    
## 43 1.256 1.66e-04 0.0870    
## 44 1.093 2.34e-02 0.1069    
## 45 1.312 1.17e-04 0.1259    
## 46 0.758 9.44e-02 0.1277    
## 47 1.419 4.23e-02 0.2654   *
## 48 1.272 4.48e-03 0.1211    
## 49 1.051 1.56e-01 0.2559    
## 50 0.128 2.64e-01 0.1015   *
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
influenceIndexPlot(m1) # Influential Observations (50, 49, 46)

influencePlot(m1) # Influential observations (50, 49, 46, 47)

##       StudRes       Hat      CookD
## 46  2.0357210 0.1277290 0.09439478
## 47 -0.8354542 0.2654200 0.04232333
## 49 -1.6860294 0.2558868 0.15637613
## 50 -4.4845939 0.1014896 0.26395944
# Regression Analysis after removing the 50th Observation
m2 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-50,])
summary(m2) # Only parameter for R.D.Spend is significant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startup[-50, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16374  -4572  -1225   5304  15402 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.338e+04  5.794e+03   9.212 9.79e-12 ***
## R.D.Spend        7.836e-01  3.907e-02  20.056  < 2e-16 ***
## Administration  -2.202e-02  4.363e-02  -0.505   0.6163    
## Marketing.Spend  2.582e-02  1.432e-02   1.804   0.0783 .  
## StateFlorida    -1.564e+03  2.842e+03  -0.550   0.5849    
## StateNew York   -1.954e+03  2.752e+03  -0.710   0.4815    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7881 on 43 degrees of freedom
## Multiple R-squared:  0.9618, Adjusted R-squared:  0.9574 
## F-statistic: 216.6 on 5 and 43 DF,  p-value: < 2.2e-16
# Regression Analysis after removing 50th and 49th Observations
m3 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50,49),])
summary(m3) # Only parameter for R.D.Spend is significant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startup[-c(50, 49), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16207  -4834  -1653   5728  14153 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.939e+04  6.094e+03   9.746  2.4e-12 ***
## R.D.Spend        7.888e-01  3.728e-02  21.160  < 2e-16 ***
## Administration  -6.214e-02  4.499e-02  -1.381    0.174    
## Marketing.Spend  1.791e-02  1.405e-02   1.275    0.209    
## StateFlorida    -1.150e+03  2.713e+03  -0.424    0.674    
## StateNew York   -6.658e+02  2.679e+03  -0.249    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7506 on 42 degrees of freedom
## Multiple R-squared:  0.9628, Adjusted R-squared:  0.9584 
## F-statistic: 217.6 on 5 and 42 DF,  p-value: < 2.2e-16
# Regression Analysis after removing 50th, 49th, and 46th observations
m4 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50, 49, 46),])
summary(m4) # Only parameter for R.D.Spend is significant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startup[-c(50, 49, 46), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15636.4  -4569.8   -975.6   5112.2  14684.7 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.812e+04  5.935e+03   9.793  2.7e-12 ***
## R.D.Spend        7.961e-01  3.628e-02  21.942  < 2e-16 ***
## Administration  -6.116e-02  4.356e-02  -1.404    0.168    
## Marketing.Spend  2.129e-02  1.371e-02   1.553    0.128    
## StateFlorida    -1.429e+03  2.630e+03  -0.543    0.590    
## StateNew York   -1.812e+03  2.659e+03  -0.682    0.499    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7267 on 41 degrees of freedom
## Multiple R-squared:  0.9645, Adjusted R-squared:  0.9602 
## F-statistic:   223 on 5 and 41 DF,  p-value: < 2.2e-16
# Regression Analysis after removing 50th, 49th, 47th, and 46th observations
m5 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = startup[-c(50, 49, 46, 47),])
summary(m5) # Only parameter for R.D.Spend is significant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startup[-c(50, 49, 46, 47), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15580.6  -5247.7   -627.4   5388.7  13120.1 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.726e+04  5.944e+03   9.634  5.6e-12 ***
## R.D.Spend        7.730e-01  4.077e-02  18.962  < 2e-16 ***
## Administration  -5.204e-02  4.395e-02  -1.184   0.2435    
## Marketing.Spend  2.831e-02  1.481e-02   1.911   0.0631 .  
## StateFlorida    -9.124e+02  2.649e+03  -0.344   0.7323    
## StateNew York   -1.745e+03  2.644e+03  -0.660   0.5131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7225 on 40 degrees of freedom
## Multiple R-squared:  0.963,  Adjusted R-squared:  0.9584 
## F-statistic: 208.2 on 5 and 40 DF,  p-value: < 2.2e-16
# Variance Inflation Factor to check the collinearity between variables
vif(m1) ## vif>10 then there exists collinearity among all the variables
##                     GVIF Df GVIF^(1/(2*Df))
## R.D.Spend       2.495511  1        1.579719
## Administration  1.177766  1        1.085249
## Marketing.Spend 2.416797  1        1.554605
## State           1.062673  2        1.015313
# Added Variable Plot to check the correlation between the input variables and output variable
avPlots(m1) # State has no correlation with the output variable

### The analysis performed have given us an indication to delete variable 'State' 

~~~~

Final Model

~~~~

# Model without State
final_model1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend)
summary(final_model1) # Parameters for Administration and Marketing.Spend is still insignificant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
## R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration  -2.682e-02  5.103e-02  -0.526    0.602    
## Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
# Model without State and Administration
final_model2 <- lm(Profit ~ R.D.Spend + Marketing.Spend)
summary(final_model2) # Parameter for Marketing.Spend is almost insignificant
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
## R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
# Model without State, Administration and Marketing.Spend
final_model3 <- lm(Profit ~ R.D.Spend)
summary(final_model3)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34351  -4626   -375   6249  17188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.903e+04  2.538e+03   19.32   <2e-16 ***
## R.D.Spend   8.543e-01  2.931e-02   29.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9416 on 48 degrees of freedom
## Multiple R-squared:  0.9465, Adjusted R-squared:  0.9454 
## F-statistic: 849.8 on 1 and 48 DF,  p-value: < 2.2e-16
# Display the fit info (Evaluate Model LINE Assumptions)
par(mfrow = c(2,2))
plot(final_model3)

# Residuals of the Final Model
residuals(final_model3)
##            1            2            3            4            5 
##   1972.53611   3853.34881  10933.73292  10532.98679  -4246.03345 
##            6            7            8            9           10 
##  -4703.07684  -7911.21501  -4592.86725    200.43619  -4636.86286 
##           11           12           13           14           15 
##  10025.58603   9223.31414  12365.62919   6686.14589 -18896.72408 
##           16           17           18           19           20 
## -16952.39093  11314.10417  -4527.32413  -3146.51482    -83.64313 
##           21           22           23           24           25 
##   4298.11626  -4687.32693  -1893.56325   2008.63322  -6298.89207 
##           26           27           28           29           30 
##   3128.93710  -7652.16276  -5625.48960  -2177.76271  -4074.45459 
##           31           32           33           34           35 
##  -2056.65845  -3777.62103  -5774.70108    338.01823   8018.50987 
##           36           37           38           39           40 
##   8137.23063  17188.08803   3267.66286  14914.19668   -967.34151 
##           41           42           43           44           45 
##   4642.43486   4937.24999   2269.34836   7479.66952  -2778.82105 
##           46           47           48           49           50 
##  15038.69300   -665.93527  -6473.16914 -13822.55778 -34351.49914
# # QQ plot of studentized residuals helps in identifying outlier 
par(mfrow = c(1,1))
qqPlot(final_model3)

## [1] 15 50
# Confidence Interval for parameters
confint(final_model3, level = 0.95)
##                    2.5 %       97.5 %
## (Intercept) 4.393012e+04 5.413568e+04
## R.D.Spend   7.953685e-01 9.132142e-01
# Model Predictions
pred <- predict(final_model3)
pred_df <- data.frame(pred)
pred_df
##         pred
## 1  190289.29
## 2  187938.71
## 3  180116.66
## 4  172369.00
## 5  170433.97
## 6  161694.20
## 7  164033.73
## 8  160345.47
## 9  152011.33
## 10 154396.82
## 11 136096.36
## 12 135036.09
## 13 129219.89
## 14 127621.20
## 15 151499.37
## 16 146869.43
## 17 115678.83
## 18 129897.69
## 19 127413.41
## 20 122860.50
## 21 114175.91
## 22 116000.35
## 23 112245.81
## 24 106725.36
## 25 114850.93
## 26 104275.40
## 27 113385.70
## 28 110633.80
## 29 105460.14
## 30 105079.09
## 31 101994.25
## 32 101261.18
## 33 103202.54
## 34  96440.90
## 35  88694.29
## 36  88342.28
## 37  73520.10
## 38  86681.48
## 39  66314.86
## 40  81973.10
## 41  73597.48
## 42  72861.58
## 43  69229.14
## 44  62279.31
## 45  67979.15
## 46  49887.39
## 47  50156.69
## 48  49032.90
## 49  49495.97
## 50  49032.90
# Plot of Profit vs R.D.Spend
plot(R.D.Spend, Profit, main = "Profit vs R.D.Spend" ,las=1)
lines(R.D.Spend, pred)

# Predicting new values
new_RD <- data.frame(R.D.Spend = c(200000, 250000.25, 145000.2, 98000))
pred_new <- predict(final_model3, new_RD)

pred_results <- data.frame(R.D.Spend = new_RD, Estimated_Profit = pred_new)
pred_results
##   R.D.Spend Estimated_Profit
## 1  200000.0         219891.2
## 2  250000.2         262606.0
## 3  145000.2         172905.3
## 4   98000.0         132753.5