Multi Linear Regression

Predict Profit of Startups

library(e1071)

Data <- read.csv("C:\\data science\\ds\\assignmentsbasicstatsandlinearregression\\50_Startups.csv")

View(Data)
Data <- Data[, -4]
colnames(Data) <- c("RDSpend", "Administration", "Marketing", "Profit")
attach(Data)
# First Moment Business Decision
summary(Data)
##     RDSpend       Administration     Marketing          Profit      
##  Min.   :     0   Min.   : 51283   Min.   :     0   Min.   : 14681  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   1st Qu.: 90139  
##  Median : 73051   Median :122700   Median :212716   Median :107978  
##  Mean   : 73722   Mean   :121345   Mean   :211025   Mean   :112013  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469   3rd Qu.:139766  
##  Max.   :165349   Max.   :182646   Max.   :471784   Max.   :192262
# Second Moment Business Decision
sd(RDSpend)
## [1] 45902.26
sd(Administration)
## [1] 28017.8
sd(Marketing)
## [1] 122290.3
sd(Profit)
## [1] 40306.18
var(RDSpend)
## [1] 2107017150
var(Administration)
## [1] 784997271
var(Marketing)
## [1] 14954920097
var(Profit)
## [1] 1624588173
#Third Moment Business Decision
skewness(RDSpend)
## [1] 0.1542932
skewness(Administration)
## [1] -0.4600745
skewness(Marketing)
## [1] -0.04372111
skewness(Profit)
## [1] 0.02191219
#Fourth Moment Business Decision
kurtosis(RDSpend)
## [1] -0.891987
kurtosis(Administration)
## [1] -0.03664891
kurtosis(Marketing)
## [1] -0.814161
kurtosis(Profit)
## [1] -0.2871546
plot(RDSpend, Profit)

plot(Administration, Profit)

plot(Marketing, Profit)

# Find Correlation between input and output
pairs(Data) 

# Correlation Coefficient matrix - Strength & Direction of Correlation
cor(Data)
##                  RDSpend Administration   Marketing    Profit
## RDSpend        1.0000000     0.24195525  0.72424813 0.9729005
## Administration 0.2419552     1.00000000 -0.03215388 0.2007166
## Marketing      0.7242481    -0.03215388  1.00000000 0.7477657
## Profit         0.9729005     0.20071657  0.74776572 1.0000000
##Pure Correlation  b/n the varibles
library(corpcor)
cor2pcor(cor(Data))
##            [,1]        [,2]        [,3]        [,4]
## [1,] 1.00000000  0.20852619  0.03890336  0.93477127
## [2,] 0.20852619  1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506  1.00000000  0.23707116
## [4,] 0.93477127 -0.07725021  0.23707116  1.00000000
# The linear model of interest
model <- lm(Profit~ RDSpend + Administration + Marketing, data = Data)

summary(model)
## 
## Call:
## lm(formula = Profit ~ RDSpend + Administration + Marketing, data = Data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.012e+04  6.572e+03   7.626 1.06e-09 ***
## RDSpend         8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration -2.682e-02  5.103e-02  -0.526    0.602    
## Marketing       2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
# Predictin based on only Administration
model1.startups <- lm(Profit~ Administration)
summary(model1.startups)
## 
## Call:
## lm(formula = Profit ~ Administration)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96072 -23426  -3564  25438  84870 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    7.697e+04  2.532e+04   3.040  0.00382 **
## Administration 2.887e-01  2.034e-01   1.419  0.16222   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared:  0.04029,    Adjusted R-squared:  0.02029 
## F-statistic: 2.015 on 1 and 48 DF,  p-value: 0.1622
# Predictin based on only Marketing
model2.startups <- lm(Profit ~ Marketing)
summary(model2.startups)
## 
## Call:
## lm(formula = Profit ~ Marketing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83739 -18802   4925  15879  64642 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.000e+04  7.685e+03   7.808 4.29e-10 ***
## Marketing   2.465e-01  3.159e-02   7.803 4.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared:  0.5592, Adjusted R-squared:   0.55 
## F-statistic: 60.88 on 1 and 48 DF,  p-value: 4.381e-10
# Administration is not significat so find the influencial record
influence.measures(model2.startups)
## Influence measures of
##   lm(formula = Profit ~ Marketing) :
## 
##       dfb.1_  dfb.Mrkt    dffit cov.r   cook.d    hat inf
## 1  -0.128325  0.201650  0.22232 1.156 0.025032 0.1128   *
## 2  -0.150928  0.247947  0.27945 1.115 0.039248 0.0940    
## 3  -0.157806  0.281062  0.32993 1.062 0.053995 0.0729    
## 4  -0.116810  0.225714  0.27593 1.056 0.037923 0.0605    
## 5  -0.053680  0.112042  0.14212 1.084 0.010235 0.0528    
## 6  -0.024360  0.051759  0.06620 1.096 0.002234 0.0515    
## 7   0.402810 -0.253257  0.44675 0.827 0.089411 0.0295   *
## 8  -0.026728  0.080123  0.11751 1.067 0.006998 0.0374    
## 9  -0.018478  0.068819  0.10769 1.064 0.005880 0.0338    
## 10 -0.013741  0.060739  0.09906 1.064 0.004980 0.0320    
## 11  0.058328  0.023766  0.16041 1.011 0.012804 0.0204    
## 12  0.026657  0.038721  0.12711 1.035 0.008126 0.0220    
## 13  0.023378  0.034147  0.11184 1.042 0.006313 0.0221    
## 14  0.012719  0.021955  0.06750 1.058 0.002317 0.0224    
## 15  0.008540  0.018689  0.05313 1.062 0.001438 0.0228    
## 16  0.003834  0.011995  0.03103 1.066 0.000491 0.0235    
## 17  0.001127  0.004294  0.01065 1.068 0.000058 0.0239    
## 18  0.000342 -0.013446 -0.02643 1.071 0.000356 0.0270    
## 19  0.004654 -0.031161 -0.05471 1.070 0.001525 0.0296    
## 20  0.758056 -0.657538  0.75806 0.874 0.257594 0.0808   *
## 21  0.010567 -0.058699 -0.10010 1.061 0.005082 0.0305    
## 22  0.016753 -0.088955 -0.15050 1.044 0.011392 0.0307    
## 23  0.021540 -0.100330 -0.16549 1.040 0.013739 0.0316    
## 24  0.024807 -0.110363 -0.18026 1.034 0.016252 0.0320    
## 25  0.074319 -0.043146  0.08578 1.060 0.003736 0.0268    
## 26  0.072986 -0.043138  0.08349 1.061 0.003540 0.0273    
## 27  0.071075 -0.043075  0.08028 1.063 0.003276 0.0281    
## 28  0.122288 -0.275649 -0.36206 0.982 0.063398 0.0476    
## 29  0.088294 -0.058241  0.09568 1.065 0.004647 0.0318    
## 30  0.097500 -0.067368  0.10345 1.067 0.005430 0.0347    
## 31  0.128333 -0.093684  0.13314 1.066 0.008969 0.0396    
## 32  0.117378 -0.086439  0.12138 1.071 0.007468 0.0406    
## 33  0.242791 -0.196970  0.24433 1.061 0.029858 0.0571    
## 34 -0.040322 -0.002549 -0.08551 1.048 0.003706 0.0200    
## 35 -0.040327  0.000152 -0.08078 1.050 0.003309 0.0200    
## 36 -0.040327  0.003416 -0.07516 1.052 0.002868 0.0200    
## 37 -0.056945  0.008189 -0.10050 1.043 0.005104 0.0201    
## 38 -0.059089  0.011425 -0.09949 1.043 0.005003 0.0203    
## 39 -0.088888  0.027726 -0.13322 1.029 0.008905 0.0209    
## 40 -0.089113  0.035090 -0.12303 1.036 0.007618 0.0218    
## 41 -0.100289  0.041049 -0.13632 1.030 0.009325 0.0220    
## 42 -0.100883  0.046675 -0.13004 1.036 0.008505 0.0230    
## 43 -0.127085  0.069702 -0.15095 1.032 0.011423 0.0254    
## 44  0.009662 -0.007980  0.00969 1.112 0.000048 0.0620    
## 45 -0.017877  0.014933 -0.01791 1.116 0.000164 0.0655    
## 46  0.050001 -0.043273  0.05000 1.132 0.001276 0.0797   *
## 47  0.059785 -0.356615 -0.61535 0.678 0.153522 0.0301   *
## 48 -0.198306  0.172010 -0.19831 1.113 0.019892 0.0808    
## 49 -0.277845  0.241003 -0.27784 1.093 0.038697 0.0808    
## 50 -0.549841  0.446798 -0.55317 0.904 0.141194 0.0575
library(car)
## Loading required package: carData
## plotting Influential measures 
influenceIndexPlot(model2.startups) # index plots for infuence measures

influencePlot(model2.startups)

##       StudRes        Hat      CookD
## 1   0.6235364 0.11278942 0.02503230
## 2   0.8675434 0.09400477 0.03924825
## 7   2.5637064 0.02947101 0.08941114
## 20  2.5573415 0.08076983 0.25759372
## 47 -3.4922130 0.03011390 0.15352176
# Delete influencial records
model3.starups <- lm(Profit ~ RDSpend + Administration + Marketing, data = Data[-c(20, 47),])
summary(model3.starups)
## 
## Call:
## lm(formula = Profit ~ RDSpend + Administration + Marketing, data = Data[-c(20, 
##     47), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33882  -4086    493   5921  17553 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.900e+04  6.655e+03   7.363 3.37e-09 ***
## RDSpend         7.676e-01  5.434e-02  14.126  < 2e-16 ***
## Administration -1.995e-02  5.198e-02  -0.384   0.7030    
## Marketing       4.193e-02  2.000e-02   2.097   0.0418 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9262 on 44 degrees of freedom
## Multiple R-squared:   0.95,  Adjusted R-squared:  0.9466 
## F-statistic: 278.8 on 3 and 44 DF,  p-value: < 2.2e-16
# Variance Inflation factor to check collinearity b/n variables
vif(model)
##        RDSpend Administration      Marketing 
##       2.468903       1.175091       2.326773
# Added Variable plot to check correlation b/n variables and o/p variable
avPlots(model)

#AV plot has given us an indication to delete "Administration" variable

## Build model with R&D spend and Marketing
model4.startups <- lm(Profit ~ RDSpend + Marketing)
summary(model4.startups)
## 
## Call:
## lm(formula = Profit ~ RDSpend + Marketing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.698e+04  2.690e+03  17.464   <2e-16 ***
## RDSpend     7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing   2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
## plotting Influential measures 
influenceIndexPlot(model4.startups) # index plots for infuence measures

influencePlot(model4.startups)

##       StudRes        Hat      CookD
## 15 -2.0302568 0.04617342 0.06236965
## 20  0.8367666 0.18077516 0.05183252
## 46  2.0131474 0.08481001 0.11755350
## 47 -0.9136765 0.21711116 0.07744170
## 50 -4.5461636 0.07424053 0.38948862
finalmodel.startups <- lm(Profit ~ RDSpend + Marketing, data = Data[-c(46, 47, 50),])
summary(finalmodel.startups)
## 
## Call:
## lm(formula = Profit ~ RDSpend + Marketing, data = Data[-c(46, 
##     47, 50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16820.9  -4143.3    -25.6   4904.0  12593.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.874e+04  2.373e+03  20.542  < 2e-16 ***
## RDSpend     7.582e-01  3.762e-02  20.152  < 2e-16 ***
## Marketing   3.799e-02  1.368e-02   2.777  0.00803 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7374 on 44 degrees of freedom
## Multiple R-squared:  0.9621, Adjusted R-squared:  0.9604 
## F-statistic: 558.2 on 2 and 44 DF,  p-value: < 2.2e-16
# Evaluate model LINE assumptions 
plot(finalmodel.startups)

#Residual plots,QQplot,std-Residuals Vs Fitted,Cook's Distance 
qqPlot(model)

## [1] 46 50