Predict Profit of Startups
library(e1071)
Data <- read.csv("C:\\data science\\ds\\assignmentsbasicstatsandlinearregression\\50_Startups.csv")
View(Data)
Data <- Data[, -4]
colnames(Data) <- c("RDSpend", "Administration", "Marketing", "Profit")
attach(Data)
# First Moment Business Decision
summary(Data)
## RDSpend Administration Marketing Profit
## Min. : 0 Min. : 51283 Min. : 0 Min. : 14681
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 1st Qu.: 90139
## Median : 73051 Median :122700 Median :212716 Median :107978
## Mean : 73722 Mean :121345 Mean :211025 Mean :112013
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469 3rd Qu.:139766
## Max. :165349 Max. :182646 Max. :471784 Max. :192262
# Second Moment Business Decision
sd(RDSpend)
## [1] 45902.26
sd(Administration)
## [1] 28017.8
sd(Marketing)
## [1] 122290.3
sd(Profit)
## [1] 40306.18
var(RDSpend)
## [1] 2107017150
var(Administration)
## [1] 784997271
var(Marketing)
## [1] 14954920097
var(Profit)
## [1] 1624588173
#Third Moment Business Decision
skewness(RDSpend)
## [1] 0.1542932
skewness(Administration)
## [1] -0.4600745
skewness(Marketing)
## [1] -0.04372111
skewness(Profit)
## [1] 0.02191219
#Fourth Moment Business Decision
kurtosis(RDSpend)
## [1] -0.891987
kurtosis(Administration)
## [1] -0.03664891
kurtosis(Marketing)
## [1] -0.814161
kurtosis(Profit)
## [1] -0.2871546
plot(RDSpend, Profit)

plot(Administration, Profit)

plot(Marketing, Profit)

# Find Correlation between input and output
pairs(Data)

# Correlation Coefficient matrix - Strength & Direction of Correlation
cor(Data)
## RDSpend Administration Marketing Profit
## RDSpend 1.0000000 0.24195525 0.72424813 0.9729005
## Administration 0.2419552 1.00000000 -0.03215388 0.2007166
## Marketing 0.7242481 -0.03215388 1.00000000 0.7477657
## Profit 0.9729005 0.20071657 0.74776572 1.0000000
##Pure Correlation b/n the varibles
library(corpcor)
cor2pcor(cor(Data))
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.20852619 0.03890336 0.93477127
## [2,] 0.20852619 1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506 1.00000000 0.23707116
## [4,] 0.93477127 -0.07725021 0.23707116 1.00000000
# The linear model of interest
model <- lm(Profit~ RDSpend + Administration + Marketing, data = Data)
summary(model)
##
## Call:
## lm(formula = Profit ~ RDSpend + Administration + Marketing, data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## RDSpend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## Marketing 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# Predictin based on only Administration
model1.startups <- lm(Profit~ Administration)
summary(model1.startups)
##
## Call:
## lm(formula = Profit ~ Administration)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96072 -23426 -3564 25438 84870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.697e+04 2.532e+04 3.040 0.00382 **
## Administration 2.887e-01 2.034e-01 1.419 0.16222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared: 0.04029, Adjusted R-squared: 0.02029
## F-statistic: 2.015 on 1 and 48 DF, p-value: 0.1622
# Predictin based on only Marketing
model2.startups <- lm(Profit ~ Marketing)
summary(model2.startups)
##
## Call:
## lm(formula = Profit ~ Marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83739 -18802 4925 15879 64642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.000e+04 7.685e+03 7.808 4.29e-10 ***
## Marketing 2.465e-01 3.159e-02 7.803 4.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared: 0.5592, Adjusted R-squared: 0.55
## F-statistic: 60.88 on 1 and 48 DF, p-value: 4.381e-10
# Administration is not significat so find the influencial record
influence.measures(model2.startups)
## Influence measures of
## lm(formula = Profit ~ Marketing) :
##
## dfb.1_ dfb.Mrkt dffit cov.r cook.d hat inf
## 1 -0.128325 0.201650 0.22232 1.156 0.025032 0.1128 *
## 2 -0.150928 0.247947 0.27945 1.115 0.039248 0.0940
## 3 -0.157806 0.281062 0.32993 1.062 0.053995 0.0729
## 4 -0.116810 0.225714 0.27593 1.056 0.037923 0.0605
## 5 -0.053680 0.112042 0.14212 1.084 0.010235 0.0528
## 6 -0.024360 0.051759 0.06620 1.096 0.002234 0.0515
## 7 0.402810 -0.253257 0.44675 0.827 0.089411 0.0295 *
## 8 -0.026728 0.080123 0.11751 1.067 0.006998 0.0374
## 9 -0.018478 0.068819 0.10769 1.064 0.005880 0.0338
## 10 -0.013741 0.060739 0.09906 1.064 0.004980 0.0320
## 11 0.058328 0.023766 0.16041 1.011 0.012804 0.0204
## 12 0.026657 0.038721 0.12711 1.035 0.008126 0.0220
## 13 0.023378 0.034147 0.11184 1.042 0.006313 0.0221
## 14 0.012719 0.021955 0.06750 1.058 0.002317 0.0224
## 15 0.008540 0.018689 0.05313 1.062 0.001438 0.0228
## 16 0.003834 0.011995 0.03103 1.066 0.000491 0.0235
## 17 0.001127 0.004294 0.01065 1.068 0.000058 0.0239
## 18 0.000342 -0.013446 -0.02643 1.071 0.000356 0.0270
## 19 0.004654 -0.031161 -0.05471 1.070 0.001525 0.0296
## 20 0.758056 -0.657538 0.75806 0.874 0.257594 0.0808 *
## 21 0.010567 -0.058699 -0.10010 1.061 0.005082 0.0305
## 22 0.016753 -0.088955 -0.15050 1.044 0.011392 0.0307
## 23 0.021540 -0.100330 -0.16549 1.040 0.013739 0.0316
## 24 0.024807 -0.110363 -0.18026 1.034 0.016252 0.0320
## 25 0.074319 -0.043146 0.08578 1.060 0.003736 0.0268
## 26 0.072986 -0.043138 0.08349 1.061 0.003540 0.0273
## 27 0.071075 -0.043075 0.08028 1.063 0.003276 0.0281
## 28 0.122288 -0.275649 -0.36206 0.982 0.063398 0.0476
## 29 0.088294 -0.058241 0.09568 1.065 0.004647 0.0318
## 30 0.097500 -0.067368 0.10345 1.067 0.005430 0.0347
## 31 0.128333 -0.093684 0.13314 1.066 0.008969 0.0396
## 32 0.117378 -0.086439 0.12138 1.071 0.007468 0.0406
## 33 0.242791 -0.196970 0.24433 1.061 0.029858 0.0571
## 34 -0.040322 -0.002549 -0.08551 1.048 0.003706 0.0200
## 35 -0.040327 0.000152 -0.08078 1.050 0.003309 0.0200
## 36 -0.040327 0.003416 -0.07516 1.052 0.002868 0.0200
## 37 -0.056945 0.008189 -0.10050 1.043 0.005104 0.0201
## 38 -0.059089 0.011425 -0.09949 1.043 0.005003 0.0203
## 39 -0.088888 0.027726 -0.13322 1.029 0.008905 0.0209
## 40 -0.089113 0.035090 -0.12303 1.036 0.007618 0.0218
## 41 -0.100289 0.041049 -0.13632 1.030 0.009325 0.0220
## 42 -0.100883 0.046675 -0.13004 1.036 0.008505 0.0230
## 43 -0.127085 0.069702 -0.15095 1.032 0.011423 0.0254
## 44 0.009662 -0.007980 0.00969 1.112 0.000048 0.0620
## 45 -0.017877 0.014933 -0.01791 1.116 0.000164 0.0655
## 46 0.050001 -0.043273 0.05000 1.132 0.001276 0.0797 *
## 47 0.059785 -0.356615 -0.61535 0.678 0.153522 0.0301 *
## 48 -0.198306 0.172010 -0.19831 1.113 0.019892 0.0808
## 49 -0.277845 0.241003 -0.27784 1.093 0.038697 0.0808
## 50 -0.549841 0.446798 -0.55317 0.904 0.141194 0.0575
library(car)
## Loading required package: carData
## plotting Influential measures
influenceIndexPlot(model2.startups) # index plots for infuence measures

influencePlot(model2.startups)

## StudRes Hat CookD
## 1 0.6235364 0.11278942 0.02503230
## 2 0.8675434 0.09400477 0.03924825
## 7 2.5637064 0.02947101 0.08941114
## 20 2.5573415 0.08076983 0.25759372
## 47 -3.4922130 0.03011390 0.15352176
# Delete influencial records
model3.starups <- lm(Profit ~ RDSpend + Administration + Marketing, data = Data[-c(20, 47),])
summary(model3.starups)
##
## Call:
## lm(formula = Profit ~ RDSpend + Administration + Marketing, data = Data[-c(20,
## 47), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33882 -4086 493 5921 17553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.900e+04 6.655e+03 7.363 3.37e-09 ***
## RDSpend 7.676e-01 5.434e-02 14.126 < 2e-16 ***
## Administration -1.995e-02 5.198e-02 -0.384 0.7030
## Marketing 4.193e-02 2.000e-02 2.097 0.0418 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9262 on 44 degrees of freedom
## Multiple R-squared: 0.95, Adjusted R-squared: 0.9466
## F-statistic: 278.8 on 3 and 44 DF, p-value: < 2.2e-16
# Variance Inflation factor to check collinearity b/n variables
vif(model)
## RDSpend Administration Marketing
## 2.468903 1.175091 2.326773
# Added Variable plot to check correlation b/n variables and o/p variable
avPlots(model)

#AV plot has given us an indication to delete "Administration" variable
## Build model with R&D spend and Marketing
model4.startups <- lm(Profit ~ RDSpend + Marketing)
summary(model4.startups)
##
## Call:
## lm(formula = Profit ~ RDSpend + Marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## RDSpend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
## plotting Influential measures
influenceIndexPlot(model4.startups) # index plots for infuence measures

influencePlot(model4.startups)

## StudRes Hat CookD
## 15 -2.0302568 0.04617342 0.06236965
## 20 0.8367666 0.18077516 0.05183252
## 46 2.0131474 0.08481001 0.11755350
## 47 -0.9136765 0.21711116 0.07744170
## 50 -4.5461636 0.07424053 0.38948862
finalmodel.startups <- lm(Profit ~ RDSpend + Marketing, data = Data[-c(46, 47, 50),])
summary(finalmodel.startups)
##
## Call:
## lm(formula = Profit ~ RDSpend + Marketing, data = Data[-c(46,
## 47, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16820.9 -4143.3 -25.6 4904.0 12593.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.874e+04 2.373e+03 20.542 < 2e-16 ***
## RDSpend 7.582e-01 3.762e-02 20.152 < 2e-16 ***
## Marketing 3.799e-02 1.368e-02 2.777 0.00803 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7374 on 44 degrees of freedom
## Multiple R-squared: 0.9621, Adjusted R-squared: 0.9604
## F-statistic: 558.2 on 2 and 44 DF, p-value: < 2.2e-16
# Evaluate model LINE assumptions
plot(finalmodel.startups)




#Residual plots,QQplot,std-Residuals Vs Fitted,Cook's Distance
qqPlot(model)

## [1] 46 50