#PREDICTON OF PROFIT FOR 50 STARTUP'S
str=read.csv(file.choose())
dim(str)
## [1] 50 5
summary(str)
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 California:17
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Florida :16
## Median : 73051 Median :122700 Median :212716 New York :17
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
str(str)
## 'data.frame': 50 obs. of 5 variables:
## $ R.D.Spend : num 165349 162598 153442 144372 142107 ...
## $ Administration : num 136898 151378 101146 118672 91392 ...
## $ Marketing.Spend: num 471784 443899 407935 383200 366168 ...
## $ State : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
## $ Profit : num 192262 191792 191050 182902 166188 ...
colnames(str)
## [1] "R.D.Spend" "Administration" "Marketing.Spend" "State"
## [5] "Profit"
attach(str)
#excluding state coloumn(4th col)
str=str[,-4]
#plot
windows()
plot(str)

plot(R.D.Spend,Profit)

plot(R.D.Spend,Marketing.Spend)

cor(str)
## R.D.Spend Administration Marketing.Spend Profit
## R.D.Spend 1.0000000 0.24195525 0.72424813 0.9729005
## Administration 0.2419552 1.00000000 -0.03215388 0.2007166
## Marketing.Spend 0.7242481 -0.03215388 1.00000000 0.7477657
## Profit 0.9729005 0.20071657 0.74776572 1.0000000
#LINEAR REGRESSION MODEL TO CHECK THE SIGNIFICANCE
m1=lm(Profit~R.D.Spend+Administration+Marketing.Spend, data=str)
summary(m1)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
## data = str)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
#model w.r.t Administration+Marketing.Spend
m2=lm(log(Profit)~Administration+Marketing.Spend, data=str)
summary(m2)
##
## Call:
## lm(formula = log(Profit) ~ Administration + Marketing.Spend,
## data = str)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50743 -0.10590 0.03404 0.15522 0.59146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.054e+01 2.304e-01 45.742 < 2e-16 ***
## Administration 3.842e-06 1.707e-06 2.250 0.0292 *
## Marketing.Spend 2.555e-06 3.911e-07 6.534 4.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3346 on 47 degrees of freedom
## Multiple R-squared: 0.4992, Adjusted R-squared: 0.4779
## F-statistic: 23.43 on 2 and 47 DF, p-value: 8.736e-08
#model w.r.t Administration
#m3=lm(Profit~Administration, data=str)
#summary(m3)
library(car)
## Loading required package: carData
#vif, check influential data
windows()
influencePlot(m1)

## StudRes Hat CookD
## 46 2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
#rows 46,47,49,50 need to be excluded
finalmodel=lm(Profit~R.D.Spend+Marketing.Spend, data=str[-c(46,47,49,50),])
summary(finalmodel)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = str[-c(46,
## 47, 49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16713.8 -4219.8 -529.1 4383.3 12081.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.029e+04 2.425e+03 20.743 <2e-16 ***
## R.D.Spend 7.507e-01 3.660e-02 20.511 <2e-16 ***
## Marketing.Spend 3.501e-02 1.332e-02 2.627 0.0119 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7137 on 43 degrees of freedom
## Multiple R-squared: 0.9612, Adjusted R-squared: 0.9594
## F-statistic: 532.5 on 2 and 43 DF, p-value: < 2.2e-16
windows()
avPlots(finalmodel)#added value plots

pv=predict(finalmodel, data=str)
pv
## 1 2 3 4 5 6 7 8
## 190939.26 187897.51 179764.84 172090.62 169794.00 161998.02 155822.68 159448.15
## 9 10 11 12 13 14 15 16
## 151695.11 153559.26 134823.34 134612.13 129504.38 128198.41 149316.43 145432.04
## 17 18 19 20 21 22 23 24
## 118112.75 131245.89 129494.94 115170.33 117993.36 119634.17 116460.20 111659.75
## 25 26 27 28 29 30 31 32
## 113052.62 103667.76 111536.62 116789.12 104015.27 103295.02 100023.83 99277.66
## 33 34 35 36 37 38 39 40
## 99508.80 99466.98 92525.20 92031.04 78852.12 90274.45 71965.16 85365.73
## 41 42 43 44 45 48
## 77928.38 76990.28 73221.70 63177.52 67934.33 50293.15
final=cbind(str[-c(46,47,49,50),],pv)