# Prepare a prediction model for profit of 50_startups data
Startups <- read.csv(file.choose())
View(Startups)
class(Startups)
## [1] "data.frame"
# To Transform the data from Character to Numeric
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
Startups$State <- revalue(Startups$State,
c("New York"="0", "California"="1", "Florida"="2"))
attach(Startups)
Startups <- cbind(RD_Spend=R.D.Spend,Administration,Marketing_Spend=Marketing.Spend,State,Profit)
Startups <- as.data.frame(Startups)
attach(Startups) # Basically to avoid reference of Data Set name(Startups) in this report.
## The following objects are masked from Startups (pos = 3):
##
## Administration, Profit, State
#Corolla_Pred <- cbind(Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight)
#Corolla_Pred1 <- as.data.frame(Corolla_Pred)
#class(Corolla_Pred1)
# Exploratory data analysis:
# 1. Measures of central tendency
# 2. Measures of dispersion
# 3. Third moment business decision
# 4. Fourth moment business decision
# 5. Probability distributions of variables
# 6. Graphical representations (Histogram, Box plot, Dot plot, Stem & Leaf plot, Bar plot, etc.)
summary(Startups)
## RD_Spend Administration Marketing_Spend State
## Min. : 0 Min. : 51283 Min. : 0 Min. :1
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 1st Qu.:1
## Median : 73051 Median :122700 Median :212716 Median :2
## Mean : 73722 Mean :121345 Mean :211025 Mean :2
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469 3rd Qu.:3
## Max. :165349 Max. :182646 Max. :471784 Max. :3
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
plot(R.D.Spend, Profit)

plot(Administration, Profit)

plot(Marketing.Spend, Profit)

plot(State, Profit)

windows()
# 7. Find the correlation between Output (Profit) & inputs (R.D Spend, Administration, Marketing, State) - SCATTER DIAGRAM
pairs(Startups)

# 8. Correlation coefficient - Strength & Direction of correlation
cor(Startups)
## RD_Spend Administration Marketing_Spend State
## RD_Spend 1.0000000 0.24195525 0.72424813 0.10468511
## Administration 0.2419552 1.00000000 -0.03215388 0.01184720
## Marketing_Spend 0.7242481 -0.03215388 1.00000000 0.07766961
## State 0.1046851 0.01184720 0.07766961 1.00000000
## Profit 0.9729005 0.20071657 0.74776572 0.10179631
## Profit
## RD_Spend 0.9729005
## Administration 0.2007166
## Marketing_Spend 0.7477657
## State 0.1017963
## Profit 1.0000000
# The Linear Model of interest
Model.Startups <- lm(Profit~RD_Spend+Administration+Marketing_Spend+State)
summary(Model.Startups)
##
## Call:
## lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend +
## State)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33553 -4779 63 6595 17301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.016e+04 7.322e+03 6.851 1.69e-08 ***
## RD_Spend 8.058e-01 4.576e-02 17.609 < 2e-16 ***
## Administration -2.683e-02 5.160e-02 -0.520 0.606
## Marketing_Spend 2.723e-02 1.663e-02 1.637 0.109
## State -2.232e+01 1.610e+03 -0.014 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9464
## F-statistic: 217.2 on 4 and 45 DF, p-value: < 2.2e-16
Model.Startups1 <- lm(Profit~RD_Spend+log(Administration))
summary(Model.Startups1)
##
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration))
##
## Residuals:
## Min 1Q Median 3Q Max
## -33851 -4928 -180 6385 17863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.021e+05 6.087e+04 1.677 0.100
## RD_Spend 8.614e-01 3.049e-02 28.250 <2e-16 ***
## log(Administration) -4.589e+03 5.260e+03 -0.872 0.387
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9440 on 47 degrees of freedom
## Multiple R-squared: 0.9474, Adjusted R-squared: 0.9451
## F-statistic: 423.2 on 2 and 47 DF, p-value: < 2.2e-16
# Model.Startups2 <- lm(Profit~RD_Spend+Administration+Marketing_Spend+log
# summary(Model.Startups2)
### Scatter plot matrix with Correlations inserted in graph
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.4/strwidth(txt)
text(0.5, 0.5, txt, cex = cex)
}
pairs(Startups, upper.panel=panel.cor,main="Scatter Plot Matrix with Correlation Coefficients")

### Partial Correlation matrix - Pure correlation between the variables
# install.packages("corpcor")
library(corpcor)
cor2pcor(cor(Startups))
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.00000000 0.20880590 0.038920914 0.026971505 0.934484951
## [2,] 0.20880590 1.00000000 -0.281913894 -0.013933444 -0.077271343
## [3,] 0.03892091 -0.28191389 1.000000000 -0.001176456 0.237068057
## [4,] 0.02697151 -0.01393344 -0.001176456 1.000000000 -0.002066896
## [5,] 0.93448495 -0.07727134 0.237068057 -0.002066896 1.000000000
# install.packages("mvinfluence")
library(mvinfluence)
## Warning: package 'mvinfluence' was built under R version 3.5.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.5.1
## Loading required package: heplots
## Warning: package 'heplots' was built under R version 3.5.1
##
## Attaching package: 'mvinfluence'
## The following object is masked from 'package:corpcor':
##
## mpower
library(car)
# It is better to delete a single observation rather than entire variable to get rid of collinearity problem
# Deletion Diagnostics for identifying influential variable
influence.measures(Model.Startups)
## Influence measures of
## lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend + State) :
##
## dfb.1_ dfb.RD_S dfb.Admn dfb.Mr_S dfb.Stat dffit cov.r
## 1 0.005573 -0.001614 -1.78e-03 -0.00458 -0.004266 -0.01138 1.309
## 2 -0.030474 0.023633 4.12e-02 0.04566 -0.066200 0.13381 1.324
## 3 0.086304 0.181884 -1.64e-01 0.02355 -0.030499 0.35755 1.122
## 4 -0.072998 0.119221 -5.54e-02 0.04566 0.162555 0.34081 1.097
## 5 -0.100209 -0.136061 1.49e-01 0.01950 0.018111 -0.23760 1.196
## 6 -0.014497 -0.089593 1.11e-01 -0.01501 -0.117810 -0.24762 1.170
## 7 -0.031273 -0.099257 5.50e-03 0.08894 0.051794 -0.12341 1.405
## 8 0.045172 -0.035906 -4.31e-02 -0.01623 0.008350 -0.11748 1.160
## 9 -0.010434 0.001367 6.84e-03 0.00303 0.008975 0.01608 1.217
## 10 -0.092215 -0.094245 6.88e-02 0.02277 0.114783 -0.18347 1.188
## 11 0.116853 0.162921 -1.22e-01 -0.10745 -0.012494 0.24442 1.003
## 12 0.247775 0.159631 -2.01e-01 -0.08378 -0.189193 0.32287 1.115
## 13 -0.005299 0.046825 2.27e-02 0.00696 -0.008788 0.21480 0.933
## 14 0.020469 0.012107 4.96e-02 0.02364 -0.136758 0.18763 1.117
## 15 0.219338 -0.181121 -2.65e-01 0.05847 0.026192 -0.50523 0.795
## 16 0.070447 -0.224544 7.58e-02 0.11071 -0.304261 -0.49002 0.822
## 17 0.073736 -0.044565 2.33e-02 0.09081 -0.198951 0.27267 1.033
## 18 0.099087 0.030794 -7.35e-02 -0.05013 -0.091981 -0.14780 1.168
## 19 -0.011430 0.004661 1.42e-02 -0.03678 0.003387 -0.09144 1.121
## 20 -0.028144 0.256904 3.85e-02 -0.35645 0.164789 0.44558 1.304
## 21 0.015148 -0.015929 -1.77e-05 0.02760 -0.035848 0.05457 1.198
## 22 0.182630 0.127895 -1.60e-01 -0.14191 -0.125242 -0.24650 1.203
## 23 0.022466 0.060978 -2.84e-02 -0.08057 -0.000145 -0.10593 1.145
## 24 -0.003225 0.017368 3.21e-03 -0.02305 -0.000174 -0.03250 1.179
## 25 -0.061116 -0.081784 9.25e-02 0.09845 -0.095141 -0.17189 1.195
## 26 0.031802 0.015801 4.38e-02 -0.03848 -0.098271 0.14774 1.153
## 27 0.012297 -0.033948 -4.36e-02 0.05634 -0.000515 -0.11475 1.135
## 28 0.206794 0.258315 -1.38e-01 -0.31127 -0.199993 -0.40597 1.123
## 29 -0.039607 -0.008569 5.97e-02 -0.00635 0.001391 0.07104 1.280
## 30 0.011586 -0.001798 -1.21e-02 0.00901 -0.015883 -0.02718 1.231
## 31 0.006628 0.006285 -4.11e-03 -0.00980 0.000176 0.01267 1.183
## 32 0.002556 -0.000516 -2.80e-03 0.00252 -0.003805 -0.00666 1.241
## 33 -0.031476 -0.031588 5.80e-03 0.04613 0.032092 -0.06325 1.254
## 34 -0.010415 0.006871 8.04e-03 -0.00527 -0.000609 -0.02214 1.154
## 35 -0.097131 -0.185557 2.33e-01 0.15161 -0.149674 0.32934 1.173
## 36 0.062079 -0.044366 -1.04e-01 0.02142 0.125511 0.20905 1.168
## 37 -0.093069 -0.399005 2.02e-01 0.28778 0.028946 0.47782 0.858
## 38 0.015553 0.001371 -1.35e-02 -0.00142 -0.006644 0.01668 1.359
## 39 0.231713 -0.178833 -3.18e-01 0.08353 0.273642 0.56424 1.071
## 40 -0.081516 0.009679 5.37e-02 0.00013 0.052197 -0.09534 1.215
## 41 0.031393 -0.058872 1.79e-02 0.03617 -0.061301 0.10559 1.188
## 42 0.057389 -0.028676 -4.82e-02 0.00799 0.004377 0.08723 1.183
## 43 0.019182 -0.009398 -8.06e-03 0.00288 -0.015514 0.02765 1.213
## 44 -0.028180 -0.086212 5.60e-02 -0.10035 0.221626 0.37035 1.092
## 45 0.000536 -0.004571 1.23e-02 -0.00537 -0.010484 0.02395 1.273
## 46 -0.009334 -0.206043 8.68e-02 -0.21717 0.432166 0.77523 0.810
## 47 0.096246 0.430539 -1.44e-01 -0.38598 -0.023283 -0.46097 1.368
## 48 -0.042206 0.045654 -4.66e-02 0.03804 0.069689 -0.16308 1.240
## 49 -0.652061 -0.119392 7.14e-01 0.44462 -0.379188 -0.99824 1.090
## 50 -0.674607 0.559442 -1.00e-01 0.14833 0.704506 -1.48097 0.183
## cook.d hat inf
## 1 2.65e-05 0.1453
## 2 3.65e-03 0.1636
## 3 2.56e-02 0.1111
## 4 2.32e-02 0.0968
## 5 1.14e-02 0.1110
## 6 1.24e-02 0.1010
## 7 3.11e-03 0.2087 *
## 8 2.81e-03 0.0594
## 9 5.29e-05 0.0808
## 10 6.83e-03 0.0925
## 11 1.19e-02 0.0421
## 12 2.09e-02 0.0985
## 13 9.06e-03 0.0242
## 14 7.11e-03 0.0593
## 15 4.81e-02 0.0640
## 16 4.56e-02 0.0653
## 17 1.48e-02 0.0567
## 18 4.44e-03 0.0723
## 19 1.70e-03 0.0309
## 20 3.99e-02 0.2110
## 21 6.09e-04 0.0699
## 22 1.23e-02 0.1167
## 23 2.28e-03 0.0476
## 24 2.16e-04 0.0529
## 25 6.00e-03 0.0937
## 26 4.43e-03 0.0639
## 27 2.68e-03 0.0449
## 28 3.28e-02 0.1247
## 29 1.03e-03 0.1293
## 30 1.51e-04 0.0921
## 31 3.28e-05 0.0542
## 32 9.08e-06 0.0981
## 33 8.18e-04 0.1107
## 34 1.00e-04 0.0318
## 35 2.18e-02 0.1244
## 36 8.85e-03 0.0891
## 37 4.37e-02 0.0693
## 38 5.69e-05 0.1770 *
## 39 6.25e-02 0.1480
## 40 1.86e-03 0.0891
## 41 2.27e-03 0.0734
## 42 1.55e-03 0.0657
## 43 1.56e-04 0.0786
## 44 2.73e-02 0.1034
## 45 1.17e-04 0.1217
## 46 1.12e-01 0.1250
## 47 4.28e-02 0.2412 *
## 48 5.42e-03 0.1177
## 49 1.91e-01 0.2548
## 50 3.06e-01 0.0966 *
influenceIndexPlot(Model.Startups, id.n=3) # Index Plots of the influence measures
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

influencePlot(Model.Startups, id.n=3) # A user friendly representation of the above
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## StudRes Hat CookD
## 46 2.0508431 0.12502380 0.11220265
## 47 -0.8175683 0.24121705 0.04281347
## 49 -1.7072322 0.25478210 0.19116349
## 50 -4.5278887 0.09664062 0.30602820
# infIndexPlot.mlm()
## Regression after deleting the 49th and 50th observation, which is influential observation
# Logarthimic Transformation
Model.Startups_Log<-lm(Profit~RD_Spend+log(Administration)+Marketing_Spend+log(State),data=Startups[-c(49,50),])
summary(Model.Startups_Log) #Adjusted R2 Value = 0.9591
##
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration) + Marketing_Spend +
## log(State), data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15952 -5122 -1868 5748 13769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.259e+05 5.682e+04 2.215 0.0321 *
## RD_Spend 7.881e-01 3.674e-02 21.452 <2e-16 ***
## log(Administration) -6.357e+03 4.865e+03 -1.307 0.1983
## Marketing_Spend 1.819e-02 1.360e-02 1.338 0.1880
## log(State) -6.988e+02 2.399e+03 -0.291 0.7722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7448 on 43 degrees of freedom
## Multiple R-squared: 0.9625, Adjusted R-squared: 0.9591
## F-statistic: 276.2 on 4 and 43 DF, p-value: < 2.2e-16
confint(Model.Startups_Log,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 1.129255e+04 2.404707e+05
## RD_Spend 7.139937e-01 8.621648e-01
## log(Administration) -1.616879e+04 3.455246e+03
## Marketing_Spend -9.231786e-03 4.562130e-02
## log(State) -5.536268e+03 4.138733e+03
predict(Model.Startups_Log,interval="predict")
## Warning in predict.lm(Model.Startups_Log, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 188824.76 172728.64 204920.88
## 2 186277.54 169881.77 202673.31
## 3 180486.18 164669.44 196302.93
## 4 171589.82 155883.61 187296.03
## 5 171438.65 155605.14 187272.15
## 6 162472.34 146739.04 178205.63
## 7 158650.41 142115.60 175185.22
## 8 158405.22 142933.74 173876.70
## 9 150072.78 134507.87 165637.70
## 10 154914.25 139197.89 170630.61
## 11 136057.26 120732.95 151381.56
## 12 137122.64 121364.05 152881.24
## 13 129194.70 113985.60 144403.80
## 14 127860.10 112315.70 143404.50
## 15 148555.19 133087.47 164022.90
## 16 145649.45 130171.52 161127.39
## 17 117743.78 102225.20 133262.35
## 18 129302.29 113806.00 144798.59
## 19 129041.14 113792.02 144290.26
## 20 117309.84 100741.17 133878.51
## 21 117399.30 101786.80 133011.79
## 22 116424.30 100656.98 132191.62
## 23 114739.94 99346.07 130133.81
## 24 110622.97 95205.25 126040.69
## 25 115249.28 99485.69 131012.87
## 26 104049.15 88489.37 119608.94
## 27 111692.42 96325.94 127058.91
## 28 113619.02 97729.66 129508.37
## 29 102586.58 86865.28 118307.89
## 30 102875.93 87225.15 118526.72
## 31 101803.14 86331.89 117274.39
## 32 99023.41 83320.49 114726.33
## 33 101876.80 85998.34 117755.25
## 34 99659.66 84380.22 114939.10
## 35 90224.01 74242.48 106205.55
## 36 92960.51 77199.73 108721.28
## 37 76938.83 61359.58 92518.07
## 38 95257.22 78055.81 112458.62
## 39 73888.65 57403.74 90373.56
## 40 87453.65 71675.36 103231.95
## 41 77419.78 61760.70 93078.85
## 42 78241.35 62610.99 93871.70
## 43 74267.32 58592.72 89941.92
## 44 63256.72 47409.22 79104.22
## 45 67912.02 51948.94 83875.11
## 46 51376.57 35340.75 67412.38
## 47 57721.47 40924.30 74518.63
## 48 50768.83 34762.86 66774.80
Model.Startups_Fin1<-lm(Profit~RD_Spend+Administration+Marketing_Spend+State,data=Startups[-c(49,50),])
summary(Model.Startups_Fin1) # Adjusted R2 Value is 0.9567
##
## Call:
## lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend +
## State, data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15944 -4787 -1904 6066 13646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.960e+04 6.314e+03 9.438 4.84e-12 ***
## RD_Spend 7.898e-01 3.677e-02 21.480 < 2e-16 ***
## Administration -6.257e-02 4.451e-02 -1.406 0.167
## Marketing_Spend 1.705e-02 1.369e-02 1.245 0.220
## State -3.267e+02 1.326e+03 -0.246 0.806
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7429 on 43 degrees of freedom
## Multiple R-squared: 0.9627, Adjusted R-squared: 0.9593
## F-statistic: 277.7 on 4 and 43 DF, p-value: < 2.2e-16
# Exponential Transformation :
Model.Startups_exp<-lm(log(Profit)~RD_Spend+Administration+Marketing_Spend+State,data=Startups[-c(49,50),])
summary(Model.Startups_exp) #Adjusted R2 Value is 0.9182
##
## Call:
## lm(formula = log(Profit) ~ RD_Spend + Administration + Marketing_Spend +
## State, data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36095 -0.03876 0.00946 0.05837 0.17095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.111e+01 8.242e-02 134.844 <2e-16 ***
## RD_Spend 7.431e-06 4.800e-07 15.482 <2e-16 ***
## Administration -7.457e-07 5.810e-07 -1.284 0.206
## Marketing_Spend -5.233e-09 1.787e-07 -0.029 0.977
## State 6.658e-03 1.730e-02 0.385 0.702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09697 on 43 degrees of freedom
## Multiple R-squared: 0.9252, Adjusted R-squared: 0.9182
## F-statistic: 132.9 on 4 and 43 DF, p-value: < 2.2e-16
Model.Startups_exp1<-lm(log(Profit)~RD_Spend+Marketing_Spend,data=Startups[-c(49,50),])
summary(Model.Startups_exp1)
##
## Call:
## lm(formula = log(Profit) ~ RD_Spend + Marketing_Spend, data = Startups[-c(49,
## 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37520 -0.03278 0.00667 0.05383 0.15858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.103e+01 3.089e-02 357.165 <2e-16 ***
## RD_Spend 7.207e-06 4.419e-07 16.308 <2e-16 ***
## Marketing_Spend 8.151e-08 1.653e-07 0.493 0.624
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09669 on 45 degrees of freedom
## Multiple R-squared: 0.9221, Adjusted R-squared: 0.9187
## F-statistic: 266.5 on 2 and 45 DF, p-value: < 2.2e-16
# Quad Model
Model.Startups_Quad <- lm(Profit~RD_Spend+I(RD_Spend^2)+Administration+I(Administration^2)
+Marketing_Spend+I(Marketing_Spend^2)+State+I(State^2),data=Startups[-c(49,50),])
summary(Model.Startups_Quad) #Adjusted R2 value is 0.9567
##
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + Administration +
## I(Administration^2) + Marketing_Spend + I(Marketing_Spend^2) +
## State + I(State^2), data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15181 -4247 -1080 4490 13994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.142e+04 1.845e+04 3.329 0.00191 **
## RD_Spend 7.385e-01 1.203e-01 6.137 3.34e-07 ***
## I(RD_Spend^2) 2.987e-07 8.110e-07 0.368 0.71465
## Administration -6.758e-03 2.896e-01 -0.023 0.98150
## I(Administration^2) -2.573e-07 1.207e-06 -0.213 0.83228
## Marketing_Spend -3.638e-03 4.592e-02 -0.079 0.93727
## I(Marketing_Spend^2) 4.791e-08 1.150e-07 0.417 0.67928
## State -1.839e+03 9.910e+03 -0.186 0.85376
## I(State^2) 3.407e+02 2.465e+03 0.138 0.89078
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7659 on 39 degrees of freedom
## Multiple R-squared: 0.9641, Adjusted R-squared: 0.9567
## F-statistic: 130.8 on 8 and 39 DF, p-value: < 2.2e-16
confint(Model.Startups_Quad,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 2.409786e+04 9.874414e+04
## RD_Spend 4.950840e-01 9.819009e-01
## I(RD_Spend^2) -1.341640e-06 1.938955e-06
## Administration -5.925914e-01 5.790761e-01
## I(Administration^2) -2.697887e-06 2.183367e-06
## Marketing_Spend -9.652462e-02 8.924949e-02
## I(Marketing_Spend^2) -1.847371e-07 2.805648e-07
## State -2.188322e+04 1.820574e+04
## I(State^2) -4.645264e+03 5.326694e+03
predict(Model.Startups_Quad,interval="predict")
## Warning in predict.lm(Model.Startups_Quad, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 192447.86 174455.25 210440.46
## 2 188804.45 170862.94 206745.97
## 3 182627.55 165522.80 199732.30
## 4 173031.09 156492.48 189569.70
## 5 172408.76 155615.81 189201.70
## 6 163306.63 146883.26 179730.00
## 7 158495.61 140270.54 176720.68
## 8 157816.95 141439.50 174194.40
## 9 149154.75 132673.44 165636.07
## 10 155122.18 138708.95 171535.41
## 11 135259.03 118916.22 151601.84
## 12 136587.50 120147.22 153027.78
## 13 128106.61 111970.31 144242.90
## 14 126887.15 110594.51 143179.78
## 15 146837.16 130025.83 163648.49
## 16 145097.80 128449.19 161746.40
## 17 117113.75 100747.81 133479.68
## 18 127953.66 111579.02 144328.30
## 19 128345.92 112096.27 144595.57
## 20 117921.85 99983.99 135859.72
## 21 117054.94 100427.38 133682.50
## 22 114788.46 97980.35 131596.57
## 23 113982.79 97483.87 130481.71
## 24 110090.88 93473.27 126708.50
## 25 114869.22 98282.50 131455.94
## 26 103383.13 87185.32 119580.94
## 27 110485.62 94196.54 126774.69
## 28 113396.89 96028.08 130765.70
## 29 99610.67 80790.74 118430.60
## 30 101807.21 85453.17 118161.24
## 31 101881.31 85202.19 118560.44
## 32 98257.57 81910.25 114604.89
## 33 102715.99 85708.40 119723.58
## 34 99005.79 82833.46 115178.11
## 35 88751.27 71877.65 105624.88
## 36 92425.24 75930.98 108919.51
## 37 76714.71 60175.55 93253.86
## 38 93168.54 73929.13 112407.95
## 39 73438.96 55704.24 91173.69
## 40 87340.69 70985.54 103695.84
## 41 77790.43 61311.04 94269.81
## 42 78216.75 61699.09 94734.42
## 43 75029.40 58637.46 91421.34
## 44 65389.95 48636.44 82143.46
## 45 69172.01 52328.07 86015.95
## 46 54899.04 37402.77 72395.31
## 47 58993.89 40893.14 77094.65
## 48 54289.54 36802.47 71776.61
Model.Startups_Quad1 <- lm(Profit~RD_Spend+I(RD_Spend^2)+Marketing_Spend+I(Marketing_Spend^2)
,data=Startups[-c(49,50),])
summary(Model.Startups_Quad1) #Adjusted R2 value is 0.9567
##
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + Marketing_Spend +
## I(Marketing_Spend^2), data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16990.4 -4031.9 -749.6 5261.9 13097.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.417e+04 3.628e+03 14.928 < 2e-16 ***
## RD_Spend 6.834e-01 1.104e-01 6.192 1.92e-07 ***
## I(RD_Spend^2) 5.628e-07 7.613e-07 0.739 0.464
## Marketing_Spend 2.002e-02 3.934e-02 0.509 0.613
## I(Marketing_Spend^2) 3.527e-09 1.022e-07 0.035 0.973
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7500 on 43 degrees of freedom
## Multiple R-squared: 0.962, Adjusted R-squared: 0.9585
## F-statistic: 272.2 on 4 and 43 DF, p-value: < 2.2e-16
# Poly Modal
Model.Startups_Poly <- lm(Profit~RD_Spend+I(RD_Spend^2)+I(RD_Spend^3)+
Administration+I(Administration^2)+I(Administration^3)+
Marketing_Spend+I(Marketing_Spend^2)+I(Marketing_Spend^3)+
State+I(State^2)+I(State^3),data=Startups[-c(49,50),])
summary(Model.Startups_Poly) #Adjusted R Square Value is 0.9569
##
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + I(RD_Spend^3) +
## Administration + I(Administration^2) + I(Administration^3) +
## Marketing_Spend + I(Marketing_Spend^2) + I(Marketing_Spend^3) +
## State + I(State^2) + I(State^3), data = Startups[-c(49, 50),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -14811 -4320 -1841 4371 14941
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.338e+04 4.270e+04 0.782 0.439461
## RD_Spend 1.321e+00 3.667e-01 3.603 0.000942 ***
## I(RD_Spend^2) -1.007e-05 6.258e-06 -1.610 0.116160
## I(RD_Spend^3) 4.834e-11 2.915e-11 1.658 0.105936
## Administration 5.050e-01 1.199e+00 0.421 0.676221
## I(Administration^2) -3.802e-06 1.059e-05 -0.359 0.721689
## I(Administration^3) 7.862e-12 3.022e-11 0.260 0.796195
## Marketing_Spend -9.678e-02 1.097e-01 -0.883 0.383326
## I(Marketing_Spend^2) 7.294e-07 6.585e-07 1.108 0.275351
## I(Marketing_Spend^3) -1.304e-12 1.129e-12 -1.155 0.255534
## State -2.194e+03 1.036e+04 -0.212 0.833431
## I(State^2) 6.219e+02 2.574e+03 0.242 0.810420
## I(State^3) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7641 on 36 degrees of freedom
## Multiple R-squared: 0.967, Adjusted R-squared: 0.9569
## F-statistic: 95.87 on 11 and 36 DF, p-value: < 2.2e-16
Model.Startups_Poly1 <- lm(Profit~RD_Spend+I(RD_Spend^2)+I(RD_Spend^3)+
Marketing_Spend+I(Marketing_Spend^2)+I(Marketing_Spend^3)
,data=Startups[-c(49,50),])
summary(Model.Startups_Poly1) #Adjusted R Square Value is 0.9601
##
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + I(RD_Spend^3) +
## Marketing_Spend + I(Marketing_Spend^2) + I(Marketing_Spend^3),
## data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15952 -4066 -1781 3793 14807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.069e+04 4.254e+03 11.915 6.75e-15 ***
## RD_Spend 1.270e+00 3.308e-01 3.839 0.000419 ***
## I(RD_Spend^2) -9.570e-06 5.490e-06 -1.743 0.088806 .
## I(RD_Spend^3) 4.654e-11 2.521e-11 1.846 0.072084 .
## Marketing_Spend -9.277e-02 1.044e-01 -0.889 0.379383
## I(Marketing_Spend^2) 7.500e-07 6.165e-07 1.217 0.230722
## I(Marketing_Spend^3) -1.343e-12 1.032e-12 -1.302 0.200249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7349 on 41 degrees of freedom
## Multiple R-squared: 0.9652, Adjusted R-squared: 0.9601
## F-statistic: 189.7 on 6 and 41 DF, p-value: < 2.2e-16
### Variance Inflation Factors is a formal way to check for collinearity
vif(Model.Startups_Log) # VIF is > 10 => collinearity
## RD_Spend log(Administration) Marketing_Spend
## 2.238234 1.186503 2.193121
## log(State)
## 1.024393
avPlots(Model.Startups_Log, id.n=2, id.cex=0.7) # Added Variable Plots
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter

# Final Model
FinalModel<-lm(Profit~RD_Spend+log(Administration)+Marketing_Spend+
log(State),data=Startups[-c(49,50),])
summary(FinalModel) #Adjusted R2 Value = 0.9591
##
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration) + Marketing_Spend +
## log(State), data = Startups[-c(49, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15952 -5122 -1868 5748 13769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.259e+05 5.682e+04 2.215 0.0321 *
## RD_Spend 7.881e-01 3.674e-02 21.452 <2e-16 ***
## log(Administration) -6.357e+03 4.865e+03 -1.307 0.1983
## Marketing_Spend 1.819e-02 1.360e-02 1.338 0.1880
## log(State) -6.988e+02 2.399e+03 -0.291 0.7722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7448 on 43 degrees of freedom
## Multiple R-squared: 0.9625, Adjusted R-squared: 0.9591
## F-statistic: 276.2 on 4 and 43 DF, p-value: < 2.2e-16
Profit_Predict <- predict(FinalModel,interval="predict")
## Warning in predict.lm(FinalModel, interval = "predict"): predictions on current data refer to _future_ responses
Final <- cbind(Startups$RD_Spend,Startups$Administration,Startups$Marketing_Spend,
Startups$State,Startups$Profit,Profit_Predict)
## Warning in cbind(Startups$RD_Spend, Startups$Administration,
## Startups$Marketing_Spend, : number of rows of result is not a multiple of
## vector length (arg 1)
View(Final)
# Evaluate model LINE assumptions
plot(FinalModel)# Residual Plots, QQ-Plos, Std. Residuals vs Fitted, Cook's distance




qqPlot(FinalModel, id.n=5) # QQ plots of studentized residuals, helps identify outliers

## [1] 15 16
library("MASS")
stepAIC(FinalModel) # backward
## Start: AIC=860.63
## Profit ~ RD_Spend + log(Administration) + Marketing_Spend + log(State)
##
## Df Sum of Sq RSS AIC
## - log(State) 1 4.7075e+06 2.3901e+09 858.72
## - log(Administration) 1 9.4695e+07 2.4801e+09 860.50
## - Marketing_Spend 1 9.9293e+07 2.4847e+09 860.59
## <none> 2.3854e+09 860.63
## - RD_Spend 1 2.5529e+10 2.7915e+10 976.70
##
## Step: AIC=858.72
## Profit ~ RD_Spend + log(Administration) + Marketing_Spend
##
## Df Sum of Sq RSS AIC
## - Marketing_Spend 1 9.6666e+07 2.4868e+09 858.63
## - log(Administration) 1 9.8854e+07 2.4889e+09 858.67
## <none> 2.3901e+09 858.72
## - RD_Spend 1 2.5526e+10 2.7916e+10 974.70
##
## Step: AIC=858.63
## Profit ~ RD_Spend + log(Administration)
##
## Df Sum of Sq RSS AIC
## <none> 2.4868e+09 858.63
## - log(Administration) 1 2.0174e+08 2.6885e+09 860.37
## - RD_Spend 1 6.0063e+10 6.2549e+10 1011.43
##
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration), data = Startups[-c(49,
## 50), ])
##
## Coefficients:
## (Intercept) RD_Spend log(Administration)
## 1.539e+05 8.233e-01 -8.680e+03
# Lower the AIC value better is the model. AIC is used only if you build multiple models.