# Prepare a prediction model for profit of 50_startups data

Startups <- read.csv(file.choose())
View(Startups)
class(Startups)
## [1] "data.frame"
# To Transform the data from Character to Numeric
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
Startups$State <- revalue(Startups$State,
                         c("New York"="0", "California"="1", "Florida"="2")) 
attach(Startups)
Startups <- cbind(RD_Spend=R.D.Spend,Administration,Marketing_Spend=Marketing.Spend,State,Profit)


Startups <- as.data.frame(Startups)

attach(Startups) # Basically to avoid reference of Data Set name(Startups) in this report.
## The following objects are masked from Startups (pos = 3):
## 
##     Administration, Profit, State
#Corolla_Pred <- cbind(Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight)
#Corolla_Pred1 <- as.data.frame(Corolla_Pred)
#class(Corolla_Pred1)

# Exploratory data analysis:
# 1. Measures of central tendency
# 2. Measures of dispersion
# 3. Third moment business decision
# 4. Fourth moment business decision
# 5. Probability distributions of variables 
# 6. Graphical representations (Histogram, Box plot, Dot plot, Stem & Leaf plot, Bar plot, etc.)

summary(Startups)
##     RD_Spend      Administration   Marketing_Spend      State  
##  Min.   :     0   Min.   : 51283   Min.   :     0   Min.   :1  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   1st Qu.:1  
##  Median : 73051   Median :122700   Median :212716   Median :2  
##  Mean   : 73722   Mean   :121345   Mean   :211025   Mean   :2  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469   3rd Qu.:3  
##  Max.   :165349   Max.   :182646   Max.   :471784   Max.   :3  
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
plot(R.D.Spend, Profit)

plot(Administration, Profit)

plot(Marketing.Spend, Profit)

plot(State, Profit)

windows()
# 7. Find the correlation between Output (Profit) & inputs (R.D Spend, Administration, Marketing, State) - SCATTER DIAGRAM
pairs(Startups)

# 8. Correlation coefficient - Strength & Direction of correlation
cor(Startups)
##                  RD_Spend Administration Marketing_Spend      State
## RD_Spend        1.0000000     0.24195525      0.72424813 0.10468511
## Administration  0.2419552     1.00000000     -0.03215388 0.01184720
## Marketing_Spend 0.7242481    -0.03215388      1.00000000 0.07766961
## State           0.1046851     0.01184720      0.07766961 1.00000000
## Profit          0.9729005     0.20071657      0.74776572 0.10179631
##                    Profit
## RD_Spend        0.9729005
## Administration  0.2007166
## Marketing_Spend 0.7477657
## State           0.1017963
## Profit          1.0000000
# The Linear Model of interest
Model.Startups <- lm(Profit~RD_Spend+Administration+Marketing_Spend+State)
summary(Model.Startups)
## 
## Call:
## lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend + 
##     State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33553  -4779     63   6595  17301 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.016e+04  7.322e+03   6.851 1.69e-08 ***
## RD_Spend         8.058e-01  4.576e-02  17.609  < 2e-16 ***
## Administration  -2.683e-02  5.160e-02  -0.520    0.606    
## Marketing_Spend  2.723e-02  1.663e-02   1.637    0.109    
## State           -2.232e+01  1.610e+03  -0.014    0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9464 
## F-statistic: 217.2 on 4 and 45 DF,  p-value: < 2.2e-16
Model.Startups1 <- lm(Profit~RD_Spend+log(Administration))
summary(Model.Startups1)
## 
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33851  -4928   -180   6385  17863 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.021e+05  6.087e+04   1.677    0.100    
## RD_Spend             8.614e-01  3.049e-02  28.250   <2e-16 ***
## log(Administration) -4.589e+03  5.260e+03  -0.872    0.387    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9440 on 47 degrees of freedom
## Multiple R-squared:  0.9474, Adjusted R-squared:  0.9451 
## F-statistic: 423.2 on 2 and 47 DF,  p-value: < 2.2e-16
# Model.Startups2 <- lm(Profit~RD_Spend+Administration+Marketing_Spend+log
# summary(Model.Startups2)


### Scatter plot matrix with Correlations inserted in graph
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r = (cor(x, y))
  txt <- format(c(r, 0.123456789), digits=digits)[1]
  txt <- paste(prefix, txt, sep="")
  
  if(missing(cex.cor)) cex <- 0.4/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex)
}
pairs(Startups, upper.panel=panel.cor,main="Scatter Plot Matrix with Correlation Coefficients")

### Partial Correlation matrix - Pure correlation between the variables
# install.packages("corpcor")
library(corpcor)
cor2pcor(cor(Startups))
##            [,1]        [,2]         [,3]         [,4]         [,5]
## [1,] 1.00000000  0.20880590  0.038920914  0.026971505  0.934484951
## [2,] 0.20880590  1.00000000 -0.281913894 -0.013933444 -0.077271343
## [3,] 0.03892091 -0.28191389  1.000000000 -0.001176456  0.237068057
## [4,] 0.02697151 -0.01393344 -0.001176456  1.000000000 -0.002066896
## [5,] 0.93448495 -0.07727134  0.237068057 -0.002066896  1.000000000
# install.packages("mvinfluence")
library(mvinfluence)
## Warning: package 'mvinfluence' was built under R version 3.5.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.5.1
## Loading required package: heplots
## Warning: package 'heplots' was built under R version 3.5.1
## 
## Attaching package: 'mvinfluence'
## The following object is masked from 'package:corpcor':
## 
##     mpower
library(car)

# It is better to delete a single observation rather than entire variable to get rid of collinearity problem
# Deletion Diagnostics for identifying influential variable
influence.measures(Model.Startups)
## Influence measures of
##   lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend +      State) :
## 
##       dfb.1_  dfb.RD_S  dfb.Admn dfb.Mr_S  dfb.Stat    dffit cov.r
## 1   0.005573 -0.001614 -1.78e-03 -0.00458 -0.004266 -0.01138 1.309
## 2  -0.030474  0.023633  4.12e-02  0.04566 -0.066200  0.13381 1.324
## 3   0.086304  0.181884 -1.64e-01  0.02355 -0.030499  0.35755 1.122
## 4  -0.072998  0.119221 -5.54e-02  0.04566  0.162555  0.34081 1.097
## 5  -0.100209 -0.136061  1.49e-01  0.01950  0.018111 -0.23760 1.196
## 6  -0.014497 -0.089593  1.11e-01 -0.01501 -0.117810 -0.24762 1.170
## 7  -0.031273 -0.099257  5.50e-03  0.08894  0.051794 -0.12341 1.405
## 8   0.045172 -0.035906 -4.31e-02 -0.01623  0.008350 -0.11748 1.160
## 9  -0.010434  0.001367  6.84e-03  0.00303  0.008975  0.01608 1.217
## 10 -0.092215 -0.094245  6.88e-02  0.02277  0.114783 -0.18347 1.188
## 11  0.116853  0.162921 -1.22e-01 -0.10745 -0.012494  0.24442 1.003
## 12  0.247775  0.159631 -2.01e-01 -0.08378 -0.189193  0.32287 1.115
## 13 -0.005299  0.046825  2.27e-02  0.00696 -0.008788  0.21480 0.933
## 14  0.020469  0.012107  4.96e-02  0.02364 -0.136758  0.18763 1.117
## 15  0.219338 -0.181121 -2.65e-01  0.05847  0.026192 -0.50523 0.795
## 16  0.070447 -0.224544  7.58e-02  0.11071 -0.304261 -0.49002 0.822
## 17  0.073736 -0.044565  2.33e-02  0.09081 -0.198951  0.27267 1.033
## 18  0.099087  0.030794 -7.35e-02 -0.05013 -0.091981 -0.14780 1.168
## 19 -0.011430  0.004661  1.42e-02 -0.03678  0.003387 -0.09144 1.121
## 20 -0.028144  0.256904  3.85e-02 -0.35645  0.164789  0.44558 1.304
## 21  0.015148 -0.015929 -1.77e-05  0.02760 -0.035848  0.05457 1.198
## 22  0.182630  0.127895 -1.60e-01 -0.14191 -0.125242 -0.24650 1.203
## 23  0.022466  0.060978 -2.84e-02 -0.08057 -0.000145 -0.10593 1.145
## 24 -0.003225  0.017368  3.21e-03 -0.02305 -0.000174 -0.03250 1.179
## 25 -0.061116 -0.081784  9.25e-02  0.09845 -0.095141 -0.17189 1.195
## 26  0.031802  0.015801  4.38e-02 -0.03848 -0.098271  0.14774 1.153
## 27  0.012297 -0.033948 -4.36e-02  0.05634 -0.000515 -0.11475 1.135
## 28  0.206794  0.258315 -1.38e-01 -0.31127 -0.199993 -0.40597 1.123
## 29 -0.039607 -0.008569  5.97e-02 -0.00635  0.001391  0.07104 1.280
## 30  0.011586 -0.001798 -1.21e-02  0.00901 -0.015883 -0.02718 1.231
## 31  0.006628  0.006285 -4.11e-03 -0.00980  0.000176  0.01267 1.183
## 32  0.002556 -0.000516 -2.80e-03  0.00252 -0.003805 -0.00666 1.241
## 33 -0.031476 -0.031588  5.80e-03  0.04613  0.032092 -0.06325 1.254
## 34 -0.010415  0.006871  8.04e-03 -0.00527 -0.000609 -0.02214 1.154
## 35 -0.097131 -0.185557  2.33e-01  0.15161 -0.149674  0.32934 1.173
## 36  0.062079 -0.044366 -1.04e-01  0.02142  0.125511  0.20905 1.168
## 37 -0.093069 -0.399005  2.02e-01  0.28778  0.028946  0.47782 0.858
## 38  0.015553  0.001371 -1.35e-02 -0.00142 -0.006644  0.01668 1.359
## 39  0.231713 -0.178833 -3.18e-01  0.08353  0.273642  0.56424 1.071
## 40 -0.081516  0.009679  5.37e-02  0.00013  0.052197 -0.09534 1.215
## 41  0.031393 -0.058872  1.79e-02  0.03617 -0.061301  0.10559 1.188
## 42  0.057389 -0.028676 -4.82e-02  0.00799  0.004377  0.08723 1.183
## 43  0.019182 -0.009398 -8.06e-03  0.00288 -0.015514  0.02765 1.213
## 44 -0.028180 -0.086212  5.60e-02 -0.10035  0.221626  0.37035 1.092
## 45  0.000536 -0.004571  1.23e-02 -0.00537 -0.010484  0.02395 1.273
## 46 -0.009334 -0.206043  8.68e-02 -0.21717  0.432166  0.77523 0.810
## 47  0.096246  0.430539 -1.44e-01 -0.38598 -0.023283 -0.46097 1.368
## 48 -0.042206  0.045654 -4.66e-02  0.03804  0.069689 -0.16308 1.240
## 49 -0.652061 -0.119392  7.14e-01  0.44462 -0.379188 -0.99824 1.090
## 50 -0.674607  0.559442 -1.00e-01  0.14833  0.704506 -1.48097 0.183
##      cook.d    hat inf
## 1  2.65e-05 0.1453    
## 2  3.65e-03 0.1636    
## 3  2.56e-02 0.1111    
## 4  2.32e-02 0.0968    
## 5  1.14e-02 0.1110    
## 6  1.24e-02 0.1010    
## 7  3.11e-03 0.2087   *
## 8  2.81e-03 0.0594    
## 9  5.29e-05 0.0808    
## 10 6.83e-03 0.0925    
## 11 1.19e-02 0.0421    
## 12 2.09e-02 0.0985    
## 13 9.06e-03 0.0242    
## 14 7.11e-03 0.0593    
## 15 4.81e-02 0.0640    
## 16 4.56e-02 0.0653    
## 17 1.48e-02 0.0567    
## 18 4.44e-03 0.0723    
## 19 1.70e-03 0.0309    
## 20 3.99e-02 0.2110    
## 21 6.09e-04 0.0699    
## 22 1.23e-02 0.1167    
## 23 2.28e-03 0.0476    
## 24 2.16e-04 0.0529    
## 25 6.00e-03 0.0937    
## 26 4.43e-03 0.0639    
## 27 2.68e-03 0.0449    
## 28 3.28e-02 0.1247    
## 29 1.03e-03 0.1293    
## 30 1.51e-04 0.0921    
## 31 3.28e-05 0.0542    
## 32 9.08e-06 0.0981    
## 33 8.18e-04 0.1107    
## 34 1.00e-04 0.0318    
## 35 2.18e-02 0.1244    
## 36 8.85e-03 0.0891    
## 37 4.37e-02 0.0693    
## 38 5.69e-05 0.1770   *
## 39 6.25e-02 0.1480    
## 40 1.86e-03 0.0891    
## 41 2.27e-03 0.0734    
## 42 1.55e-03 0.0657    
## 43 1.56e-04 0.0786    
## 44 2.73e-02 0.1034    
## 45 1.17e-04 0.1217    
## 46 1.12e-01 0.1250    
## 47 4.28e-02 0.2412   *
## 48 5.42e-03 0.1177    
## 49 1.91e-01 0.2548    
## 50 3.06e-01 0.0966   *
influenceIndexPlot(Model.Startups, id.n=3) # Index Plots of the influence measures
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

influencePlot(Model.Startups, id.n=3) # A user friendly representation of the above
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

##       StudRes        Hat      CookD
## 46  2.0508431 0.12502380 0.11220265
## 47 -0.8175683 0.24121705 0.04281347
## 49 -1.7072322 0.25478210 0.19116349
## 50 -4.5278887 0.09664062 0.30602820
# infIndexPlot.mlm()

## Regression after deleting the 49th and 50th observation, which is influential observation

# Logarthimic Transformation 
Model.Startups_Log<-lm(Profit~RD_Spend+log(Administration)+Marketing_Spend+log(State),data=Startups[-c(49,50),]) 

summary(Model.Startups_Log) #Adjusted R2 Value = 0.9591  
## 
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration) + Marketing_Spend + 
##     log(State), data = Startups[-c(49, 50), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15952  -5122  -1868   5748  13769 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.259e+05  5.682e+04   2.215   0.0321 *  
## RD_Spend             7.881e-01  3.674e-02  21.452   <2e-16 ***
## log(Administration) -6.357e+03  4.865e+03  -1.307   0.1983    
## Marketing_Spend      1.819e-02  1.360e-02   1.338   0.1880    
## log(State)          -6.988e+02  2.399e+03  -0.291   0.7722    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7448 on 43 degrees of freedom
## Multiple R-squared:  0.9625, Adjusted R-squared:  0.9591 
## F-statistic: 276.2 on 4 and 43 DF,  p-value: < 2.2e-16
confint(Model.Startups_Log,level=0.95)
##                             2.5 %       97.5 %
## (Intercept)          1.129255e+04 2.404707e+05
## RD_Spend             7.139937e-01 8.621648e-01
## log(Administration) -1.616879e+04 3.455246e+03
## Marketing_Spend     -9.231786e-03 4.562130e-02
## log(State)          -5.536268e+03 4.138733e+03
predict(Model.Startups_Log,interval="predict")
## Warning in predict.lm(Model.Startups_Log, interval = "predict"): predictions on current data refer to _future_ responses
##          fit       lwr       upr
## 1  188824.76 172728.64 204920.88
## 2  186277.54 169881.77 202673.31
## 3  180486.18 164669.44 196302.93
## 4  171589.82 155883.61 187296.03
## 5  171438.65 155605.14 187272.15
## 6  162472.34 146739.04 178205.63
## 7  158650.41 142115.60 175185.22
## 8  158405.22 142933.74 173876.70
## 9  150072.78 134507.87 165637.70
## 10 154914.25 139197.89 170630.61
## 11 136057.26 120732.95 151381.56
## 12 137122.64 121364.05 152881.24
## 13 129194.70 113985.60 144403.80
## 14 127860.10 112315.70 143404.50
## 15 148555.19 133087.47 164022.90
## 16 145649.45 130171.52 161127.39
## 17 117743.78 102225.20 133262.35
## 18 129302.29 113806.00 144798.59
## 19 129041.14 113792.02 144290.26
## 20 117309.84 100741.17 133878.51
## 21 117399.30 101786.80 133011.79
## 22 116424.30 100656.98 132191.62
## 23 114739.94  99346.07 130133.81
## 24 110622.97  95205.25 126040.69
## 25 115249.28  99485.69 131012.87
## 26 104049.15  88489.37 119608.94
## 27 111692.42  96325.94 127058.91
## 28 113619.02  97729.66 129508.37
## 29 102586.58  86865.28 118307.89
## 30 102875.93  87225.15 118526.72
## 31 101803.14  86331.89 117274.39
## 32  99023.41  83320.49 114726.33
## 33 101876.80  85998.34 117755.25
## 34  99659.66  84380.22 114939.10
## 35  90224.01  74242.48 106205.55
## 36  92960.51  77199.73 108721.28
## 37  76938.83  61359.58  92518.07
## 38  95257.22  78055.81 112458.62
## 39  73888.65  57403.74  90373.56
## 40  87453.65  71675.36 103231.95
## 41  77419.78  61760.70  93078.85
## 42  78241.35  62610.99  93871.70
## 43  74267.32  58592.72  89941.92
## 44  63256.72  47409.22  79104.22
## 45  67912.02  51948.94  83875.11
## 46  51376.57  35340.75  67412.38
## 47  57721.47  40924.30  74518.63
## 48  50768.83  34762.86  66774.80
Model.Startups_Fin1<-lm(Profit~RD_Spend+Administration+Marketing_Spend+State,data=Startups[-c(49,50),])
summary(Model.Startups_Fin1) # Adjusted R2 Value is 0.9567
## 
## Call:
## lm(formula = Profit ~ RD_Spend + Administration + Marketing_Spend + 
##     State, data = Startups[-c(49, 50), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15944  -4787  -1904   6066  13646 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.960e+04  6.314e+03   9.438 4.84e-12 ***
## RD_Spend         7.898e-01  3.677e-02  21.480  < 2e-16 ***
## Administration  -6.257e-02  4.451e-02  -1.406    0.167    
## Marketing_Spend  1.705e-02  1.369e-02   1.245    0.220    
## State           -3.267e+02  1.326e+03  -0.246    0.806    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7429 on 43 degrees of freedom
## Multiple R-squared:  0.9627, Adjusted R-squared:  0.9593 
## F-statistic: 277.7 on 4 and 43 DF,  p-value: < 2.2e-16
# Exponential Transformation :
Model.Startups_exp<-lm(log(Profit)~RD_Spend+Administration+Marketing_Spend+State,data=Startups[-c(49,50),])
summary(Model.Startups_exp)  #Adjusted R2 Value is 0.9182
## 
## Call:
## lm(formula = log(Profit) ~ RD_Spend + Administration + Marketing_Spend + 
##     State, data = Startups[-c(49, 50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36095 -0.03876  0.00946  0.05837  0.17095 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.111e+01  8.242e-02 134.844   <2e-16 ***
## RD_Spend         7.431e-06  4.800e-07  15.482   <2e-16 ***
## Administration  -7.457e-07  5.810e-07  -1.284    0.206    
## Marketing_Spend -5.233e-09  1.787e-07  -0.029    0.977    
## State            6.658e-03  1.730e-02   0.385    0.702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09697 on 43 degrees of freedom
## Multiple R-squared:  0.9252, Adjusted R-squared:  0.9182 
## F-statistic: 132.9 on 4 and 43 DF,  p-value: < 2.2e-16
Model.Startups_exp1<-lm(log(Profit)~RD_Spend+Marketing_Spend,data=Startups[-c(49,50),])
summary(Model.Startups_exp1) 
## 
## Call:
## lm(formula = log(Profit) ~ RD_Spend + Marketing_Spend, data = Startups[-c(49, 
##     50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37520 -0.03278  0.00667  0.05383  0.15858 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.103e+01  3.089e-02 357.165   <2e-16 ***
## RD_Spend        7.207e-06  4.419e-07  16.308   <2e-16 ***
## Marketing_Spend 8.151e-08  1.653e-07   0.493    0.624    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09669 on 45 degrees of freedom
## Multiple R-squared:  0.9221, Adjusted R-squared:  0.9187 
## F-statistic: 266.5 on 2 and 45 DF,  p-value: < 2.2e-16
# Quad Model
Model.Startups_Quad <- lm(Profit~RD_Spend+I(RD_Spend^2)+Administration+I(Administration^2)
                          +Marketing_Spend+I(Marketing_Spend^2)+State+I(State^2),data=Startups[-c(49,50),])
summary(Model.Startups_Quad)  #Adjusted R2 value is 0.9567
## 
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + Administration + 
##     I(Administration^2) + Marketing_Spend + I(Marketing_Spend^2) + 
##     State + I(State^2), data = Startups[-c(49, 50), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15181  -4247  -1080   4490  13994 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.142e+04  1.845e+04   3.329  0.00191 ** 
## RD_Spend              7.385e-01  1.203e-01   6.137 3.34e-07 ***
## I(RD_Spend^2)         2.987e-07  8.110e-07   0.368  0.71465    
## Administration       -6.758e-03  2.896e-01  -0.023  0.98150    
## I(Administration^2)  -2.573e-07  1.207e-06  -0.213  0.83228    
## Marketing_Spend      -3.638e-03  4.592e-02  -0.079  0.93727    
## I(Marketing_Spend^2)  4.791e-08  1.150e-07   0.417  0.67928    
## State                -1.839e+03  9.910e+03  -0.186  0.85376    
## I(State^2)            3.407e+02  2.465e+03   0.138  0.89078    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7659 on 39 degrees of freedom
## Multiple R-squared:  0.9641, Adjusted R-squared:  0.9567 
## F-statistic: 130.8 on 8 and 39 DF,  p-value: < 2.2e-16
confint(Model.Startups_Quad,level=0.95)
##                              2.5 %       97.5 %
## (Intercept)           2.409786e+04 9.874414e+04
## RD_Spend              4.950840e-01 9.819009e-01
## I(RD_Spend^2)        -1.341640e-06 1.938955e-06
## Administration       -5.925914e-01 5.790761e-01
## I(Administration^2)  -2.697887e-06 2.183367e-06
## Marketing_Spend      -9.652462e-02 8.924949e-02
## I(Marketing_Spend^2) -1.847371e-07 2.805648e-07
## State                -2.188322e+04 1.820574e+04
## I(State^2)           -4.645264e+03 5.326694e+03
predict(Model.Startups_Quad,interval="predict")
## Warning in predict.lm(Model.Startups_Quad, interval = "predict"): predictions on current data refer to _future_ responses
##          fit       lwr       upr
## 1  192447.86 174455.25 210440.46
## 2  188804.45 170862.94 206745.97
## 3  182627.55 165522.80 199732.30
## 4  173031.09 156492.48 189569.70
## 5  172408.76 155615.81 189201.70
## 6  163306.63 146883.26 179730.00
## 7  158495.61 140270.54 176720.68
## 8  157816.95 141439.50 174194.40
## 9  149154.75 132673.44 165636.07
## 10 155122.18 138708.95 171535.41
## 11 135259.03 118916.22 151601.84
## 12 136587.50 120147.22 153027.78
## 13 128106.61 111970.31 144242.90
## 14 126887.15 110594.51 143179.78
## 15 146837.16 130025.83 163648.49
## 16 145097.80 128449.19 161746.40
## 17 117113.75 100747.81 133479.68
## 18 127953.66 111579.02 144328.30
## 19 128345.92 112096.27 144595.57
## 20 117921.85  99983.99 135859.72
## 21 117054.94 100427.38 133682.50
## 22 114788.46  97980.35 131596.57
## 23 113982.79  97483.87 130481.71
## 24 110090.88  93473.27 126708.50
## 25 114869.22  98282.50 131455.94
## 26 103383.13  87185.32 119580.94
## 27 110485.62  94196.54 126774.69
## 28 113396.89  96028.08 130765.70
## 29  99610.67  80790.74 118430.60
## 30 101807.21  85453.17 118161.24
## 31 101881.31  85202.19 118560.44
## 32  98257.57  81910.25 114604.89
## 33 102715.99  85708.40 119723.58
## 34  99005.79  82833.46 115178.11
## 35  88751.27  71877.65 105624.88
## 36  92425.24  75930.98 108919.51
## 37  76714.71  60175.55  93253.86
## 38  93168.54  73929.13 112407.95
## 39  73438.96  55704.24  91173.69
## 40  87340.69  70985.54 103695.84
## 41  77790.43  61311.04  94269.81
## 42  78216.75  61699.09  94734.42
## 43  75029.40  58637.46  91421.34
## 44  65389.95  48636.44  82143.46
## 45  69172.01  52328.07  86015.95
## 46  54899.04  37402.77  72395.31
## 47  58993.89  40893.14  77094.65
## 48  54289.54  36802.47  71776.61
Model.Startups_Quad1 <- lm(Profit~RD_Spend+I(RD_Spend^2)+Marketing_Spend+I(Marketing_Spend^2)
                          ,data=Startups[-c(49,50),])
summary(Model.Startups_Quad1)  #Adjusted R2 value is 0.9567
## 
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + Marketing_Spend + 
##     I(Marketing_Spend^2), data = Startups[-c(49, 50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16990.4  -4031.9   -749.6   5261.9  13097.6 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.417e+04  3.628e+03  14.928  < 2e-16 ***
## RD_Spend             6.834e-01  1.104e-01   6.192 1.92e-07 ***
## I(RD_Spend^2)        5.628e-07  7.613e-07   0.739    0.464    
## Marketing_Spend      2.002e-02  3.934e-02   0.509    0.613    
## I(Marketing_Spend^2) 3.527e-09  1.022e-07   0.035    0.973    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7500 on 43 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.9585 
## F-statistic: 272.2 on 4 and 43 DF,  p-value: < 2.2e-16
# Poly Modal
Model.Startups_Poly <- lm(Profit~RD_Spend+I(RD_Spend^2)+I(RD_Spend^3)+
                          Administration+I(Administration^2)+I(Administration^3)+
                          Marketing_Spend+I(Marketing_Spend^2)+I(Marketing_Spend^3)+
                          State+I(State^2)+I(State^3),data=Startups[-c(49,50),])
summary(Model.Startups_Poly) #Adjusted R Square Value is 0.9569
## 
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + I(RD_Spend^3) + 
##     Administration + I(Administration^2) + I(Administration^3) + 
##     Marketing_Spend + I(Marketing_Spend^2) + I(Marketing_Spend^3) + 
##     State + I(State^2) + I(State^3), data = Startups[-c(49, 50), 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14811  -4320  -1841   4371  14941 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.338e+04  4.270e+04   0.782 0.439461    
## RD_Spend              1.321e+00  3.667e-01   3.603 0.000942 ***
## I(RD_Spend^2)        -1.007e-05  6.258e-06  -1.610 0.116160    
## I(RD_Spend^3)         4.834e-11  2.915e-11   1.658 0.105936    
## Administration        5.050e-01  1.199e+00   0.421 0.676221    
## I(Administration^2)  -3.802e-06  1.059e-05  -0.359 0.721689    
## I(Administration^3)   7.862e-12  3.022e-11   0.260 0.796195    
## Marketing_Spend      -9.678e-02  1.097e-01  -0.883 0.383326    
## I(Marketing_Spend^2)  7.294e-07  6.585e-07   1.108 0.275351    
## I(Marketing_Spend^3) -1.304e-12  1.129e-12  -1.155 0.255534    
## State                -2.194e+03  1.036e+04  -0.212 0.833431    
## I(State^2)            6.219e+02  2.574e+03   0.242 0.810420    
## I(State^3)                   NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7641 on 36 degrees of freedom
## Multiple R-squared:  0.967,  Adjusted R-squared:  0.9569 
## F-statistic: 95.87 on 11 and 36 DF,  p-value: < 2.2e-16
Model.Startups_Poly1 <- lm(Profit~RD_Spend+I(RD_Spend^2)+I(RD_Spend^3)+
                            Marketing_Spend+I(Marketing_Spend^2)+I(Marketing_Spend^3)
                                                         ,data=Startups[-c(49,50),])
summary(Model.Startups_Poly1) #Adjusted R Square Value is 0.9601
## 
## Call:
## lm(formula = Profit ~ RD_Spend + I(RD_Spend^2) + I(RD_Spend^3) + 
##     Marketing_Spend + I(Marketing_Spend^2) + I(Marketing_Spend^3), 
##     data = Startups[-c(49, 50), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15952  -4066  -1781   3793  14807 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.069e+04  4.254e+03  11.915 6.75e-15 ***
## RD_Spend              1.270e+00  3.308e-01   3.839 0.000419 ***
## I(RD_Spend^2)        -9.570e-06  5.490e-06  -1.743 0.088806 .  
## I(RD_Spend^3)         4.654e-11  2.521e-11   1.846 0.072084 .  
## Marketing_Spend      -9.277e-02  1.044e-01  -0.889 0.379383    
## I(Marketing_Spend^2)  7.500e-07  6.165e-07   1.217 0.230722    
## I(Marketing_Spend^3) -1.343e-12  1.032e-12  -1.302 0.200249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7349 on 41 degrees of freedom
## Multiple R-squared:  0.9652, Adjusted R-squared:  0.9601 
## F-statistic: 189.7 on 6 and 41 DF,  p-value: < 2.2e-16
### Variance Inflation Factors is a formal way to check for collinearity
vif(Model.Startups_Log)  # VIF is > 10 => collinearity
##            RD_Spend log(Administration)     Marketing_Spend 
##            2.238234            1.186503            2.193121 
##          log(State) 
##            1.024393
avPlots(Model.Startups_Log, id.n=2, id.cex=0.7) # Added Variable Plots
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.window(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.cex" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.cex" is
## not a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in box(...): "id.cex" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.cex" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.cex" is not a
## graphical parameter

# Final Model
FinalModel<-lm(Profit~RD_Spend+log(Administration)+Marketing_Spend+
                 log(State),data=Startups[-c(49,50),])

summary(FinalModel) #Adjusted R2 Value = 0.9591 
## 
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration) + Marketing_Spend + 
##     log(State), data = Startups[-c(49, 50), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15952  -5122  -1868   5748  13769 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.259e+05  5.682e+04   2.215   0.0321 *  
## RD_Spend             7.881e-01  3.674e-02  21.452   <2e-16 ***
## log(Administration) -6.357e+03  4.865e+03  -1.307   0.1983    
## Marketing_Spend      1.819e-02  1.360e-02   1.338   0.1880    
## log(State)          -6.988e+02  2.399e+03  -0.291   0.7722    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7448 on 43 degrees of freedom
## Multiple R-squared:  0.9625, Adjusted R-squared:  0.9591 
## F-statistic: 276.2 on 4 and 43 DF,  p-value: < 2.2e-16
Profit_Predict <- predict(FinalModel,interval="predict")
## Warning in predict.lm(FinalModel, interval = "predict"): predictions on current data refer to _future_ responses
Final <- cbind(Startups$RD_Spend,Startups$Administration,Startups$Marketing_Spend,
               Startups$State,Startups$Profit,Profit_Predict)
## Warning in cbind(Startups$RD_Spend, Startups$Administration,
## Startups$Marketing_Spend, : number of rows of result is not a multiple of
## vector length (arg 1)
View(Final)


# Evaluate model LINE assumptions
plot(FinalModel)# Residual Plots, QQ-Plos, Std. Residuals vs Fitted, Cook's distance

qqPlot(FinalModel, id.n=5) # QQ plots of studentized residuals, helps identify outliers

## [1] 15 16
library("MASS")
stepAIC(FinalModel) # backward
## Start:  AIC=860.63
## Profit ~ RD_Spend + log(Administration) + Marketing_Spend + log(State)
## 
##                       Df  Sum of Sq        RSS    AIC
## - log(State)           1 4.7075e+06 2.3901e+09 858.72
## - log(Administration)  1 9.4695e+07 2.4801e+09 860.50
## - Marketing_Spend      1 9.9293e+07 2.4847e+09 860.59
## <none>                              2.3854e+09 860.63
## - RD_Spend             1 2.5529e+10 2.7915e+10 976.70
## 
## Step:  AIC=858.72
## Profit ~ RD_Spend + log(Administration) + Marketing_Spend
## 
##                       Df  Sum of Sq        RSS    AIC
## - Marketing_Spend      1 9.6666e+07 2.4868e+09 858.63
## - log(Administration)  1 9.8854e+07 2.4889e+09 858.67
## <none>                              2.3901e+09 858.72
## - RD_Spend             1 2.5526e+10 2.7916e+10 974.70
## 
## Step:  AIC=858.63
## Profit ~ RD_Spend + log(Administration)
## 
##                       Df  Sum of Sq        RSS     AIC
## <none>                              2.4868e+09  858.63
## - log(Administration)  1 2.0174e+08 2.6885e+09  860.37
## - RD_Spend             1 6.0063e+10 6.2549e+10 1011.43
## 
## Call:
## lm(formula = Profit ~ RD_Spend + log(Administration), data = Startups[-c(49, 
##     50), ])
## 
## Coefficients:
##         (Intercept)             RD_Spend  log(Administration)  
##           1.539e+05            8.233e-01           -8.680e+03
# Lower the AIC value better is the model. AIC is used only if you build multiple models.