#PREDICTON OF PROFIT FOR 50 STARTUP'S
str=read.csv(file.choose())
dim(str)
## [1] 50  5
summary(str)
##    R.D.Spend      Administration   Marketing.Spend         State   
##  Min.   :     0   Min.   : 51283   Min.   :     0   California:17  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Florida   :16  
##  Median : 73051   Median :122700   Median :212716   New York  :17  
##  Mean   : 73722   Mean   :121345   Mean   :211025                  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                  
##  Max.   :165349   Max.   :182646   Max.   :471784                  
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
str(str)
## 'data.frame':    50 obs. of  5 variables:
##  $ R.D.Spend      : num  165349 162598 153442 144372 142107 ...
##  $ Administration : num  136898 151378 101146 118672 91392 ...
##  $ Marketing.Spend: num  471784 443899 407935 383200 366168 ...
##  $ State          : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
##  $ Profit         : num  192262 191792 191050 182902 166188 ...
colnames(str)
## [1] "R.D.Spend"       "Administration"  "Marketing.Spend" "State"          
## [5] "Profit"
attach(str)
#excluding state coloumn(4th col)
str=str[,-4]
#plot
windows()
plot(str)

plot(R.D.Spend,Profit)

plot(R.D.Spend,Marketing.Spend)

cor(str)
##                 R.D.Spend Administration Marketing.Spend    Profit
## R.D.Spend       1.0000000     0.24195525      0.72424813 0.9729005
## Administration  0.2419552     1.00000000     -0.03215388 0.2007166
## Marketing.Spend 0.7242481    -0.03215388      1.00000000 0.7477657
## Profit          0.9729005     0.20071657      0.74776572 1.0000000
#LINEAR REGRESSION MODEL TO CHECK THE SIGNIFICANCE
m1=lm(Profit~R.D.Spend+Administration+Marketing.Spend, data=str)
summary(m1)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, 
##     data = str)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
## R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration  -2.682e-02  5.103e-02  -0.526    0.602    
## Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
#model w.r.t Administration+Marketing.Spend
m2=lm(log(Profit)~Administration+Marketing.Spend, data=str)
summary(m2)
## 
## Call:
## lm(formula = log(Profit) ~ Administration + Marketing.Spend, 
##     data = str)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50743 -0.10590  0.03404  0.15522  0.59146 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.054e+01  2.304e-01  45.742  < 2e-16 ***
## Administration  3.842e-06  1.707e-06   2.250   0.0292 *  
## Marketing.Spend 2.555e-06  3.911e-07   6.534 4.17e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3346 on 47 degrees of freedom
## Multiple R-squared:  0.4992, Adjusted R-squared:  0.4779 
## F-statistic: 23.43 on 2 and 47 DF,  p-value: 8.736e-08
#model w.r.t Administration
#m3=lm(Profit~Administration, data=str)
#summary(m3)
library(car)
## Loading required package: carData
#vif, check influential data
windows()
influencePlot(m1)

##       StudRes        Hat      CookD
## 46  2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
#rows 46,47,49,50 need to be excluded
finalmodel=lm(Profit~R.D.Spend+Marketing.Spend, data=str[-c(46,47,49,50),])
summary(finalmodel)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = str[-c(46, 
##     47, 49, 50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16713.8  -4219.8   -529.1   4383.3  12081.1 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.029e+04  2.425e+03  20.743   <2e-16 ***
## R.D.Spend       7.507e-01  3.660e-02  20.511   <2e-16 ***
## Marketing.Spend 3.501e-02  1.332e-02   2.627   0.0119 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7137 on 43 degrees of freedom
## Multiple R-squared:  0.9612, Adjusted R-squared:  0.9594 
## F-statistic: 532.5 on 2 and 43 DF,  p-value: < 2.2e-16
windows()
avPlots(finalmodel)#added value plots

pv=predict(finalmodel, data=str)
pv
##         1         2         3         4         5         6         7         8 
## 190939.26 187897.51 179764.84 172090.62 169794.00 161998.02 155822.68 159448.15 
##         9        10        11        12        13        14        15        16 
## 151695.11 153559.26 134823.34 134612.13 129504.38 128198.41 149316.43 145432.04 
##        17        18        19        20        21        22        23        24 
## 118112.75 131245.89 129494.94 115170.33 117993.36 119634.17 116460.20 111659.75 
##        25        26        27        28        29        30        31        32 
## 113052.62 103667.76 111536.62 116789.12 104015.27 103295.02 100023.83  99277.66 
##        33        34        35        36        37        38        39        40 
##  99508.80  99466.98  92525.20  92031.04  78852.12  90274.45  71965.16  85365.73 
##        41        42        43        44        45        48 
##  77928.38  76990.28  73221.70  63177.52  67934.33  50293.15
final=cbind(str[-c(46,47,49,50),],pv)