startups<-read.csv("E:\\Data science\\50_Startups.csv")
View(startups)
dim(startups)
## [1] 50  4
attach(startups)
# Exploratory Data Analysis(60% of time)
# 1. Measures of Central Tendency
# 2. Measures of Dispersion
# 3. Third Moment Business decision
# 4. Fourth Moment Business decision
# 5. Probability distributions of variables
# 6. Graphical representations
#  > Histogram,Box plot,Dot plot,Stem & Leaf plot, 
#     Bar plot

summary(startups)
##       RDS               AD               MS             Profit      
##  Min.   :     0   Min.   : 51283   Min.   :     0   Min.   : 14681  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   1st Qu.: 90139  
##  Median : 73051   Median :122700   Median :212716   Median :107978  
##  Mean   : 73722   Mean   :121345   Mean   :211025   Mean   :112013  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469   3rd Qu.:139766  
##  Max.   :165349   Max.   :182646   Max.   :471784   Max.   :192262
hist(Profit)

boxplot(Profit)

barplot(Profit)

# 7. Find the correlation b/n Output (Profit) & (RDS,AD,MS)-Scatter plot
plot(startups)
pairs(startups)

windows()

cor(startups)
##              RDS          AD          MS    Profit
## RDS    1.0000000  0.24195525  0.72424813 0.9729005
## AD     0.2419552  1.00000000 -0.03215388 0.2007166
## MS     0.7242481 -0.03215388  1.00000000 0.7477657
## Profit 0.9729005  0.20071657  0.74776572 1.0000000
# The Linear Model of profit
m1<-lm(Profit~RDS+AD+MS,data=startups)
summary(m1)
## 
## Call:
## lm(formula = Profit ~ RDS + AD + MS, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.012e+04  6.572e+03   7.626 1.06e-09 ***
## RDS          8.057e-01  4.515e-02  17.846  < 2e-16 ***
## AD          -2.682e-02  5.103e-02  -0.526    0.602    
## MS           2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
# Prediction based on only Administration
m2<-lm(Profit~AD,data=startups)
summary(m2) #Administration become insignificant
## 
## Call:
## lm(formula = Profit ~ AD, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96072 -23426  -3564  25438  84870 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 7.697e+04  2.532e+04   3.040  0.00382 **
## AD          2.887e-01  2.034e-01   1.419  0.16222   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared:  0.04029,    Adjusted R-squared:  0.02029 
## F-statistic: 2.015 on 1 and 48 DF,  p-value: 0.1622
# Prediction based on only marketing spend
m3<-lm(Profit~MS,data=startups)
summary(m3) #marketing spend become significant
## 
## Call:
## lm(formula = Profit ~ MS, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83739 -18802   4925  15879  64642 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.000e+04  7.685e+03   7.808 4.29e-10 ***
## MS          2.465e-01  3.159e-02   7.803 4.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared:  0.5592, Adjusted R-squared:   0.55 
## F-statistic: 60.88 on 1 and 48 DF,  p-value: 4.381e-10
# Prediction based on both MS and AD
m4<-lm(Profit~AD+MS,data=startups)
summary(m4) # AD and MS become significant
## 
## Call:
## lm(formula = Profit ~ AD + MS, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82155 -12168   2836  13650  56472 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.022e+04  1.770e+04   1.143   0.2589    
## AD          3.237e-01  1.312e-01   2.468   0.0173 *  
## MS          2.488e-01  3.005e-02   8.281 9.73e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25710 on 47 degrees of freedom
## Multiple R-squared:  0.6097, Adjusted R-squared:  0.5931 
## F-statistic: 36.71 on 2 and 47 DF,  p-value: 2.496e-10
# It is Better to delete influential observations rather than deleting entire column which is 
 library(car)
## Loading required package: carData
#plotting influential measures
influencePlot(m1)

##       StudRes        Hat      CookD
## 46  2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
# Regression after deleting the 50th observation, which is influential observation
m5<-lm(Profit~AD+RDS+MS,data=startups[-50,])
summary(m5)
## 
## Call:
## lm(formula = Profit ~ AD + RDS + MS, data = startups[-50, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16692.8  -4862.3   -993.9   6135.9  14468.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.238e+04  5.543e+03   9.451 2.96e-12 ***
## AD          -2.220e-02  4.287e-02  -0.518   0.6071    
## RDS          7.830e-01  3.825e-02  20.470  < 2e-16 ***
## MS           2.523e-02  1.382e-02   1.825   0.0746 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7754 on 45 degrees of freedom
## Multiple R-squared:  0.9613, Adjusted R-squared:  0.9587 
## F-statistic: 372.8 on 3 and 45 DF,  p-value: < 2.2e-16
# Regression after deleting the 49th,47th,46th,50th observation, which is influential observation
m6<-lm(Profit~AD+RDS+MS,data=startups[-c(46,49,47,50),])
summary(m6)
## 
## Call:
## lm(formula = Profit ~ AD + RDS + MS, data = startups[-c(46, 49, 
##     47, 50), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15958.4  -5119.9   -919.9   6340.9  12986.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.690e+04  5.788e+03   9.830 1.87e-12 ***
## AD          -5.398e-02  4.303e-02  -1.254   0.2166    
## RDS          7.714e-01  3.993e-02  19.320  < 2e-16 ***
## MS           2.771e-02  1.446e-02   1.916   0.0622 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7089 on 42 degrees of freedom
## Multiple R-squared:  0.9626, Adjusted R-squared:  0.9599 
## F-statistic: 360.3 on 3 and 42 DF,  p-value: < 2.2e-16
## Variance Inflation factor to check collinearity b/n variables 
vif(m1)
##      RDS       AD       MS 
## 2.468903 1.175091 2.326773
## vif>10 then there exists collinearity among all the variables 

finalmodel<-lm(Profit~RDS+MS,data=startups)
summary(finalmodel)
## 
## Call:
## lm(formula = Profit ~ RDS + MS, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.698e+04  2.690e+03  17.464   <2e-16 ***
## RDS         7.966e-01  4.135e-02  19.266   <2e-16 ***
## MS          2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
pv<-predict(finalmodel,data=startups)
pv
##         1         2         3         4         5         6         7 
## 192800.46 189774.66 181405.38 173441.31 171127.62 162879.31 158028.13 
##         8         9        10        11        12        13        14 
## 160455.74 152317.80 154343.81 135011.91 134638.87 129218.40 127812.21 
##        15        16        17        18        19        20        21 
## 150192.49 146032.72 117025.89 130829.44 128882.20 115816.42 116650.89 
##        22        23        24        25        26        27        28 
## 118384.17 114990.38 109886.19 112552.19 102612.91 110990.79 114978.61 
##        29        30        31        32        33        34        35 
## 103125.01 102440.42  99085.22  98314.55  98864.66  97600.73  90262.64 
##        36        37        38        39        40        41        42 
##  89776.49  75824.23  87974.01  68631.32  82924.82  75049.06  74113.89 
##        43        44        45        46        47        48        49 
##  70234.25  60390.23  65489.73  47829.57  56909.80  46975.86  47407.65 
##        50 
##  48326.89
pv1<-as.data.frame(pv)
pv1
##           pv
## 1  192800.46
## 2  189774.66
## 3  181405.38
## 4  173441.31
## 5  171127.62
## 6  162879.31
## 7  158028.13
## 8  160455.74
## 9  152317.80
## 10 154343.81
## 11 135011.91
## 12 134638.87
## 13 129218.40
## 14 127812.21
## 15 150192.49
## 16 146032.72
## 17 117025.89
## 18 130829.44
## 19 128882.20
## 20 115816.42
## 21 116650.89
## 22 118384.17
## 23 114990.38
## 24 109886.19
## 25 112552.19
## 26 102612.91
## 27 110990.79
## 28 114978.61
## 29 103125.01
## 30 102440.42
## 31  99085.22
## 32  98314.55
## 33  98864.66
## 34  97600.73
## 35  90262.64
## 36  89776.49
## 37  75824.23
## 38  87974.01
## 39  68631.32
## 40  82924.82
## 41  75049.06
## 42  74113.89
## 43  70234.25
## 44  60390.23
## 45  65489.73
## 46  47829.57
## 47  56909.80
## 48  46975.86
## 49  47407.65
## 50  48326.89