startups<-read.csv("E:\\Data science\\50_Startups.csv")
View(startups)
dim(startups)
## [1] 50 4
attach(startups)
# Exploratory Data Analysis(60% of time)
# 1. Measures of Central Tendency
# 2. Measures of Dispersion
# 3. Third Moment Business decision
# 4. Fourth Moment Business decision
# 5. Probability distributions of variables
# 6. Graphical representations
# > Histogram,Box plot,Dot plot,Stem & Leaf plot,
# Bar plot
summary(startups)
## RDS AD MS Profit
## Min. : 0 Min. : 51283 Min. : 0 Min. : 14681
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 1st Qu.: 90139
## Median : 73051 Median :122700 Median :212716 Median :107978
## Mean : 73722 Mean :121345 Mean :211025 Mean :112013
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469 3rd Qu.:139766
## Max. :165349 Max. :182646 Max. :471784 Max. :192262
hist(Profit)

boxplot(Profit)

barplot(Profit)

# 7. Find the correlation b/n Output (Profit) & (RDS,AD,MS)-Scatter plot
plot(startups)
pairs(startups)

windows()
cor(startups)
## RDS AD MS Profit
## RDS 1.0000000 0.24195525 0.72424813 0.9729005
## AD 0.2419552 1.00000000 -0.03215388 0.2007166
## MS 0.7242481 -0.03215388 1.00000000 0.7477657
## Profit 0.9729005 0.20071657 0.74776572 1.0000000
# The Linear Model of profit
m1<-lm(Profit~RDS+AD+MS,data=startups)
summary(m1)
##
## Call:
## lm(formula = Profit ~ RDS + AD + MS, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## RDS 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## AD -2.682e-02 5.103e-02 -0.526 0.602
## MS 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# Prediction based on only Administration
m2<-lm(Profit~AD,data=startups)
summary(m2) #Administration become insignificant
##
## Call:
## lm(formula = Profit ~ AD, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96072 -23426 -3564 25438 84870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.697e+04 2.532e+04 3.040 0.00382 **
## AD 2.887e-01 2.034e-01 1.419 0.16222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39900 on 48 degrees of freedom
## Multiple R-squared: 0.04029, Adjusted R-squared: 0.02029
## F-statistic: 2.015 on 1 and 48 DF, p-value: 0.1622
# Prediction based on only marketing spend
m3<-lm(Profit~MS,data=startups)
summary(m3) #marketing spend become significant
##
## Call:
## lm(formula = Profit ~ MS, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83739 -18802 4925 15879 64642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.000e+04 7.685e+03 7.808 4.29e-10 ***
## MS 2.465e-01 3.159e-02 7.803 4.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared: 0.5592, Adjusted R-squared: 0.55
## F-statistic: 60.88 on 1 and 48 DF, p-value: 4.381e-10
# Prediction based on both MS and AD
m4<-lm(Profit~AD+MS,data=startups)
summary(m4) # AD and MS become significant
##
## Call:
## lm(formula = Profit ~ AD + MS, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82155 -12168 2836 13650 56472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.022e+04 1.770e+04 1.143 0.2589
## AD 3.237e-01 1.312e-01 2.468 0.0173 *
## MS 2.488e-01 3.005e-02 8.281 9.73e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25710 on 47 degrees of freedom
## Multiple R-squared: 0.6097, Adjusted R-squared: 0.5931
## F-statistic: 36.71 on 2 and 47 DF, p-value: 2.496e-10
# It is Better to delete influential observations rather than deleting entire column which is
library(car)
## Loading required package: carData
#plotting influential measures
influencePlot(m1)

## StudRes Hat CookD
## 46 2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
# Regression after deleting the 50th observation, which is influential observation
m5<-lm(Profit~AD+RDS+MS,data=startups[-50,])
summary(m5)
##
## Call:
## lm(formula = Profit ~ AD + RDS + MS, data = startups[-50, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16692.8 -4862.3 -993.9 6135.9 14468.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.238e+04 5.543e+03 9.451 2.96e-12 ***
## AD -2.220e-02 4.287e-02 -0.518 0.6071
## RDS 7.830e-01 3.825e-02 20.470 < 2e-16 ***
## MS 2.523e-02 1.382e-02 1.825 0.0746 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7754 on 45 degrees of freedom
## Multiple R-squared: 0.9613, Adjusted R-squared: 0.9587
## F-statistic: 372.8 on 3 and 45 DF, p-value: < 2.2e-16
# Regression after deleting the 49th,47th,46th,50th observation, which is influential observation
m6<-lm(Profit~AD+RDS+MS,data=startups[-c(46,49,47,50),])
summary(m6)
##
## Call:
## lm(formula = Profit ~ AD + RDS + MS, data = startups[-c(46, 49,
## 47, 50), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15958.4 -5119.9 -919.9 6340.9 12986.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.690e+04 5.788e+03 9.830 1.87e-12 ***
## AD -5.398e-02 4.303e-02 -1.254 0.2166
## RDS 7.714e-01 3.993e-02 19.320 < 2e-16 ***
## MS 2.771e-02 1.446e-02 1.916 0.0622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7089 on 42 degrees of freedom
## Multiple R-squared: 0.9626, Adjusted R-squared: 0.9599
## F-statistic: 360.3 on 3 and 42 DF, p-value: < 2.2e-16
## Variance Inflation factor to check collinearity b/n variables
vif(m1)
## RDS AD MS
## 2.468903 1.175091 2.326773
## vif>10 then there exists collinearity among all the variables
finalmodel<-lm(Profit~RDS+MS,data=startups)
summary(finalmodel)
##
## Call:
## lm(formula = Profit ~ RDS + MS, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## RDS 7.966e-01 4.135e-02 19.266 <2e-16 ***
## MS 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
pv<-predict(finalmodel,data=startups)
pv
## 1 2 3 4 5 6 7
## 192800.46 189774.66 181405.38 173441.31 171127.62 162879.31 158028.13
## 8 9 10 11 12 13 14
## 160455.74 152317.80 154343.81 135011.91 134638.87 129218.40 127812.21
## 15 16 17 18 19 20 21
## 150192.49 146032.72 117025.89 130829.44 128882.20 115816.42 116650.89
## 22 23 24 25 26 27 28
## 118384.17 114990.38 109886.19 112552.19 102612.91 110990.79 114978.61
## 29 30 31 32 33 34 35
## 103125.01 102440.42 99085.22 98314.55 98864.66 97600.73 90262.64
## 36 37 38 39 40 41 42
## 89776.49 75824.23 87974.01 68631.32 82924.82 75049.06 74113.89
## 43 44 45 46 47 48 49
## 70234.25 60390.23 65489.73 47829.57 56909.80 46975.86 47407.65
## 50
## 48326.89
pv1<-as.data.frame(pv)
pv1
## pv
## 1 192800.46
## 2 189774.66
## 3 181405.38
## 4 173441.31
## 5 171127.62
## 6 162879.31
## 7 158028.13
## 8 160455.74
## 9 152317.80
## 10 154343.81
## 11 135011.91
## 12 134638.87
## 13 129218.40
## 14 127812.21
## 15 150192.49
## 16 146032.72
## 17 117025.89
## 18 130829.44
## 19 128882.20
## 20 115816.42
## 21 116650.89
## 22 118384.17
## 23 114990.38
## 24 109886.19
## 25 112552.19
## 26 102612.91
## 27 110990.79
## 28 114978.61
## 29 103125.01
## 30 102440.42
## 31 99085.22
## 32 98314.55
## 33 98864.66
## 34 97600.73
## 35 90262.64
## 36 89776.49
## 37 75824.23
## 38 87974.01
## 39 68631.32
## 40 82924.82
## 41 75049.06
## 42 74113.89
## 43 70234.25
## 44 60390.23
## 45 65489.73
## 46 47829.57
## 47 56909.80
## 48 46975.86
## 49 47407.65
## 50 48326.89