start ups problem with multilinear regression
startups <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\multi linear regression\\50_Startups.csv")
attach(startups)
class(startups)
## [1] "data.frame"
str(startups)
## 'data.frame': 50 obs. of 5 variables:
## $ R.D.Spend : num 165349 162598 153442 144372 142107 ...
## $ Administration : num 136898 151378 101146 118672 91392 ...
## $ Marketing.Spend: num 471784 443899 407935 383200 366168 ...
## $ State : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
## $ Profit : num 192262 191792 191050 182902 166188 ...
summary(startups)
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 California:17
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Florida :16
## Median : 73051 Median :122700 Median :212716 New York :17
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
View(startups)
#since the one of the column is in categorical format we have to convert it into numerical
startups$State <- as.numeric(factor(startups$State))
View(startups)
plot(startups)

colnames(startups)
## [1] "R.D.Spend" "Administration" "Marketing.Spend" "State"
## [5] "Profit"
m1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups)
summary(m1)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33553 -4779 63 6595 17301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.016e+04 7.322e+03 6.851 1.69e-08 ***
## R.D.Spend 8.058e-01 4.576e-02 17.609 < 2e-16 ***
## Administration -2.683e-02 5.160e-02 -0.520 0.606
## Marketing.Spend 2.723e-02 1.663e-02 1.637 0.109
## State -2.232e+01 1.610e+03 -0.014 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9464
## F-statistic: 217.2 on 4 and 45 DF, p-value: < 2.2e-16
cor(startups)
## R.D.Spend Administration Marketing.Spend State
## R.D.Spend 1.0000000 0.24195525 0.72424813 0.10468511
## Administration 0.2419552 1.00000000 -0.03215388 0.01184720
## Marketing.Spend 0.7242481 -0.03215388 1.00000000 0.07766961
## State 0.1046851 0.01184720 0.07766961 1.00000000
## Profit 0.9729005 0.20071657 0.74776572 0.10179631
## Profit
## R.D.Spend 0.9729005
## Administration 0.2007166
## Marketing.Spend 0.7477657
## State 0.1017963
## Profit 1.0000000
#finding influencers
library(mvinfluence)
## Loading required package: car
## Loading required package: carData
## Loading required package: heplots
influencePlot(m1)

## StudRes Hat CookD
## 46 2.0508431 0.12502380 0.11220265
## 47 -0.8175683 0.24121705 0.04281347
## 49 -1.7072322 0.25478210 0.19116349
## 50 -4.5278887 0.09664062 0.30602820
m2 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-50])
summary(m2)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startups[-50])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33553 -4779 63 6595 17301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.016e+04 7.322e+03 6.851 1.69e-08 ***
## R.D.Spend 8.058e-01 4.576e-02 17.609 < 2e-16 ***
## Administration -2.683e-02 5.160e-02 -0.520 0.606
## Marketing.Spend 2.723e-02 1.663e-02 1.637 0.109
## State -2.232e+01 1.610e+03 -0.014 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9464
## F-statistic: 217.2 on 4 and 45 DF, p-value: < 2.2e-16
m3 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49),])
summary(m3)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startups[-c(50, 49), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15944 -4787 -1904 6066 13646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.960e+04 6.314e+03 9.438 4.84e-12 ***
## R.D.Spend 7.898e-01 3.677e-02 21.480 < 2e-16 ***
## Administration -6.257e-02 4.451e-02 -1.406 0.167
## Marketing.Spend 1.705e-02 1.369e-02 1.245 0.220
## State -3.267e+02 1.326e+03 -0.246 0.806
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7429 on 43 degrees of freedom
## Multiple R-squared: 0.9627, Adjusted R-squared: 0.9593
## F-statistic: 277.7 on 4 and 43 DF, p-value: < 2.2e-16
m4 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49,47,46),])
summary(m4)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startups[-c(50, 49, 47, 46), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15606.7 -5244.0 -656.3 5402.7 13092.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.812e+04 6.110e+03 9.513 6.22e-12 ***
## R.D.Spend 7.730e-01 4.027e-02 19.198 < 2e-16 ***
## Administration -5.203e-02 4.342e-02 -1.198 0.2376
## Marketing.Spend 2.829e-02 1.458e-02 1.940 0.0593 .
## State -8.725e+02 1.306e+03 -0.668 0.5079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7137 on 41 degrees of freedom
## Multiple R-squared: 0.963, Adjusted R-squared: 0.9594
## F-statistic: 266.8 on 4 and 41 DF, p-value: < 2.2e-16
vif(m1)
## R.D.Spend Administration Marketing.Spend State
## 2.481178 1.175315 2.326780 1.011281
avPlots(m1)

# lets remove state and see
m5 <- lm(Profit ~ R.D.Spend + Marketing.Spend ,data = startups[-c(50,49,47,46)])
summary(m5)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = startups[-c(50,
## 49, 47, 46)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
#final model
final<- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49,47,46),])
summary(final)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = startups[-c(50, 49, 47, 46), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15606.7 -5244.0 -656.3 5402.7 13092.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.812e+04 6.110e+03 9.513 6.22e-12 ***
## R.D.Spend 7.730e-01 4.027e-02 19.198 < 2e-16 ***
## Administration -5.203e-02 4.342e-02 -1.198 0.2376
## Marketing.Spend 2.829e-02 1.458e-02 1.940 0.0593 .
## State -8.725e+02 1.306e+03 -0.668 0.5079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7137 on 41 degrees of freedom
## Multiple R-squared: 0.963, Adjusted R-squared: 0.9594
## F-statistic: 266.8 on 4 and 41 DF, p-value: < 2.2e-16
prediction <- predict(final,data =startups)
pre <- data.frame(startups[-c(50,49,47,46),],prediction)
summary(pre)
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 Min. :1.000
## 1st Qu.: 48693 1st Qu.:103731 1st Qu.:138616 1st Qu.:1.000
## Median : 75791 Median :124919 Median :221898 Median :2.000
## Mean : 80070 Mean :123012 Mean :221893 Mean :1.978
## 3rd Qu.:111371 3rd Qu.:145417 3rd Qu.:302424 3rd Qu.:3.000
## Max. :165349 Max. :182646 Max. :471784 Max. :3.000
## Profit prediction
## Min. : 42560 Min. : 50204
## 1st Qu.: 96729 1st Qu.: 94997
## Median :109543 Median :114455
## Mean :118171 Mean :118171
## 3rd Qu.:143591 3rd Qu.:143137
## Max. :192262 Max. :189550
View(pre)