The objective of this model is to predict the the profit of startup. This dataset consider several spend on R&D, Admin, Marketing, and Location of Startup
The data is obtained from this link https://www.kaggle.com/amineoumous/50-startups-data
## R.D.Spend Administration Marketing.Spend State Profit
## 1 165349.2 136897.80 471784.1 New York 192261.8
## 2 162597.7 151377.59 443898.5 California 191792.1
## 3 153441.5 101145.55 407934.5 Florida 191050.4
## 4 144372.4 118671.85 383199.6 New York 182902.0
## 5 142107.3 91391.77 366168.4 Florida 166187.9
## 6 131876.9 99814.71 362861.4 New York 156991.1
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 California:17
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Florida :16
## Median : 73051 Median :122700 Median :212716 New York :17
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Warning in ggcorr(db, label = T): data in column(s) 'State' are not numeric and
## were ignored
Insight
model_full <- lm(formula = Profit ~ ., data = db)
model_step <- step(model_full, direction = "both", trace = 1)## Start: AIC=920.87
## Profit ~ R.D.Spend + Administration + Marketing.Spend + State
##
## Df Sum of Sq RSS AIC
## - State 2 5.1666e+05 3.9209e+09 916.88
## - Administration 1 2.3816e+07 3.9442e+09 919.17
## <none> 3.9203e+09 920.87
## - Marketing.Spend 1 2.2071e+08 4.1410e+09 921.61
## - R.D.Spend 1 2.6878e+10 3.0799e+10 1021.94
##
## Step: AIC=916.88
## Profit ~ R.D.Spend + Administration + Marketing.Spend
##
## Df Sum of Sq RSS AIC
## - Administration 1 2.3539e+07 3.9444e+09 915.18
## <none> 3.9209e+09 916.88
## - Marketing.Spend 1 2.3349e+08 4.1543e+09 917.77
## + State 2 5.1666e+05 3.9203e+09 920.87
## - R.D.Spend 1 2.7147e+10 3.1068e+10 1018.37
##
## Step: AIC=915.18
## Profit ~ R.D.Spend + Marketing.Spend
##
## Df Sum of Sq RSS AIC
## <none> 3.9444e+09 915.18
## + Administration 1 2.3539e+07 3.9209e+09 916.88
## - Marketing.Spend 1 3.1165e+08 4.2560e+09 916.98
## + State 2 2.3905e+05 3.9442e+09 919.17
## - R.D.Spend 1 3.1149e+10 3.5094e+10 1022.46
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db)
Insight
##
## Shapiro-Wilk normality test
##
## data: model_step$residuals
## W = 0.93717, p-value = 0.01042
Conclusion
p value = 0.01, meaning < 0.05, reject H0, residual is not normally distributed, target varible needs to be transformed
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## studentized Breusch-Pagan test
##
## data: model_step
## BP = 2.8431, df = 2, p-value = 0.2413
Conclusion
p value = 0.2, meaning > 0.05, failed to reject H0, error is not constant (Heteroscesdasticity), target variable needs to be transformed
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
model_full_improve_2 <- lm(formula = Profit ~ ., data = db_improve_2)
model_step_improve_2 <- step(model_full_improve_2, direction = "both", trace = 0)
model_full_improve_2##
## Call:
## lm(formula = Profit ~ ., data = db_improve_2)
##
## Coefficients:
## (Intercept) R.D.Spend Administration Marketing.Spend
## 1.081e+01 8.128e-06 2.835e-07 2.734e-07
## StateFlorida StateNew York
## 6.618e-02 4.643e-02
##
## Shapiro-Wilk normality test
##
## data: model_full_improve_2$residuals
## W = 0.65193, p-value = 1.249e-09
##
## studentized Breusch-Pagan test
##
## data: model_full_improve_2
## BP = 4.917, df = 5, p-value = 0.4261
## GVIF Df GVIF^(1/(2*Df))
## R.D.Spend 2.495511 1 1.579719
## Administration 1.177766 1 1.085249
## Marketing.Spend 2.416797 1 1.554605
## State 1.062673 2 1.015313
model_full_improve_2a <- lm(formula = Profit ~ ., data = db_improve_2a)
model_step_improve_2a <- step(model_full_improve_2, direction = "both", trace = 0)
model_full_improve_2a##
## Call:
## lm(formula = Profit ~ ., data = db_improve_2a)
##
## Coefficients:
## (Intercept) R.D.Spend Administration Marketing.Spend
## 2.255e+02 1.252e-03 -3.794e-07 3.988e-05
## StateFlorida StateNew York
## 4.443e+00 2.633e+00
##
## Shapiro-Wilk normality test
##
## data: model_full_improve_2a$residuals
## W = 0.77813, p-value = 2.848e-07
##
## studentized Breusch-Pagan test
##
## data: model_full_improve_2a
## BP = 5.1036, df = 5, p-value = 0.4034
## GVIF Df GVIF^(1/(2*Df))
## R.D.Spend 2.495511 1 1.579719
## Administration 1.177766 1 1.085249
## Marketing.Spend 2.416797 1 1.554605
## State 1.062673 2 1.015313
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## [1] 1.2
db_improve_bc <- db %>%
mutate(Profit = (Profit^lambda - 1) / lambda)
model_full_improve_bc <- lm(Profit ~., data = db_improve_bc)
model_step_improve_bc <- step(model_full_improve_bc, trace = 0)
model_step_improve_bc##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db_improve_bc)
##
## Coefficients:
## (Intercept) R.D.Spend Marketing.Spend
## 3.082e+05 8.066e+00 3.192e-01
##
## Shapiro-Wilk normality test
##
## data: model_full_improve_bc$residuals
## W = 0.97641, p-value = 0.4122
##
## studentized Breusch-Pagan test
##
## data: model_full_improve_bc
## BP = 1.5761, df = 5, p-value = 0.9041
## GVIF Df GVIF^(1/(2*Df))
## R.D.Spend 2.495511 1 1.579719
## Administration 1.177766 1 1.085249
## Marketing.Spend 2.416797 1 1.554605
## State 1.062673 2 1.015313