Objective

The objective of this model is to predict the the profit of startup. This dataset consider several spend on R&D, Admin, Marketing, and Location of Startup

Data Source

The data is obtained from this link https://www.kaggle.com/amineoumous/50-startups-data

Data Preprocessing

Data Input

db <- read.csv("50_Startups.csv",
               stringsAsFactors = T)
head(db)
##   R.D.Spend Administration Marketing.Spend      State   Profit
## 1  165349.2      136897.80        471784.1   New York 192261.8
## 2  162597.7      151377.59        443898.5 California 191792.1
## 3  153441.5      101145.55        407934.5    Florida 191050.4
## 4  144372.4      118671.85        383199.6   New York 182902.0
## 5  142107.3       91391.77        366168.4    Florida 166187.9
## 6  131876.9       99814.71        362861.4   New York 156991.1

Check missing value

colSums(is.na(db))
##       R.D.Spend  Administration Marketing.Spend           State          Profit 
##               0               0               0               0               0

Exploratory Data Analysis

summary(db)
##    R.D.Spend      Administration   Marketing.Spend         State   
##  Min.   :     0   Min.   : 51283   Min.   :     0   California:17  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Florida   :16  
##  Median : 73051   Median :122700   Median :212716   New York  :17  
##  Mean   : 73722   Mean   :121345   Mean   :211025                  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                  
##  Max.   :165349   Max.   :182646   Max.   :471784                  
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggcorr(db, label = T)
## Warning in ggcorr(db, label = T): data in column(s) 'State' are not numeric and
## were ignored

ggplot(db, aes(x = State, y = Profit))+
  geom_boxplot()

Insight

  1. According to correlation table the profit is having high correlation tih R&D Spend and Marketing Spend
  2. According to the boxplot, Startup that located in New York has lowest range in terms of profit and California has the highest range

Linear Regression Model

model_full <- lm(formula = Profit ~ ., data = db)
model_step <- step(model_full, direction = "both", trace = 1)
## Start:  AIC=920.87
## Profit ~ R.D.Spend + Administration + Marketing.Spend + State
## 
##                   Df  Sum of Sq        RSS     AIC
## - State            2 5.1666e+05 3.9209e+09  916.88
## - Administration   1 2.3816e+07 3.9442e+09  919.17
## <none>                          3.9203e+09  920.87
## - Marketing.Spend  1 2.2071e+08 4.1410e+09  921.61
## - R.D.Spend        1 2.6878e+10 3.0799e+10 1021.94
## 
## Step:  AIC=916.88
## Profit ~ R.D.Spend + Administration + Marketing.Spend
## 
##                   Df  Sum of Sq        RSS     AIC
## - Administration   1 2.3539e+07 3.9444e+09  915.18
## <none>                          3.9209e+09  916.88
## - Marketing.Spend  1 2.3349e+08 4.1543e+09  917.77
## + State            2 5.1666e+05 3.9203e+09  920.87
## - R.D.Spend        1 2.7147e+10 3.1068e+10 1018.37
## 
## Step:  AIC=915.18
## Profit ~ R.D.Spend + Marketing.Spend
## 
##                   Df  Sum of Sq        RSS     AIC
## <none>                          3.9444e+09  915.18
## + Administration   1 2.3539e+07 3.9209e+09  916.88
## - Marketing.Spend  1 3.1165e+08 4.2560e+09  916.98
## + State            2 2.3905e+05 3.9442e+09  919.17
## - R.D.Spend        1 3.1149e+10 3.5094e+10 1022.46
summary(model_step)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
## R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
model_step$call
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db)

Insight

  1. R&D Spend is signifficantly contribute to the profit
  2. The linear model consider R&D Spend and Marketing Spend in determining Profit
  3. The model is able to determine 94.83% of result
  4. The increase of 1 dollar in R&D Spend will increase profit by 0.796 USD
  5. The increase of 1 dollar in Marketing Spend will increase profit by 0.029 USD

Assumption Check

Shapiro Test for Normality Test

shapiro.test(model_step$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_step$residuals
## W = 0.93717, p-value = 0.01042
hist(model_step$residuals)

Conclusion

p value = 0.01, meaning < 0.05, reject H0, residual is not normally distributed, target varible needs to be transformed

Constant Variance of Error (Homocesdasticity)

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model_step)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_step
## BP = 2.8431, df = 2, p-value = 0.2413

Conclusion

p value = 0.2, meaning > 0.05, failed to reject H0, error is not constant (Heteroscesdasticity), target variable needs to be transformed

Multicolinearity

library(car)
## Loading required package: carData
vif(model_full)
##                     GVIF Df GVIF^(1/(2*Df))
## R.D.Spend       2.495511  1        1.579719
## Administration  1.177766  1        1.085249
## Marketing.Spend 2.416797  1        1.554605
## State           1.062673  2        1.015313

Conclusion

VIF < 10, no multicolinearity

Model Improvement

Using Log

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
db_improve_2 <- db %>%
  mutate(Profit = log(Profit))
model_full_improve_2 <- lm(formula = Profit ~ ., data = db_improve_2)
model_step_improve_2 <- step(model_full_improve_2, direction = "both", trace = 0)
model_full_improve_2
## 
## Call:
## lm(formula = Profit ~ ., data = db_improve_2)
## 
## Coefficients:
##     (Intercept)        R.D.Spend   Administration  Marketing.Spend  
##       1.081e+01        8.128e-06        2.835e-07        2.734e-07  
##    StateFlorida    StateNew York  
##       6.618e-02        4.643e-02
shapiro.test(model_full_improve_2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_full_improve_2$residuals
## W = 0.65193, p-value = 1.249e-09
bptest(model_full_improve_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_full_improve_2
## BP = 4.917, df = 5, p-value = 0.4261
vif(model_full_improve_2)
##                     GVIF Df GVIF^(1/(2*Df))
## R.D.Spend       2.495511  1        1.579719
## Administration  1.177766  1        1.085249
## Marketing.Spend 2.416797  1        1.554605
## State           1.062673  2        1.015313

Using SQRT

library(tidyverse)

db_improve_2a <- db %>%
  mutate(Profit = sqrt(Profit))
model_full_improve_2a <- lm(formula = Profit ~ ., data = db_improve_2a)
model_step_improve_2a <- step(model_full_improve_2, direction = "both", trace = 0)
model_full_improve_2a
## 
## Call:
## lm(formula = Profit ~ ., data = db_improve_2a)
## 
## Coefficients:
##     (Intercept)        R.D.Spend   Administration  Marketing.Spend  
##       2.255e+02        1.252e-03       -3.794e-07        3.988e-05  
##    StateFlorida    StateNew York  
##       4.443e+00        2.633e+00
shapiro.test(model_full_improve_2a$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_full_improve_2a$residuals
## W = 0.77813, p-value = 2.848e-07
bptest(model_full_improve_2a)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_full_improve_2a
## BP = 5.1036, df = 5, p-value = 0.4034
vif(model_full_improve_2a)
##                     GVIF Df GVIF^(1/(2*Df))
## R.D.Spend       2.495511  1        1.579719
## Administration  1.177766  1        1.085249
## Marketing.Spend 2.416797  1        1.554605
## State           1.062673  2        1.015313

Using BoxCox

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
box_try <- boxcox(model_step, lambda = seq(-10, 10,by=0.1))

lambda <- box_try$x[ which.max(box_try$y) ]
lambda
## [1] 1.2
db_improve_bc <- db %>%
  mutate(Profit = (Profit^lambda - 1) / lambda)

model_full_improve_bc <- lm(Profit ~., data = db_improve_bc)
model_step_improve_bc <- step(model_full_improve_bc, trace = 0)
model_step_improve_bc
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = db_improve_bc)
## 
## Coefficients:
##     (Intercept)        R.D.Spend  Marketing.Spend  
##       3.082e+05        8.066e+00        3.192e-01
shapiro.test(model_full_improve_bc$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_full_improve_bc$residuals
## W = 0.97641, p-value = 0.4122
bptest(model_full_improve_bc)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_full_improve_bc
## BP = 1.5761, df = 5, p-value = 0.9041
vif(model_full_improve_bc)
##                     GVIF Df GVIF^(1/(2*Df))
## R.D.Spend       2.495511  1        1.579719
## Administration  1.177766  1        1.085249
## Marketing.Spend 2.416797  1        1.554605
## State           1.062673  2        1.015313

Conclusion

  1. The normality is met after transformation of target variable using BoxCox.