dataset=read.csv('50_Startups.csv')
dataset$State=factor(dataset$State,
levels=c('New York', 'California','Florida'), labels=c(1,2,3))
library(caTools)
## Warning: package 'caTools' was built under R version 3.4.2
set.seed(123)
split=sample.split(dataset$Profit, SplitRatio=0.8)
training_set=subset(dataset, split==TRUE)
test_set=subset(dataset, split==FALSE)
regressor= lm(formula= Profit~R.D.Spend+Administration+Marketing.Spend+State ,data=dataset)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
## State, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33504 -4736 90 6672 17338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.008e+04 6.953e+03 7.204 5.76e-09 ***
## R.D.Spend 8.060e-01 4.641e-02 17.369 < 2e-16 ***
## Administration -2.700e-02 5.223e-02 -0.517 0.608
## Marketing.Spend 2.698e-02 1.714e-02 1.574 0.123
## State2 4.189e+01 3.256e+03 0.013 0.990
## State3 2.407e+02 3.339e+03 0.072 0.943
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9439 on 44 degrees of freedom
## Multiple R-squared: 0.9508, Adjusted R-squared: 0.9452
## F-statistic: 169.9 on 5 and 44 DF, p-value: < 2.2e-16
y_pred= predict(regressor, newdata=test_set)
y_pred
## 4 5 8 11 16 20 21
## 173584.98 172277.13 160155.64 135664.64 146143.64 115594.19 116570.73
## 24 31 32
## 110123.80 99629.01 97617.30
regressor1= lm(formula= Profit~R.D.Spend+Administration+Marketing.Spend ,data=training_set)
summary(regressor1)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
## data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33117 -4858 -36 6020 17957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.970e+04 7.120e+03 6.980 3.48e-08 ***
## R.D.Spend 7.983e-01 5.356e-02 14.905 < 2e-16 ***
## Administration -2.895e-02 5.603e-02 -0.517 0.609
## Marketing.Spend 3.283e-02 1.987e-02 1.652 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9629 on 36 degrees of freedom
## Multiple R-squared: 0.9499, Adjusted R-squared: 0.9457
## F-statistic: 227.6 on 3 and 36 DF, p-value: < 2.2e-16
regressor2= lm(formula= Profit~R.D.Spend+Marketing.Spend ,data=training_set)
summary(regressor2)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33294 -4763 -354 6351 17693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.638e+04 3.019e+03 15.364 <2e-16 ***
## R.D.Spend 7.879e-01 4.916e-02 16.026 <2e-16 ***
## Marketing.Spend 3.538e-02 1.905e-02 1.857 0.0713 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9533 on 37 degrees of freedom
## Multiple R-squared: 0.9495, Adjusted R-squared: 0.9468
## F-statistic: 348.1 on 2 and 37 DF, p-value: < 2.2e-16
We will keep the R D Spend and Marketing Spend, although Marketing Spend is a little greater than 0.05