Importing the dataset
dataset = read.csv("C:/RClass/50_Startups.csv")
# View(dataset)
Encoding categorical data
dataset$State = factor(dataset$State,
levels = c('New York','California','Florida'),
labels= c(1, 2, 3))
Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(2023)
split = sample.split(dataset$Profit, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
Feature Scaling
# training_set[,2:3] = scale(training_set[, 2:3])
# test_set[, 2:3] = scale(test_set[,2:3])
Fitting Multiple Linear Regression to the Training set
regressor = lm(formula = Profit ~., data = training_set)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15608.9 -4633.5 -631.4 4026.7 14762.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.990e+04 5.764e+03 8.658 4.08e-10 ***
## R.D.Spend 7.883e-01 3.967e-02 19.872 < 2e-16 ***
## Administration -4.592e-03 4.476e-02 -0.103 0.919
## Marketing.Spend 2.255e-02 1.416e-02 1.592 0.121
## State2 2.630e+03 2.866e+03 0.918 0.365
## State3 -3.588e+02 2.907e+03 -0.123 0.903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7491 on 34 degrees of freedom
## Multiple R-squared: 0.9675, Adjusted R-squared: 0.9627
## F-statistic: 202.2 on 5 and 34 DF, p-value: < 2.2e-16
Multiple Linear Regression -> Simple Linear Regression
regressor = lm(formula = Profit ~ R.D.Spend, data = training_set)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16875 -4507 -845 5109 12720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.171e+04 2.359e+03 21.93 <2e-16 ***
## R.D.Spend 8.302e-01 2.615e-02 31.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7487 on 38 degrees of freedom
## Multiple R-squared: 0.9637, Adjusted R-squared: 0.9627
## F-statistic: 1008 on 1 and 38 DF, p-value: < 2.2e-16
Predicting the Test set results
y_pred = predict(regressor, newdata = test_set)
y_pred
## 11 14 15 18 24 37 40 45
## 136322.93 128086.73 151291.66 130299.03 107780.03 75510.98 83725.65 70126.25
## 48 50
## 51714.18 51714.18
Bulding the optimal model using Backward Elimination
regressor = lm(formula = Profit ~., data = dataset)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ ., data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33504 -4736 90 6672 17338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.008e+04 6.953e+03 7.204 5.76e-09 ***
## R.D.Spend 8.060e-01 4.641e-02 17.369 < 2e-16 ***
## Administration -2.700e-02 5.223e-02 -0.517 0.608
## Marketing.Spend 2.698e-02 1.714e-02 1.574 0.123
## State2 4.189e+01 3.256e+03 0.013 0.990
## State3 2.407e+02 3.339e+03 0.072 0.943
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9439 on 44 degrees of freedom
## Multiple R-squared: 0.9508, Adjusted R-squared: 0.9452
## F-statistic: 169.9 on 5 and 44 DF, p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, data = dataset)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
## data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
## Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend, data = dataset)
summary(regressor)
##
## Call:
## lm(formula = Profit ~ R.D.Spend, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34351 -4626 -375 6249 17188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.903e+04 2.538e+03 19.32 <2e-16 ***
## R.D.Spend 8.543e-01 2.931e-02 29.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9416 on 48 degrees of freedom
## Multiple R-squared: 0.9465, Adjusted R-squared: 0.9454
## F-statistic: 849.8 on 1 and 48 DF, p-value: < 2.2e-16
Automatic implementation of Backward Elimination
backwardElimination <- function(x, sl) {
numVars = length(x)
for (i in c(1:numVars)){
regressor = lm(formula = Profit ~ ., data = x)
maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
if (maxVar > sl){
j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
x = x[, -j]
}
numVars = numVars - 1
}
return(summary(regressor))
}
SL = 0.05
dataset = dataset[, c(1,2,3,4,5)]
backwardElimination(training_set, SL)
##
## Call:
## lm(formula = Profit ~ ., data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16875 -4507 -845 5109 12720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.171e+04 2.359e+03 21.93 <2e-16 ***
## R.D.Spend 8.302e-01 2.615e-02 31.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7487 on 38 degrees of freedom
## Multiple R-squared: 0.9637, Adjusted R-squared: 0.9627
## F-statistic: 1008 on 1 and 38 DF, p-value: < 2.2e-16