Importing the dataset

dataset = read.csv("C:/RClass/50_Startups.csv")
# View(dataset)

Encoding categorical data

dataset$State = factor(dataset$State,
                       levels = c('New York','California','Florida'),
                       labels= c(1, 2, 3))

Splitting the dataset into the Training set and Test set

# install.packages('caTools')
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(2023)
split = sample.split(dataset$Profit, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

# training_set[,2:3] = scale(training_set[, 2:3])
# test_set[, 2:3] = scale(test_set[,2:3])

Fitting Multiple Linear Regression to the Training set

regressor = lm(formula = Profit ~., data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ ., data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15608.9  -4633.5   -631.4   4026.7  14762.3 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.990e+04  5.764e+03   8.658 4.08e-10 ***
## R.D.Spend        7.883e-01  3.967e-02  19.872  < 2e-16 ***
## Administration  -4.592e-03  4.476e-02  -0.103    0.919    
## Marketing.Spend  2.255e-02  1.416e-02   1.592    0.121    
## State2           2.630e+03  2.866e+03   0.918    0.365    
## State3          -3.588e+02  2.907e+03  -0.123    0.903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7491 on 34 degrees of freedom
## Multiple R-squared:  0.9675, Adjusted R-squared:  0.9627 
## F-statistic: 202.2 on 5 and 34 DF,  p-value: < 2.2e-16

Multiple Linear Regression -> Simple Linear Regression

regressor = lm(formula = Profit ~ R.D.Spend, data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16875  -4507   -845   5109  12720 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.171e+04  2.359e+03   21.93   <2e-16 ***
## R.D.Spend   8.302e-01  2.615e-02   31.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7487 on 38 degrees of freedom
## Multiple R-squared:  0.9637, Adjusted R-squared:  0.9627 
## F-statistic:  1008 on 1 and 38 DF,  p-value: < 2.2e-16

Predicting the Test set results

y_pred = predict(regressor, newdata = test_set)
y_pred
##        11        14        15        18        24        37        40        45 
## 136322.93 128086.73 151291.66 130299.03 107780.03  75510.98  83725.65  70126.25 
##        48        50 
##  51714.18  51714.18

Bulding the optimal model using Backward Elimination

regressor = lm(formula = Profit ~., data = dataset)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ ., data = dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33504  -4736     90   6672  17338 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.008e+04  6.953e+03   7.204 5.76e-09 ***
## R.D.Spend        8.060e-01  4.641e-02  17.369  < 2e-16 ***
## Administration  -2.700e-02  5.223e-02  -0.517    0.608    
## Marketing.Spend  2.698e-02  1.714e-02   1.574    0.123    
## State2           4.189e+01  3.256e+03   0.013    0.990    
## State3           2.407e+02  3.339e+03   0.072    0.943    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9439 on 44 degrees of freedom
## Multiple R-squared:  0.9508, Adjusted R-squared:  0.9452 
## F-statistic: 169.9 on 5 and 44 DF,  p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, data = dataset)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, 
##     data = dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
## R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration  -2.682e-02  5.103e-02  -0.526    0.602    
## Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
## R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
regressor = lm(formula = Profit ~ R.D.Spend, data = dataset)
summary(regressor)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend, data = dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34351  -4626   -375   6249  17188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.903e+04  2.538e+03   19.32   <2e-16 ***
## R.D.Spend   8.543e-01  2.931e-02   29.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9416 on 48 degrees of freedom
## Multiple R-squared:  0.9465, Adjusted R-squared:  0.9454 
## F-statistic: 849.8 on 1 and 48 DF,  p-value: < 2.2e-16

Automatic implementation of Backward Elimination

backwardElimination <- function(x, sl) {
    numVars = length(x)
    for (i in c(1:numVars)){
      regressor = lm(formula = Profit ~ ., data = x)
      maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
      if (maxVar > sl){
        j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
        x = x[, -j]
      }
      numVars = numVars - 1
    }
    return(summary(regressor))
  }
  
  SL = 0.05
  dataset = dataset[, c(1,2,3,4,5)]
  backwardElimination(training_set, SL)
## 
## Call:
## lm(formula = Profit ~ ., data = x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16875  -4507   -845   5109  12720 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.171e+04  2.359e+03   21.93   <2e-16 ***
## R.D.Spend   8.302e-01  2.615e-02   31.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7487 on 38 degrees of freedom
## Multiple R-squared:  0.9637, Adjusted R-squared:  0.9627 
## F-statistic:  1008 on 1 and 38 DF,  p-value: < 2.2e-16