Objective:

The primary objective of this analysis was to develop a linear regression model to predict the fuel efficiency (miles per gallon, MPG) of cars using the mtcars dataset. We aimed to identify key predictors and evaluate the model’s performance in accurately estimating MPG.

Achievements:

Data Exploration and Variable Selection:

Model Development and Evaluation (Weight Only):

Model Improvement with Backward Elimination:

Enhanced Model Performance:

Visualizations:

Scatter plots with regression lines were generated to visualize the relationship between the predictors (‘weight’ and ‘number of cylinders’) and the target variable (‘MPG’) for both the training and testing sets.

Conclusion:

The analysis successfully developed and evaluated linear regression models for predicting car fuel efficiency. The final model, incorporating ‘weight’ and ‘number of cylinders’, demonstrated strong predictive capabilities, as evidenced by high R-squared values and low error metrics. These results indicate that vehicle weight and the number of cylinders are significant factors in determining fuel efficiency.

A more detailed explanation of the methodology, including code and visualizations, can be found below.

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caTools)
library(ggplot2)

1. Data Exploration

mydata <- mtcars
str(mydata)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(mydata)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

2. Variable Selection

Weight is considered an independent variable, based on the assumption that increased vehicle weight correlates with decreased fuel efficiency (mileage).

3. Data Split

set.seed(123)
split = sample.split(mydata$mpg, SplitRatio = 0.8)
training_set = subset(mydata, split == TRUE)
testing_set = subset(mydata, split == FALSE)

4. Linear Regression Model

regressor = lm(formula = mpg ~ wt, data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = mpg ~ wt, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5704 -2.1247  0.1817  1.7318  7.1721 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2449     2.0934   17.31 1.09e-14 ***
## wt           -5.0077     0.6379   -7.85 5.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.99 on 23 degrees of freedom
## Multiple R-squared:  0.7282, Adjusted R-squared:  0.7164 
## F-statistic: 61.62 on 1 and 23 DF,  p-value: 5.926e-08
y_pred <- predict(regressor, newdata = testing_set)

5. Visualization

Visualizing the training set results

ggplot() +
  geom_point(aes(x = training_set$wt, y = training_set$mpg),
             color = 'blue') +
  geom_line(aes(x = training_set$wt,
                y = predict(regressor, newdata = training_set)),
            color = 'red') +
  ggtitle('Weight vs Mileage (Training Set)') +
  xlab('Weight') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

Visualizing the testing set results

ggplot() +
  geom_point(aes(x = testing_set$wt, y = testing_set$mpg),
             color = 'blue') +
  geom_line(aes(x = testing_set$wt,
                y = y_pred),
            color = 'red') +
  ggtitle('Weight vs Mileage (Testing Set)') +
  xlab('Weight') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

6. Model Evaluation

Predict on test data

predicted <- predict(regressor, newdata = testing_set)

Calculate R-squared

tss <- sum((testing_set$mpg - mean(testing_set$mpg))^2)
rss <- sum((testing_set$mpg - predicted)^2)
rsq_test <- 1 - (rss / tss)

Calculate Mean Squared Error

mse <- mean((testing_set$mpg - predicted)^2)

Calculate Root Mean Squared Error

rmse <- sqrt(mean((testing_set$mpg - predicted)^2))

Calculate Mean Absolute Error

mae <- mean(abs(testing_set$mpg - predicted))

Print the metrics

cat("Mean Squared Error (Test Set):", mse, "\n")
## Mean Squared Error (Test Set): 10.87982
cat("Root Mean Squared Error (Test Set):", rmse, "\n")
## Root Mean Squared Error (Test Set): 3.298458
cat("Mean Absolute Error (Test Set):", mae, "\n")
## Mean Absolute Error (Test Set): 2.529128
cat("R-squared (Test Set):", rsq_test, "\n")
## R-squared (Test Set): 0.7829351

Interpretation:

Using Backward Elimination

# Fit a model with all variables
initial_model <- lm(mpg ~ ., data = training_set)


# Perform backward elimination
final_model_backward <- step(initial_model, direction = "backward")
## Start:  AIC=56.82
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - disp  1    0.0150 100.69 54.828
## - qsec  1    0.4319 101.10 54.932
## - gear  1    0.6408 101.31 54.983
## - wt    1    1.5158 102.19 55.198
## - cyl   1    1.6793 102.35 55.238
## - vs    1    2.6000 103.27 55.462
## - am    1    2.7790 103.45 55.505
## - carb  1    3.3372 104.01 55.640
## - drat  1    3.8712 104.54 55.768
## - hp    1    5.9231 106.59 56.254
## <none>              100.67 56.825
## 
## Step:  AIC=54.83
## mpg ~ cyl + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - qsec  1    0.5666 101.25 52.969
## - gear  1    0.6381 101.33 52.986
## - cyl   1    1.6781 102.36 53.242
## - vs    1    2.6070 103.29 53.467
## - am    1    2.7678 103.45 53.506
## - wt    1    2.9468 103.63 53.550
## - drat  1    4.1180 104.80 53.830
## - carb  1    6.0651 106.75 54.291
## - hp    1    7.3001 107.99 54.578
## <none>              100.69 54.828
## 
## Step:  AIC=52.97
## mpg ~ cyl + hp + drat + wt + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - gear  1    0.8785 102.13 51.185
## - cyl   1    1.1684 102.42 51.256
## - vs    1    2.0842 103.34 51.478
## - drat  1    3.8254 105.08 51.896
## - am    1    4.3621 105.61 52.023
## - carb  1    5.5899 106.84 52.312
## - wt    1    6.0233 107.28 52.413
## - hp    1    6.7441 108.00 52.581
## <none>              101.25 52.969
## 
## Step:  AIC=51.18
## mpg ~ cyl + hp + drat + wt + vs + am + carb
## 
##        Df Sum of Sq    RSS    AIC
## - vs    1    2.0121 104.14 49.672
## - cyl   1    2.6545 104.79 49.826
## - drat  1    4.3724 106.50 50.233
## - carb  1    4.7153 106.85 50.313
## - hp    1    5.8663 108.00 50.581
## - am    1    6.9005 109.03 50.819
## - wt    1    7.4107 109.54 50.936
## <none>              102.13 51.185
## 
## Step:  AIC=49.67
## mpg ~ cyl + hp + drat + wt + am + carb
## 
##        Df Sum of Sq    RSS    AIC
## - drat  1    4.3834 108.53 48.703
## - am    1    5.1705 109.31 48.884
## - hp    1    5.3006 109.44 48.913
## - carb  1    7.2183 111.36 49.348
## - wt    1    7.8327 111.98 49.485
## <none>              104.14 49.672
## - cyl   1    8.9229 113.07 49.728
## 
## Step:  AIC=48.7
## mpg ~ cyl + hp + wt + am + carb
## 
##        Df Sum of Sq    RSS    AIC
## - hp    1    4.5062 113.03 47.720
## - carb  1    4.8390 113.37 47.794
## <none>              108.53 48.703
## - wt    1    9.8095 118.34 48.866
## - am    1   10.4027 118.93 48.991
## - cyl   1   18.4275 126.95 50.624
## 
## Step:  AIC=47.72
## mpg ~ cyl + wt + am + carb
## 
##        Df Sum of Sq    RSS    AIC
## - am    1     8.665 121.70 47.567
## <none>              113.03 47.720
## - carb  1     9.728 122.76 47.784
## - wt    1    12.022 125.06 48.247
## - cyl   1    60.164 173.20 56.389
## 
## Step:  AIC=47.57
## mpg ~ cyl + wt + carb
## 
##        Df Sum of Sq    RSS    AIC
## - carb  1     3.297 125.00 46.235
## <none>              121.70 47.567
## - wt    1    55.082 176.78 54.901
## - cyl   1    62.395 184.09 55.914
## 
## Step:  AIC=46.24
## mpg ~ cyl + wt
## 
##        Df Sum of Sq    RSS    AIC
## <none>              125.00 46.235
## - wt    1    55.495 180.49 53.420
## - cyl   1    80.593 205.59 56.675
summary(final_model_backward)
## 
## Call:
## lm(formula = mpg ~ cyl + wt, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2135 -1.4294 -0.5492  1.4851  6.0051 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.7236     1.7941  21.584 2.69e-16 ***
## cyl          -1.6678     0.4428  -3.766  0.00106 ** 
## wt           -2.5716     0.8228  -3.125  0.00492 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.384 on 22 degrees of freedom
## Multiple R-squared:  0.8348, Adjusted R-squared:  0.8197 
## F-statistic: 55.57 on 2 and 22 DF,  p-value: 2.509e-09
# Linear Regression Model:
# Convert cyl to numeric
training_set$cyl <- as.numeric(as.character(training_set$cyl))
testing_set$cyl <- as.numeric(as.character(testing_set$cyl))


regressor = lm(formula = mpg ~ cyl + wt, data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = mpg ~ cyl + wt, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2135 -1.4294 -0.5492  1.4851  6.0051 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.7236     1.7941  21.584 2.69e-16 ***
## cyl          -1.6678     0.4428  -3.766  0.00106 ** 
## wt           -2.5716     0.8228  -3.125  0.00492 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.384 on 22 degrees of freedom
## Multiple R-squared:  0.8348, Adjusted R-squared:  0.8197 
## F-statistic: 55.57 on 2 and 22 DF,  p-value: 2.509e-09
y_pred <- predict(regressor, newdata = testing_set)

Visualization

Visualizing the training set results for ‘Number of Cylinders’ vs ‘Mileage’

ggplot() +
  geom_point(aes(x = training_set$cyl, y = training_set$mpg),
             color = 'blue') +
  geom_line(aes(x = training_set$cyl,
                y = predict(regressor, newdata = training_set)),
            color = 'red') +
  ggtitle('Number of Cylinders vs Mileage (Training Set)') +
  xlab('Number of Cylinders') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

Visualizing the testing set results for ‘Number of Cylinders’ vs ‘Mileage’

ggplot() +
  geom_point(aes(x = testing_set$cyl, y = testing_set$mpg),
             color = 'blue') +
  geom_line(aes(x = testing_set$cyl,
                y = y_pred),
            color = 'red') +
  ggtitle('Number of Cylinders vs Mileage (Testing Set)') +
  xlab('Number of Cylinders') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

Visualizing the training set results for ‘Weight’ vs ‘Mileage’

ggplot() +
  geom_point(aes(x = training_set$wt, y = training_set$mpg),
             color = 'blue') +
  geom_line(aes(x = training_set$wt,
                y = predict(regressor, newdata = training_set)),
            color = 'red') +
  ggtitle('Weight vs Mileage (Training Set)') +
  xlab('Weight') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

Visualizing the testing set results for ‘Weight’ vs ‘Mileage’

ggplot() +
  geom_point(aes(x = testing_set$wt, y = testing_set$mpg),
             color = 'blue') +
  geom_line(aes(x = testing_set$wt,
                y = y_pred),
            color = 'red') +
  ggtitle('Weight vs Mileage (Testing Set)') +
  xlab('Weight') +
  ylab('Mileage') +
  theme(plot.title = element_text(hjust = 0.5))

Model Evaluation

Predict on test data

predicted <- predict(regressor, newdata = testing_set)

Calculate R-squared

tss <- sum((testing_set$mpg - mean(testing_set$mpg))^2)
rss <- sum((testing_set$mpg - predicted)^2)
rsq_test <- 1 - (rss / tss)

Calculate Mean Squared Error

mse <- mean((testing_set$mpg - predicted)^2)

Calculate Root Mean Squared Error

rmse <- sqrt(mean((testing_set$mpg - predicted)^2))

Calculate Mean Absolute Error

mae <- mean(abs(testing_set$mpg - predicted))

Print the metrics

cat("Mean Squared Error (Test Set):", mse, "\n")
## Mean Squared Error (Test Set): 10.24904
cat("Root Mean Squared Error (Test Set):", rmse, "\n")
## Root Mean Squared Error (Test Set): 3.201412
cat("Mean Absolute Error (Test Set):", mae, "\n")
## Mean Absolute Error (Test Set): 2.677928
cat("R-squared (Test Set):", rsq_test, "\n")
## R-squared (Test Set): 0.79552

Interpretation: