1. Introduction The mtcars dataset is a classic dataset from the 1974 Motor Trend US magazine. It comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
data("mtcars")
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
  1. Data Summary and Structure
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

We observe variables like mpg (miles per gallon), wt (weight), hp (horsepower), and cyl (number of cylinders), which will be key in our regression modeling.

summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
  1. Descriptive Statistics
mtcars %>%
  summarise(
    mean_mpg = mean(mpg),
    median_hp = median(hp),
    sd_wt = sd(wt),
    max_qsec = max(qsec)
  )
##   mean_mpg median_hp     sd_wt max_qsec
## 1 20.09062       123 0.9784574     22.9
  1. Exploratory Data Visualization
# Histogram
ggplot(mtcars, aes(mpg)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black") +
  labs(title = "Distribution of MPG", x = "Miles Per Gallon")

# Boxplot
ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
  geom_boxplot(fill = "orange") +
  labs(title = "MPG by Number of Cylinders", x = "Cylinders")

# Scatter plot
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "MPG vs Weight")

  1. Correlation Matrix
cor_matrix <- round(cor(mtcars), 2)
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

mpg is negatively correlated with wt and hp.

  1. Simple Linear Regression
model_simple <- lm(mpg ~ wt, data = mtcars)
summary(model_simple)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

Diagnostics

par(mfrow = c(2, 2))
plot(model_simple)

  1. Multiple Linear Regression
model_multi <- lm(mpg ~ wt + hp + qsec + drat, data = mtcars)
summary(model_multi)
## 
## Call:
## lm(formula = mpg ~ wt + hp + qsec + drat, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5775 -1.6626 -0.3417  1.1317  5.4422 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.25970   10.31545   1.867 0.072785 .  
## wt          -3.70773    0.88227  -4.202 0.000259 ***
## hp          -0.01784    0.01476  -1.209 0.237319    
## qsec         0.52754    0.43285   1.219 0.233470    
## drat         1.65710    1.21697   1.362 0.184561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.539 on 27 degrees of freedom
## Multiple R-squared:  0.8454, Adjusted R-squared:  0.8225 
## F-statistic: 36.91 on 4 and 27 DF,  p-value: 1.408e-10

Multicollinearity

vif(model_multi)
##       wt       hp     qsec     drat 
## 3.582683 4.921958 2.876115 2.035473

Assumption Checks

# Normality
qqnorm(resid(model_multi))
qqline(resid(model_multi), col = "red")

# Homoscedasticity
plot(model_multi, which = 3)

# Independence
dwtest(model_multi)
## 
##  Durbin-Watson test
## 
## data:  model_multi
## DW = 1.7876, p-value = 0.1952
## alternative hypothesis: true autocorrelation is greater than 0
  1. Model Evaluation
predicted <- predict(model_multi)
actual <- mtcars$mpg

# RMSE
rmse <- sqrt(mean((predicted - actual)^2))
# MAE
mae <- mean(abs(predicted - actual))

rmse
## [1] 2.332538
mae
## [1] 1.851651
  1. Conclusion Weight and horsepower are strong predictors of fuel efficiency.

The multiple regression model explains a large proportion of variance (R² > 0.8).

Model diagnostics suggest reasonably well-met assumptions.

Recommendations: Consider variable selection or regularization in future models.