1. Introduction

This report explores the variables influencing the price of cars using data from Kaggle. I conducted some exploratory data analysis in ordert to create a multiple linear regression model, and to evaluate its perfromance.

2. Loading the data

# file path
library(readr)
library(readxl)
CarPrice_Assignment_csv = read_csv("~/Desktop/CarPrice_Assignment.csv.xls")

# Display data structure and preview
glimpse(CarPrice_Assignment_csv)
## Rows: 205
## Columns: 26
## $ car_ID           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ symboling        <dbl> 3, 3, 1, 2, 2, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0,…
## $ CarName          <chr> "alfa-romero giulia", "alfa-romero stelvio", "alfa-ro…
## $ fueltype         <chr> "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas…
## $ aspiration       <chr> "std", "std", "std", "std", "std", "std", "std", "std…
## $ doornumber       <chr> "two", "two", "two", "four", "four", "two", "four", "…
## $ carbody          <chr> "convertible", "convertible", "hatchback", "sedan", "…
## $ drivewheel       <chr> "rwd", "rwd", "rwd", "fwd", "4wd", "fwd", "fwd", "fwd…
## $ enginelocation   <chr> "front", "front", "front", "front", "front", "front",…
## $ wheelbase        <dbl> 88.6, 88.6, 94.5, 99.8, 99.4, 99.8, 105.8, 105.8, 105…
## $ carlength        <dbl> 168.8, 168.8, 171.2, 176.6, 176.6, 177.3, 192.7, 192.…
## $ carwidth         <dbl> 64.1, 64.1, 65.5, 66.2, 66.4, 66.3, 71.4, 71.4, 71.4,…
## $ carheight        <dbl> 48.8, 48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9,…
## $ curbweight       <dbl> 2548, 2548, 2823, 2337, 2824, 2507, 2844, 2954, 3086,…
## $ enginetype       <chr> "dohc", "dohc", "ohcv", "ohc", "ohc", "ohc", "ohc", "…
## $ cylindernumber   <chr> "four", "four", "six", "four", "five", "five", "five"…
## $ enginesize       <dbl> 130, 130, 152, 109, 136, 136, 136, 136, 131, 131, 108…
## $ fuelsystem       <chr> "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi…
## $ boreratio        <dbl> 3.47, 3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13,…
## $ stroke           <dbl> 2.68, 2.68, 3.47, 3.40, 3.40, 3.40, 3.40, 3.40, 3.40,…
## $ compressionratio <dbl> 9.00, 9.00, 9.00, 10.00, 8.00, 8.50, 8.50, 8.50, 8.30…
## $ horsepower       <dbl> 111, 111, 154, 102, 115, 110, 110, 110, 140, 160, 101…
## $ peakrpm          <dbl> 5000, 5000, 5000, 5500, 5500, 5500, 5500, 5500, 5500,…
## $ citympg          <dbl> 21, 21, 19, 24, 18, 19, 19, 19, 17, 16, 23, 23, 21, 2…
## $ highwaympg       <dbl> 27, 27, 26, 30, 22, 25, 25, 25, 20, 22, 29, 29, 28, 2…
## $ price            <dbl> 13495.00, 16500.00, 16500.00, 13950.00, 17450.00, 152…
cat("\nFirst few rows of the dataset:\n")
## 
## First few rows of the dataset:
print(head(CarPrice_Assignment_csv))
## # A tibble: 6 × 26
##   car_ID symboling CarName     fueltype aspiration doornumber carbody drivewheel
##    <dbl>     <dbl> <chr>       <chr>    <chr>      <chr>      <chr>   <chr>     
## 1      1         3 alfa-romer… gas      std        two        conver… rwd       
## 2      2         3 alfa-romer… gas      std        two        conver… rwd       
## 3      3         1 alfa-romer… gas      std        two        hatchb… rwd       
## 4      4         2 audi 100 ls gas      std        four       sedan   fwd       
## 5      5         2 audi 100ls  gas      std        four       sedan   4wd       
## 6      6         2 audi fox    gas      std        two        sedan   fwd       
## # ℹ 18 more variables: enginelocation <chr>, wheelbase <dbl>, carlength <dbl>,
## #   carwidth <dbl>, carheight <dbl>, curbweight <dbl>, enginetype <chr>,
## #   cylindernumber <chr>, enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## #   stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## #   citympg <dbl>, highwaympg <dbl>, price <dbl>
Car_Price_Data1 = CarPrice_Assignment_csv

# clean columns 
Car_Price_Data1 = Car_Price_Data1 %>%
  clean_names()

head(Car_Price_Data1)
## # A tibble: 6 × 26
##   car_id symboling car_name    fueltype aspiration doornumber carbody drivewheel
##    <dbl>     <dbl> <chr>       <chr>    <chr>      <chr>      <chr>   <chr>     
## 1      1         3 alfa-romer… gas      std        two        conver… rwd       
## 2      2         3 alfa-romer… gas      std        two        conver… rwd       
## 3      3         1 alfa-romer… gas      std        two        hatchb… rwd       
## 4      4         2 audi 100 ls gas      std        four       sedan   fwd       
## 5      5         2 audi 100ls  gas      std        four       sedan   4wd       
## 6      6         2 audi fox    gas      std        two        sedan   fwd       
## # ℹ 18 more variables: enginelocation <chr>, wheelbase <dbl>, carlength <dbl>,
## #   carwidth <dbl>, carheight <dbl>, curbweight <dbl>, enginetype <chr>,
## #   cylindernumber <chr>, enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## #   stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## #   citympg <dbl>, highwaympg <dbl>, price <dbl>
## 3. Exploratory Data Analysis
### 3.1. Summary of statistics
summary(Car_Price_Data1)
##      car_id      symboling         car_name           fueltype        
##  Min.   :  1   Min.   :-2.0000   Length:205         Length:205        
##  1st Qu.: 52   1st Qu.: 0.0000   Class :character   Class :character  
##  Median :103   Median : 1.0000   Mode  :character   Mode  :character  
##  Mean   :103   Mean   : 0.8341                                        
##  3rd Qu.:154   3rd Qu.: 2.0000                                        
##  Max.   :205   Max.   : 3.0000                                        
##   aspiration         doornumber          carbody           drivewheel       
##  Length:205         Length:205         Length:205         Length:205        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  enginelocation       wheelbase        carlength        carwidth    
##  Length:205         Min.   : 86.60   Min.   :141.1   Min.   :60.30  
##  Class :character   1st Qu.: 94.50   1st Qu.:166.3   1st Qu.:64.10  
##  Mode  :character   Median : 97.00   Median :173.2   Median :65.50  
##                     Mean   : 98.76   Mean   :174.0   Mean   :65.91  
##                     3rd Qu.:102.40   3rd Qu.:183.1   3rd Qu.:66.90  
##                     Max.   :120.90   Max.   :208.1   Max.   :72.30  
##    carheight       curbweight    enginetype        cylindernumber    
##  Min.   :47.80   Min.   :1488   Length:205         Length:205        
##  1st Qu.:52.00   1st Qu.:2145   Class :character   Class :character  
##  Median :54.10   Median :2414   Mode  :character   Mode  :character  
##  Mean   :53.72   Mean   :2556                                        
##  3rd Qu.:55.50   3rd Qu.:2935                                        
##  Max.   :59.80   Max.   :4066                                        
##    enginesize     fuelsystem          boreratio        stroke     
##  Min.   : 61.0   Length:205         Min.   :2.54   Min.   :2.070  
##  1st Qu.: 97.0   Class :character   1st Qu.:3.15   1st Qu.:3.110  
##  Median :120.0   Mode  :character   Median :3.31   Median :3.290  
##  Mean   :126.9                      Mean   :3.33   Mean   :3.255  
##  3rd Qu.:141.0                      3rd Qu.:3.58   3rd Qu.:3.410  
##  Max.   :326.0                      Max.   :3.94   Max.   :4.170  
##  compressionratio   horsepower       peakrpm        citympg     
##  Min.   : 7.00    Min.   : 48.0   Min.   :4150   Min.   :13.00  
##  1st Qu.: 8.60    1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00  
##  Median : 9.00    Median : 95.0   Median :5200   Median :24.00  
##  Mean   :10.14    Mean   :104.1   Mean   :5125   Mean   :25.22  
##  3rd Qu.: 9.40    3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00  
##  Max.   :23.00    Max.   :288.0   Max.   :6600   Max.   :49.00  
##    highwaympg        price      
##  Min.   :16.00   Min.   : 5118  
##  1st Qu.:25.00   1st Qu.: 7788  
##  Median :30.00   Median :10295  
##  Mean   :30.75   Mean   :13277  
##  3rd Qu.:34.00   3rd Qu.:16503  
##  Max.   :54.00   Max.   :45400
# Create a heatmap
library(pheatmap)
numeric_cols = Car_Price_Data1[, sapply(Car_Price_Data1, is.numeric)]
scaled_data = scale(numeric_cols)

pheatmap(scaled_data, 
         main = "Car Features Heatmap", 
         show_rownames = FALSE, 
         clustering_method = "complete", 
         color = colorRampPalette(c("blue", "white", "red"))(50)
         )

### 3.2. Distribution of the data

numeric_cols %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 10, fill = "steelblue", colour = "black") + 
  facet_wrap(~ variable, scales = "free") + 
  labs(title = "Histogram of all numeric values")

# Convert the appropriate columns to factors
Car_Price_Data1$fueltype = as.factor(Car_Price_Data1$fueltype)
Car_Price_Data1$aspiration = as.factor(Car_Price_Data1$aspiration)
Car_Price_Data1$doornumber = as.factor(Car_Price_Data1$doornumber)
Car_Price_Data1$enginelocation = as.factor(Car_Price_Data1$enginelocation)
Car_Price_Data1$drivewheel = as.factor(Car_Price_Data1$drivewheel)
Car_Price_Data1$carbody = as.factor(Car_Price_Data1$carbody)

3.3. Creating a Correlation Matrix

numeric_vars = Car_Price_Data1 %>%
  select(where(is.numeric), -price)

corr_matrix = cor(numeric_vars, use = "complete.obs")
print(round(corr_matrix, 2))
##                  car_id symboling wheelbase carlength carwidth carheight
## car_id             1.00     -0.15      0.13      0.17     0.05      0.26
## symboling         -0.15      1.00     -0.53     -0.36    -0.23     -0.54
## wheelbase          0.13     -0.53      1.00      0.87     0.80      0.59
## carlength          0.17     -0.36      0.87      1.00     0.84      0.49
## carwidth           0.05     -0.23      0.80      0.84     1.00      0.28
## carheight          0.26     -0.54      0.59      0.49     0.28      1.00
## curbweight         0.07     -0.23      0.78      0.88     0.87      0.30
## enginesize        -0.03     -0.11      0.57      0.68     0.74      0.07
## boreratio          0.26     -0.13      0.49      0.61     0.56      0.17
## stroke            -0.16     -0.01      0.16      0.13     0.18     -0.06
## compressionratio   0.15     -0.18      0.25      0.16     0.18      0.26
## horsepower        -0.02      0.07      0.35      0.55     0.64     -0.11
## peakrpm           -0.20      0.27     -0.36     -0.29    -0.22     -0.32
## citympg            0.02     -0.04     -0.47     -0.67    -0.64     -0.05
## highwaympg         0.01      0.03     -0.54     -0.70    -0.68     -0.11
##                  curbweight enginesize boreratio stroke compressionratio
## car_id                 0.07      -0.03      0.26  -0.16             0.15
## symboling             -0.23      -0.11     -0.13  -0.01            -0.18
## wheelbase              0.78       0.57      0.49   0.16             0.25
## carlength              0.88       0.68      0.61   0.13             0.16
## carwidth               0.87       0.74      0.56   0.18             0.18
## carheight              0.30       0.07      0.17  -0.06             0.26
## curbweight             1.00       0.85      0.65   0.17             0.15
## enginesize             0.85       1.00      0.58   0.20             0.03
## boreratio              0.65       0.58      1.00  -0.06             0.01
## stroke                 0.17       0.20     -0.06   1.00             0.19
## compressionratio       0.15       0.03      0.01   0.19             1.00
## horsepower             0.75       0.81      0.57   0.08            -0.20
## peakrpm               -0.27      -0.24     -0.25  -0.07            -0.44
## citympg               -0.76      -0.65     -0.58  -0.04             0.32
## highwaympg            -0.80      -0.68     -0.59  -0.04             0.27
##                  horsepower peakrpm citympg highwaympg
## car_id                -0.02   -0.20    0.02       0.01
## symboling              0.07    0.27   -0.04       0.03
## wheelbase              0.35   -0.36   -0.47      -0.54
## carlength              0.55   -0.29   -0.67      -0.70
## carwidth               0.64   -0.22   -0.64      -0.68
## carheight             -0.11   -0.32   -0.05      -0.11
## curbweight             0.75   -0.27   -0.76      -0.80
## enginesize             0.81   -0.24   -0.65      -0.68
## boreratio              0.57   -0.25   -0.58      -0.59
## stroke                 0.08   -0.07   -0.04      -0.04
## compressionratio      -0.20   -0.44    0.32       0.27
## horsepower             1.00    0.13   -0.80      -0.77
## peakrpm                0.13    1.00   -0.11      -0.05
## citympg               -0.80   -0.11    1.00       0.97
## highwaympg            -0.77   -0.05    0.97       1.00
corrplot(corr_matrix, method = "color", type = "upper", 
         tl.cex = 0.7, tl.col = "black", 
         title = "Correlation Matrix of Numeric Predictions", 
         mar = c(0, 0, 2, 0)
         )

3.4. Detecting Multicolinearity

high_cor = which(abs(corr_matrix) > 0.8 & upper.tri(corr_matrix), arr.ind = TRUE)
 if(nrow(high_cor) > 0) {
   for(i in 1:nrow(high_cor)) {
     cat("High Correlation Between", 
         rownames(corr_matrix)[high_cor[i,1]], "and", 
         colnames(corr_matrix)[high_cor[i,2]], ":", 
         corr_matrix[high_cor[i,1], high_cor[i,2]], "\n")
   }
 } else {
   cat("No highly correlated pairs found.\n")
 }
## High Correlation Between wheelbase and carlength : 0.8745875 
## High Correlation Between carlength and carwidth : 0.8411183 
## High Correlation Between carlength and curbweight : 0.8777285 
## High Correlation Between carwidth and curbweight : 0.8670325 
## High Correlation Between curbweight and enginesize : 0.8505941 
## High Correlation Between enginesize and horsepower : 0.8097687 
## High Correlation Between horsepower and citympg : -0.8014562 
## High Correlation Between citympg and highwaympg : 0.971337
# Names of variables involved 
vars_involved = unique(c(
  rownames(corr_matrix)[high_cor[,1]], 
  colnames(corr_matrix)[high_cor[,2]]
))

cor_with_price = sapply(vars_involved, function(v) {
  abs(cor(Car_Price_Data1[[v]], Car_Price_Data1$price, use = "complete.obs"))
})

sort(cor_with_price, decreasing = TRUE)
## enginesize curbweight horsepower   carwidth highwaympg    citympg  carlength 
##  0.8741448  0.8353049  0.8081388  0.7593253  0.6975991  0.6857513  0.6829200 
##  wheelbase 
##  0.5778156
# Highlighting the variables that need to be dropped
vars_to_drop = c()
for (i in 1:nrow(high_cor)) {
  v1 = rownames(corr_matrix)[high_cor[i,1]]
  v2 = colnames(corr_matrix)[high_cor[i,2]]
  
  if(cor_with_price[v1] < cor_with_price[v2]) {
    vars_to_drop = c(vars_to_drop, v1)
    cat("Drop", v1, "(cor with price =", round(cor_with_price[v1],3), 
        "(, keep", v2, "(cor =", round(cor_with_price[v2],3), ")\n")
  } else{
    vars_to_drop = c(vars_to_drop, v2)
    cat("Drop", v2, "(cor with price =", round(cor_with_price[v2],3), 
        "(, keep", v1, "(cor =", round(cor_with_price[v1],3), ")\n")
  }
}
## Drop wheelbase (cor with price = 0.578 (, keep carlength (cor = 0.683 )
## Drop carlength (cor with price = 0.683 (, keep carwidth (cor = 0.759 )
## Drop carlength (cor with price = 0.683 (, keep curbweight (cor = 0.835 )
## Drop carwidth (cor with price = 0.759 (, keep curbweight (cor = 0.835 )
## Drop curbweight (cor with price = 0.835 (, keep enginesize (cor = 0.874 )
## Drop horsepower (cor with price = 0.808 (, keep enginesize (cor = 0.874 )
## Drop citympg (cor with price = 0.686 (, keep horsepower (cor = 0.808 )
## Drop citympg (cor with price = 0.686 (, keep highwaympg (cor = 0.698 )
vars_to_drop = unique(vars_to_drop)
cat("\nVariables to drop:\n")
## 
## Variables to drop:
print(vars_to_drop)
## [1] "wheelbase"  "carlength"  "carwidth"   "curbweight" "horsepower"
## [6] "citympg"
Car_Price_Data1 = Car_Price_Data1 %>%
  select(-car_id, -car_name, -citympg, -carlength, -wheelbase, -carwidth, -curbweight, -horsepower)

head(Car_Price_Data1)
## # A tibble: 6 × 18
##   symboling fueltype aspiration doornumber carbody     drivewheel enginelocation
##       <dbl> <fct>    <fct>      <fct>      <fct>       <fct>      <fct>         
## 1         3 gas      std        two        convertible rwd        front         
## 2         3 gas      std        two        convertible rwd        front         
## 3         1 gas      std        two        hatchback   rwd        front         
## 4         2 gas      std        four       sedan       fwd        front         
## 5         2 gas      std        four       sedan       4wd        front         
## 6         2 gas      std        two        sedan       fwd        front         
## # ℹ 11 more variables: carheight <dbl>, enginetype <chr>, cylindernumber <chr>,
## #   enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>, stroke <dbl>,
## #   compressionratio <dbl>, peakrpm <dbl>, highwaympg <dbl>, price <dbl>
str(Car_Price_Data1)
## tibble [205 × 18] (S3: tbl_df/tbl/data.frame)
##  $ symboling       : num [1:205] 3 3 1 2 2 2 1 1 1 0 ...
##  $ fueltype        : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ aspiration      : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
##  $ doornumber      : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
##  $ carbody         : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
##  $ drivewheel      : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
##  $ enginelocation  : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
##  $ carheight       : num [1:205] 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ enginetype      : chr [1:205] "dohc" "dohc" "ohcv" "ohc" ...
##  $ cylindernumber  : chr [1:205] "four" "four" "six" "four" ...
##  $ enginesize      : num [1:205] 130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : chr [1:205] "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ boreratio       : num [1:205] 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num [1:205] 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num [1:205] 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ peakrpm         : num [1:205] 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ highwaympg      : num [1:205] 27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num [1:205] 13495 16500 16500 13950 17450 ...
glimpse(Car_Price_Data1)
## Rows: 205
## Columns: 18
## $ symboling        <dbl> 3, 3, 1, 2, 2, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0,…
## $ fueltype         <fct> gas, gas, gas, gas, gas, gas, gas, gas, gas, gas, gas…
## $ aspiration       <fct> std, std, std, std, std, std, std, std, turbo, turbo,…
## $ doornumber       <fct> two, two, two, four, four, two, four, four, four, two…
## $ carbody          <fct> convertible, convertible, hatchback, sedan, sedan, se…
## $ drivewheel       <fct> rwd, rwd, rwd, fwd, 4wd, fwd, fwd, fwd, fwd, 4wd, rwd…
## $ enginelocation   <fct> front, front, front, front, front, front, front, fron…
## $ carheight        <dbl> 48.8, 48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9,…
## $ enginetype       <chr> "dohc", "dohc", "ohcv", "ohc", "ohc", "ohc", "ohc", "…
## $ cylindernumber   <chr> "four", "four", "six", "four", "five", "five", "five"…
## $ enginesize       <dbl> 130, 130, 152, 109, 136, 136, 136, 136, 131, 131, 108…
## $ fuelsystem       <chr> "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi…
## $ boreratio        <dbl> 3.47, 3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13,…
## $ stroke           <dbl> 2.68, 2.68, 3.47, 3.40, 3.40, 3.40, 3.40, 3.40, 3.40,…
## $ compressionratio <dbl> 9.00, 9.00, 9.00, 10.00, 8.00, 8.50, 8.50, 8.50, 8.30…
## $ peakrpm          <dbl> 5000, 5000, 5000, 5500, 5500, 5500, 5500, 5500, 5500,…
## $ highwaympg       <dbl> 27, 27, 26, 30, 22, 25, 25, 25, 20, 22, 29, 29, 28, 2…
## $ price            <dbl> 13495.00, 16500.00, 16500.00, 13950.00, 17450.00, 152…

3.5. Sorting out rare categorial values

Some of the categorial variables have levels with very few observations, which can create unstable estimates. The rare levels will be combined (appearing fewer than five times) into a “other” category using ‘fct_lump_min()’

library(forcats)

# Get all factor columns 
factor_cols = names(Car_Price_Data1)[sapply(Car_Price_Data1, is.factor)]

# Prepare data (combine rare levels)
library(dplyr)
library(forcats)
Car_Price_Data1 <- Car_Price_Data1 %>%
  mutate(cylindernumber = fct_lump_min(cylindernumber, min = 5, other_level = "rare"))

# Combine rare levels (min frequency = 5)
Car_Price_Data1 = Car_Price_Data1 %>%
  mutate(across(all_of(factor_cols), 
                ~ fct_lump_min(.x, min = 5, other_level = "rare")))

# Quick summary of changes
cat("Factor columns processed:", length(factor_cols), "\n")
## Factor columns processed: 6
cat("Example - cylinder levels:\n")
## Example - cylinder levels:
print(table(Car_Price_Data1$cylindernumber))
## 
## eight  five  four   six  rare 
##     5    11   159    24     6

4. Building the model

4.1. Split the data

Now the data will be split into a training set and a testing set.

set.seed(123)
n = nrow(Car_Price_Data1)
train_idx = sample(1:n, 0.7 * n)
train = Car_Price_Data1[train_idx, ]
test = Car_Price_Data1[-train_idx, ]

4.2. Initial Linear Model

initial_model = lm(price ~ ., data = train)
summary(initial_model)
## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5285.5  -989.3   -52.8   714.6  9643.6 
## 
## Coefficients: (2 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.796e+03  1.716e+04   0.396 0.692864    
## symboling          -3.338e+02  2.701e+02  -1.236 0.219041    
## fueltypegas        -5.958e+03  7.902e+03  -0.754 0.452469    
## aspirationturbo     3.442e+03  7.845e+02   4.387 2.65e-05 ***
## doornumbertwo      -2.298e+02  6.685e+02  -0.344 0.731743    
## carbodyhardtop     -3.626e+03  1.779e+03  -2.038 0.043982 *  
## carbodyhatchback   -3.550e+03  1.489e+03  -2.384 0.018862 *  
## carbodysedan       -3.280e+03  1.606e+03  -2.042 0.043560 *  
## carbodywagon       -3.698e+03  1.777e+03  -2.081 0.039765 *  
## drivewheelfwd       2.285e+02  1.229e+03   0.186 0.852881    
## drivewheelrwd       1.077e+03  1.644e+03   0.655 0.513966    
## enginelocationrare  4.877e+03  2.691e+03   1.813 0.072627 .  
## carheight           2.240e+02  1.554e+02   1.442 0.152186    
## enginetypedohcv    -3.950e+03  4.284e+03  -0.922 0.358493    
## enginetypel         1.469e+03  1.671e+03   0.879 0.381153    
## enginetypeohc       2.602e+03  1.164e+03   2.236 0.027383 *  
## enginetypeohcf      8.460e+02  2.023e+03   0.418 0.676535    
## enginetypeohcv     -6.704e+03  1.756e+03  -3.817 0.000224 ***
## enginetyperotor     8.026e+03  7.661e+03   1.048 0.297113    
## cylindernumberfive -9.722e+03  3.643e+03  -2.669 0.008764 ** 
## cylindernumberfour -1.342e+04  4.077e+03  -3.291 0.001342 ** 
## cylindernumbersix  -7.575e+03  2.663e+03  -2.845 0.005299 ** 
## cylindernumberrare -1.142e+04  4.071e+03  -2.804 0.005965 ** 
## enginesize          1.538e+02  2.337e+01   6.583 1.63e-09 ***
## fuelsystem2bbl      7.510e+02  1.013e+03   0.741 0.460095    
## fuelsystem4bbl             NA         NA      NA       NA    
## fuelsystemidi              NA         NA      NA       NA    
## fuelsystemmfi      -1.653e+03  2.810e+03  -0.588 0.557529    
## fuelsystemmpfi      7.925e+02  1.081e+03   0.733 0.464877    
## fuelsystemspdi     -2.976e+02  1.818e+03  -0.164 0.870303    
## boreratio          -1.274e+03  2.163e+03  -0.589 0.556975    
## stroke             -3.950e+03  1.099e+03  -3.594 0.000490 ***
## compressionratio   -2.527e+02  5.753e+02  -0.439 0.661267    
## peakrpm             2.552e+00  6.718e-01   3.798 0.000239 ***
## highwaympg         -2.244e+01  7.560e+01  -0.297 0.767153    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2246 on 110 degrees of freedom
## Multiple R-squared:  0.9406, Adjusted R-squared:  0.9233 
## F-statistic: 54.43 on 32 and 110 DF,  p-value: < 2.2e-16
# checking for aliased coefficients
alias_info = alias(initial_model)
print(alias_info$Complete)
##                (Intercept) symboling fueltypegas aspirationturbo doornumbertwo
## fuelsystem4bbl  0           0         0           0               0           
## fuelsystemidi   1           0        -1           0               0           
##                carbodyhardtop carbodyhatchback carbodysedan carbodywagon
## fuelsystem4bbl  0              0                0            0          
## fuelsystemidi   0              0                0            0          
##                drivewheelfwd drivewheelrwd enginelocationrare carheight
## fuelsystem4bbl  0             0             0                  0       
## fuelsystemidi   0             0             0                  0       
##                enginetypedohcv enginetypel enginetypeohc enginetypeohcf
## fuelsystem4bbl  0               0           0             0            
## fuelsystemidi   0               0           0             0            
##                enginetypeohcv enginetyperotor cylindernumberfive
## fuelsystem4bbl  0              1               0                
## fuelsystemidi   0              0               0                
##                cylindernumberfour cylindernumbersix cylindernumberrare
## fuelsystem4bbl  0                  0                 0                
## fuelsystemidi   0                  0                 0                
##                enginesize fuelsystem2bbl fuelsystemmfi fuelsystemmpfi
## fuelsystem4bbl  0          0              0             0            
## fuelsystemidi   0          0              0             0            
##                fuelsystemspdi boreratio stroke compressionratio peakrpm
## fuelsystem4bbl  0              0         0      0                0     
## fuelsystemidi   0              0         0      0                0     
##                highwaympg
## fuelsystem4bbl  0        
## fuelsystemidi   0

4.3. Stepwise selection

model_step = step(lm(price ~ ., data = train), direction = "both", trace = 0)
summary(model_step)
## 
## Call:
## lm(formula = price ~ symboling + aspiration + carbody + enginelocation + 
##     carheight + enginetype + cylindernumber + enginesize + stroke + 
##     compressionratio + peakrpm, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5376.3 -1074.5  -100.4   787.5  9597.2 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.587e+03  9.151e+03  -0.283 0.777871    
## symboling          -4.219e+02  2.048e+02  -2.060 0.041600 *  
## aspirationturbo     3.652e+03  5.890e+02   6.200 8.26e-09 ***
## carbodyhardtop     -3.818e+03  1.678e+03  -2.275 0.024679 *  
## carbodyhatchback   -3.931e+03  1.394e+03  -2.821 0.005610 ** 
## carbodysedan       -3.525e+03  1.452e+03  -2.428 0.016684 *  
## carbodywagon       -3.969e+03  1.595e+03  -2.488 0.014225 *  
## enginelocationrare  4.956e+03  2.520e+03   1.967 0.051548 .  
## carheight           2.213e+02  1.231e+02   1.799 0.074610 .  
## enginetypedohcv    -5.510e+03  3.259e+03  -1.690 0.093561 .  
## enginetypel         1.834e+03  1.402e+03   1.308 0.193208    
## enginetypeohc       2.278e+03  1.084e+03   2.101 0.037692 *  
## enginetypeohcf     -4.586e+02  1.492e+03  -0.307 0.759129    
## enginetypeohcv     -7.218e+03  1.595e+03  -4.525 1.43e-05 ***
## enginetyperotor     9.122e+03  4.925e+03   1.852 0.066439 .  
## cylindernumberfive -9.731e+03  2.757e+03  -3.530 0.000591 ***
## cylindernumberfour -1.404e+04  2.888e+03  -4.863 3.54e-06 ***
## cylindernumbersix  -7.520e+03  2.064e+03  -3.643 0.000399 ***
## cylindernumberrare -1.312e+04  3.027e+03  -4.335 3.05e-05 ***
## enginesize          1.563e+02  1.235e+01  12.657  < 2e-16 ***
## stroke             -4.663e+03  8.731e+02  -5.340 4.46e-07 ***
## compressionratio    1.285e+02  5.920e+01   2.171 0.031893 *  
## peakrpm             2.463e+00  5.274e-01   4.670 7.91e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2179 on 120 degrees of freedom
## Multiple R-squared:  0.939,  Adjusted R-squared:  0.9278 
## F-statistic: 83.99 on 22 and 120 DF,  p-value: < 2.2e-16

4.4. Multicollinearity check (VIF)

vif_values = vif(model_step)
print(vif_values)
##                       GVIF Df GVIF^(1/(2*Df))
## symboling         1.846560  1        1.358882
## aspiration        1.460088  1        1.208341
## carbody           4.313462  4        1.200475
## enginelocation    2.638824  1        1.624446
## carheight         2.450903  1        1.565536
## enginetype       78.345865  6        1.438251
## cylindernumber   56.391677  4        1.655396
## enginesize        8.300746  1        2.881101
## stroke            2.132191  1        1.460202
## compressionratio  1.769727  1        1.330311
## peakrpm           2.047784  1        1.431008
# Remove any factors with only one level
for(col in names(train)[sapply(train, is.factor)]) {
  if(length(unique(train[[col]])) < 2) {
    train = train %>% select(-all_of(col))
    test = test %>% select(-all_of(col))
    cat("Removed", col, "-only one level\n")
  }
}

# Fit the model again
model_temp = lm(price ~ ., data = train)

# Stepwise again
model_step2 = step(lm(price ~ ., data = train), direction = "both", trace = 1)
## Start:  AIC=2235.48
## price ~ symboling + fueltype + aspiration + doornumber + carbody + 
##     drivewheel + enginelocation + carheight + enginetype + cylindernumber + 
##     enginesize + fuelsystem + boreratio + stroke + compressionratio + 
##     peakrpm + highwaympg
## 
## 
## Step:  AIC=2235.48
## price ~ symboling + aspiration + doornumber + carbody + drivewheel + 
##     enginelocation + carheight + enginetype + cylindernumber + 
##     enginesize + fuelsystem + boreratio + stroke + compressionratio + 
##     peakrpm + highwaympg
## 
##                    Df Sum of Sq       RSS    AIC
## - fuelsystem        5   9079873 563829230 2227.8
## - drivewheel        2   3969260 558718617 2232.5
## - highwaympg        1    444355 555193712 2233.6
## - doornumber        1    595681 555345038 2233.6
## - compressionratio  1    973497 555722854 2233.7
## - boreratio         1   1750415 556499772 2233.9
## - carbody           4  31476863 586226220 2235.4
## - symboling         1   7706126 562455483 2235.4
## <none>                          554749357 2235.5
## - carheight         1  10484384 565233740 2236.2
## - enginelocation    1  16568705 571318062 2237.7
## - stroke            1  65124193 619873549 2249.3
## - peakrpm           1  72764509 627513865 2251.1
## - aspiration        1  97059903 651809260 2256.5
## - cylindernumber    4 132146339 686895696 2258.0
## - enginetype        5 176178631 730927987 2264.9
## - enginesize        1 218564558 773313915 2281.0
## 
## Step:  AIC=2227.8
## price ~ symboling + aspiration + doornumber + carbody + drivewheel + 
##     enginelocation + carheight + enginetype + cylindernumber + 
##     enginesize + boreratio + stroke + compressionratio + peakrpm + 
##     highwaympg
## 
##                    Df Sum of Sq       RSS    AIC
## - drivewheel        2   4672711 568501941 2225.0
## - doornumber        1    568560 564397790 2225.9
## - highwaympg        1    941321 564770552 2226.0
## - boreratio         1   1305794 565135024 2226.1
## <none>                          563829230 2227.8
## - symboling         1   9766685 573595915 2228.3
## - carbody           4  36476882 600306112 2228.8
## - compressionratio  1  13091616 576920846 2229.1
## - carheight         1  13881206 577710436 2229.3
## + fueltype          1    889197 562940034 2229.6
## - enginelocation    1  17612808 581442039 2230.2
## + fuelsystem        5   9079873 554749357 2235.5
## - peakrpm           1  74561578 638390808 2243.6
## - stroke            1  83688385 647517616 2245.6
## - aspiration        1 139124708 702953939 2257.3
## - enginetype        6 205813222 769642453 2260.3
## - cylindernumber    4 201230349 765059579 2263.4
## - enginesize        1 230303099 794132330 2274.8
## 
## Step:  AIC=2224.98
## price ~ symboling + aspiration + doornumber + carbody + enginelocation + 
##     carheight + enginetype + cylindernumber + enginesize + boreratio + 
##     stroke + compressionratio + peakrpm + highwaympg
## 
##                    Df Sum of Sq       RSS    AIC
## - boreratio         1      4427 568506368 2223.0
## - doornumber        1     74576 568576517 2223.0
## - highwaympg        1    904484 569406425 2223.2
## <none>                          568501941 2225.0
## - carheight         1  12020772 580522713 2226.0
## - carbody           4  39239801 607741743 2226.5
## - symboling         1  15421677 583923618 2226.8
## + fueltype          1    221301 568280640 2226.9
## - enginelocation    1  18053164 586555106 2227.4
## - compressionratio  1  19188779 587690721 2227.7
## + drivewheel        2   4672711 563829230 2227.8
## + fuelsystem        5   9783324 558718617 2232.5
## - peakrpm           1  88404094 656906035 2243.7
## - stroke            1 127240335 695742276 2251.9
## - aspiration        1 139344945 707846886 2254.3
## - enginetype        6 249447080 817949022 2265.0
## - cylindernumber    4 244573761 813075702 2268.2
## - enginesize        1 235652916 804154857 2272.6
## 
## Step:  AIC=2222.98
## price ~ symboling + aspiration + doornumber + carbody + enginelocation + 
##     carheight + enginetype + cylindernumber + enginesize + stroke + 
##     compressionratio + peakrpm + highwaympg
## 
##                    Df Sum of Sq        RSS    AIC
## - doornumber        1     78958  568585326 2221.0
## - highwaympg        1    905001  569411369 2221.2
## <none>                           568506368 2223.0
## - carheight         1  12028502  580534870 2224.0
## - carbody           4  39630610  608136978 2224.6
## + fueltype          1    225056  568281313 2224.9
## + boreratio         1      4427  568501941 2225.0
## - symboling         1  16769748  585276116 2225.1
## - enginelocation    1  18067464  586573832 2225.5
## - compressionratio  1  19568336  588074704 2225.8
## + drivewheel        2   3371344  565135024 2226.1
## + fuelsystem        5   9619130  558887238 2230.5
## - peakrpm           1  89775945  658282313 2241.9
## - stroke            1 128326762  696833131 2250.1
## - aspiration        1 140688921  709195289 2252.6
## - enginetype        6 249910048  818416416 2263.1
## - cylindernumber    4 297601167  866107535 2275.2
## - enginesize        1 443801378 1012307746 2303.5
## 
## Step:  AIC=2221
## price ~ symboling + aspiration + carbody + enginelocation + carheight + 
##     enginetype + cylindernumber + enginesize + stroke + compressionratio + 
##     peakrpm + highwaympg
## 
##                    Df Sum of Sq        RSS    AIC
## - highwaympg        1    921311  569506636 2219.2
## <none>                           568585326 2221.0
## - carheight         1  11994201  580579527 2222.0
## - carbody           4  40371654  608956980 2222.8
## + fueltype          1    198898  568386428 2222.9
## + doornumber        1     78958  568506368 2223.0
## + boreratio         1      8809  568576517 2223.0
## - enginelocation    1  18854567  587439893 2223.7
## - compressionratio  1  19489471  588074797 2223.8
## - symboling         1  19916965  588502291 2223.9
## + drivewheel        2   2933258  565652068 2224.3
## + fuelsystem        5   9589196  558996130 2228.6
## - peakrpm           1  90814686  659400012 2240.2
## - stroke            1 128255752  696841077 2248.1
## - aspiration        1 140634478  709219804 2250.6
## - enginetype        6 250169522  818754848 2261.1
## - cylindernumber    4 299125163  867710489 2273.4
## - enginesize        1 443893695 1012479020 2301.5
## 
## Step:  AIC=2219.23
## price ~ symboling + aspiration + carbody + enginelocation + carheight + 
##     enginetype + cylindernumber + enginesize + stroke + compressionratio + 
##     peakrpm
## 
##                    Df Sum of Sq        RSS    AIC
## <none>                           569506636 2219.2
## + highwaympg        1    921311  568585326 2221.0
## - carheight         1  15351271  584857908 2221.0
## + fueltype          1    366936  569139701 2221.1
## + doornumber        1     95267  569411369 2221.2
## + boreratio         1      1698  569504939 2221.2
## - carbody           4  41794069  611300706 2221.4
## - enginelocation    1  18353113  587859750 2221.8
## - symboling         1  20131841  589638477 2222.2
## + drivewheel        2   3118652  566387985 2222.4
## - compressionratio  1  22370435  591877071 2222.7
## + fuelsystem        5  10110312  559396324 2226.7
## - peakrpm           1 103523078  673029715 2241.1
## - stroke            1 135349977  704856614 2247.7
## - aspiration        1 182449358  751955994 2257.0
## - enginetype        6 256588487  826095123 2260.4
## - cylindernumber    4 304557161  874063797 2272.5
## - enginesize        1 760280536 1329787172 2338.5
# Compare Original vs. Stepwise
cat("Original variable:", length(coef(lm(price ~ ., data = train))), "\n")
## Original variable: 35
cat("Stepwise variables:", length(coef(model_step2)), "\n")
## Stepwise variables: 23
cat("Variables dropped due to aliasing:", 
    length(coef(lm(price ~ ., data = train))) - length(coef(model_step2)), "\n")
## Variables dropped due to aliasing: 12

4.5. Prediction

pred_step = predict(model_step2, newdata = test)
rmse_step = sqrt(mean((test$price - pred_step)^2))
r2_step = cor(test$price, pred_step)^2
cat("RMSE:", rmse_step, "\nR²:", r2_step, "\n")
## RMSE: 2920.561 
## R²: 0.8599762
Residuals = test$price - pred_step

# Plotting the graph
plot(test$price, pred_step, xlab = "Actual Price", ylab = "Predicted Price")
abline(0, 1, col = "red")