This report explores the variables influencing the price of cars using data from Kaggle. I conducted some exploratory data analysis in ordert to create a multiple linear regression model, and to evaluate its perfromance.
# file path
library(readr)
library(readxl)
CarPrice_Assignment_csv = read_csv("~/Desktop/CarPrice_Assignment.csv.xls")
# Display data structure and preview
glimpse(CarPrice_Assignment_csv)
## Rows: 205
## Columns: 26
## $ car_ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ symboling <dbl> 3, 3, 1, 2, 2, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0,…
## $ CarName <chr> "alfa-romero giulia", "alfa-romero stelvio", "alfa-ro…
## $ fueltype <chr> "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas…
## $ aspiration <chr> "std", "std", "std", "std", "std", "std", "std", "std…
## $ doornumber <chr> "two", "two", "two", "four", "four", "two", "four", "…
## $ carbody <chr> "convertible", "convertible", "hatchback", "sedan", "…
## $ drivewheel <chr> "rwd", "rwd", "rwd", "fwd", "4wd", "fwd", "fwd", "fwd…
## $ enginelocation <chr> "front", "front", "front", "front", "front", "front",…
## $ wheelbase <dbl> 88.6, 88.6, 94.5, 99.8, 99.4, 99.8, 105.8, 105.8, 105…
## $ carlength <dbl> 168.8, 168.8, 171.2, 176.6, 176.6, 177.3, 192.7, 192.…
## $ carwidth <dbl> 64.1, 64.1, 65.5, 66.2, 66.4, 66.3, 71.4, 71.4, 71.4,…
## $ carheight <dbl> 48.8, 48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9,…
## $ curbweight <dbl> 2548, 2548, 2823, 2337, 2824, 2507, 2844, 2954, 3086,…
## $ enginetype <chr> "dohc", "dohc", "ohcv", "ohc", "ohc", "ohc", "ohc", "…
## $ cylindernumber <chr> "four", "four", "six", "four", "five", "five", "five"…
## $ enginesize <dbl> 130, 130, 152, 109, 136, 136, 136, 136, 131, 131, 108…
## $ fuelsystem <chr> "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi…
## $ boreratio <dbl> 3.47, 3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13,…
## $ stroke <dbl> 2.68, 2.68, 3.47, 3.40, 3.40, 3.40, 3.40, 3.40, 3.40,…
## $ compressionratio <dbl> 9.00, 9.00, 9.00, 10.00, 8.00, 8.50, 8.50, 8.50, 8.30…
## $ horsepower <dbl> 111, 111, 154, 102, 115, 110, 110, 110, 140, 160, 101…
## $ peakrpm <dbl> 5000, 5000, 5000, 5500, 5500, 5500, 5500, 5500, 5500,…
## $ citympg <dbl> 21, 21, 19, 24, 18, 19, 19, 19, 17, 16, 23, 23, 21, 2…
## $ highwaympg <dbl> 27, 27, 26, 30, 22, 25, 25, 25, 20, 22, 29, 29, 28, 2…
## $ price <dbl> 13495.00, 16500.00, 16500.00, 13950.00, 17450.00, 152…
cat("\nFirst few rows of the dataset:\n")
##
## First few rows of the dataset:
print(head(CarPrice_Assignment_csv))
## # A tibble: 6 × 26
## car_ID symboling CarName fueltype aspiration doornumber carbody drivewheel
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 3 alfa-romer… gas std two conver… rwd
## 2 2 3 alfa-romer… gas std two conver… rwd
## 3 3 1 alfa-romer… gas std two hatchb… rwd
## 4 4 2 audi 100 ls gas std four sedan fwd
## 5 5 2 audi 100ls gas std four sedan 4wd
## 6 6 2 audi fox gas std two sedan fwd
## # ℹ 18 more variables: enginelocation <chr>, wheelbase <dbl>, carlength <dbl>,
## # carwidth <dbl>, carheight <dbl>, curbweight <dbl>, enginetype <chr>,
## # cylindernumber <chr>, enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## # stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## # citympg <dbl>, highwaympg <dbl>, price <dbl>
Car_Price_Data1 = CarPrice_Assignment_csv
# clean columns
Car_Price_Data1 = Car_Price_Data1 %>%
clean_names()
head(Car_Price_Data1)
## # A tibble: 6 × 26
## car_id symboling car_name fueltype aspiration doornumber carbody drivewheel
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 3 alfa-romer… gas std two conver… rwd
## 2 2 3 alfa-romer… gas std two conver… rwd
## 3 3 1 alfa-romer… gas std two hatchb… rwd
## 4 4 2 audi 100 ls gas std four sedan fwd
## 5 5 2 audi 100ls gas std four sedan 4wd
## 6 6 2 audi fox gas std two sedan fwd
## # ℹ 18 more variables: enginelocation <chr>, wheelbase <dbl>, carlength <dbl>,
## # carwidth <dbl>, carheight <dbl>, curbweight <dbl>, enginetype <chr>,
## # cylindernumber <chr>, enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>,
## # stroke <dbl>, compressionratio <dbl>, horsepower <dbl>, peakrpm <dbl>,
## # citympg <dbl>, highwaympg <dbl>, price <dbl>
## 3. Exploratory Data Analysis
### 3.1. Summary of statistics
summary(Car_Price_Data1)
## car_id symboling car_name fueltype
## Min. : 1 Min. :-2.0000 Length:205 Length:205
## 1st Qu.: 52 1st Qu.: 0.0000 Class :character Class :character
## Median :103 Median : 1.0000 Mode :character Mode :character
## Mean :103 Mean : 0.8341
## 3rd Qu.:154 3rd Qu.: 2.0000
## Max. :205 Max. : 3.0000
## aspiration doornumber carbody drivewheel
## Length:205 Length:205 Length:205 Length:205
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## enginelocation wheelbase carlength carwidth
## Length:205 Min. : 86.60 Min. :141.1 Min. :60.30
## Class :character 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.10
## Mode :character Median : 97.00 Median :173.2 Median :65.50
## Mean : 98.76 Mean :174.0 Mean :65.91
## 3rd Qu.:102.40 3rd Qu.:183.1 3rd Qu.:66.90
## Max. :120.90 Max. :208.1 Max. :72.30
## carheight curbweight enginetype cylindernumber
## Min. :47.80 Min. :1488 Length:205 Length:205
## 1st Qu.:52.00 1st Qu.:2145 Class :character Class :character
## Median :54.10 Median :2414 Mode :character Mode :character
## Mean :53.72 Mean :2556
## 3rd Qu.:55.50 3rd Qu.:2935
## Max. :59.80 Max. :4066
## enginesize fuelsystem boreratio stroke
## Min. : 61.0 Length:205 Min. :2.54 Min. :2.070
## 1st Qu.: 97.0 Class :character 1st Qu.:3.15 1st Qu.:3.110
## Median :120.0 Mode :character Median :3.31 Median :3.290
## Mean :126.9 Mean :3.33 Mean :3.255
## 3rd Qu.:141.0 3rd Qu.:3.58 3rd Qu.:3.410
## Max. :326.0 Max. :3.94 Max. :4.170
## compressionratio horsepower peakrpm citympg
## Min. : 7.00 Min. : 48.0 Min. :4150 Min. :13.00
## 1st Qu.: 8.60 1st Qu.: 70.0 1st Qu.:4800 1st Qu.:19.00
## Median : 9.00 Median : 95.0 Median :5200 Median :24.00
## Mean :10.14 Mean :104.1 Mean :5125 Mean :25.22
## 3rd Qu.: 9.40 3rd Qu.:116.0 3rd Qu.:5500 3rd Qu.:30.00
## Max. :23.00 Max. :288.0 Max. :6600 Max. :49.00
## highwaympg price
## Min. :16.00 Min. : 5118
## 1st Qu.:25.00 1st Qu.: 7788
## Median :30.00 Median :10295
## Mean :30.75 Mean :13277
## 3rd Qu.:34.00 3rd Qu.:16503
## Max. :54.00 Max. :45400
# Create a heatmap
library(pheatmap)
numeric_cols = Car_Price_Data1[, sapply(Car_Price_Data1, is.numeric)]
scaled_data = scale(numeric_cols)
pheatmap(scaled_data,
main = "Car Features Heatmap",
show_rownames = FALSE,
clustering_method = "complete",
color = colorRampPalette(c("blue", "white", "red"))(50)
)
### 3.2. Distribution of the data
numeric_cols %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 10, fill = "steelblue", colour = "black") +
facet_wrap(~ variable, scales = "free") +
labs(title = "Histogram of all numeric values")
# Convert the appropriate columns to factors
Car_Price_Data1$fueltype = as.factor(Car_Price_Data1$fueltype)
Car_Price_Data1$aspiration = as.factor(Car_Price_Data1$aspiration)
Car_Price_Data1$doornumber = as.factor(Car_Price_Data1$doornumber)
Car_Price_Data1$enginelocation = as.factor(Car_Price_Data1$enginelocation)
Car_Price_Data1$drivewheel = as.factor(Car_Price_Data1$drivewheel)
Car_Price_Data1$carbody = as.factor(Car_Price_Data1$carbody)
numeric_vars = Car_Price_Data1 %>%
select(where(is.numeric), -price)
corr_matrix = cor(numeric_vars, use = "complete.obs")
print(round(corr_matrix, 2))
## car_id symboling wheelbase carlength carwidth carheight
## car_id 1.00 -0.15 0.13 0.17 0.05 0.26
## symboling -0.15 1.00 -0.53 -0.36 -0.23 -0.54
## wheelbase 0.13 -0.53 1.00 0.87 0.80 0.59
## carlength 0.17 -0.36 0.87 1.00 0.84 0.49
## carwidth 0.05 -0.23 0.80 0.84 1.00 0.28
## carheight 0.26 -0.54 0.59 0.49 0.28 1.00
## curbweight 0.07 -0.23 0.78 0.88 0.87 0.30
## enginesize -0.03 -0.11 0.57 0.68 0.74 0.07
## boreratio 0.26 -0.13 0.49 0.61 0.56 0.17
## stroke -0.16 -0.01 0.16 0.13 0.18 -0.06
## compressionratio 0.15 -0.18 0.25 0.16 0.18 0.26
## horsepower -0.02 0.07 0.35 0.55 0.64 -0.11
## peakrpm -0.20 0.27 -0.36 -0.29 -0.22 -0.32
## citympg 0.02 -0.04 -0.47 -0.67 -0.64 -0.05
## highwaympg 0.01 0.03 -0.54 -0.70 -0.68 -0.11
## curbweight enginesize boreratio stroke compressionratio
## car_id 0.07 -0.03 0.26 -0.16 0.15
## symboling -0.23 -0.11 -0.13 -0.01 -0.18
## wheelbase 0.78 0.57 0.49 0.16 0.25
## carlength 0.88 0.68 0.61 0.13 0.16
## carwidth 0.87 0.74 0.56 0.18 0.18
## carheight 0.30 0.07 0.17 -0.06 0.26
## curbweight 1.00 0.85 0.65 0.17 0.15
## enginesize 0.85 1.00 0.58 0.20 0.03
## boreratio 0.65 0.58 1.00 -0.06 0.01
## stroke 0.17 0.20 -0.06 1.00 0.19
## compressionratio 0.15 0.03 0.01 0.19 1.00
## horsepower 0.75 0.81 0.57 0.08 -0.20
## peakrpm -0.27 -0.24 -0.25 -0.07 -0.44
## citympg -0.76 -0.65 -0.58 -0.04 0.32
## highwaympg -0.80 -0.68 -0.59 -0.04 0.27
## horsepower peakrpm citympg highwaympg
## car_id -0.02 -0.20 0.02 0.01
## symboling 0.07 0.27 -0.04 0.03
## wheelbase 0.35 -0.36 -0.47 -0.54
## carlength 0.55 -0.29 -0.67 -0.70
## carwidth 0.64 -0.22 -0.64 -0.68
## carheight -0.11 -0.32 -0.05 -0.11
## curbweight 0.75 -0.27 -0.76 -0.80
## enginesize 0.81 -0.24 -0.65 -0.68
## boreratio 0.57 -0.25 -0.58 -0.59
## stroke 0.08 -0.07 -0.04 -0.04
## compressionratio -0.20 -0.44 0.32 0.27
## horsepower 1.00 0.13 -0.80 -0.77
## peakrpm 0.13 1.00 -0.11 -0.05
## citympg -0.80 -0.11 1.00 0.97
## highwaympg -0.77 -0.05 0.97 1.00
corrplot(corr_matrix, method = "color", type = "upper",
tl.cex = 0.7, tl.col = "black",
title = "Correlation Matrix of Numeric Predictions",
mar = c(0, 0, 2, 0)
)
high_cor = which(abs(corr_matrix) > 0.8 & upper.tri(corr_matrix), arr.ind = TRUE)
if(nrow(high_cor) > 0) {
for(i in 1:nrow(high_cor)) {
cat("High Correlation Between",
rownames(corr_matrix)[high_cor[i,1]], "and",
colnames(corr_matrix)[high_cor[i,2]], ":",
corr_matrix[high_cor[i,1], high_cor[i,2]], "\n")
}
} else {
cat("No highly correlated pairs found.\n")
}
## High Correlation Between wheelbase and carlength : 0.8745875
## High Correlation Between carlength and carwidth : 0.8411183
## High Correlation Between carlength and curbweight : 0.8777285
## High Correlation Between carwidth and curbweight : 0.8670325
## High Correlation Between curbweight and enginesize : 0.8505941
## High Correlation Between enginesize and horsepower : 0.8097687
## High Correlation Between horsepower and citympg : -0.8014562
## High Correlation Between citympg and highwaympg : 0.971337
# Names of variables involved
vars_involved = unique(c(
rownames(corr_matrix)[high_cor[,1]],
colnames(corr_matrix)[high_cor[,2]]
))
cor_with_price = sapply(vars_involved, function(v) {
abs(cor(Car_Price_Data1[[v]], Car_Price_Data1$price, use = "complete.obs"))
})
sort(cor_with_price, decreasing = TRUE)
## enginesize curbweight horsepower carwidth highwaympg citympg carlength
## 0.8741448 0.8353049 0.8081388 0.7593253 0.6975991 0.6857513 0.6829200
## wheelbase
## 0.5778156
# Highlighting the variables that need to be dropped
vars_to_drop = c()
for (i in 1:nrow(high_cor)) {
v1 = rownames(corr_matrix)[high_cor[i,1]]
v2 = colnames(corr_matrix)[high_cor[i,2]]
if(cor_with_price[v1] < cor_with_price[v2]) {
vars_to_drop = c(vars_to_drop, v1)
cat("Drop", v1, "(cor with price =", round(cor_with_price[v1],3),
"(, keep", v2, "(cor =", round(cor_with_price[v2],3), ")\n")
} else{
vars_to_drop = c(vars_to_drop, v2)
cat("Drop", v2, "(cor with price =", round(cor_with_price[v2],3),
"(, keep", v1, "(cor =", round(cor_with_price[v1],3), ")\n")
}
}
## Drop wheelbase (cor with price = 0.578 (, keep carlength (cor = 0.683 )
## Drop carlength (cor with price = 0.683 (, keep carwidth (cor = 0.759 )
## Drop carlength (cor with price = 0.683 (, keep curbweight (cor = 0.835 )
## Drop carwidth (cor with price = 0.759 (, keep curbweight (cor = 0.835 )
## Drop curbweight (cor with price = 0.835 (, keep enginesize (cor = 0.874 )
## Drop horsepower (cor with price = 0.808 (, keep enginesize (cor = 0.874 )
## Drop citympg (cor with price = 0.686 (, keep horsepower (cor = 0.808 )
## Drop citympg (cor with price = 0.686 (, keep highwaympg (cor = 0.698 )
vars_to_drop = unique(vars_to_drop)
cat("\nVariables to drop:\n")
##
## Variables to drop:
print(vars_to_drop)
## [1] "wheelbase" "carlength" "carwidth" "curbweight" "horsepower"
## [6] "citympg"
Car_Price_Data1 = Car_Price_Data1 %>%
select(-car_id, -car_name, -citympg, -carlength, -wheelbase, -carwidth, -curbweight, -horsepower)
head(Car_Price_Data1)
## # A tibble: 6 × 18
## symboling fueltype aspiration doornumber carbody drivewheel enginelocation
## <dbl> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 3 gas std two convertible rwd front
## 2 3 gas std two convertible rwd front
## 3 1 gas std two hatchback rwd front
## 4 2 gas std four sedan fwd front
## 5 2 gas std four sedan 4wd front
## 6 2 gas std two sedan fwd front
## # ℹ 11 more variables: carheight <dbl>, enginetype <chr>, cylindernumber <chr>,
## # enginesize <dbl>, fuelsystem <chr>, boreratio <dbl>, stroke <dbl>,
## # compressionratio <dbl>, peakrpm <dbl>, highwaympg <dbl>, price <dbl>
str(Car_Price_Data1)
## tibble [205 × 18] (S3: tbl_df/tbl/data.frame)
## $ symboling : num [1:205] 3 3 1 2 2 2 1 1 1 0 ...
## $ fueltype : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ aspiration : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
## $ doornumber : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
## $ carbody : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
## $ drivewheel : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
## $ enginelocation : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
## $ carheight : num [1:205] 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ enginetype : chr [1:205] "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr [1:205] "four" "four" "six" "four" ...
## $ enginesize : num [1:205] 130 130 152 109 136 136 136 136 131 131 ...
## $ fuelsystem : chr [1:205] "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num [1:205] 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num [1:205] 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compressionratio: num [1:205] 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ peakrpm : num [1:205] 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ highwaympg : num [1:205] 27 27 26 30 22 25 25 25 20 22 ...
## $ price : num [1:205] 13495 16500 16500 13950 17450 ...
glimpse(Car_Price_Data1)
## Rows: 205
## Columns: 18
## $ symboling <dbl> 3, 3, 1, 2, 2, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0,…
## $ fueltype <fct> gas, gas, gas, gas, gas, gas, gas, gas, gas, gas, gas…
## $ aspiration <fct> std, std, std, std, std, std, std, std, turbo, turbo,…
## $ doornumber <fct> two, two, two, four, four, two, four, four, four, two…
## $ carbody <fct> convertible, convertible, hatchback, sedan, sedan, se…
## $ drivewheel <fct> rwd, rwd, rwd, fwd, 4wd, fwd, fwd, fwd, fwd, 4wd, rwd…
## $ enginelocation <fct> front, front, front, front, front, front, front, fron…
## $ carheight <dbl> 48.8, 48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9,…
## $ enginetype <chr> "dohc", "dohc", "ohcv", "ohc", "ohc", "ohc", "ohc", "…
## $ cylindernumber <chr> "four", "four", "six", "four", "five", "five", "five"…
## $ enginesize <dbl> 130, 130, 152, 109, 136, 136, 136, 136, 131, 131, 108…
## $ fuelsystem <chr> "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi…
## $ boreratio <dbl> 3.47, 3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13,…
## $ stroke <dbl> 2.68, 2.68, 3.47, 3.40, 3.40, 3.40, 3.40, 3.40, 3.40,…
## $ compressionratio <dbl> 9.00, 9.00, 9.00, 10.00, 8.00, 8.50, 8.50, 8.50, 8.30…
## $ peakrpm <dbl> 5000, 5000, 5000, 5500, 5500, 5500, 5500, 5500, 5500,…
## $ highwaympg <dbl> 27, 27, 26, 30, 22, 25, 25, 25, 20, 22, 29, 29, 28, 2…
## $ price <dbl> 13495.00, 16500.00, 16500.00, 13950.00, 17450.00, 152…
Some of the categorial variables have levels with very few observations, which can create unstable estimates. The rare levels will be combined (appearing fewer than five times) into a “other” category using ‘fct_lump_min()’
library(forcats)
# Get all factor columns
factor_cols = names(Car_Price_Data1)[sapply(Car_Price_Data1, is.factor)]
# Prepare data (combine rare levels)
library(dplyr)
library(forcats)
Car_Price_Data1 <- Car_Price_Data1 %>%
mutate(cylindernumber = fct_lump_min(cylindernumber, min = 5, other_level = "rare"))
# Combine rare levels (min frequency = 5)
Car_Price_Data1 = Car_Price_Data1 %>%
mutate(across(all_of(factor_cols),
~ fct_lump_min(.x, min = 5, other_level = "rare")))
# Quick summary of changes
cat("Factor columns processed:", length(factor_cols), "\n")
## Factor columns processed: 6
cat("Example - cylinder levels:\n")
## Example - cylinder levels:
print(table(Car_Price_Data1$cylindernumber))
##
## eight five four six rare
## 5 11 159 24 6
Now the data will be split into a training set and a testing set.
set.seed(123)
n = nrow(Car_Price_Data1)
train_idx = sample(1:n, 0.7 * n)
train = Car_Price_Data1[train_idx, ]
test = Car_Price_Data1[-train_idx, ]
initial_model = lm(price ~ ., data = train)
summary(initial_model)
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5285.5 -989.3 -52.8 714.6 9643.6
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.796e+03 1.716e+04 0.396 0.692864
## symboling -3.338e+02 2.701e+02 -1.236 0.219041
## fueltypegas -5.958e+03 7.902e+03 -0.754 0.452469
## aspirationturbo 3.442e+03 7.845e+02 4.387 2.65e-05 ***
## doornumbertwo -2.298e+02 6.685e+02 -0.344 0.731743
## carbodyhardtop -3.626e+03 1.779e+03 -2.038 0.043982 *
## carbodyhatchback -3.550e+03 1.489e+03 -2.384 0.018862 *
## carbodysedan -3.280e+03 1.606e+03 -2.042 0.043560 *
## carbodywagon -3.698e+03 1.777e+03 -2.081 0.039765 *
## drivewheelfwd 2.285e+02 1.229e+03 0.186 0.852881
## drivewheelrwd 1.077e+03 1.644e+03 0.655 0.513966
## enginelocationrare 4.877e+03 2.691e+03 1.813 0.072627 .
## carheight 2.240e+02 1.554e+02 1.442 0.152186
## enginetypedohcv -3.950e+03 4.284e+03 -0.922 0.358493
## enginetypel 1.469e+03 1.671e+03 0.879 0.381153
## enginetypeohc 2.602e+03 1.164e+03 2.236 0.027383 *
## enginetypeohcf 8.460e+02 2.023e+03 0.418 0.676535
## enginetypeohcv -6.704e+03 1.756e+03 -3.817 0.000224 ***
## enginetyperotor 8.026e+03 7.661e+03 1.048 0.297113
## cylindernumberfive -9.722e+03 3.643e+03 -2.669 0.008764 **
## cylindernumberfour -1.342e+04 4.077e+03 -3.291 0.001342 **
## cylindernumbersix -7.575e+03 2.663e+03 -2.845 0.005299 **
## cylindernumberrare -1.142e+04 4.071e+03 -2.804 0.005965 **
## enginesize 1.538e+02 2.337e+01 6.583 1.63e-09 ***
## fuelsystem2bbl 7.510e+02 1.013e+03 0.741 0.460095
## fuelsystem4bbl NA NA NA NA
## fuelsystemidi NA NA NA NA
## fuelsystemmfi -1.653e+03 2.810e+03 -0.588 0.557529
## fuelsystemmpfi 7.925e+02 1.081e+03 0.733 0.464877
## fuelsystemspdi -2.976e+02 1.818e+03 -0.164 0.870303
## boreratio -1.274e+03 2.163e+03 -0.589 0.556975
## stroke -3.950e+03 1.099e+03 -3.594 0.000490 ***
## compressionratio -2.527e+02 5.753e+02 -0.439 0.661267
## peakrpm 2.552e+00 6.718e-01 3.798 0.000239 ***
## highwaympg -2.244e+01 7.560e+01 -0.297 0.767153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2246 on 110 degrees of freedom
## Multiple R-squared: 0.9406, Adjusted R-squared: 0.9233
## F-statistic: 54.43 on 32 and 110 DF, p-value: < 2.2e-16
# checking for aliased coefficients
alias_info = alias(initial_model)
print(alias_info$Complete)
## (Intercept) symboling fueltypegas aspirationturbo doornumbertwo
## fuelsystem4bbl 0 0 0 0 0
## fuelsystemidi 1 0 -1 0 0
## carbodyhardtop carbodyhatchback carbodysedan carbodywagon
## fuelsystem4bbl 0 0 0 0
## fuelsystemidi 0 0 0 0
## drivewheelfwd drivewheelrwd enginelocationrare carheight
## fuelsystem4bbl 0 0 0 0
## fuelsystemidi 0 0 0 0
## enginetypedohcv enginetypel enginetypeohc enginetypeohcf
## fuelsystem4bbl 0 0 0 0
## fuelsystemidi 0 0 0 0
## enginetypeohcv enginetyperotor cylindernumberfive
## fuelsystem4bbl 0 1 0
## fuelsystemidi 0 0 0
## cylindernumberfour cylindernumbersix cylindernumberrare
## fuelsystem4bbl 0 0 0
## fuelsystemidi 0 0 0
## enginesize fuelsystem2bbl fuelsystemmfi fuelsystemmpfi
## fuelsystem4bbl 0 0 0 0
## fuelsystemidi 0 0 0 0
## fuelsystemspdi boreratio stroke compressionratio peakrpm
## fuelsystem4bbl 0 0 0 0 0
## fuelsystemidi 0 0 0 0 0
## highwaympg
## fuelsystem4bbl 0
## fuelsystemidi 0
model_step = step(lm(price ~ ., data = train), direction = "both", trace = 0)
summary(model_step)
##
## Call:
## lm(formula = price ~ symboling + aspiration + carbody + enginelocation +
## carheight + enginetype + cylindernumber + enginesize + stroke +
## compressionratio + peakrpm, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5376.3 -1074.5 -100.4 787.5 9597.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.587e+03 9.151e+03 -0.283 0.777871
## symboling -4.219e+02 2.048e+02 -2.060 0.041600 *
## aspirationturbo 3.652e+03 5.890e+02 6.200 8.26e-09 ***
## carbodyhardtop -3.818e+03 1.678e+03 -2.275 0.024679 *
## carbodyhatchback -3.931e+03 1.394e+03 -2.821 0.005610 **
## carbodysedan -3.525e+03 1.452e+03 -2.428 0.016684 *
## carbodywagon -3.969e+03 1.595e+03 -2.488 0.014225 *
## enginelocationrare 4.956e+03 2.520e+03 1.967 0.051548 .
## carheight 2.213e+02 1.231e+02 1.799 0.074610 .
## enginetypedohcv -5.510e+03 3.259e+03 -1.690 0.093561 .
## enginetypel 1.834e+03 1.402e+03 1.308 0.193208
## enginetypeohc 2.278e+03 1.084e+03 2.101 0.037692 *
## enginetypeohcf -4.586e+02 1.492e+03 -0.307 0.759129
## enginetypeohcv -7.218e+03 1.595e+03 -4.525 1.43e-05 ***
## enginetyperotor 9.122e+03 4.925e+03 1.852 0.066439 .
## cylindernumberfive -9.731e+03 2.757e+03 -3.530 0.000591 ***
## cylindernumberfour -1.404e+04 2.888e+03 -4.863 3.54e-06 ***
## cylindernumbersix -7.520e+03 2.064e+03 -3.643 0.000399 ***
## cylindernumberrare -1.312e+04 3.027e+03 -4.335 3.05e-05 ***
## enginesize 1.563e+02 1.235e+01 12.657 < 2e-16 ***
## stroke -4.663e+03 8.731e+02 -5.340 4.46e-07 ***
## compressionratio 1.285e+02 5.920e+01 2.171 0.031893 *
## peakrpm 2.463e+00 5.274e-01 4.670 7.91e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2179 on 120 degrees of freedom
## Multiple R-squared: 0.939, Adjusted R-squared: 0.9278
## F-statistic: 83.99 on 22 and 120 DF, p-value: < 2.2e-16
vif_values = vif(model_step)
print(vif_values)
## GVIF Df GVIF^(1/(2*Df))
## symboling 1.846560 1 1.358882
## aspiration 1.460088 1 1.208341
## carbody 4.313462 4 1.200475
## enginelocation 2.638824 1 1.624446
## carheight 2.450903 1 1.565536
## enginetype 78.345865 6 1.438251
## cylindernumber 56.391677 4 1.655396
## enginesize 8.300746 1 2.881101
## stroke 2.132191 1 1.460202
## compressionratio 1.769727 1 1.330311
## peakrpm 2.047784 1 1.431008
# Remove any factors with only one level
for(col in names(train)[sapply(train, is.factor)]) {
if(length(unique(train[[col]])) < 2) {
train = train %>% select(-all_of(col))
test = test %>% select(-all_of(col))
cat("Removed", col, "-only one level\n")
}
}
# Fit the model again
model_temp = lm(price ~ ., data = train)
# Stepwise again
model_step2 = step(lm(price ~ ., data = train), direction = "both", trace = 1)
## Start: AIC=2235.48
## price ~ symboling + fueltype + aspiration + doornumber + carbody +
## drivewheel + enginelocation + carheight + enginetype + cylindernumber +
## enginesize + fuelsystem + boreratio + stroke + compressionratio +
## peakrpm + highwaympg
##
##
## Step: AIC=2235.48
## price ~ symboling + aspiration + doornumber + carbody + drivewheel +
## enginelocation + carheight + enginetype + cylindernumber +
## enginesize + fuelsystem + boreratio + stroke + compressionratio +
## peakrpm + highwaympg
##
## Df Sum of Sq RSS AIC
## - fuelsystem 5 9079873 563829230 2227.8
## - drivewheel 2 3969260 558718617 2232.5
## - highwaympg 1 444355 555193712 2233.6
## - doornumber 1 595681 555345038 2233.6
## - compressionratio 1 973497 555722854 2233.7
## - boreratio 1 1750415 556499772 2233.9
## - carbody 4 31476863 586226220 2235.4
## - symboling 1 7706126 562455483 2235.4
## <none> 554749357 2235.5
## - carheight 1 10484384 565233740 2236.2
## - enginelocation 1 16568705 571318062 2237.7
## - stroke 1 65124193 619873549 2249.3
## - peakrpm 1 72764509 627513865 2251.1
## - aspiration 1 97059903 651809260 2256.5
## - cylindernumber 4 132146339 686895696 2258.0
## - enginetype 5 176178631 730927987 2264.9
## - enginesize 1 218564558 773313915 2281.0
##
## Step: AIC=2227.8
## price ~ symboling + aspiration + doornumber + carbody + drivewheel +
## enginelocation + carheight + enginetype + cylindernumber +
## enginesize + boreratio + stroke + compressionratio + peakrpm +
## highwaympg
##
## Df Sum of Sq RSS AIC
## - drivewheel 2 4672711 568501941 2225.0
## - doornumber 1 568560 564397790 2225.9
## - highwaympg 1 941321 564770552 2226.0
## - boreratio 1 1305794 565135024 2226.1
## <none> 563829230 2227.8
## - symboling 1 9766685 573595915 2228.3
## - carbody 4 36476882 600306112 2228.8
## - compressionratio 1 13091616 576920846 2229.1
## - carheight 1 13881206 577710436 2229.3
## + fueltype 1 889197 562940034 2229.6
## - enginelocation 1 17612808 581442039 2230.2
## + fuelsystem 5 9079873 554749357 2235.5
## - peakrpm 1 74561578 638390808 2243.6
## - stroke 1 83688385 647517616 2245.6
## - aspiration 1 139124708 702953939 2257.3
## - enginetype 6 205813222 769642453 2260.3
## - cylindernumber 4 201230349 765059579 2263.4
## - enginesize 1 230303099 794132330 2274.8
##
## Step: AIC=2224.98
## price ~ symboling + aspiration + doornumber + carbody + enginelocation +
## carheight + enginetype + cylindernumber + enginesize + boreratio +
## stroke + compressionratio + peakrpm + highwaympg
##
## Df Sum of Sq RSS AIC
## - boreratio 1 4427 568506368 2223.0
## - doornumber 1 74576 568576517 2223.0
## - highwaympg 1 904484 569406425 2223.2
## <none> 568501941 2225.0
## - carheight 1 12020772 580522713 2226.0
## - carbody 4 39239801 607741743 2226.5
## - symboling 1 15421677 583923618 2226.8
## + fueltype 1 221301 568280640 2226.9
## - enginelocation 1 18053164 586555106 2227.4
## - compressionratio 1 19188779 587690721 2227.7
## + drivewheel 2 4672711 563829230 2227.8
## + fuelsystem 5 9783324 558718617 2232.5
## - peakrpm 1 88404094 656906035 2243.7
## - stroke 1 127240335 695742276 2251.9
## - aspiration 1 139344945 707846886 2254.3
## - enginetype 6 249447080 817949022 2265.0
## - cylindernumber 4 244573761 813075702 2268.2
## - enginesize 1 235652916 804154857 2272.6
##
## Step: AIC=2222.98
## price ~ symboling + aspiration + doornumber + carbody + enginelocation +
## carheight + enginetype + cylindernumber + enginesize + stroke +
## compressionratio + peakrpm + highwaympg
##
## Df Sum of Sq RSS AIC
## - doornumber 1 78958 568585326 2221.0
## - highwaympg 1 905001 569411369 2221.2
## <none> 568506368 2223.0
## - carheight 1 12028502 580534870 2224.0
## - carbody 4 39630610 608136978 2224.6
## + fueltype 1 225056 568281313 2224.9
## + boreratio 1 4427 568501941 2225.0
## - symboling 1 16769748 585276116 2225.1
## - enginelocation 1 18067464 586573832 2225.5
## - compressionratio 1 19568336 588074704 2225.8
## + drivewheel 2 3371344 565135024 2226.1
## + fuelsystem 5 9619130 558887238 2230.5
## - peakrpm 1 89775945 658282313 2241.9
## - stroke 1 128326762 696833131 2250.1
## - aspiration 1 140688921 709195289 2252.6
## - enginetype 6 249910048 818416416 2263.1
## - cylindernumber 4 297601167 866107535 2275.2
## - enginesize 1 443801378 1012307746 2303.5
##
## Step: AIC=2221
## price ~ symboling + aspiration + carbody + enginelocation + carheight +
## enginetype + cylindernumber + enginesize + stroke + compressionratio +
## peakrpm + highwaympg
##
## Df Sum of Sq RSS AIC
## - highwaympg 1 921311 569506636 2219.2
## <none> 568585326 2221.0
## - carheight 1 11994201 580579527 2222.0
## - carbody 4 40371654 608956980 2222.8
## + fueltype 1 198898 568386428 2222.9
## + doornumber 1 78958 568506368 2223.0
## + boreratio 1 8809 568576517 2223.0
## - enginelocation 1 18854567 587439893 2223.7
## - compressionratio 1 19489471 588074797 2223.8
## - symboling 1 19916965 588502291 2223.9
## + drivewheel 2 2933258 565652068 2224.3
## + fuelsystem 5 9589196 558996130 2228.6
## - peakrpm 1 90814686 659400012 2240.2
## - stroke 1 128255752 696841077 2248.1
## - aspiration 1 140634478 709219804 2250.6
## - enginetype 6 250169522 818754848 2261.1
## - cylindernumber 4 299125163 867710489 2273.4
## - enginesize 1 443893695 1012479020 2301.5
##
## Step: AIC=2219.23
## price ~ symboling + aspiration + carbody + enginelocation + carheight +
## enginetype + cylindernumber + enginesize + stroke + compressionratio +
## peakrpm
##
## Df Sum of Sq RSS AIC
## <none> 569506636 2219.2
## + highwaympg 1 921311 568585326 2221.0
## - carheight 1 15351271 584857908 2221.0
## + fueltype 1 366936 569139701 2221.1
## + doornumber 1 95267 569411369 2221.2
## + boreratio 1 1698 569504939 2221.2
## - carbody 4 41794069 611300706 2221.4
## - enginelocation 1 18353113 587859750 2221.8
## - symboling 1 20131841 589638477 2222.2
## + drivewheel 2 3118652 566387985 2222.4
## - compressionratio 1 22370435 591877071 2222.7
## + fuelsystem 5 10110312 559396324 2226.7
## - peakrpm 1 103523078 673029715 2241.1
## - stroke 1 135349977 704856614 2247.7
## - aspiration 1 182449358 751955994 2257.0
## - enginetype 6 256588487 826095123 2260.4
## - cylindernumber 4 304557161 874063797 2272.5
## - enginesize 1 760280536 1329787172 2338.5
# Compare Original vs. Stepwise
cat("Original variable:", length(coef(lm(price ~ ., data = train))), "\n")
## Original variable: 35
cat("Stepwise variables:", length(coef(model_step2)), "\n")
## Stepwise variables: 23
cat("Variables dropped due to aliasing:",
length(coef(lm(price ~ ., data = train))) - length(coef(model_step2)), "\n")
## Variables dropped due to aliasing: 12
pred_step = predict(model_step2, newdata = test)
rmse_step = sqrt(mean((test$price - pred_step)^2))
r2_step = cor(test$price, pred_step)^2
cat("RMSE:", rmse_step, "\nR²:", r2_step, "\n")
## RMSE: 2920.561
## R²: 0.8599762
Residuals = test$price - pred_step
# Plotting the graph
plot(test$price, pred_step, xlab = "Actual Price", ylab = "Predicted Price")
abline(0, 1, col = "red")