# ---------------------------------------------------------
# 1. Load the Data
# In this section, we load the original automobile dataset from Excel and briefly inspect its structure. The dataset contains information such as price, age, mileage, fuel type, color, engine size, and multiple binary equipment features.
# ---------------------------------------------------------
# 1.1 Read the Excel file (original case data)
data <- read_excel("C:/Users/divya/OneDrive/Projects/Capstone/W28593-XLS-ENG.xlsx")
# 1.2 Quick structure check
glimpse(data)
## Rows: 1,367
## Columns: 28
## $ Price <dbl> 21000, 20000, 19650, 21550, 22550, 22050, 22800, 18000, 16800,…
## $ Age <dbl> 26, 23, 26, 32, 33, 29, 31, 25, 25, 31, 31, 30, 29, 29, 30, 26…
## $ KM <dbl> 31463, 43612, 32191, 23002, 34133, 18741, 34002, 21718, 25565,…
## $ Fuel <chr> "Petrol", "Petrol", "Petrol", "Petrol", "Petrol", "Petrol", "P…
## $ HP <dbl> 195, 195, 195, 195, 195, 195, 195, 113, 113, 113, 113, 113, 11…
## $ MC <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ Color <chr> "Silver", "Red", "Red", "Black", "Grey", "Grey", "Grey", "Blue…
## $ Auto <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CC <dbl> 1800, 1800, 1800, 1800, 1800, 1800, 1800, 1600, 1600, 1600, 16…
## $ Drs <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Cyl <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Grs <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Wght <dbl> 1189, 1189, 1189, 1189, 1189, 1189, 1189, 1109, 1069, 1109, 11…
## $ G_P <dbl> 10, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Mfr_G <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ ABS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Abag_1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Abag_2 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ AC <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ Comp <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ CD <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ Clock <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ Pwin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ PStr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Radio <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ SpM <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,…
## $ M_Rim <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
## $ Tow_Bar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,…
summary(data)
## Price Age KM Fuel
## Min. : 4400 Min. : 5.00 Min. : 3 Length:1367
## 1st Qu.: 8500 1st Qu.:47.00 1st Qu.: 43002 Class :character
## Median : 9945 Median :62.00 Median : 64002 Mode :character
## Mean :10682 Mean :57.66 Mean : 68606
## 3rd Qu.:11800 3rd Qu.:71.00 3rd Qu.: 87181
## Max. :32550 Max. :81.00 Max. :232942
## HP MC Color Auto
## Min. : 72.0 Min. :0.0000 Length:1367 Min. :0.00000
## 1st Qu.: 89.0 1st Qu.:0.0000 Class :character 1st Qu.:0.00000
## Median :113.0 Median :1.0000 Mode :character Median :0.00000
## Mean :104.5 Mean :0.6715 Mean :0.05779
## 3rd Qu.:113.0 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :195.0 Max. :1.0000 Max. :1.00000
## CC Drs Cyl Grs Wght
## Min. : 1300 Min. :2.000 Min. :3 Min. :3.000 Min. :1004
## 1st Qu.: 1400 1st Qu.:3.000 1st Qu.:3 1st Qu.:5.000 1st Qu.:1044
## Median : 1600 Median :4.000 Median :3 Median :5.000 Median :1069
## Mean : 1575 Mean :4.037 Mean :3 Mean :5.028 Mean :1075
## 3rd Qu.: 1600 3rd Qu.:5.000 3rd Qu.:3 3rd Qu.:5.000 3rd Qu.:1089
## Max. :16000 Max. :5.000 Max. :3 Max. :6.000 Max. :1619
## G_P Mfr_G ABS Abag_1
## Min. : 4.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4.000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median : 4.000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean : 5.251 Mean :0.8969 Mean :0.8127 Mean :0.9715
## 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :20.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Abag_2 AC Comp CD
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.7242 Mean :0.0556 Mean :0.2816 Mean :0.2114
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Clock Pwin PStr Radio
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5743 Mean :0.5552 Mean :0.9773 Mean :0.1456
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## SpM M_Rim Tow_Bar
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.3043 Mean :0.1997 Mean :0.2802
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# ---------------------------------------------------------
# 2. Basic Cleaning & Renaming
# We first make column names syntactically safe, check for missing values.
# ---------------------------------------------------------
# 2.1 Make safe column names (no spaces, no special characters)
df <- data
names(df) <- make.names(names(df))
# Check names once
names(df)
## [1] "Price" "Age" "KM" "Fuel" "HP" "MC" "Color"
## [8] "Auto" "CC" "Drs" "Cyl" "Grs" "Wght" "G_P"
## [15] "Mfr_G" "ABS" "Abag_1" "Abag_2" "AC" "Comp" "CD"
## [22] "Clock" "Pwin" "PStr" "Radio" "SpM" "M_Rim" "Tow_Bar"
# 2.2 Check missing values
colSums(is.na(df))
## Price Age KM Fuel HP MC Color Auto CC Drs
## 0 0 0 0 0 0 0 0 0 0
## Cyl Grs Wght G_P Mfr_G ABS Abag_1 Abag_2 AC Comp
## 0 0 0 0 0 0 0 0 0 0
## CD Clock Pwin PStr Radio SpM M_Rim Tow_Bar
## 0 0 0 0 0 0 0 0
# Remove rows with any missing values / NA
# (none in this dataset, but kept for robustness)
df <- na.omit(df)
# ---------------------------------------------------------
# 3. Exploratory Data Analysis (EDA)
# We examine correlations among numerical variables to understand how price relates to age, mileage, horsepower, engine size, and weight.
# The scatter plot below shows how price varies with horsepower, with a fitted linear trend.
# ---------------------------------------------------------
# 3.1 Summary statistics
summary(df)
## Price Age KM Fuel
## Min. : 4400 Min. : 5.00 Min. : 3 Length:1367
## 1st Qu.: 8500 1st Qu.:47.00 1st Qu.: 43002 Class :character
## Median : 9945 Median :62.00 Median : 64002 Mode :character
## Mean :10682 Mean :57.66 Mean : 68606
## 3rd Qu.:11800 3rd Qu.:71.00 3rd Qu.: 87181
## Max. :32550 Max. :81.00 Max. :232942
## HP MC Color Auto
## Min. : 72.0 Min. :0.0000 Length:1367 Min. :0.00000
## 1st Qu.: 89.0 1st Qu.:0.0000 Class :character 1st Qu.:0.00000
## Median :113.0 Median :1.0000 Mode :character Median :0.00000
## Mean :104.5 Mean :0.6715 Mean :0.05779
## 3rd Qu.:113.0 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :195.0 Max. :1.0000 Max. :1.00000
## CC Drs Cyl Grs Wght
## Min. : 1300 Min. :2.000 Min. :3 Min. :3.000 Min. :1004
## 1st Qu.: 1400 1st Qu.:3.000 1st Qu.:3 1st Qu.:5.000 1st Qu.:1044
## Median : 1600 Median :4.000 Median :3 Median :5.000 Median :1069
## Mean : 1575 Mean :4.037 Mean :3 Mean :5.028 Mean :1075
## 3rd Qu.: 1600 3rd Qu.:5.000 3rd Qu.:3 3rd Qu.:5.000 3rd Qu.:1089
## Max. :16000 Max. :5.000 Max. :3 Max. :6.000 Max. :1619
## G_P Mfr_G ABS Abag_1
## Min. : 4.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4.000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median : 4.000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean : 5.251 Mean :0.8969 Mean :0.8127 Mean :0.9715
## 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :20.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Abag_2 AC Comp CD
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.7242 Mean :0.0556 Mean :0.2816 Mean :0.2114
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Clock Pwin PStr Radio
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5743 Mean :0.5552 Mean :0.9773 Mean :0.1456
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## SpM M_Rim Tow_Bar
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.3043 Mean :0.1997 Mean :0.2802
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# 3.2 Correlation plot for numeric variables
numeric_vars <- df %>% select_if(is.numeric)
corr_matrix <- cor(numeric_vars)
ggcorrplot(
corr_matrix,
type = "lower",
lab = FALSE, # set TRUE if you want numeric values
title = "Correlation Plot for numeric variables",
colors = c("darkred", "white", "darkblue"),
outline.color = "gray80",
tl.col = "black",
tl.srt = 45,
tl.cex = 12 / max(1, ncol(corr_matrix) / 14) # auto-scale text a bit
)

# 3.3 Example scatterplot: Price vs HP (Horsepower)
ggplot(df, aes(x = HP, y = Price)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "ScatterPlot of Price vs Horsepower",
x = "Horsepower (HP)",
y = "Price (CA$)")

# ---------------------------------------------------------
# 4. Transform / Encode Variables
# We treat key variables as categorical where appropriate. We convert categorical variables into dummy/indicator variables using `caret::dummyVars()` and create a fully numeric dataset for modeling.
# ---------------------------------------------------------
# 4.1 Treat some variables as factors (categorical)
# Check what Fuel and Colour look like:
table(df$Fuel)
##
## Diesel Petrol
## 142 1225
table(df$Color)
##
## Black Blue Green Grey Red Silver White Yellow
## 179 266 215 290 267 120 27 3
df$Fuel <- as.factor(df$Fuel)
df$Color <- as.factor(df$Color)
# 4.2 Ensure binary indicator columns are numeric 0/1
binary_cols <- c("MC", "Auto", "Mfr_G", "ABS", "Abag_1", "Abag_2",
"AC", "Comp", "CD", "Clock", "Pwin", "PStr",
"Radio", "SpM", "M_Rim", "Tow_Bar")
for (col in binary_cols) {
if (col %in% names(df)) {
df[[col]] <- as.numeric(df[[col]])
}
}
# 4.3 Create dummy variables for categorical features (Fuel, Colour, etc.)
dummy_obj <- dummyVars(" ~ .", data = df)
automobile <- data.frame(predict(dummy_obj, newdata = df))
# Check structure
glimpse(automobile)
## Rows: 1,367
## Columns: 36
## $ Price <dbl> 21000, 20000, 19650, 21550, 22550, 22050, 22800, 18000, 1…
## $ Age <dbl> 26, 23, 26, 32, 33, 29, 31, 25, 25, 31, 31, 30, 29, 29, 3…
## $ KM <dbl> 31463, 43612, 32191, 23002, 34133, 18741, 34002, 21718, 2…
## $ Fuel.Diesel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Fuel.Petrol <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ HP <dbl> 195, 195, 195, 195, 195, 195, 195, 113, 113, 113, 113, 11…
## $ MC <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, …
## $ Color.Black <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, …
## $ Color.Blue <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ Color.Green <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Grey <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Color.Red <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Silver <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.White <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Yellow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Auto <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ CC <dbl> 1800, 1800, 1800, 1800, 1800, 1800, 1800, 1600, 1600, 160…
## $ Drs <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Cyl <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Grs <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ Wght <dbl> 1189, 1189, 1189, 1189, 1189, 1189, 1189, 1109, 1069, 110…
## $ G_P <dbl> 10, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Mfr_G <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ABS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Abag_1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Abag_2 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ AC <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Comp <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ CD <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, …
## $ Clock <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Pwin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ PStr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Radio <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ SpM <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ M_Rim <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, …
## $ Tow_Bar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
# Save cleaned dataset
write_xlsx(automobile, "cleaned_dataset.xlsx")
getwd()
## [1] "C:/Users/divya/OneDrive/Projects/Capstone/Final"
# ---------------------------------------------------------
# 5. Train / Test Split
# We split the encoded dataset into a training set (70%) and a validation set (30%) to evaluate model performance out-of-sample.
# ---------------------------------------------------------
set.seed(123)
# Assume Price is the target
index <- createDataPartition(automobile$Price, p = 0.7, list = FALSE)
train <- automobile[index, ]
valid <- automobile[-index, ]
# Return the number of rows
nrow(train); nrow(valid)
## [1] 959
## [1] 408
# ---------------------------------------------------------
# 6. Model 1 – Linear Regression
# We fit a multiple linear regression model using all predictors and evaluate performance using RMSE, MAE, and R² on the validation set.
# ---------------------------------------------------------
lm_model <- lm(Price ~ ., data = train)
summary(lm_model)
##
## Call:
## lm(formula = Price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6337.0 -717.2 27.1 699.1 6417.9
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.363e+03 2.201e+03 1.073 0.28346
## Age -1.187e+02 4.401e+00 -26.968 < 2e-16 ***
## KM -1.733e-02 1.488e-03 -11.648 < 2e-16 ***
## Fuel.Diesel 5.337e+02 2.610e+02 2.045 0.04110 *
## Fuel.Petrol NA NA NA NA
## HP 2.625e+01 4.117e+00 6.376 2.86e-10 ***
## MC -6.867e+01 9.213e+01 -0.745 0.45629
## Color.Black -5.314e+02 1.211e+03 -0.439 0.66094
## Color.Blue -6.076e+02 1.210e+03 -0.502 0.61576
## Color.Green -8.718e+02 1.211e+03 -0.720 0.47176
## Color.Grey -4.414e+02 1.210e+03 -0.365 0.71534
## Color.Red -7.264e+02 1.212e+03 -0.599 0.54914
## Color.Silver -4.325e+02 1.215e+03 -0.356 0.72198
## Color.White -1.550e+03 1.244e+03 -1.246 0.21309
## Color.Yellow NA NA NA NA
## Auto 4.281e+02 1.801e+02 2.378 0.01763 *
## CC -6.840e-02 8.461e-02 -0.808 0.41906
## Drs 7.378e+01 4.710e+01 1.567 0.11757
## Cyl NA NA NA NA
## Grs 1.844e+02 2.565e+02 0.719 0.47228
## Wght 1.092e+01 1.348e+00 8.096 1.78e-15 ***
## G_P 5.665e+01 1.311e+01 4.320 1.73e-05 ***
## Mfr_G 7.669e+02 1.457e+02 5.265 1.74e-07 ***
## ABS -4.656e+02 1.462e+02 -3.185 0.00149 **
## Abag_1 1.439e+02 3.064e+02 0.470 0.63875
## Abag_2 2.105e+02 1.418e+02 1.484 0.13819
## AC 2.499e+03 2.082e+02 12.004 < 2e-16 ***
## Comp -2.528e+02 1.372e+02 -1.843 0.06566 .
## CD 2.516e+02 1.161e+02 2.167 0.03048 *
## Clock 3.140e+01 1.615e+02 0.194 0.84588
## Pwin 3.559e+02 1.613e+02 2.207 0.02755 *
## PStr 1.050e+02 3.364e+02 0.312 0.75502
## Radio -9.620e+01 1.212e+02 -0.794 0.42736
## SpM 1.735e+02 9.673e+01 1.793 0.07323 .
## M_Rim 1.109e+02 1.120e+02 0.990 0.32232
## Tow_Bar -9.873e+01 9.293e+01 -1.062 0.28833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1195 on 926 degrees of freedom
## Multiple R-squared: 0.8873, Adjusted R-squared: 0.8834
## F-statistic: 227.8 on 32 and 926 DF, p-value: < 2.2e-16
# Predictions on validation set
pred_lm <- predict(lm_model, newdata = valid)
# Evaluate
RMSE_LM <- RMSE(pred_lm, valid$Price)
MAE_LM <- MAE(pred_lm, valid$Price)
R2_LM <- R2(pred_lm, valid$Price)
cat("Linear Regression:\n")
## Linear Regression:
cat(" RMSE:", RMSE_LM, "\n")
## RMSE: 1167.974
cat(" MAE :", MAE_LM, "\n")
## MAE : 908.2309
cat(" R2 :", R2_LM, "\n\n")
## R2 : 0.9001726
# Plot the linear regression
plot_lm <- ggplot(data = NULL, aes(x = valid$Price, y = pred_lm)) +
geom_point(color = "purple", alpha = 0.6, size = 2) +
geom_abline(intercept = 0, slope = 1,
color = "red", linetype = "dashed", size = 1) +
labs(
title = "Linear Regression for Actual vs Predicted Prices",
x = "Actual Price",
y = "Predicted Price"
) +
theme_minimal(base_size = 12)
print(plot_lm)

# ---------------------------------------------------------
# 7. Model 2 – Decision Tree
# We build a regression tree using `rpart`, visualize the tree, and compute the same evaluation metrics.
# ---------------------------------------------------------
tree_model <- rpart(Price ~ ., data = train, method = "anova")
# Plot the decision tree
my_node_lab <- function(x, labs, digits, varlen) {
paste0(format(round(x$frame$yval, 0), big.mark = ","),
"\n",
round(100 * x$frame$n / x$frame$n[1], 1), "%")
}
options(scipen = 999)
rpart.plot(tree_model,
main = "Decision Tree for Automobile Price",
node.fun = my_node_lab,
fallen.leaves = TRUE,
box.palette = "Blues",
shadow.col = "gray",
nn = TRUE)

# Predictions
pred_tree <- predict(tree_model, newdata = valid)
# Evaluate
RMSE_TREE <- RMSE(pred_tree, valid$Price)
MAE_TREE <- MAE(pred_tree, valid$Price)
R2_TREE <- R2(pred_tree, valid$Price)
cat("Decision Tree:\n")
## Decision Tree:
cat(" RMSE:", RMSE_TREE, "\n")
## RMSE: 1409.776
cat(" MAE :", MAE_TREE, "\n")
## MAE : 1024.642
cat(" R2 :", R2_TREE, "\n\n")
## R2 : 0.8540996
# ---------------------------------------------------------
# 8. Model 3 – Neural Network
# We construct a feed-forward neural network using `neuralnet`, visualize the architecture, and evaluate its predictive accuracy.
# ---------------------------------------------------------
# 8.1 Build formula (Price as target, all others as predictors)
features_nn <- setdiff(names(train), "Price")
f_nn <- as.formula(paste("Price ~", paste(features_nn, collapse = " + ")))
# 8.2 Train Neural Network (you can play with hidden = c(3,2))
nn_model <- neuralnet(f_nn,
data = train,
hidden = c(3, 2),
linear.output = TRUE)
# Plot the network
par(mar = c(1, 1, 3, 1)) # smaller margins so the plot uses space better
plot(
nn_model,
rep = "best", # best repetition
information = FALSE, # hide info box
col.in = "orange", # input nodes
col.hidden = "blue", # hidden layer nodes
col.out = "green", # output node
cex = 0.6, # slightly smaller labels
main = "Neural Network Architecture for Automobile Price"
)

# 8.3 Predictions
nn_pred_raw <- compute(nn_model, valid[ , features_nn])$net.result
pred_nn <- as.vector(nn_pred_raw)
# Evaluate
RMSE_NN <- RMSE(pred_nn, valid$Price)
MAE_NN <- MAE(pred_nn, valid$Price)
R2_NN <- R2(pred_nn, valid$Price)
cat("Neural Network:\n")
## Neural Network:
cat(" RMSE:", RMSE_NN, "\n")
## RMSE: 3686.845
cat(" MAE :", MAE_NN, "\n")
## MAE : 2563.663
cat(" R2 :", R2_NN, "\n\n")
## R2 : NA
# ---------------------------------------------------------
# 9. Compare Models
# Finally, we compare the three models using RMSE, MAE, and R² on the validation set.
# ---------------------------------------------------------
results <- data.frame(
Model = c("Linear Regression", "Decision Tree", "Neural Network"),
RMSE = c(RMSE_LM, RMSE_TREE, RMSE_NN),
MAE = c(MAE_LM, MAE_TREE, MAE_NN),
R2 = c(R2_LM, R2_TREE, R2_NN)
)
print(results)
## Model RMSE MAE R2
## 1 Linear Regression 1167.974 908.2309 0.9001726
## 2 Decision Tree 1409.776 1024.6423 0.8540996
## 3 Neural Network 3686.845 2563.6625 NA