# ---------------------------------------------------------
# 1. Load the Data

# In this section, we load the original automobile dataset from Excel and briefly inspect its structure. The dataset contains information such as price, age, mileage, fuel type, color, engine size, and multiple binary equipment features.
# ---------------------------------------------------------

# 1.1 Read the Excel file (original case data)
data <- read_excel("C:/Users/divya/OneDrive/Projects/Capstone/W28593-XLS-ENG.xlsx")

# 1.2 Quick structure check
glimpse(data)
## Rows: 1,367
## Columns: 28
## $ Price   <dbl> 21000, 20000, 19650, 21550, 22550, 22050, 22800, 18000, 16800,…
## $ Age     <dbl> 26, 23, 26, 32, 33, 29, 31, 25, 25, 31, 31, 30, 29, 29, 30, 26…
## $ KM      <dbl> 31463, 43612, 32191, 23002, 34133, 18741, 34002, 21718, 25565,…
## $ Fuel    <chr> "Petrol", "Petrol", "Petrol", "Petrol", "Petrol", "Petrol", "P…
## $ HP      <dbl> 195, 195, 195, 195, 195, 195, 195, 113, 113, 113, 113, 113, 11…
## $ MC      <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ Color   <chr> "Silver", "Red", "Red", "Black", "Grey", "Grey", "Grey", "Blue…
## $ Auto    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CC      <dbl> 1800, 1800, 1800, 1800, 1800, 1800, 1800, 1600, 1600, 1600, 16…
## $ Drs     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Cyl     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Grs     <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Wght    <dbl> 1189, 1189, 1189, 1189, 1189, 1189, 1189, 1109, 1069, 1109, 11…
## $ G_P     <dbl> 10, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Mfr_G   <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ ABS     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Abag_1  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Abag_2  <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ AC      <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ Comp    <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ CD      <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ Clock   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ Pwin    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
## $ PStr    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Radio   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ SpM     <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,…
## $ M_Rim   <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
## $ Tow_Bar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,…
summary(data)
##      Price            Age              KM             Fuel          
##  Min.   : 4400   Min.   : 5.00   Min.   :     3   Length:1367       
##  1st Qu.: 8500   1st Qu.:47.00   1st Qu.: 43002   Class :character  
##  Median : 9945   Median :62.00   Median : 64002   Mode  :character  
##  Mean   :10682   Mean   :57.66   Mean   : 68606                     
##  3rd Qu.:11800   3rd Qu.:71.00   3rd Qu.: 87181                     
##  Max.   :32550   Max.   :81.00   Max.   :232942                     
##        HP              MC            Color                Auto        
##  Min.   : 72.0   Min.   :0.0000   Length:1367        Min.   :0.00000  
##  1st Qu.: 89.0   1st Qu.:0.0000   Class :character   1st Qu.:0.00000  
##  Median :113.0   Median :1.0000   Mode  :character   Median :0.00000  
##  Mean   :104.5   Mean   :0.6715                      Mean   :0.05779  
##  3rd Qu.:113.0   3rd Qu.:1.0000                      3rd Qu.:0.00000  
##  Max.   :195.0   Max.   :1.0000                      Max.   :1.00000  
##        CC             Drs             Cyl         Grs             Wght     
##  Min.   : 1300   Min.   :2.000   Min.   :3   Min.   :3.000   Min.   :1004  
##  1st Qu.: 1400   1st Qu.:3.000   1st Qu.:3   1st Qu.:5.000   1st Qu.:1044  
##  Median : 1600   Median :4.000   Median :3   Median :5.000   Median :1069  
##  Mean   : 1575   Mean   :4.037   Mean   :3   Mean   :5.028   Mean   :1075  
##  3rd Qu.: 1600   3rd Qu.:5.000   3rd Qu.:3   3rd Qu.:5.000   3rd Qu.:1089  
##  Max.   :16000   Max.   :5.000   Max.   :3   Max.   :6.000   Max.   :1619  
##       G_P             Mfr_G             ABS             Abag_1      
##  Min.   : 4.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4.000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000  
##  Median : 4.000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   : 5.251   Mean   :0.8969   Mean   :0.8127   Mean   :0.9715  
##  3rd Qu.: 4.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :20.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Abag_2             AC              Comp              CD        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.7242   Mean   :0.0556   Mean   :0.2816   Mean   :0.2114  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Clock             Pwin             PStr            Radio       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.5743   Mean   :0.5552   Mean   :0.9773   Mean   :0.1456  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       SpM             M_Rim           Tow_Bar      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3043   Mean   :0.1997   Mean   :0.2802  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
# ---------------------------------------------------------
# 2. Basic Cleaning & Renaming

# We first make column names syntactically safe, check for missing values.
# ---------------------------------------------------------

# 2.1 Make safe column names (no spaces, no special characters)
df <- data
names(df) <- make.names(names(df))

# Check names once
names(df)
##  [1] "Price"   "Age"     "KM"      "Fuel"    "HP"      "MC"      "Color"  
##  [8] "Auto"    "CC"      "Drs"     "Cyl"     "Grs"     "Wght"    "G_P"    
## [15] "Mfr_G"   "ABS"     "Abag_1"  "Abag_2"  "AC"      "Comp"    "CD"     
## [22] "Clock"   "Pwin"    "PStr"    "Radio"   "SpM"     "M_Rim"   "Tow_Bar"
# 2.2 Check missing values
colSums(is.na(df))
##   Price     Age      KM    Fuel      HP      MC   Color    Auto      CC     Drs 
##       0       0       0       0       0       0       0       0       0       0 
##     Cyl     Grs    Wght     G_P   Mfr_G     ABS  Abag_1  Abag_2      AC    Comp 
##       0       0       0       0       0       0       0       0       0       0 
##      CD   Clock    Pwin    PStr   Radio     SpM   M_Rim Tow_Bar 
##       0       0       0       0       0       0       0       0
# Remove rows with any missing values / NA
# (none in this dataset, but kept for robustness)
df <- na.omit(df)
# ---------------------------------------------------------
# 3. Exploratory Data Analysis (EDA)

# We examine correlations among numerical variables to understand how price relates to age, mileage, horsepower, engine size, and weight.

# The scatter plot below shows how price varies with horsepower, with a fitted linear trend.
# ---------------------------------------------------------

# 3.1 Summary statistics
summary(df)
##      Price            Age              KM             Fuel          
##  Min.   : 4400   Min.   : 5.00   Min.   :     3   Length:1367       
##  1st Qu.: 8500   1st Qu.:47.00   1st Qu.: 43002   Class :character  
##  Median : 9945   Median :62.00   Median : 64002   Mode  :character  
##  Mean   :10682   Mean   :57.66   Mean   : 68606                     
##  3rd Qu.:11800   3rd Qu.:71.00   3rd Qu.: 87181                     
##  Max.   :32550   Max.   :81.00   Max.   :232942                     
##        HP              MC            Color                Auto        
##  Min.   : 72.0   Min.   :0.0000   Length:1367        Min.   :0.00000  
##  1st Qu.: 89.0   1st Qu.:0.0000   Class :character   1st Qu.:0.00000  
##  Median :113.0   Median :1.0000   Mode  :character   Median :0.00000  
##  Mean   :104.5   Mean   :0.6715                      Mean   :0.05779  
##  3rd Qu.:113.0   3rd Qu.:1.0000                      3rd Qu.:0.00000  
##  Max.   :195.0   Max.   :1.0000                      Max.   :1.00000  
##        CC             Drs             Cyl         Grs             Wght     
##  Min.   : 1300   Min.   :2.000   Min.   :3   Min.   :3.000   Min.   :1004  
##  1st Qu.: 1400   1st Qu.:3.000   1st Qu.:3   1st Qu.:5.000   1st Qu.:1044  
##  Median : 1600   Median :4.000   Median :3   Median :5.000   Median :1069  
##  Mean   : 1575   Mean   :4.037   Mean   :3   Mean   :5.028   Mean   :1075  
##  3rd Qu.: 1600   3rd Qu.:5.000   3rd Qu.:3   3rd Qu.:5.000   3rd Qu.:1089  
##  Max.   :16000   Max.   :5.000   Max.   :3   Max.   :6.000   Max.   :1619  
##       G_P             Mfr_G             ABS             Abag_1      
##  Min.   : 4.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4.000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000  
##  Median : 4.000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   : 5.251   Mean   :0.8969   Mean   :0.8127   Mean   :0.9715  
##  3rd Qu.: 4.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :20.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Abag_2             AC              Comp              CD        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.7242   Mean   :0.0556   Mean   :0.2816   Mean   :0.2114  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Clock             Pwin             PStr            Radio       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.5743   Mean   :0.5552   Mean   :0.9773   Mean   :0.1456  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       SpM             M_Rim           Tow_Bar      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3043   Mean   :0.1997   Mean   :0.2802  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
# 3.2 Correlation plot for numeric variables
numeric_vars <- df %>% select_if(is.numeric)
corr_matrix <- cor(numeric_vars)

ggcorrplot(
  corr_matrix,
  type      = "lower",
  lab       = FALSE,   # set TRUE if you want numeric values
  title = "Correlation Plot for numeric variables",
  colors    = c("darkred", "white", "darkblue"),
  outline.color = "gray80",
  tl.col    = "black",
  tl.srt    = 45,
  tl.cex    = 12 / max(1, ncol(corr_matrix) / 14)  # auto-scale text a bit
)

# 3.3 Example scatterplot: Price vs HP (Horsepower)
ggplot(df, aes(x = HP, y = Price)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "ScatterPlot of Price vs Horsepower",
       x = "Horsepower (HP)",
       y = "Price (CA$)")

# ---------------------------------------------------------
# 4. Transform / Encode Variables

# We treat key variables as categorical where appropriate. We convert categorical variables into dummy/indicator variables using `caret::dummyVars()` and create a fully numeric dataset for modeling.
# ---------------------------------------------------------

# 4.1 Treat some variables as factors (categorical)
# Check what Fuel and Colour look like:
table(df$Fuel)
## 
## Diesel Petrol 
##    142   1225
table(df$Color)
## 
##  Black   Blue  Green   Grey    Red Silver  White Yellow 
##    179    266    215    290    267    120     27      3
df$Fuel <- as.factor(df$Fuel)
df$Color <- as.factor(df$Color)

# 4.2 Ensure binary indicator columns are numeric 0/1
binary_cols <- c("MC", "Auto", "Mfr_G", "ABS", "Abag_1", "Abag_2",
                 "AC", "Comp", "CD", "Clock", "Pwin", "PStr",
                 "Radio", "SpM", "M_Rim", "Tow_Bar")

for (col in binary_cols) {
  if (col %in% names(df)) {
    df[[col]] <- as.numeric(df[[col]])
  }
}

# 4.3 Create dummy variables for categorical features (Fuel, Colour, etc.)
dummy_obj <- dummyVars(" ~ .", data = df)
automobile <- data.frame(predict(dummy_obj, newdata = df))

# Check structure
glimpse(automobile)
## Rows: 1,367
## Columns: 36
## $ Price        <dbl> 21000, 20000, 19650, 21550, 22550, 22050, 22800, 18000, 1…
## $ Age          <dbl> 26, 23, 26, 32, 33, 29, 31, 25, 25, 31, 31, 30, 29, 29, 3…
## $ KM           <dbl> 31463, 43612, 32191, 23002, 34133, 18741, 34002, 21718, 2…
## $ Fuel.Diesel  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Fuel.Petrol  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ HP           <dbl> 195, 195, 195, 195, 195, 195, 195, 113, 113, 113, 113, 11…
## $ MC           <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, …
## $ Color.Black  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, …
## $ Color.Blue   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ Color.Green  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Grey   <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Color.Red    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Silver <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.White  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Color.Yellow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Auto         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ CC           <dbl> 1800, 1800, 1800, 1800, 1800, 1800, 1800, 1600, 1600, 160…
## $ Drs          <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Cyl          <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Grs          <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ Wght         <dbl> 1189, 1189, 1189, 1189, 1189, 1189, 1189, 1109, 1069, 110…
## $ G_P          <dbl> 10, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Mfr_G        <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ABS          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Abag_1       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Abag_2       <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ AC           <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Comp         <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ CD           <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, …
## $ Clock        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Pwin         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ PStr         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Radio        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ SpM          <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ M_Rim        <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, …
## $ Tow_Bar      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
# Save cleaned dataset
write_xlsx(automobile, "cleaned_dataset.xlsx")
getwd()
## [1] "C:/Users/divya/OneDrive/Projects/Capstone/Final"
# ---------------------------------------------------------
# 5. Train / Test Split

# We split the encoded dataset into a training set (70%) and a validation set (30%) to evaluate model performance out-of-sample.
# ---------------------------------------------------------

set.seed(123)

# Assume Price is the target
index <- createDataPartition(automobile$Price, p = 0.7, list = FALSE)
train <- automobile[index, ]
valid <- automobile[-index, ]

# Return the number of rows
nrow(train); nrow(valid)
## [1] 959
## [1] 408
# ---------------------------------------------------------
# 6. Model 1 – Linear Regression

# We fit a multiple linear regression model using all predictors and evaluate performance using RMSE, MAE, and R² on the validation set.
# ---------------------------------------------------------

lm_model <- lm(Price ~ ., data = train)

summary(lm_model)
## 
## Call:
## lm(formula = Price ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6337.0  -717.2    27.1   699.1  6417.9 
## 
## Coefficients: (3 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.363e+03  2.201e+03   1.073  0.28346    
## Age          -1.187e+02  4.401e+00 -26.968  < 2e-16 ***
## KM           -1.733e-02  1.488e-03 -11.648  < 2e-16 ***
## Fuel.Diesel   5.337e+02  2.610e+02   2.045  0.04110 *  
## Fuel.Petrol          NA         NA      NA       NA    
## HP            2.625e+01  4.117e+00   6.376 2.86e-10 ***
## MC           -6.867e+01  9.213e+01  -0.745  0.45629    
## Color.Black  -5.314e+02  1.211e+03  -0.439  0.66094    
## Color.Blue   -6.076e+02  1.210e+03  -0.502  0.61576    
## Color.Green  -8.718e+02  1.211e+03  -0.720  0.47176    
## Color.Grey   -4.414e+02  1.210e+03  -0.365  0.71534    
## Color.Red    -7.264e+02  1.212e+03  -0.599  0.54914    
## Color.Silver -4.325e+02  1.215e+03  -0.356  0.72198    
## Color.White  -1.550e+03  1.244e+03  -1.246  0.21309    
## Color.Yellow         NA         NA      NA       NA    
## Auto          4.281e+02  1.801e+02   2.378  0.01763 *  
## CC           -6.840e-02  8.461e-02  -0.808  0.41906    
## Drs           7.378e+01  4.710e+01   1.567  0.11757    
## Cyl                  NA         NA      NA       NA    
## Grs           1.844e+02  2.565e+02   0.719  0.47228    
## Wght          1.092e+01  1.348e+00   8.096 1.78e-15 ***
## G_P           5.665e+01  1.311e+01   4.320 1.73e-05 ***
## Mfr_G         7.669e+02  1.457e+02   5.265 1.74e-07 ***
## ABS          -4.656e+02  1.462e+02  -3.185  0.00149 ** 
## Abag_1        1.439e+02  3.064e+02   0.470  0.63875    
## Abag_2        2.105e+02  1.418e+02   1.484  0.13819    
## AC            2.499e+03  2.082e+02  12.004  < 2e-16 ***
## Comp         -2.528e+02  1.372e+02  -1.843  0.06566 .  
## CD            2.516e+02  1.161e+02   2.167  0.03048 *  
## Clock         3.140e+01  1.615e+02   0.194  0.84588    
## Pwin          3.559e+02  1.613e+02   2.207  0.02755 *  
## PStr          1.050e+02  3.364e+02   0.312  0.75502    
## Radio        -9.620e+01  1.212e+02  -0.794  0.42736    
## SpM           1.735e+02  9.673e+01   1.793  0.07323 .  
## M_Rim         1.109e+02  1.120e+02   0.990  0.32232    
## Tow_Bar      -9.873e+01  9.293e+01  -1.062  0.28833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1195 on 926 degrees of freedom
## Multiple R-squared:  0.8873, Adjusted R-squared:  0.8834 
## F-statistic: 227.8 on 32 and 926 DF,  p-value: < 2.2e-16
# Predictions on validation set
pred_lm <- predict(lm_model, newdata = valid)

# Evaluate
RMSE_LM <- RMSE(pred_lm, valid$Price)
MAE_LM  <- MAE(pred_lm,  valid$Price)
R2_LM   <- R2(pred_lm,   valid$Price)

cat("Linear Regression:\n")
## Linear Regression:
cat("  RMSE:", RMSE_LM, "\n")
##   RMSE: 1167.974
cat("  MAE :", MAE_LM,  "\n")
##   MAE : 908.2309
cat("  R2  :", R2_LM,   "\n\n")
##   R2  : 0.9001726
# Plot the linear regression
plot_lm <- ggplot(data = NULL, aes(x = valid$Price, y = pred_lm)) +
  geom_point(color = "purple", alpha = 0.6, size = 2) +
  geom_abline(intercept = 0, slope = 1, 
              color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Linear Regression for Actual vs Predicted Prices",
    x = "Actual Price",
    y = "Predicted Price"
  ) +
  theme_minimal(base_size = 12)

print(plot_lm)

# ---------------------------------------------------------
# 7. Model 2 – Decision Tree

# We build a regression tree using `rpart`, visualize the tree, and compute the same evaluation metrics.
# ---------------------------------------------------------

tree_model <- rpart(Price ~ ., data = train, method = "anova")

# Plot the decision tree
my_node_lab <- function(x, labs, digits, varlen) {
  paste0(format(round(x$frame$yval, 0), big.mark = ","),
         "\n",
         round(100 * x$frame$n / x$frame$n[1], 1), "%")
}

options(scipen = 999)
rpart.plot(tree_model,
           main = "Decision Tree for Automobile Price",
           node.fun     = my_node_lab,
           fallen.leaves = TRUE,
           box.palette = "Blues",
           shadow.col = "gray",
           nn = TRUE)

# Predictions
pred_tree <- predict(tree_model, newdata = valid)

# Evaluate
RMSE_TREE <- RMSE(pred_tree, valid$Price)
MAE_TREE  <- MAE(pred_tree,  valid$Price)
R2_TREE   <- R2(pred_tree,   valid$Price)

cat("Decision Tree:\n")
## Decision Tree:
cat("  RMSE:", RMSE_TREE, "\n")
##   RMSE: 1409.776
cat("  MAE :", MAE_TREE,  "\n")
##   MAE : 1024.642
cat("  R2  :", R2_TREE,   "\n\n")
##   R2  : 0.8540996
# ---------------------------------------------------------
# 8. Model 3 – Neural Network

# We construct a feed-forward neural network using `neuralnet`, visualize the architecture, and evaluate its predictive accuracy.
# ---------------------------------------------------------

# 8.1 Build formula (Price as target, all others as predictors)
features_nn <- setdiff(names(train), "Price")
f_nn <- as.formula(paste("Price ~", paste(features_nn, collapse = " + ")))

# 8.2 Train Neural Network (you can play with hidden = c(3,2))
nn_model <- neuralnet(f_nn,
                      data = train,
                      hidden = c(3, 2),
                      linear.output = TRUE)

# Plot the network
par(mar = c(1, 1, 3, 1))   # smaller margins so the plot uses space better

plot(
  nn_model,
  rep          = "best",      # best repetition
  information  = FALSE,       # hide info box
  col.in       = "orange",    # input nodes
  col.hidden   = "blue",      # hidden layer nodes
  col.out      = "green",     # output node
  cex          = 0.6,         # slightly smaller labels
  main         = "Neural Network Architecture for Automobile Price"
)

# 8.3 Predictions
nn_pred_raw <- compute(nn_model, valid[ , features_nn])$net.result
pred_nn <- as.vector(nn_pred_raw)

# Evaluate
RMSE_NN <- RMSE(pred_nn, valid$Price)
MAE_NN  <- MAE(pred_nn,  valid$Price)
R2_NN   <- R2(pred_nn,   valid$Price)

cat("Neural Network:\n")
## Neural Network:
cat("  RMSE:", RMSE_NN, "\n")
##   RMSE: 3686.845
cat("  MAE :", MAE_NN,  "\n")
##   MAE : 2563.663
cat("  R2  :", R2_NN,   "\n\n")
##   R2  : NA
# ---------------------------------------------------------
# 9. Compare Models

# Finally, we compare the three models using RMSE, MAE, and R² on the validation set.
# ---------------------------------------------------------

results <- data.frame(
  Model = c("Linear Regression", "Decision Tree", "Neural Network"),
  RMSE  = c(RMSE_LM, RMSE_TREE, RMSE_NN),
  MAE   = c(MAE_LM,  MAE_TREE,  MAE_NN),
  R2    = c(R2_LM,   R2_TREE,   R2_NN)
)

print(results)
##               Model     RMSE       MAE        R2
## 1 Linear Regression 1167.974  908.2309 0.9001726
## 2     Decision Tree 1409.776 1024.6423 0.8540996
## 3    Neural Network 3686.845 2563.6625        NA