Introduction

Overview: This analysis aims to predict car prices using a regression model. The dataset used is from Kaggle.

Load and Preprocess Data

# Load the dataset
url <- "https://raw.githubusercontent.com/selva86/datasets/master/Cars93.csv"
car_data <- read_csv(url)

# Handle missing values
car_data <- na.omit(car_data)

# Convert categorical variables to factors
car_data$Manufacturer <- as.factor(car_data$Manufacturer)
car_data$Model <- as.factor(car_data$Model)
car_data$Type <- as.factor(car_data$Type)
car_data$DriveTrain <- as.factor(car_data$DriveTrain)
car_data$Cylinders <- as.factor(car_data$Cylinders)
car_data$Man.trans.avail <- as.factor(car_data$Man.trans.avail)

Data Preparation

Split Data

# Split Data into Training and Testing Sets
set.seed(123)
trainIndex <- createDataPartition(car_data$Price, p = 0.8, list = FALSE, times = 1)
car_train <- car_data[trainIndex,]
car_test  <- car_data[-trainIndex,]

Model Fitting

Linear Regression

# Dynamic input for predictor variable
predictor <- "Horsepower" # Default value

# Fit the linear regression model dynamically
model_dynamic <- lm(as.formula(paste("Price ~", predictor)), data = car_train)

# Summarize the dynamic model
summary(model_dynamic)

Call:
lm(formula = as.formula(paste("Price ~", predictor)), data = car_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.4289  -2.8128  -0.5482   1.9688  31.2305 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.16652    2.26722  -0.956    0.343    
Horsepower   0.15132    0.01492  10.143 6.02e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.425 on 64 degrees of freedom
Multiple R-squared:  0.6165,    Adjusted R-squared:  0.6105 
F-statistic: 102.9 on 1 and 64 DF,  p-value: 6.015e-15

Model Evaluation

Predictions and Performance

# Make dynamic predictions
predictions_dynamic <- predict(model_dynamic, car_test)

# Evaluate the dynamic model
postResample(predictions_dynamic, car_test$Price)
     RMSE  Rsquared       MAE 
5.0976057 0.6552374 3.2900270 

Visualization

Actual vs Predicted

# Create a data frame with actual and predicted prices dynamically
results_dynamic <- data.frame(Actual = car_test$Price, Predicted = predictions_dynamic)

# Plot the actual vs. predicted prices dynamically
ggplot(results_dynamic, aes(x = Actual, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Actual vs. Predicted Car Prices",
       x = "Actual Prices",
       y = "Predicted Prices") +
  theme_minimal()

Linear Regression

Visualization