Introduction
Overview: This analysis aims to predict car prices using a regression model. The dataset used is from Kaggle.
Load and Preprocess Data
# Load the dataset
url <- "https://raw.githubusercontent.com/selva86/datasets/master/Cars93.csv"
car_data <- read_csv(url)
# Handle missing values
car_data <- na.omit(car_data)
# Convert categorical variables to factors
car_data$Manufacturer <- as.factor(car_data$Manufacturer)
car_data$Model <- as.factor(car_data$Model)
car_data$Type <- as.factor(car_data$Type)
car_data$DriveTrain <- as.factor(car_data$DriveTrain)
car_data$Cylinders <- as.factor(car_data$Cylinders)
car_data$Man.trans.avail <- as.factor(car_data$Man.trans.avail)
Data Preparation
Split Data
# Split Data into Training and Testing Sets
set.seed(123)
trainIndex <- createDataPartition(car_data$Price, p = 0.8, list = FALSE, times = 1)
car_train <- car_data[trainIndex,]
car_test <- car_data[-trainIndex,]
Model Fitting
Linear Regression
# Dynamic input for predictor variable
predictor <- "Horsepower" # Default value
# Fit the linear regression model dynamically
model_dynamic <- lm(as.formula(paste("Price ~", predictor)), data = car_train)
# Summarize the dynamic model
summary(model_dynamic)
Call:
lm(formula = as.formula(paste("Price ~", predictor)), data = car_train)
Residuals:
Min 1Q Median 3Q Max
-17.4289 -2.8128 -0.5482 1.9688 31.2305
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.16652 2.26722 -0.956 0.343
Horsepower 0.15132 0.01492 10.143 6.02e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.425 on 64 degrees of freedom
Multiple R-squared: 0.6165, Adjusted R-squared: 0.6105
F-statistic: 102.9 on 1 and 64 DF, p-value: 6.015e-15
Model Evaluation
Predictions and Performance
# Make dynamic predictions
predictions_dynamic <- predict(model_dynamic, car_test)
# Evaluate the dynamic model
postResample(predictions_dynamic, car_test$Price)
RMSE Rsquared MAE
5.0976057 0.6552374 3.2900270
Visualization
Actual vs Predicted
# Create a data frame with actual and predicted prices dynamically
results_dynamic <- data.frame(Actual = car_test$Price, Predicted = predictions_dynamic)
# Plot the actual vs. predicted prices dynamically
ggplot(results_dynamic, aes(x = Actual, y = Predicted)) +
geom_point(color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Actual vs. Predicted Car Prices",
x = "Actual Prices",
y = "Predicted Prices") +
theme_minimal()