FINAL PROJECT STA 631 01
Portfolio
I am Venkanna Dora Arigela, currently pursuing a Master’s in Data Science and Analytics. With a solid foundation in computer science and engineering, coupled with my experience as a software engineer, I have developed a keen interest in data science. My master’s program has allowed me to work on a significant project involving car price prediction using regression analysis. This project included data pre-processing, exploratory data analysis, model selection, and evaluation, enhancing my proficiency in R programming and statistical modeling.
Looking ahead, I aim to become a data analyst, using my skills to provide actionable insights and support data-driven decision-making. I am passionate about utilizing statistical techniques and machine learning algorithms to solve business challenges. My goal is to contribute to an organization by transforming data into strategic assets. I am committed to continuous learning and professional development, eager to stay updated with the latest trends in data science, and ready to tackle new challenges in this dynamic field.
Predicting Used Car Prices Using Regression Analysis
Objective:
The objective of this project is to develop a predictive model that accurately forecasts the selling prices of used cars based on various features such as year of manufacture, kilometers driven, fuel type, seller type, transmission, ownership, mileage, engine capacity, and maximum power. By employing regression analysis techniques, this project aims to provide valuable insights into the factors that influence car prices and to create a reliable tool for predicting the market value of used cars.
Loading libraries
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## corrplot 0.92 loaded
## Warning: package 'caTools' was built under R version 4.3.3
Dataset loading
car_data <- read.csv('D:/STA 631/Car-price.csv')
# Printing the first five rows of the dataset
head(car_data,5)## name year selling_price km_driven fuel seller_type
## 1 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual
## 2 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual
## 3 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual
## 4 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual
## 5 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual
## transmission owner mileage engine max_power seats
## 1 Manual First Owner 23.40 1248 74.00 5
## 2 Manual Second Owner 21.14 1498 103.52 5
## 3 Manual Third Owner 17.70 1497 78.00 5
## 4 Manual First Owner 23.00 1396 90.00 5
## 5 Manual First Owner 16.10 1298 88.20 5
Data types
## 'data.frame': 8128 obs. of 12 variables:
## $ name : chr "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
## $ year : int 2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ mileage : num 23.4 21.1 17.7 23 16.1 ...
## $ engine : int 1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...
## $ max_power : num 74 103.5 78 90 88.2 ...
## $ seats : int 5 5 5 5 5 5 5 4 5 5 ...
Data summary
## name year selling_price km_driven
## Length:8128 Min. :1983 Min. : 29999 Min. : 1
## Class :character 1st Qu.:2011 1st Qu.: 254999 1st Qu.: 35000
## Mode :character Median :2015 Median : 450000 Median : 60000
## Mean :2014 Mean : 638272 Mean : 69820
## 3rd Qu.:2017 3rd Qu.: 675000 3rd Qu.: 98000
## Max. :2020 Max. :10000000 Max. :2360457
##
## fuel seller_type transmission owner
## Length:8128 Length:8128 Length:8128 Length:8128
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## mileage engine max_power seats
## Min. : 0.00 Min. : 624 Min. : 0.00 Min. : 2.000
## 1st Qu.:16.78 1st Qu.:1197 1st Qu.: 68.05 1st Qu.: 5.000
## Median :19.30 Median :1248 Median : 82.00 Median : 5.000
## Mean :19.42 Mean :1459 Mean : 91.52 Mean : 5.417
## 3rd Qu.:22.32 3rd Qu.:1582 3rd Qu.:102.00 3rd Qu.: 5.000
## Max. :42.00 Max. :3604 Max. :400.00 Max. :14.000
## NA's :221 NA's :221 NA's :216 NA's :221
Data Preprocessing
Exploratory Data Analysis (EDA)
# Plot distribution of selling price
selling_price_plot <- ggplot(car_data, aes(x = selling_price)) +
geom_histogram(binwidth = 300000, fill = "blue", color = "black") +
ggtitle("Distribution of Selling Price") +
theme_minimal()
# Plot distribution of km driven
km_driven_plot <- ggplot(car_data, aes(x = km_driven)) +
geom_histogram(binwidth = 50000, fill = "green", color = "black") +
ggtitle("Distribution of Kilometers Driven") +
theme_minimal()
# Plot distribution of fuel type
fuel_type_plot <- ggplot(car_data, aes(x = fuel)) +
geom_bar(fill = "orange", color = "black") +
ggtitle("Distribution of Fuel Type") +
theme_minimal()
# Plot distribution of seller type
seller_type_plot <- ggplot(car_data, aes(x = seller_type)) +
geom_bar(fill = "purple", color = "black") +
ggtitle("Distribution of Seller Type") +
theme_minimal()
grid.arrange(selling_price_plot, km_driven_plot, fuel_type_plot, seller_type_plot, nrow = 2, ncol = 2)
## Categorical column plots
# Bar chart for fuel type distribution
fuel_dist <- ggplot(car_data, aes(x = fuel)) +
geom_bar(fill = "#1f77b4", color = "black") +
ggtitle("Distribution of Fuel Type") +
xlab("Fuel Type") +
ylab("Count") +
theme_linedraw()
# Bar chart for seller type distribution
seller_type_dist <- ggplot(car_data, aes(x = seller_type)) +
geom_bar(fill = "#ff7f0e", color = "black") +
ggtitle("Distribution of Seller Type") +
xlab("Seller Type") +
ylab("Count") +
theme_linedraw()
# Box plot for selling price by fuel type
price_by_fuel <- ggplot(car_data, aes(x = fuel, y = selling_price, fill = fuel)) +
geom_boxplot() +
ggtitle("Selling Price by Fuel Type") +
xlab("Fuel Type") +
ylab("Selling Price") +
theme_linedraw() +
theme(legend.position = "none")
# Box plot for selling price by seller type
price_by_seller <- ggplot(car_data, aes(x = seller_type, y = selling_price, fill = seller_type)) +
geom_boxplot() +
ggtitle("Selling Price by Seller Type") +
xlab("Seller Type") +
ylab("Selling Price") +
theme_linedraw() +
theme(legend.position = "none")
# Arrange all plots in a grid
grid.arrange(fuel_dist, seller_type_dist, price_by_fuel, price_by_seller, nrow = 2, ncol = 2)Bivariate Analysis
# Scatter plot for selling price vs km driven
km_driven_vs_price <- ggplot(car_data, aes(x = km_driven, y = selling_price)) +
geom_point(color = "#1f77b4", alpha = 0.5) +
ggtitle("Selling Price vs Kilometers Driven") +
theme_minimal()
# Scatter plot for selling price vs engine size
engine_vs_price <- ggplot(car_data, aes(x = engine, y = selling_price)) +
geom_point(color = "#2ca02c", alpha = 0.5) +
ggtitle("Selling Price vs Engine Size") +
theme_minimal()
# Scatter plot for selling price vs mileage
mileage_vs_price <- ggplot(car_data, aes(x = mileage, y = selling_price)) +
geom_point(color = "#d62728", alpha = 0.5) +
ggtitle("Selling Price vs Mileage") +
theme_minimal()
# Scatter plot for selling price vs max power
max_power_vs_price <- ggplot(car_data, aes(x = max_power, y = selling_price)) +
geom_point(color = "#9467bd", alpha = 0.5) +
ggtitle("Selling Price vs Max Power") +
theme_minimal()
grid.arrange(km_driven_vs_price, engine_vs_price, mileage_vs_price, max_power_vs_price, nrow = 2, ncol = 2)## Warning: Removed 221 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 221 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 216 rows containing missing values or values outside the scale range
## (`geom_point()`).
Correlation Heatmap
numeric_vars <- car_data %>%
select(selling_price, km_driven, mileage, engine, max_power)
# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# Print correlation matrix
print(cor_matrix)## selling_price km_driven mileage engine max_power
## selling_price 1.0000000 -0.22215848 -0.1262799 0.4556818 0.74967378
## km_driven -0.2221585 1.00000000 -0.1729803 0.2060307 -0.03815852
## mileage -0.1262799 -0.17298035 1.0000000 -0.5764079 -0.37462089
## engine 0.4556818 0.20603073 -0.5764079 1.0000000 0.70397453
## max_power 0.7496738 -0.03815852 -0.3746209 0.7039745 1.00000000
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.7, tl.col = "red", tl.srt = 45)Multivariate scatter plot
# Multivariate Scatter Plot: Selling Price vs Kilometers Driven by Fuel Type
multivariate_plot <- ggplot(car_data, aes(x = km_driven, y = selling_price, color = fuel)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_x_continuous(limits = c(0, 500000), breaks = seq(0, 500000, 100000)) +
scale_y_continuous(limits = c(0, 5000000), breaks = seq(0, 5000000, 1000000)) +
ggtitle("Multivariate Scatter Plot: Selling Price vs Kilometers Driven by Fuel Type") +
xlab("Kilometers Driven") +
ylab("Selling Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
guides(color = guide_legend(title = "Fuel Type"))
print(multivariate_plot)## Warning: Removed 120 rows containing missing values or values outside the scale range
## (`geom_point()`).
Data cleaning
# Handle missing values (simple approach: remove rows with any NA values)
car_data <- na.omit(car_data)
sapply(car_data, function(x) sum(is.na(x)))## name year selling_price km_driven fuel
## 0 0 0 0 0
## seller_type transmission owner mileage engine
## 0 0 0 0 0
## max_power seats
## 0 0
Data Splitting for Model Training
# Split the data into training and testing sets (80% train, 30% test)
set.seed(123)
split <- sample.split(car_data$selling_price, SplitRatio = 0.8)
train_data <- subset(car_data, split == TRUE)
test_data <- subset(car_data, split == FALSE)# Ensure that levels of factors in test data match those in train data
test_data$fuel <- factor(test_data$fuel, levels = levels(train_data$fuel))
test_data$seller_type <- factor(test_data$seller_type, levels = levels(train_data$seller_type))
test_data$transmission <- factor(test_data$transmission, levels = levels(train_data$transmission))
test_data$owner <- factor(test_data$owner, levels = levels(train_data$owner))# Simple linear regression using MLE
lm_model <- lm(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats,
data = train_data)
# Summarize the model to see MLE estimates of parameters
summary(lm_model)##
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type +
## transmission + owner + mileage + engine + max_power + seats,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2628968 -200594 15220 167028 3972348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.723e+05 1.120e+05 -3.324 0.000892 ***
## km_driven -1.489e+00 1.121e-01 -13.286 < 2e-16 ***
## fuelDiesel -7.674e+04 6.971e+04 -1.101 0.270998
## fuelLPG 1.382e+05 1.080e+05 1.280 0.200705
## fuelPetrol -1.076e+05 7.014e+04 -1.534 0.125003
## seller_typeIndividual -2.413e+05 1.845e+04 -13.082 < 2e-16 ***
## seller_typeTrustmark Dealer -3.036e+05 3.803e+04 -7.982 1.69e-15 ***
## transmissionManual -4.839e+05 2.198e+04 -22.014 < 2e-16 ***
## ownerFourth & Above Owner -1.444e+05 4.327e+04 -3.337 0.000852 ***
## ownerSecond Owner -1.106e+05 1.444e+04 -7.662 2.11e-14 ***
## ownerTest Drive Car 2.719e+06 2.335e+05 11.643 < 2e-16 ***
## ownerThird Owner -1.282e+05 2.486e+04 -5.157 2.58e-07 ***
## mileage 2.705e+04 2.182e+03 12.395 < 2e-16 ***
## engine 1.034e+02 2.682e+01 3.856 0.000117 ***
## max_power 1.373e+04 2.831e+02 48.503 < 2e-16 ***
## seats -9.397e+03 8.703e+03 -1.080 0.280294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 465400 on 6399 degrees of freedom
## Multiple R-squared: 0.6789, Adjusted R-squared: 0.6781
## F-statistic: 901.8 on 15 and 6399 DF, p-value: < 2.2e-16
Selection of model
# Load necessary library
library(caret)
# Define training control for cross-validation
cv_control <- trainControl(method = "cv", number = 7) # 5-fold cross-validation
# Train the base model using all predictors
lm_base <- train(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats,
data = train_data, method = "lm", trControl = cv_control)
# Train a reduced model using selected predictors based on domain knowledge
lm_reduced <- train(selling_price ~ km_driven + fuel + seller_type + mileage + max_power,
data = train_data, method = "lm", trControl = cv_control)
# Compare models using resampling
model_results <- resamples(list(base_model = lm_base, reduced_model = lm_reduced))
# Summary of the results
print(summary(model_results))##
## Call:
## summary.resamples(object = model_results)
##
## Models: base_model, reduced_model
## Number of resamples: 7
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base_model 267804.4 275884.8 279576.9 283191.2 291623.1 299941.5 0
## reduced_model 287277.2 289397.0 303286.9 299011.3 306236.6 311247.5 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base_model 441163.6 452036.5 464856.6 469919.8 491283.4 496778.3 0
## reduced_model 460356.2 479614.9 484365.3 494553.0 513590.9 530738.0 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base_model 0.6516481 0.6580008 0.6732746 0.6718508 0.6781786 0.7056738 0
## reduced_model 0.5969086 0.6028698 0.6257644 0.6354998 0.6498693 0.7203473 0
Model fitting
# Fit the linear regression model (full model)
lm_base <- lm(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats, data = train_data)
# Fit the reduced linear regression model
lm_reduced <- lm(selling_price ~ km_driven + fuel + seller_type + mileage + max_power, data = train_data)
# Summary of the models
summary(lm_base)##
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type +
## transmission + owner + mileage + engine + max_power + seats,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2628968 -200594 15220 167028 3972348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.723e+05 1.120e+05 -3.324 0.000892 ***
## km_driven -1.489e+00 1.121e-01 -13.286 < 2e-16 ***
## fuelDiesel -7.674e+04 6.971e+04 -1.101 0.270998
## fuelLPG 1.382e+05 1.080e+05 1.280 0.200705
## fuelPetrol -1.076e+05 7.014e+04 -1.534 0.125003
## seller_typeIndividual -2.413e+05 1.845e+04 -13.082 < 2e-16 ***
## seller_typeTrustmark Dealer -3.036e+05 3.803e+04 -7.982 1.69e-15 ***
## transmissionManual -4.839e+05 2.198e+04 -22.014 < 2e-16 ***
## ownerFourth & Above Owner -1.444e+05 4.327e+04 -3.337 0.000852 ***
## ownerSecond Owner -1.106e+05 1.444e+04 -7.662 2.11e-14 ***
## ownerTest Drive Car 2.719e+06 2.335e+05 11.643 < 2e-16 ***
## ownerThird Owner -1.282e+05 2.486e+04 -5.157 2.58e-07 ***
## mileage 2.705e+04 2.182e+03 12.395 < 2e-16 ***
## engine 1.034e+02 2.682e+01 3.856 0.000117 ***
## max_power 1.373e+04 2.831e+02 48.503 < 2e-16 ***
## seats -9.397e+03 8.703e+03 -1.080 0.280294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 465400 on 6399 degrees of freedom
## Multiple R-squared: 0.6789, Adjusted R-squared: 0.6781
## F-statistic: 901.8 on 15 and 6399 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type +
## mileage + max_power, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2446599 -216407 4643 161424 4536258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.710e+05 9.269e+04 -9.397 < 2e-16 ***
## km_driven -1.917e+00 1.148e-01 -16.699 < 2e-16 ***
## fuelDiesel -1.194e+05 7.348e+04 -1.625 0.1042
## fuelLPG 1.183e+05 1.135e+05 1.042 0.2974
## fuelPetrol -1.225e+05 7.359e+04 -1.664 0.0961 .
## seller_typeIndividual -3.587e+05 1.880e+04 -19.084 < 2e-16 ***
## seller_typeTrustmark Dealer -2.717e+05 3.964e+04 -6.855 7.81e-12 ***
## mileage 2.621e+04 1.740e+03 15.057 < 2e-16 ***
## max_power 1.715e+04 2.115e+02 81.090 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 491200 on 6406 degrees of freedom
## Multiple R-squared: 0.6419, Adjusted R-squared: 0.6414
## F-statistic: 1435 on 8 and 6406 DF, p-value: < 2.2e-16
Residual analysis of Base and reduced models
# Residual analysis for the base model
resid_hist_base <- ggplot(data.frame(residuals = lm_base_residuals), aes(x = residuals)) +
geom_histogram(binwidth = 50000, fill = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals (Base Model)", x = "Residuals", y = "Count")
resid_fitted_base <- ggplot(data.frame(fitted = lm_base_predictions, residuals = lm_base_residuals), aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Residuals vs. Fitted Values (Base Model)", x = "Fitted Values", y = "Residuals")
# Residual analysis for the reduced model
resid_hist_reduced <- ggplot(data.frame(residuals = lm_reduced_residuals), aes(x = residuals)) +
geom_histogram(binwidth = 50000, fill = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals (Reduced Model)", x = "Residuals", y = "Count")
resid_fitted_reduced <- ggplot(data.frame(fitted = lm_reduced_predictions, residuals = lm_reduced_residuals), aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Residuals vs. Fitted Values (Reduced Model)", x = "Fitted Values", y = "Residuals")
grid.arrange(resid_hist_base, resid_fitted_base, resid_hist_reduced, resid_fitted_reduced, ncol = 2)## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
RMSE
base_rmse <- sqrt(mean((test_data$selling_price - lm_base_predictions)^2))
reduced_rmse <- sqrt(mean((test_data$selling_price - lm_reduced_predictions)^2))
print(paste("Base Model RMSE:", round(base_rmse, 2)))## [1] "Base Model RMSE: 468768.43"
## [1] "Reduced Model RMSE: 484363.78"
MAPE
base_mape <- mean(abs((test_data$selling_price - lm_base_predictions) / test_data$selling_price)) * 100
reduced_mape <- mean(abs((test_data$selling_price - lm_reduced_predictions) / test_data$selling_price)) * 100
print(paste("Base Model MAPE:", round(base_mape, 2)))## [1] "Base Model MAPE: 71.96"
## [1] "Reduced Model MAPE: 77.19"