FINAL PROJECT STA 631 01

Portfolio

I am Venkanna Dora Arigela, currently pursuing a Master’s in Data Science and Analytics. With a solid foundation in computer science and engineering, coupled with my experience as a software engineer, I have developed a keen interest in data science. My master’s program has allowed me to work on a significant project involving car price prediction using regression analysis. This project included data pre-processing, exploratory data analysis, model selection, and evaluation, enhancing my proficiency in R programming and statistical modeling.

Looking ahead, I aim to become a data analyst, using my skills to provide actionable insights and support data-driven decision-making. I am passionate about utilizing statistical techniques and machine learning algorithms to solve business challenges. My goal is to contribute to an organization by transforming data into strategic assets. I am committed to continuous learning and professional development, eager to stay updated with the latest trends in data science, and ready to tackle new challenges in this dynamic field.

Predicting Used Car Prices Using Regression Analysis

Objective:

The objective of this project is to develop a predictive model that accurately forecasts the selling prices of used cars based on various features such as year of manufacture, kilometers driven, fuel type, seller type, transmission, ownership, mileage, engine capacity, and maximum power. By employing regression analysis techniques, this project aims to provide valuable insights into the factors that influence car prices and to create a reliable tool for predicting the market value of used cars.

Loading libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.92 loaded
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3

Dataset loading

car_data <- read.csv('D:/STA 631/Car-price.csv')

# Printing the first five rows of the dataset
head(car_data,5)
##                           name year selling_price km_driven   fuel seller_type
## 1       Maruti Swift Dzire VDI 2014        450000    145500 Diesel  Individual
## 2 Skoda Rapid 1.5 TDI Ambition 2014        370000    120000 Diesel  Individual
## 3     Honda City 2017-2020 EXi 2006        158000    140000 Petrol  Individual
## 4    Hyundai i20 Sportz Diesel 2010        225000    127000 Diesel  Individual
## 5       Maruti Swift VXI BSIII 2007        130000    120000 Petrol  Individual
##   transmission        owner mileage engine max_power seats
## 1       Manual  First Owner   23.40   1248     74.00     5
## 2       Manual Second Owner   21.14   1498    103.52     5
## 3       Manual  Third Owner   17.70   1497     78.00     5
## 4       Manual  First Owner   23.00   1396     90.00     5
## 5       Manual  First Owner   16.10   1298     88.20     5

Data types

str(car_data)
## 'data.frame':    8128 obs. of  12 variables:
##  $ name         : chr  "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
##  $ year         : int  2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ mileage      : num  23.4 21.1 17.7 23 16.1 ...
##  $ engine       : int  1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...
##  $ max_power    : num  74 103.5 78 90 88.2 ...
##  $ seats        : int  5 5 5 5 5 5 5 4 5 5 ...

Data summary

summary(car_data)
##      name                year      selling_price        km_driven      
##  Length:8128        Min.   :1983   Min.   :   29999   Min.   :      1  
##  Class :character   1st Qu.:2011   1st Qu.:  254999   1st Qu.:  35000  
##  Mode  :character   Median :2015   Median :  450000   Median :  60000  
##                     Mean   :2014   Mean   :  638272   Mean   :  69820  
##                     3rd Qu.:2017   3rd Qu.:  675000   3rd Qu.:  98000  
##                     Max.   :2020   Max.   :10000000   Max.   :2360457  
##                                                                        
##      fuel           seller_type        transmission          owner          
##  Length:8128        Length:8128        Length:8128        Length:8128       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     mileage          engine       max_power          seats       
##  Min.   : 0.00   Min.   : 624   Min.   :  0.00   Min.   : 2.000  
##  1st Qu.:16.78   1st Qu.:1197   1st Qu.: 68.05   1st Qu.: 5.000  
##  Median :19.30   Median :1248   Median : 82.00   Median : 5.000  
##  Mean   :19.42   Mean   :1459   Mean   : 91.52   Mean   : 5.417  
##  3rd Qu.:22.32   3rd Qu.:1582   3rd Qu.:102.00   3rd Qu.: 5.000  
##  Max.   :42.00   Max.   :3604   Max.   :400.00   Max.   :14.000  
##  NA's   :221     NA's   :221    NA's   :216      NA's   :221

Data Preprocessing

Exploratory Data Analysis (EDA)

# Plot distribution of selling price
selling_price_plot <- ggplot(car_data, aes(x = selling_price)) + 
  geom_histogram(binwidth = 300000, fill = "blue", color = "black") +
  ggtitle("Distribution of Selling Price") +
  theme_minimal()

# Plot distribution of km driven
km_driven_plot <- ggplot(car_data, aes(x = km_driven)) + 
  geom_histogram(binwidth = 50000, fill = "green", color = "black") +
  ggtitle("Distribution of Kilometers Driven") +
  theme_minimal()

# Plot distribution of fuel type
fuel_type_plot <- ggplot(car_data, aes(x = fuel)) + 
  geom_bar(fill = "orange", color = "black") +
  ggtitle("Distribution of Fuel Type") +
  theme_minimal()

# Plot distribution of seller type
seller_type_plot <- ggplot(car_data, aes(x = seller_type)) + 
  geom_bar(fill = "purple", color = "black") +
  ggtitle("Distribution of Seller Type") +
  theme_minimal()

grid.arrange(selling_price_plot, km_driven_plot, fuel_type_plot, seller_type_plot, nrow = 2, ncol = 2)

## Categorical column plots

# Bar chart for fuel type distribution
fuel_dist <- ggplot(car_data, aes(x = fuel)) + 
  geom_bar(fill = "#1f77b4", color = "black") +
  ggtitle("Distribution of Fuel Type") +
  xlab("Fuel Type") +
  ylab("Count") +
  theme_linedraw()

# Bar chart for seller type distribution
seller_type_dist <- ggplot(car_data, aes(x = seller_type)) + 
  geom_bar(fill = "#ff7f0e", color = "black") +
  ggtitle("Distribution of Seller Type") +
  xlab("Seller Type") +
  ylab("Count") +
  theme_linedraw()

# Box plot for selling price by fuel type
price_by_fuel <- ggplot(car_data, aes(x = fuel, y = selling_price, fill = fuel)) + 
  geom_boxplot() +
  ggtitle("Selling Price by Fuel Type") +
  xlab("Fuel Type") +
  ylab("Selling Price") +
  theme_linedraw() +
  theme(legend.position = "none")

# Box plot for selling price by seller type
price_by_seller <- ggplot(car_data, aes(x = seller_type, y = selling_price, fill = seller_type)) + 
  geom_boxplot() +
  ggtitle("Selling Price by Seller Type") +
  xlab("Seller Type") +
  ylab("Selling Price") +
  theme_linedraw() +
  theme(legend.position = "none")

# Arrange all plots in a grid
grid.arrange(fuel_dist, seller_type_dist, price_by_fuel, price_by_seller, nrow = 2, ncol = 2)

Bivariate Analysis

# Scatter plot for selling price vs km driven
km_driven_vs_price <- ggplot(car_data, aes(x = km_driven, y = selling_price)) + 
  geom_point(color = "#1f77b4", alpha = 0.5) +  
  ggtitle("Selling Price vs Kilometers Driven") +
  theme_minimal()

# Scatter plot for selling price vs engine size
engine_vs_price <- ggplot(car_data, aes(x = engine, y = selling_price)) + 
  geom_point(color = "#2ca02c", alpha = 0.5) +  
  ggtitle("Selling Price vs Engine Size") +
  theme_minimal()

# Scatter plot for selling price vs mileage
mileage_vs_price <- ggplot(car_data, aes(x = mileage, y = selling_price)) + 
  geom_point(color = "#d62728", alpha = 0.5) +  
  ggtitle("Selling Price vs Mileage") +
  theme_minimal()

# Scatter plot for selling price vs max power
max_power_vs_price <- ggplot(car_data, aes(x = max_power, y = selling_price)) + 
  geom_point(color = "#9467bd", alpha = 0.5) +  
  ggtitle("Selling Price vs Max Power") +
  theme_minimal()

grid.arrange(km_driven_vs_price, engine_vs_price, mileage_vs_price, max_power_vs_price, nrow = 2, ncol = 2)
## Warning: Removed 221 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 221 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 216 rows containing missing values or values outside the scale range
## (`geom_point()`).

Correlation Heatmap

numeric_vars <- car_data %>%
  select(selling_price, km_driven, mileage, engine, max_power)

# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")

# Print correlation matrix
print(cor_matrix)
##               selling_price   km_driven    mileage     engine   max_power
## selling_price     1.0000000 -0.22215848 -0.1262799  0.4556818  0.74967378
## km_driven        -0.2221585  1.00000000 -0.1729803  0.2060307 -0.03815852
## mileage          -0.1262799 -0.17298035  1.0000000 -0.5764079 -0.37462089
## engine            0.4556818  0.20603073 -0.5764079  1.0000000  0.70397453
## max_power         0.7496738 -0.03815852 -0.3746209  0.7039745  1.00000000
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.7, tl.col = "red", tl.srt = 45)

Multivariate scatter plot

# Multivariate Scatter Plot: Selling Price vs Kilometers Driven by Fuel Type
multivariate_plot <- ggplot(car_data, aes(x = km_driven, y = selling_price, color = fuel)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_x_continuous(limits = c(0, 500000), breaks = seq(0, 500000, 100000)) +
  scale_y_continuous(limits = c(0, 5000000), breaks = seq(0, 5000000, 1000000)) +
  ggtitle("Multivariate Scatter Plot: Selling Price vs Kilometers Driven by Fuel Type") +
  xlab("Kilometers Driven") +
  ylab("Selling Price") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(color = guide_legend(title = "Fuel Type"))


print(multivariate_plot)
## Warning: Removed 120 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Convert categorical variables to factors
car_data$fuel <- as.factor(car_data$fuel)
car_data$seller_type <- as.factor(car_data$seller_type)
car_data$transmission <- as.factor(car_data$transmission)
car_data$owner <- as.factor(car_data$owner)

Missing values

# Check for missing values
sum(is.na(car_data))
## [1] 879

Data cleaning

# Handle missing values (simple approach: remove rows with any NA values)
car_data <- na.omit(car_data)

sapply(car_data, function(x) sum(is.na(x)))
##          name          year selling_price     km_driven          fuel 
##             0             0             0             0             0 
##   seller_type  transmission         owner       mileage        engine 
##             0             0             0             0             0 
##     max_power         seats 
##             0             0

Data Splitting for Model Training

# Split the data into training and testing sets (80% train, 30% test)
set.seed(123)
split <- sample.split(car_data$selling_price, SplitRatio = 0.8)
train_data <- subset(car_data, split == TRUE)
test_data <- subset(car_data, split == FALSE)
# Ensure that levels of factors in test data match those in train data
test_data$fuel <- factor(test_data$fuel, levels = levels(train_data$fuel))
test_data$seller_type <- factor(test_data$seller_type, levels = levels(train_data$seller_type))
test_data$transmission <- factor(test_data$transmission, levels = levels(train_data$transmission))
test_data$owner <- factor(test_data$owner, levels = levels(train_data$owner))
# Simple linear regression using MLE
lm_model <- lm(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats, 
               data = train_data)

# Summarize the model to see MLE estimates of parameters
summary(lm_model)
## 
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type + 
##     transmission + owner + mileage + engine + max_power + seats, 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2628968  -200594    15220   167028  3972348 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -3.723e+05  1.120e+05  -3.324 0.000892 ***
## km_driven                   -1.489e+00  1.121e-01 -13.286  < 2e-16 ***
## fuelDiesel                  -7.674e+04  6.971e+04  -1.101 0.270998    
## fuelLPG                      1.382e+05  1.080e+05   1.280 0.200705    
## fuelPetrol                  -1.076e+05  7.014e+04  -1.534 0.125003    
## seller_typeIndividual       -2.413e+05  1.845e+04 -13.082  < 2e-16 ***
## seller_typeTrustmark Dealer -3.036e+05  3.803e+04  -7.982 1.69e-15 ***
## transmissionManual          -4.839e+05  2.198e+04 -22.014  < 2e-16 ***
## ownerFourth & Above Owner   -1.444e+05  4.327e+04  -3.337 0.000852 ***
## ownerSecond Owner           -1.106e+05  1.444e+04  -7.662 2.11e-14 ***
## ownerTest Drive Car          2.719e+06  2.335e+05  11.643  < 2e-16 ***
## ownerThird Owner            -1.282e+05  2.486e+04  -5.157 2.58e-07 ***
## mileage                      2.705e+04  2.182e+03  12.395  < 2e-16 ***
## engine                       1.034e+02  2.682e+01   3.856 0.000117 ***
## max_power                    1.373e+04  2.831e+02  48.503  < 2e-16 ***
## seats                       -9.397e+03  8.703e+03  -1.080 0.280294    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 465400 on 6399 degrees of freedom
## Multiple R-squared:  0.6789, Adjusted R-squared:  0.6781 
## F-statistic: 901.8 on 15 and 6399 DF,  p-value: < 2.2e-16

Selection of model

# Load necessary library
library(caret)

# Define training control for cross-validation
cv_control <- trainControl(method = "cv", number = 7)  # 5-fold cross-validation

# Train the base model using all predictors
lm_base <- train(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats,
                 data = train_data, method = "lm", trControl = cv_control)

# Train a reduced model using selected predictors based on domain knowledge
lm_reduced <- train(selling_price ~ km_driven + fuel + seller_type + mileage + max_power,
                    data = train_data, method = "lm", trControl = cv_control)

# Compare models using resampling
model_results <- resamples(list(base_model = lm_base, reduced_model = lm_reduced))

# Summary of the results
print(summary(model_results))
## 
## Call:
## summary.resamples(object = model_results)
## 
## Models: base_model, reduced_model 
## Number of resamples: 7 
## 
## MAE 
##                   Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## base_model    267804.4 275884.8 279576.9 283191.2 291623.1 299941.5    0
## reduced_model 287277.2 289397.0 303286.9 299011.3 306236.6 311247.5    0
## 
## RMSE 
##                   Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## base_model    441163.6 452036.5 464856.6 469919.8 491283.4 496778.3    0
## reduced_model 460356.2 479614.9 484365.3 494553.0 513590.9 530738.0    0
## 
## Rsquared 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## base_model    0.6516481 0.6580008 0.6732746 0.6718508 0.6781786 0.7056738    0
## reduced_model 0.5969086 0.6028698 0.6257644 0.6354998 0.6498693 0.7203473    0

Model fitting

# Fit the linear regression model (full model)
lm_base <- lm(selling_price ~ km_driven + fuel + seller_type + transmission + owner + mileage + engine + max_power + seats, data = train_data)

# Fit the reduced linear regression model
lm_reduced <- lm(selling_price ~ km_driven + fuel + seller_type + mileage + max_power, data = train_data)

# Summary of the models
summary(lm_base)
## 
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type + 
##     transmission + owner + mileage + engine + max_power + seats, 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2628968  -200594    15220   167028  3972348 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -3.723e+05  1.120e+05  -3.324 0.000892 ***
## km_driven                   -1.489e+00  1.121e-01 -13.286  < 2e-16 ***
## fuelDiesel                  -7.674e+04  6.971e+04  -1.101 0.270998    
## fuelLPG                      1.382e+05  1.080e+05   1.280 0.200705    
## fuelPetrol                  -1.076e+05  7.014e+04  -1.534 0.125003    
## seller_typeIndividual       -2.413e+05  1.845e+04 -13.082  < 2e-16 ***
## seller_typeTrustmark Dealer -3.036e+05  3.803e+04  -7.982 1.69e-15 ***
## transmissionManual          -4.839e+05  2.198e+04 -22.014  < 2e-16 ***
## ownerFourth & Above Owner   -1.444e+05  4.327e+04  -3.337 0.000852 ***
## ownerSecond Owner           -1.106e+05  1.444e+04  -7.662 2.11e-14 ***
## ownerTest Drive Car          2.719e+06  2.335e+05  11.643  < 2e-16 ***
## ownerThird Owner            -1.282e+05  2.486e+04  -5.157 2.58e-07 ***
## mileage                      2.705e+04  2.182e+03  12.395  < 2e-16 ***
## engine                       1.034e+02  2.682e+01   3.856 0.000117 ***
## max_power                    1.373e+04  2.831e+02  48.503  < 2e-16 ***
## seats                       -9.397e+03  8.703e+03  -1.080 0.280294    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 465400 on 6399 degrees of freedom
## Multiple R-squared:  0.6789, Adjusted R-squared:  0.6781 
## F-statistic: 901.8 on 15 and 6399 DF,  p-value: < 2.2e-16
summary(lm_reduced)
## 
## Call:
## lm(formula = selling_price ~ km_driven + fuel + seller_type + 
##     mileage + max_power, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2446599  -216407     4643   161424  4536258 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -8.710e+05  9.269e+04  -9.397  < 2e-16 ***
## km_driven                   -1.917e+00  1.148e-01 -16.699  < 2e-16 ***
## fuelDiesel                  -1.194e+05  7.348e+04  -1.625   0.1042    
## fuelLPG                      1.183e+05  1.135e+05   1.042   0.2974    
## fuelPetrol                  -1.225e+05  7.359e+04  -1.664   0.0961 .  
## seller_typeIndividual       -3.587e+05  1.880e+04 -19.084  < 2e-16 ***
## seller_typeTrustmark Dealer -2.717e+05  3.964e+04  -6.855 7.81e-12 ***
## mileage                      2.621e+04  1.740e+03  15.057  < 2e-16 ***
## max_power                    1.715e+04  2.115e+02  81.090  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 491200 on 6406 degrees of freedom
## Multiple R-squared:  0.6419, Adjusted R-squared:  0.6414 
## F-statistic:  1435 on 8 and 6406 DF,  p-value: < 2.2e-16
# Make predictions on the test data
lm_base_predictions <- predict(lm_base, newdata = test_data)
lm_reduced_predictions <- predict(lm_reduced, newdata = test_data)
# Calculate residuals
lm_base_residuals <- test_data$selling_price - lm_base_predictions
lm_reduced_residuals <- test_data$selling_price - lm_reduced_predictions

Residual analysis of Base and reduced models

# Residual analysis for the base model
resid_hist_base <- ggplot(data.frame(residuals = lm_base_residuals), aes(x = residuals)) + 
  geom_histogram(binwidth = 50000, fill = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals (Base Model)", x = "Residuals", y = "Count")

resid_fitted_base <- ggplot(data.frame(fitted = lm_base_predictions, residuals = lm_base_residuals), aes(x = fitted, y = residuals)) + 
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Residuals vs. Fitted Values (Base Model)", x = "Fitted Values", y = "Residuals")

# Residual analysis for the reduced model
resid_hist_reduced <- ggplot(data.frame(residuals = lm_reduced_residuals), aes(x = residuals)) + 
  geom_histogram(binwidth = 50000, fill = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals (Reduced Model)", x = "Residuals", y = "Count")

resid_fitted_reduced <- ggplot(data.frame(fitted = lm_reduced_predictions, residuals = lm_reduced_residuals), aes(x = fitted, y = residuals)) + 
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Residuals vs. Fitted Values (Reduced Model)", x = "Fitted Values", y = "Residuals")

grid.arrange(resid_hist_base, resid_fitted_base, resid_hist_reduced, resid_fitted_reduced, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

RMSE

base_rmse <- sqrt(mean((test_data$selling_price - lm_base_predictions)^2))
reduced_rmse <- sqrt(mean((test_data$selling_price - lm_reduced_predictions)^2))

print(paste("Base Model RMSE:", round(base_rmse, 2)))
## [1] "Base Model RMSE: 468768.43"
print(paste("Reduced Model RMSE:", round(reduced_rmse, 2)))
## [1] "Reduced Model RMSE: 484363.78"

MAPE

base_mape <- mean(abs((test_data$selling_price - lm_base_predictions) / test_data$selling_price)) * 100
reduced_mape <- mean(abs((test_data$selling_price - lm_reduced_predictions) / test_data$selling_price)) * 100


print(paste("Base Model MAPE:", round(base_mape, 2)))
## [1] "Base Model MAPE: 71.96"
print(paste("Reduced Model MAPE:", round(reduced_mape, 2)))
## [1] "Reduced Model MAPE: 77.19"