rm(list=ls())
car.data <- read.csv("C:/Users/aarav/Downloads/archive/car data.csv")
CAR.DETAILS.FROM.CAR.DEKHO <- read.csv("C:/Users/aarav/Downloads/archive/CAR DETAILS FROM CAR DEKHO.csv")
Car.details.v3 <- read.csv("C:/Users/aarav/Downloads/archive/Car details v3.csv")
car.details.v4 <- read.csv("C:/Users/aarav/Downloads/archive/car details v4.csv")Summer Project
Import Data
Clean Data
Check for and Replace NA Values
library(ggplot2)
df <- Car.details.v3
library(visdat)
vis_miss(df)for(i in colnames(df)){
df[,i][is.na(df[,i])] <- median(df[,i],
na.rm=TRUE)
}Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
sum(is.numeric(is.na(df)))[1] 0
Remove Characters from Important Variables
df[c("mileage", "max_power")] <- as.numeric(sapply(df[c("mileage", "max_power")], function(x) gsub("[^0-9]", "", x)))Remove Unnecessary Variables
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- dplyr::select(.data = df, -name, -engine, - torque)Single Variable Plots
df_hist <- dplyr::select(.data = df, -selling_price, - km_driven)
library(reshape2)
Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
df_melted <- melt(df_hist)Using fuel, seller_type, transmission, owner as id variables
ggplot(data = df_melted,
aes(x = value)
) +
geom_histogram() +
facet_wrap(facets = . ~ variable,
scales = "free_x") `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 437 rows containing non-finite outside the scale range
(`stat_bin()`).
Correlation Tests
Creating a Numeric-Only Dataframe and Create a Correlation Matrix
df_corr <- dplyr::select(.data = df, -fuel, -seller_type, -transmission, -owner, -seats)
for(i in colnames(df_corr)){
df_corr[,i][is.na(df_corr[,i])] <- median(df_corr[,i],
na.rm=TRUE)
}
sum(is.numeric(is.na(df)))[1] 0
df_corr_matrix <- cor(df_corr)Visualize Correlation Matrix
library(corrplot)corrplot 0.92 loaded
corrplot(df_corr_matrix)Plotting Strong Correlations
ggplot(data = df_corr, mapping = aes(x = year,
y = km_driven)) +
geom_point(col="red", alpha = 0.1) +
labs(title = "Distribution of Km Driven by Year",
xlab = "Year",
ylab = "Km Driven") +
geom_smooth(method ="lm", col = 'black', linewidth = 2) +
ylim(0, 1000000)`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(data = df_corr, mapping = aes(x = year,
y = selling_price)) +
geom_point(col='blue', alpha = 0.1) +
labs(title = "Distribution of Selling Price by Year",
xlab = "Year",
ylab = "Selling Price") +
geom_smooth(method ="lm", col = 'black', linewidth = 2) +
ylim(0,5000000)`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 117 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 117 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 47 rows containing missing values or values outside the scale range
(`geom_smooth()`).
Descriptive Statistics
library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df, type = "text", title = "Cars Summary Stats")
Cars Summary Stats
=============================================================
Statistic N Mean St. Dev. Min Max
-------------------------------------------------------------
year 8,128 2,013.804 4.044 1,983 2,020
selling_price 8,128 638,271.800 806,253.400 29,999 10,000,000
km_driven 8,128 69,819.510 56,550.560 1 2,360,457
mileage 7,907 947.596 925.326 0 3,344
max_power 7,912 2,764.028 5,160.728 0 108,495
seats 8,128 5.405 0.949 2 14
-------------------------------------------------------------
Bivariate Regression Model
Bivariate Regression: Selling Price vs. Year Manufactured
?geom_smoothstarting httpd help server ... done
bi_var_model <- lm(formula = selling_price ~ year, data = df)
summary(bi_var_model)
Call:
lm(formula = selling_price ~ year, data = df)
Residuals:
Min 1Q Median 3Q Max
-899766 -314346 -161900 28101 9097891
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -165606347 4053843 -40.85 <2e-16 ***
year 82553 2013 41.01 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 733900 on 8126 degrees of freedom
Multiple R-squared: 0.1715, Adjusted R-squared: 0.1714
F-statistic: 1682 on 1 and 8126 DF, p-value: < 2.2e-16
Plotting Bivariate Model Residuals
plot(bi_var_model)Plotting Bivariate Regression
ggplot(data = df, mapping =aes(x = year,
y = selling_price)) +
geom_point(col = 'black', alpha = 0.1)+
geom_smooth(method = "lm", linewidth = 2) +
ylim(0,5000000)`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 117 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 117 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 47 rows containing missing values or values outside the scale range
(`geom_smooth()`).
?geom_smoothMultivariate Regression Model
Kitchen Sink Model
library(dplyr)
df_sample <- sample_n(tbl = df, size = 1000)
s_multi_var_model <- lm(formula = selling_price ~., data = df_sample)
stargazer(s_multi_var_model, type = 'text')
=======================================================
Dependent variable:
---------------------------
selling_price
-------------------------------------------------------
year 44,006.560***
(6,491.705)
km_driven -1.166**
(0.459)
fuelDiesel 296,595.700
(226,924.700)
fuelLPG 69,555.700
(334,118.700)
fuelPetrol -41,196.210
(227,194.200)
seller_typeIndividual -372,529.100***
(57,260.220)
seller_typeTrustmark Dealer -556,910.000***
(126,241.800)
transmissionManual -1,231,620.000***
(58,322.750)
ownerFourth Above Owner
(136,688.100)
ownerSecond Owner -18,363.470
(46,139.660)
ownerTest Drive Car 3,763,961.000***
(556,357.600)
ownerThird Owner 67,920.020
(78,612.490)
mileage 35.039*
(20.474)
max_power -0.843
(3.565)
seats 5,687.288
(20,070.800)
Constant -86,701,676.000***
(13,092,836.000)
-------------------------------------------------------
Observations 978
R2 0.550
Adjusted R2 0.543
Residual Std. Error 550,288.000 (df = 962)
F Statistic 78.247*** (df = 15; 962)
=======================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Plotting Full Model Residuals
plot(s_multi_var_model)Warning: not plotting observations with leverage one:
710
Plotting Full Model Fit
df_sample<- na.omit(df_sample)
df_sample$price_prediction <- s_multi_var_model$fitted.values
ggplot(data = df_sample, mapping =aes(x = price_prediction,
y = selling_price)) +
geom_point()+
geom_smooth()`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Comparing Full and Bivariate Models
stargazer(bi_var_model, s_multi_var_model, type = 'text')
================================================================================
Dependent variable:
----------------------------------------------------
selling_price
(1) (2)
--------------------------------------------------------------------------------
year 82,552.530*** 44,006.560***
(2,013.024) (6,491.705)
km_driven -1.166**
(0.459)
fuelDiesel 296,595.700
(226,924.700)
fuelLPG 69,555.700
(334,118.700)
fuelPetrol -41,196.210
(227,194.200)
seller_typeIndividual -372,529.100***
(57,260.220)
seller_typeTrustmark Dealer -556,910.000***
(126,241.800)
transmissionManual -1,231,620.000***
(58,322.750)
ownerFourth Above Owner
(136,688.100)
ownerSecond Owner -18,363.470
(46,139.660)
ownerTest Drive Car 3,763,961.000***
(556,357.600)
ownerThird Owner 67,920.020
(78,612.490)
mileage 35.039*
(20.474)
max_power -0.843
(3.565)
seats 5,687.288
(20,070.800)
Constant -165,606,347.000*** -86,701,676.000***
(4,053,843.000) (13,092,836.000)
--------------------------------------------------------------------------------
Observations 8,128 978
R2 0.171 0.550
Adjusted R2 0.171 0.543
Residual Std. Error 733,925.300 (df = 8126) 550,288.000 (df = 962)
F Statistic 1,681.756*** (df = 1; 8126) 78.247*** (df = 15; 962)
================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Refining Model
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
stepAIC(object = s_multi_var_model, direction = 'backward')Start: AIC=25870.66
selling_price ~ year + km_driven + fuel + seller_type + transmission +
owner + mileage + max_power + seats
Df Sum of Sq RSS AIC
- max_power 1 1.6939e+10 2.9133e+14 25869
- seats 1 2.4314e+10 2.9133e+14 25869
<none> 2.9131e+14 25871
- mileage 1 8.8695e+11 2.9220e+14 25872
- km_driven 1 1.9544e+12 2.9326e+14 25875
- owner 4 1.4256e+13 3.0557e+14 25909
- seller_type 2 1.4421e+13 3.0573e+14 25914
- year 1 1.3915e+13 3.0523e+14 25914
- fuel 3 2.0371e+13 3.1168e+14 25931
- transmission 1 1.3504e+14 4.2635e+14 26241
Step: AIC=25868.72
selling_price ~ year + km_driven + fuel + seller_type + transmission +
owner + mileage + seats
Df Sum of Sq RSS AIC
- seats 1 2.7784e+10 2.9135e+14 25867
<none> 2.9133e+14 25869
- mileage 1 8.7505e+11 2.9220e+14 25870
- km_driven 1 1.9657e+12 2.9329e+14 25873
- owner 4 1.4278e+13 3.0560e+14 25908
- seller_type 2 1.4405e+13 3.0573e+14 25912
- year 1 1.4203e+13 3.0553e+14 25913
- fuel 3 2.0427e+13 3.1175e+14 25929
- transmission 1 1.3521e+14 4.2654e+14 26240
Step: AIC=25866.81
selling_price ~ year + km_driven + fuel + seller_type + transmission +
owner + mileage
Df Sum of Sq RSS AIC
<none> 2.9135e+14 25867
- mileage 1 8.7021e+11 2.9222e+14 25868
- km_driven 1 1.9568e+12 2.9331e+14 25871
- owner 4 1.4285e+13 3.0564e+14 25906
- seller_type 2 1.4397e+13 3.0575e+14 25910
- year 1 1.4553e+13 3.0591e+14 25913
- fuel 3 2.2389e+13 3.1374e+14 25933
- transmission 1 1.3550e+14 4.2686e+14 26238
Call:
lm(formula = selling_price ~ year + km_driven + fuel + seller_type +
transmission + owner + mileage, data = df_sample)
Coefficients:
(Intercept) year
-8.658e+07 4.396e+04
km_driven fuelDiesel
-1.139e+00 3.008e+05
fuelLPG fuelPetrol
6.953e+04 -4.055e+04
seller_typeIndividual seller_typeTrustmark Dealer
-3.707e+05 -5.553e+05
transmissionManual ownerFourth & Above Owner
-1.233e+06 2.462e+04
ownerSecond Owner ownerTest Drive Car
-1.781e+04 3.753e+06
ownerThird Owner mileage
6.887e+04 3.463e+01
final_model <- lm(formula = selling_price ~ owner + seller_type + as.factor(year) + seats + transmission + fuel, data = df_sample)Plotting Refined Model Residuals
plot(final_model)Warning: not plotting observations with leverage one:
637, 649, 710, 742
Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Plotting Refined Model
df_sample$price_prediction <- final_model$fitted.values
summaryfunction (object, ...)
UseMethod("summary")
<bytecode: 0x0000020443db2ef0>
<environment: namespace:base>
ggplot(data = df_sample, mapping =aes(x = price_prediction,
y = selling_price)) +
geom_point()+
geom_smooth()`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Comparing Refined Model, Full Model, and Bivariate Model
stargazer(s_multi_var_model, final_model, bi_var_model, type = 'text')
=========================================================================================================
Dependent variable:
-----------------------------------------------------------------------------
selling_price
(1) (2) (3)
---------------------------------------------------------------------------------------------------------
year 44,006.560*** 82,552.530***
(6,491.705) (2,013.024)
km_driven -1.166**
(0.459)
fuelDiesel 296,595.700 252,788.500
(226,924.700) (219,959.900)
fuelLPG 69,555.700 32,723.790
(334,118.700) (325,144.400)
fuelPetrol -41,196.210 -95,878.360
(227,194.200) (219,786.400)
seller_typeIndividual -372,529.100*** -386,144.800***
(57,260.220) (55,498.880)
seller_typeTrustmark Dealer -556,910.000*** -622,594.600***
(126,241.800) (124,548.500)
as.factor(year)1998 -54,034.750
(614,468.800)
as.factor(year)1999 -67,264.460
(490,706.400)
as.factor(year)2000 23,427.510
(629,284.400)
as.factor(year)2001 -66,852.070
(615,118.200)
as.factor(year)2002 -24,572.490
(629,284.400)
as.factor(year)2003 -160,087.000
(406,443.300)
as.factor(year)2004 -50,904.250
(486,066.200)
as.factor(year)2005 -264,154.700
(341,042.000)
as.factor(year)2006 -85,117.580
(339,214.700)
as.factor(year)2007 -222,987.100
(325,924.200)
as.factor(year)2008 -68,366.040
(325,363.600)
as.factor(year)2009 -86,915.210
(322,927.400)
as.factor(year)2010 -112,889.700
(317,820.000)
as.factor(year)2011 -110,899.700
(314,083.300)
as.factor(year)2012 -68,992.440
(315,025.500)
as.factor(year)2013 2,781.216
(314,682.000)
as.factor(year)2014 87,656.820
(314,819.800)
as.factor(year)2015 155,592.600
(315,066.400)
as.factor(year)2016 202,072.000
(314,511.500)
as.factor(year)2017 242,434.600
(313,875.300)
as.factor(year)2018 292,068.700
(315,671.300)
as.factor(year)2019 937,492.700***
(317,235.600)
as.factor(year)2020 377,415.900
(369,732.200)
transmissionManual -1,231,620.000*** -1,116,310.000***
(58,322.750) (58,043.060)
ownerFourth Above Owner 24,614.720 -27,279.580
(136,688.100) (141,131.600)
ownerSecond Owner -18,363.470 12,676.200
(46,139.660) (45,701.770)
ownerTest Drive Car 3,763,961.000*** 3,371,201.000***
(556,357.600) (536,923.600)
ownerThird Owner 67,920.020 48,597.860
(78,612.490) (77,423.480)
mileage 35.039*
(20.474)
max_power -0.843
(3.565)
seats 5,687.288 493.519
(20,070.800) (19,011.820)
Constant -86,701,676.000*** 1,707,717.000*** -165,606,347.000***
(13,092,836.000) (398,049.500) (4,053,843.000)
---------------------------------------------------------------------------------------------------------
Observations 978 978 8,128
R2 0.550 0.589 0.171
Adjusted R2 0.543 0.574 0.171
Residual Std. Error 550,288.000 (df = 962) 531,236.200 (df = 943) 733,925.300 (df = 8126)
F Statistic 78.247*** (df = 15; 962) 39.666*** (df = 34; 943) 1,681.756*** (df = 1; 8126)
=========================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Insights
Car Price in Relation to Year Manufactured
On average, for every year newer a car is, it cost 43,000 additional dollars.
For this reason, buying old/used cars is much more money efficient than buying new cars
Car Price in Relation to Fuel Type
Compared to cars which use CNG fuel, diesel cars sell for $296,000 higher on average, and petrol vehicles sell for $16,000 higher.
This is likely due to diesel vehicles being large semi-trucks and buses, while petrol and CNG vehicles are cars
Car Price in Relation to Transmission Type, Mileage, Max_Power, and Seats
Compared to cars with automatic transmission, cars with manual transmission sell for $1,256,237 lower
For every additional mile of mileage a car has, its price increases by $14
For every additional unit of horsepower a car has, its price increases by $9
For every additional seat a car has, its price increases by $37,000
All of these car options can together vastly increase, or decrease the price of your car, so being conscious of these aspects when buying a car can increase car affordability depending on your circumstances
Car Price in Relation to Owner
Compared to the average price the first owner of a car payed, the second owner of a car payed $15,000 less, and the third owner payed $74,000 more
Compared to first owners of cars, second owners of cars are likely buying cars for cheaper, because they are owning lower end cars, where the value of the vehicle is assessed based only its condition and its original value.
Third car owners are likely buying cars for a higher price, because third car owners are likely those owning more expensive cars, where the value of the vehicle is assessed by its, condition, its original value, and its exclusivity.
Alternatively, third car owners may be a skewed metric, because of one or two third car owners who bought cars with a price to a very high extreme