Summer Project

Import Data

rm(list=ls())

car.data <- read.csv("C:/Users/aarav/Downloads/archive/car data.csv")
CAR.DETAILS.FROM.CAR.DEKHO <- read.csv("C:/Users/aarav/Downloads/archive/CAR DETAILS FROM CAR DEKHO.csv")
Car.details.v3 <- read.csv("C:/Users/aarav/Downloads/archive/Car details v3.csv")
car.details.v4 <- read.csv("C:/Users/aarav/Downloads/archive/car details v4.csv")

Clean Data

Check for and Replace NA Values

library(ggplot2)

df <- Car.details.v3

library(visdat)
vis_miss(df)

for(i in colnames(df)){
  df[,i][is.na(df[,i])] <- median(df[,i], 
  na.rm=TRUE)
}
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
argument is not numeric or logical: returning NA
sum(is.numeric(is.na(df)))
[1] 0

Remove Characters from Important Variables

df[c("mileage", "max_power")] <- as.numeric(sapply(df[c("mileage", "max_power")], function(x) gsub("[^0-9]", "", x)))

Remove Unnecessary Variables

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- dplyr::select(.data = df, -name, -engine, - torque)

Single Variable Plots

df_hist <- dplyr::select(.data = df, -selling_price, - km_driven)

library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
df_melted <- melt(df_hist)
Using fuel, seller_type, transmission, owner as id variables
ggplot(data = df_melted, 
       aes(x = value)
       ) + 
  geom_histogram() + 
  facet_wrap(facets = . ~ variable,
             scales = "free_x") 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 437 rows containing non-finite outside the scale range
(`stat_bin()`).

Correlation Tests

Creating a Numeric-Only Dataframe and Create a Correlation Matrix

df_corr <- dplyr::select(.data = df, -fuel, -seller_type, -transmission, -owner, -seats)

for(i in colnames(df_corr)){
  df_corr[,i][is.na(df_corr[,i])] <- median(df_corr[,i], 
  na.rm=TRUE)
}

sum(is.numeric(is.na(df)))
[1] 0
df_corr_matrix <- cor(df_corr)

Visualize Correlation Matrix

library(corrplot)
corrplot 0.92 loaded
corrplot(df_corr_matrix)

Plotting Strong Correlations

ggplot(data = df_corr, mapping = aes(x = year,
                                     y = km_driven)) +
  geom_point(col="red", alpha = 0.1) +
  labs(title = "Distribution of Km Driven by Year",
       xlab = "Year",
       ylab = "Km Driven") +
  geom_smooth(method ="lm", col = 'black', linewidth = 2) +
  ylim(0, 1000000)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = df_corr, mapping = aes(x = year,
                                     y = selling_price)) +
  geom_point(col='blue', alpha = 0.1) +
  labs(title = "Distribution of Selling Price by Year",
       xlab = "Year",
       ylab = "Selling Price") +
  geom_smooth(method ="lm", col = 'black', linewidth = 2) +
  ylim(0,5000000)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 117 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 117 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 47 rows containing missing values or values outside the scale range
(`geom_smooth()`).

Descriptive Statistics

library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
stargazer(df, type = "text", title = "Cars Summary Stats")

Cars Summary Stats
=============================================================
Statistic       N      Mean      St. Dev.    Min      Max    
-------------------------------------------------------------
year          8,128  2,013.804     4.044    1,983    2,020   
selling_price 8,128 638,271.800 806,253.400 29,999 10,000,000
km_driven     8,128 69,819.510  56,550.560    1    2,360,457 
mileage       7,907   947.596     925.326     0      3,344   
max_power     7,912  2,764.028   5,160.728    0     108,495  
seats         8,128    5.405       0.949      2        14    
-------------------------------------------------------------

Bivariate Regression Model

Bivariate Regression: Selling Price vs. Year Manufactured

?geom_smooth
starting httpd help server ... done
bi_var_model <- lm(formula = selling_price ~ year, data = df)
summary(bi_var_model)

Call:
lm(formula = selling_price ~ year, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-899766 -314346 -161900   28101 9097891 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -165606347    4053843  -40.85   <2e-16 ***
year             82553       2013   41.01   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 733900 on 8126 degrees of freedom
Multiple R-squared:  0.1715,    Adjusted R-squared:  0.1714 
F-statistic:  1682 on 1 and 8126 DF,  p-value: < 2.2e-16

Plotting Bivariate Model Residuals

plot(bi_var_model)

Plotting Bivariate Regression

ggplot(data = df, mapping =aes(x = year, 
                               y = selling_price)) +
  geom_point(col = 'black', alpha = 0.1)+
  geom_smooth(method = "lm", linewidth = 2) +
  ylim(0,5000000)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 117 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 117 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 47 rows containing missing values or values outside the scale range
(`geom_smooth()`).

?geom_smooth

Multivariate Regression Model

Kitchen Sink Model

library(dplyr)

df_sample <- sample_n(tbl = df, size = 1000)

s_multi_var_model <- lm(formula = selling_price ~., data = df_sample)

stargazer(s_multi_var_model, type = 'text')

=======================================================
                                Dependent variable:    
                            ---------------------------
                                   selling_price       
-------------------------------------------------------
year                               44,006.560***       
                                    (6,491.705)        
                                                       
km_driven                            -1.166**          
                                      (0.459)          
                                                       
fuelDiesel                          296,595.700        
                                   (226,924.700)       
                                                       
fuelLPG                             69,555.700         
                                   (334,118.700)       
                                                       
fuelPetrol                          -41,196.210        
                                   (227,194.200)       
                                                       
seller_typeIndividual             -372,529.100***      
                                   (57,260.220)        
                                                       
seller_typeTrustmark Dealer       -556,910.000***      
                                   (126,241.800)       
                                                       
transmissionManual               -1,231,620.000***     
                                   (58,322.750)        
                                                       
ownerFourth                         Above Owner        
                                   (136,688.100)       
                                                       
ownerSecond Owner                   -18,363.470        
                                   (46,139.660)        
                                                       
ownerTest Drive Car              3,763,961.000***      
                                   (556,357.600)       
                                                       
ownerThird Owner                    67,920.020         
                                   (78,612.490)        
                                                       
mileage                               35.039*          
                                     (20.474)          
                                                       
max_power                             -0.843           
                                      (3.565)          
                                                       
seats                                5,687.288         
                                   (20,070.800)        
                                                       
Constant                        -86,701,676.000***     
                                 (13,092,836.000)      
                                                       
-------------------------------------------------------
Observations                            978            
R2                                     0.550           
Adjusted R2                            0.543           
Residual Std. Error           550,288.000 (df = 962)   
F Statistic                  78.247*** (df = 15; 962)  
=======================================================
Note:                       *p<0.1; **p<0.05; ***p<0.01

Plotting Full Model Residuals

plot(s_multi_var_model)
Warning: not plotting observations with leverage one:
  710

Plotting Full Model Fit

df_sample<- na.omit(df_sample)

df_sample$price_prediction <- s_multi_var_model$fitted.values

ggplot(data = df_sample, mapping =aes(x = price_prediction, 
                               y = selling_price)) +
  geom_point()+
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Comparing Full and Bivariate Models

stargazer(bi_var_model, s_multi_var_model, type = 'text')

================================================================================
                                            Dependent variable:                 
                            ----------------------------------------------------
                                               selling_price                    
                                        (1)                       (2)           
--------------------------------------------------------------------------------
year                               82,552.530***             44,006.560***      
                                    (2,013.024)               (6,491.705)       
                                                                                
km_driven                                                       -1.166**        
                                                                (0.459)         
                                                                                
fuelDiesel                                                    296,595.700       
                                                             (226,924.700)      
                                                                                
fuelLPG                                                        69,555.700       
                                                             (334,118.700)      
                                                                                
fuelPetrol                                                    -41,196.210       
                                                             (227,194.200)      
                                                                                
seller_typeIndividual                                       -372,529.100***     
                                                              (57,260.220)      
                                                                                
seller_typeTrustmark Dealer                                 -556,910.000***     
                                                             (126,241.800)      
                                                                                
transmissionManual                                         -1,231,620.000***    
                                                              (58,322.750)      
                                                                                
ownerFourth                         Above Owner                                 
                                                             (136,688.100)      
                                                                                
ownerSecond Owner                                             -18,363.470       
                                                              (46,139.660)      
                                                                                
ownerTest Drive Car                                         3,763,961.000***    
                                                             (556,357.600)      
                                                                                
ownerThird Owner                                               67,920.020       
                                                              (78,612.490)      
                                                                                
mileage                                                         35.039*         
                                                                (20.474)        
                                                                                
max_power                                                        -0.843         
                                                                (3.565)         
                                                                                
seats                                                          5,687.288        
                                                              (20,070.800)      
                                                                                
Constant                        -165,606,347.000***        -86,701,676.000***   
                                  (4,053,843.000)           (13,092,836.000)    
                                                                                
--------------------------------------------------------------------------------
Observations                           8,128                      978           
R2                                     0.171                     0.550          
Adjusted R2                            0.171                     0.543          
Residual Std. Error           733,925.300 (df = 8126)    550,288.000 (df = 962) 
F Statistic                 1,681.756*** (df = 1; 8126) 78.247*** (df = 15; 962)
================================================================================
Note:                                                *p<0.1; **p<0.05; ***p<0.01

Refining Model

library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
stepAIC(object = s_multi_var_model, direction = 'backward')
Start:  AIC=25870.66
selling_price ~ year + km_driven + fuel + seller_type + transmission + 
    owner + mileage + max_power + seats

               Df  Sum of Sq        RSS   AIC
- max_power     1 1.6939e+10 2.9133e+14 25869
- seats         1 2.4314e+10 2.9133e+14 25869
<none>                       2.9131e+14 25871
- mileage       1 8.8695e+11 2.9220e+14 25872
- km_driven     1 1.9544e+12 2.9326e+14 25875
- owner         4 1.4256e+13 3.0557e+14 25909
- seller_type   2 1.4421e+13 3.0573e+14 25914
- year          1 1.3915e+13 3.0523e+14 25914
- fuel          3 2.0371e+13 3.1168e+14 25931
- transmission  1 1.3504e+14 4.2635e+14 26241

Step:  AIC=25868.72
selling_price ~ year + km_driven + fuel + seller_type + transmission + 
    owner + mileage + seats

               Df  Sum of Sq        RSS   AIC
- seats         1 2.7784e+10 2.9135e+14 25867
<none>                       2.9133e+14 25869
- mileage       1 8.7505e+11 2.9220e+14 25870
- km_driven     1 1.9657e+12 2.9329e+14 25873
- owner         4 1.4278e+13 3.0560e+14 25908
- seller_type   2 1.4405e+13 3.0573e+14 25912
- year          1 1.4203e+13 3.0553e+14 25913
- fuel          3 2.0427e+13 3.1175e+14 25929
- transmission  1 1.3521e+14 4.2654e+14 26240

Step:  AIC=25866.81
selling_price ~ year + km_driven + fuel + seller_type + transmission + 
    owner + mileage

               Df  Sum of Sq        RSS   AIC
<none>                       2.9135e+14 25867
- mileage       1 8.7021e+11 2.9222e+14 25868
- km_driven     1 1.9568e+12 2.9331e+14 25871
- owner         4 1.4285e+13 3.0564e+14 25906
- seller_type   2 1.4397e+13 3.0575e+14 25910
- year          1 1.4553e+13 3.0591e+14 25913
- fuel          3 2.2389e+13 3.1374e+14 25933
- transmission  1 1.3550e+14 4.2686e+14 26238

Call:
lm(formula = selling_price ~ year + km_driven + fuel + seller_type + 
    transmission + owner + mileage, data = df_sample)

Coefficients:
                (Intercept)                         year  
                 -8.658e+07                    4.396e+04  
                  km_driven                   fuelDiesel  
                 -1.139e+00                    3.008e+05  
                    fuelLPG                   fuelPetrol  
                  6.953e+04                   -4.055e+04  
      seller_typeIndividual  seller_typeTrustmark Dealer  
                 -3.707e+05                   -5.553e+05  
         transmissionManual    ownerFourth & Above Owner  
                 -1.233e+06                    2.462e+04  
          ownerSecond Owner          ownerTest Drive Car  
                 -1.781e+04                    3.753e+06  
           ownerThird Owner                      mileage  
                  6.887e+04                    3.463e+01  
final_model <- lm(formula = selling_price ~ owner +  seller_type + as.factor(year) + seats + transmission + fuel, data = df_sample)

Plotting Refined Model Residuals

plot(final_model)
Warning: not plotting observations with leverage one:
  637, 649, 710, 742

Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Plotting Refined Model

df_sample$price_prediction <- final_model$fitted.values

summary
function (object, ...) 
UseMethod("summary")
<bytecode: 0x0000020443db2ef0>
<environment: namespace:base>
ggplot(data = df_sample, mapping =aes(x = price_prediction, 
                               y = selling_price)) +
  geom_point()+
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Comparing Refined Model, Full Model, and Bivariate Model

stargazer(s_multi_var_model, final_model, bi_var_model, type = 'text')

=========================================================================================================
                                                         Dependent variable:                             
                            -----------------------------------------------------------------------------
                                                            selling_price                                
                                      (1)                      (2)                        (3)            
---------------------------------------------------------------------------------------------------------
year                             44,006.560***                                       82,552.530***       
                                  (6,491.705)                                         (2,013.024)        
                                                                                                         
km_driven                           -1.166**                                                             
                                    (0.459)                                                              
                                                                                                         
fuelDiesel                        296,595.700              252,788.500                                   
                                 (226,924.700)            (219,959.900)                                  
                                                                                                         
fuelLPG                            69,555.700               32,723.790                                   
                                 (334,118.700)            (325,144.400)                                  
                                                                                                         
fuelPetrol                        -41,196.210              -95,878.360                                   
                                 (227,194.200)            (219,786.400)                                  
                                                                                                         
seller_typeIndividual           -372,529.100***          -386,144.800***                                 
                                  (57,260.220)             (55,498.880)                                  
                                                                                                         
seller_typeTrustmark Dealer     -556,910.000***          -622,594.600***                                 
                                 (126,241.800)            (124,548.500)                                  
                                                                                                         
as.factor(year)1998                                        -54,034.750                                   
                                                          (614,468.800)                                  
                                                                                                         
as.factor(year)1999                                        -67,264.460                                   
                                                          (490,706.400)                                  
                                                                                                         
as.factor(year)2000                                         23,427.510                                   
                                                          (629,284.400)                                  
                                                                                                         
as.factor(year)2001                                        -66,852.070                                   
                                                          (615,118.200)                                  
                                                                                                         
as.factor(year)2002                                        -24,572.490                                   
                                                          (629,284.400)                                  
                                                                                                         
as.factor(year)2003                                        -160,087.000                                  
                                                          (406,443.300)                                  
                                                                                                         
as.factor(year)2004                                        -50,904.250                                   
                                                          (486,066.200)                                  
                                                                                                         
as.factor(year)2005                                        -264,154.700                                  
                                                          (341,042.000)                                  
                                                                                                         
as.factor(year)2006                                        -85,117.580                                   
                                                          (339,214.700)                                  
                                                                                                         
as.factor(year)2007                                        -222,987.100                                  
                                                          (325,924.200)                                  
                                                                                                         
as.factor(year)2008                                        -68,366.040                                   
                                                          (325,363.600)                                  
                                                                                                         
as.factor(year)2009                                        -86,915.210                                   
                                                          (322,927.400)                                  
                                                                                                         
as.factor(year)2010                                        -112,889.700                                  
                                                          (317,820.000)                                  
                                                                                                         
as.factor(year)2011                                        -110,899.700                                  
                                                          (314,083.300)                                  
                                                                                                         
as.factor(year)2012                                        -68,992.440                                   
                                                          (315,025.500)                                  
                                                                                                         
as.factor(year)2013                                         2,781.216                                    
                                                          (314,682.000)                                  
                                                                                                         
as.factor(year)2014                                         87,656.820                                   
                                                          (314,819.800)                                  
                                                                                                         
as.factor(year)2015                                        155,592.600                                   
                                                          (315,066.400)                                  
                                                                                                         
as.factor(year)2016                                        202,072.000                                   
                                                          (314,511.500)                                  
                                                                                                         
as.factor(year)2017                                        242,434.600                                   
                                                          (313,875.300)                                  
                                                                                                         
as.factor(year)2018                                        292,068.700                                   
                                                          (315,671.300)                                  
                                                                                                         
as.factor(year)2019                                       937,492.700***                                 
                                                          (317,235.600)                                  
                                                                                                         
as.factor(year)2020                                        377,415.900                                   
                                                          (369,732.200)                                  
                                                                                                         
transmissionManual             -1,231,620.000***        -1,116,310.000***                                
                                  (58,322.750)             (58,043.060)                                  
                                                                                                         
ownerFourth                       Above Owner               24,614.720                -27,279.580        
                                 (136,688.100)            (141,131.600)                                  
                                                                                                         
ownerSecond Owner                 -18,363.470               12,676.200                                   
                                  (46,139.660)             (45,701.770)                                  
                                                                                                         
ownerTest Drive Car             3,763,961.000***         3,371,201.000***                                
                                 (556,357.600)            (536,923.600)                                  
                                                                                                         
ownerThird Owner                   67,920.020               48,597.860                                   
                                  (78,612.490)             (77,423.480)                                  
                                                                                                         
mileage                             35.039*                                                              
                                    (20.474)                                                             
                                                                                                         
max_power                            -0.843                                                              
                                    (3.565)                                                              
                                                                                                         
seats                              5,687.288                 493.519                                     
                                  (20,070.800)             (19,011.820)                                  
                                                                                                         
Constant                       -86,701,676.000***        1,707,717.000***         -165,606,347.000***    
                                (13,092,836.000)          (398,049.500)             (4,053,843.000)      
                                                                                                         
---------------------------------------------------------------------------------------------------------
Observations                          978                      978                       8,128           
R2                                   0.550                    0.589                      0.171           
Adjusted R2                          0.543                    0.574                      0.171           
Residual Std. Error          550,288.000 (df = 962)   531,236.200 (df = 943)    733,925.300 (df = 8126)  
F Statistic                 78.247*** (df = 15; 962) 39.666*** (df = 34; 943) 1,681.756*** (df = 1; 8126)
=========================================================================================================
Note:                                                                         *p<0.1; **p<0.05; ***p<0.01

Insights

Car Price in Relation to Year Manufactured

  • On average, for every year newer a car is, it cost 43,000 additional dollars.

  • For this reason, buying old/used cars is much more money efficient than buying new cars

Car Price in Relation to Fuel Type

  • Compared to cars which use CNG fuel, diesel cars sell for $296,000 higher on average, and petrol vehicles sell for $16,000 higher.

  • This is likely due to diesel vehicles being large semi-trucks and buses, while petrol and CNG vehicles are cars

Car Price in Relation to Transmission Type, Mileage, Max_Power, and Seats

  • Compared to cars with automatic transmission, cars with manual transmission sell for $1,256,237 lower

  • For every additional mile of mileage a car has, its price increases by $14

  • For every additional unit of horsepower a car has, its price increases by $9

  • For every additional seat a car has, its price increases by $37,000

  • All of these car options can together vastly increase, or decrease the price of your car, so being conscious of these aspects when buying a car can increase car affordability depending on your circumstances

Car Price in Relation to Owner

  • Compared to the average price the first owner of a car payed, the second owner of a car payed $15,000 less, and the third owner payed $74,000 more

  • Compared to first owners of cars, second owners of cars are likely buying cars for cheaper, because they are owning lower end cars, where the value of the vehicle is assessed based only its condition and its original value.

  • Third car owners are likely buying cars for a higher price, because third car owners are likely those owning more expensive cars, where the value of the vehicle is assessed by its, condition, its original value, and its exclusivity.

  • Alternatively, third car owners may be a skewed metric, because of one or two third car owners who bought cars with a price to a very high extreme