IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

# Load data
trainData <- read.csv("Data/train.csv", header = TRUE, sep = ",")
names(trainData)
##  [1] "id"                  "car_name"            "yr_mfr"             
##  [4] "fuel_type"           "kms_run"             "sale_price"         
##  [7] "city"                "times_viewed"        "body_type"          
## [10] "transmission"        "variant"             "assured_buy"        
## [13] "registered_city"     "registered_state"    "is_hot"             
## [16] "rto"                 "source"              "make"               
## [19] "model"               "car_availability"    "total_owners"       
## [22] "broker_quote"        "original_price"      "car_rating"         
## [25] "ad_created_on"       "fitness_certificate" "emi_starts_from"    
## [28] "booking_down_pymnt"  "reserved"            "warranty_avail"
str(trainData)
## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
trainData <- trainData[-c(1)] # Remove ID column

# Convert ad_created_on as Date variable as it defaulted to Character
trainData$ad_created_on <- as.Date(trainData$ad_created_on)

# Create car_age column based on yr_mfr and ad_created_on
trainData$car_age <- as.numeric(format(trainData$ad_created_on, format = "%Y")) - trainData$yr_mfr

# Create diff_sale_orig_price with the difference between original price and the sale price of the car 
trainData$diff_sale_orig_price <- trainData$original_price - trainData$sale_price

# Convert relevant variables as factor
trainData[,c("car_name","fuel_type", "city", "body_type", "transmission", "variant", "assured_buy", "registered_city", "registered_state", "is_hot", "rto", "source", "make", "model", "car_availability", "car_rating", "fitness_certificate", "reserved", "warranty_avail")] <- lapply(trainData[,c("car_name", "fuel_type", "city", "body_type", "transmission", "variant", "assured_buy", "registered_city", "registered_state", "is_hot", "rto", "source", "make", "model", "car_availability", "car_rating", "fitness_certificate", "reserved", "warranty_avail")], factor)

str(trainData)
## 'data.frame':    6399 obs. of  31 variables:
##  $ car_name            : Factor w/ 182 levels "audi a3","audi a4",..: 104 87 49 104 49 88 101 52 46 135 ...
##  $ yr_mfr              : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type           : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 3 1 3 1 3 ...
##  $ kms_run             : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price          : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city                : Factor w/ 13 levels "ahmedabad","bengaluru",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ times_viewed        : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type           : Factor w/ 6 levels "","hatchback",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ transmission        : Factor w/ 3 levels "","automatic",..: 3 3 3 3 3 1 3 3 3 3 ...
##  $ variant             : Factor w/ 873 levels "1.0 climber opt amt",..: 518 507 616 693 532 726 693 301 533 560 ...
##  $ assured_buy         : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
##  $ registered_city     : Factor w/ 231 levels "","-do","agra",..: 64 152 3 64 150 64 64 152 152 152 ...
##  $ registered_state    : Factor w/ 17 levels "","andhra pradesh",..: 5 15 15 5 5 5 5 15 15 15 ...
##  $ is_hot              : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ rto                 : Factor w/ 255 levels "ap02","ap04",..: 37 228 243 32 29 33 29 228 228 228 ...
##  $ source              : Factor w/ 4 levels "","customer_to_customer",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ make                : Factor w/ 26 levels "audi","bmw","chevrolet",..: 14 14 8 14 8 14 14 8 8 20 ...
##  $ model               : Factor w/ 182 levels "3 series","5 series",..: 149 12 72 149 72 13 131 79 51 94 ...
##  $ car_availability    : Factor w/ 5 levels "","in_stock",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ total_owners        : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote        : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price      : num  404177 354313 NA 374326 367216 ...
##  $ car_rating          : Factor w/ 5 levels "","fair","good",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ ad_created_on       : Date, format: "2021-04-04" "2021-03-22" ...
##  $ fitness_certificate : Factor w/ 3 levels "","False","True": 3 3 3 3 3 3 3 3 3 3 ...
##  $ emi_starts_from     : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt  : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved            : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avail      : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ car_age             : num  6 5 4 8 6 3 8 9 7 3 ...
##  $ diff_sale_orig_price: num  17778 88814 NA 66327 5717 ...
# testData <- read.csv("Data/test.csv", header = TRUE, sep = ",")
# names(testData)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5

Cars by Make

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Make") +
 coord_flip() +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Top 3 makes are Maruti, Hyundai and Honda
  • Nearly 80% of the market is with these makes

Cars by Body Type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Body Type") +
 coord_flip() +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Top 3 body types are Hatchback, Sedan, SUV
  • 50% of the market is filled with Hatchbacks

Cars by Make and Body Type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make) +
 geom_bar(position = "dodge", fill = "#0c4c8a") +
 labs(title = "Cars by Make and Body Type") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(body_type))  + 
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Hatchback - Maruti and Hyundai have the major share.
  • Sedan - Maruti and Honda have the major share.
  • SUV - Maruti has the major share followed by Mahindra, Ford and Renault.
  • 50% of Hatchbacks are from Maruti
  • 90% of Sedans are from Maruti
  • 40% of SUVs are from Maruti

Cars by Fuel Type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = fuel_type) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Fuel Type") +
 coord_flip() +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Top 3 fuel types are Petrol, Diesel and Petrol & CNG.
  • More than 50% of the cars are petrol variant

Cars by Make and Fuel Type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make) +
 geom_bar(position = "dodge", fill = "#0c4c8a") +
 labs(title = "Cars by Make and Fuel Type") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(fuel_type))  + 
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Petrol - Maruti, Hyundai and Honda have the major share.
  • Diesel - Maruti, Toyota and Mahindra have the major share.
  • Petrol & CNG - Maruti is leading.
  • Maruti has close to 40% share in both Petrol and Diesel variants.

Cars by Transmission

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = transmission) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Transmission") +
 coord_flip() +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Manual and Automatic are the only two transmissions available
  • Manual has the major share with up to 90%

Cars by Make and Transmission

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make) +
 geom_bar(position = "dodge", fill = "#0c4c8a") +
 labs(title = "Cars by Make and Transmission") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(transmission))  + 
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • In Automatic transmission Maruti has the major share up to 30%

Models among Top 3 Makes

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("maruti")) %>%
 filter(!(car_availability %in% "")) %>%
 
    filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = make, fill = model) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Models in Maruti") +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
 theme(legend.position = "bottom")

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("hyundai")) %>%
 filter(!(car_availability %in% "")) %>%
 
    filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = make, fill = model) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Models in Hyundai") +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
 theme(legend.position = "bottom")

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("honda")) %>%
 filter(!(car_availability %in% "")) %>%
 
    filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = make, fill = model) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Models in Honda") +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
 theme(legend.position = "bottom")

Findings

  • Top 3 models among the top 3 makes are:
    • Maruti - Swift, Swift Dzire, Wagon R 1.0
    • Hyundai - i10, Grand i10, i20
    • Honda - City, Amaze, Brio

Cars by Name

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("maruti", "hyundai", "honda")) %>%
 filter(!(car_availability %in% "")) %>%
 
    filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = car_name) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Name", subtitle = "Among top 3 makes - Maruti, Hyundai, Honda") +
 coord_flip() +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

Findings

  • Top 5 cars are: Maruti Swift, Hyundai i10, Maruti Swift Dzire, Maruti Wagon R 1.0, Hyundai Grand i10.
  • These 5 models take 25% share in the market.

Cars by Year of Manufacturing

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = yr_mfr) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Cars by Year of Manufacturing") +
 theme_minimal() +
 geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = yr_mfr, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Cars by Make and Year of Manufacturing") +
 theme_minimal()

Findings

  • Most of the cars are in the manufacturing years of 2010 to 2018.
  • 40% of the cars are from 2012 to 2015 years.
  • The oldest car is of Opel make and the youngest is of Tata make.
  • Most of the old cars are of Maruti make.
  • The mean manufacturing year of Ford, Honda, Hyundai, Nissan, Skoda, Ssangyong and Toyota is same (around 2013)

Cars by Kilometers Run

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = kms_run) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Cars by Kilometers Run") +
 theme_minimal()

trainData %>%
 filter(kms_run >= 0L & kms_run <= 250000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = kms_run) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Cars by Kilometers Run", subtitle = "0 to 2,50,000 KM") +
 theme_minimal()

trainData %>%
 filter(kms_run >= 250000L & kms_run <= 1000000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = kms_run) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Cars by Kilometers Run", subtitle = "> 2,50,000 KM") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = kms_run, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Kilometers Run by Make") +
 theme_minimal()

Findings

  • Most of the cars are in the range of 35,000 to 55,000 KM.
  • Lowest KM is of Audi make.
  • Highest KM is of Maruti make and followed by Toyota.

Cars by Sale Price

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = sale_price) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Sale Price Range") +
 theme_minimal()

trainData %>%
 filter(sale_price >= 0L & sale_price <= 500000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = sale_price) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Sale Price Range", subtitle = "< 5,00,000/-") +
 theme_minimal()

trainData %>%
 filter(sale_price >= 500000L & sale_price <= 1000000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = sale_price) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Sale Price Range", subtitle = "> 5,00,000/- and < 10,00,000/-") +
 theme_minimal()

trainData %>%
 filter(sale_price >= 1000000L & sale_price <= 3600000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = sale_price) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Sale Price Range", subtitle = "> 10,00,000/-") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = sale_price, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Sale Price Range by Make") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("maruti", "hyundai", "honda")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = sale_price, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Sale Price Range among top 3 makes") +
 theme_minimal()

Findings

  • Majority share (~ 40%) is in the sale price range of 2,50,000/- to 4,00,000/-.
  • The costliest one if of Audi make. The next costliest one is in Toyota.
  • Among the top 3 makes:
    • The average sale price is in the range of 2,75,000/- to 3,00,000/-.
    • The costliest car is in Hyundai.
    • The cheapest car is in Maruti.

Sale Price of Cars by Year of Manufacturing and Body Type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = yr_mfr, y = sale_price) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "Sale Price of All Cars by Year of Manufacturing", subtitle = "Grouped by Body Type") +
 theme_minimal() +
 facet_wrap(vars(body_type))

trainData %>%
 filter(body_type %in% c("hatchback", "sedan", "suv")) %>%
 filter(!(transmission %in% 
    "")) %>%
 filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 
    filter(make %in% c("maruti", "hyundai", "honda")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = yr_mfr, y = sale_price) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "Sale Price of Maruti, Hyundai and Honda by Year of Manufacturing", subtitle = "Grouped by Top 3 Body Types") +
 theme_minimal() +
 facet_wrap(vars(body_type))

Findings

  • Older the car, lower the sale price

Cars by Times Viewed

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = times_viewed) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Range of Views") +
 theme_minimal()

trainData %>%
 filter(times_viewed >= 0L & times_viewed <= 5000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = times_viewed) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Range of Views", subtitle = "< 5000") +
 theme_minimal()

trainData %>%
 filter(times_viewed >= 5000L & times_viewed <= 10000L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = times_viewed) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Range of Views", subtitle = "> 5000 and < 10000") +
 theme_minimal()

trainData %>%
 filter(times_viewed >= 10000L & times_viewed <= 46500L) %>%
 filter(!(body_type %in% 
    "")) %>%
 filter(!(transmission %in% "")) %>%
 filter(!(registered_state %in% "")) %>%
 
    filter(!(source %in% "")) %>%
 filter(!(car_availability %in% "")) %>%
 filter(!(car_rating %in% 
    "")) %>%
 filter(!(fitness_certificate %in% "")) %>%
 ggplot() +
 aes(x = times_viewed) +
 geom_histogram(bins = 50L, fill = "#0c4c8a") +
 labs(title = "Range of Views", subtitle = "> 10000") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = times_viewed, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Range of Views by Make") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(make %in% 
    c("maruti", "hyundai", "honda")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = "", y = times_viewed, fill = make) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(title = "Range of Views among top 3 Makes") +
 theme_minimal()

Findings

  • Most of the cars have views in the range of 300 to 800.
  • There are 2 cars which were viewed more than 40,000 times which are of Hyundai and Mercedes Benz makes
  • Top 3 makes are viewed below 2000 times on an average

Cars by City

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city) +
 geom_bar(position = "dodge", fill = "#0c4c8a") +
 labs(title = "Cars by City") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = make) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Make") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = body_type) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Body Type") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = fuel_type) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Fuel Type") +
 coord_flip() +
 theme_minimal()

Findings

  • Top 5 cities - Mumbai, Bengaluru, New Delhi, Pune and Chennai.

  • Approx. 50% of cars are in the top 3 cities.

  • City / Make

    • Most of Maruti are in Mumbai, Bengaluru, New Delhi.
    • Most of Hyundai are in Bengaluru, Mumbai, New Delhi.
    • Most of Honda are in Mumbai, New Delhi,Pune.
  • City / Body Type

    • Top 3 cities for Hatchback - Bengaluru, Mumbai, New Delhi.
    • Top 3 cities for Sedan - Mumbai, New Delhi, Bengaluru.
    • Top 3 cities for SUV - Mumbai, New Delhi, Bengaluru.
  • City / Fuel Type

    • Most Petrol cars are from Bengaluru, Mumbai, Pune.
    • Most Diesel cars are from New Delhi, Mumbai, Bengaluru.

Cars by Total Owners

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = total_owners) +
 geom_histogram(bins = 30L, position = "dodge", fill = "#0c4c8a") +
 labs(title = "Cars by Total Owners") +
 theme_minimal()

Findings

  • Most of the cars are first owner, approx. 75%

Cars by Hotness

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = is_hot) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Hotness") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make, fill = is_hot) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Make and Hotness") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type, fill = is_hot) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Body Type and Hotness") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = yr_mfr, fill = is_hot) +
 geom_histogram(bins = 30L, position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Year of Manufacturing and Hotness") +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = total_owners, fill = is_hot) +
 geom_histogram(bins = 30L, position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Total Owners and Hotness") +
 theme_minimal()

Findings

  • Most of the hot cars are in Mumbai.
  • Most of the not so hot cars in New Delhi.
  • Maruti has the most hot and not so hot cars.
  • Hatchback body type has the most hot and not so hot cars.
  • Most of the hot cars are in 2012 to 2015 manufacturing years.
  • 1 owner cars are mostly hot.

Cars by Assured Buy

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = assured_buy) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Assured Buy") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make, fill = assured_buy) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Make and Assured Buy") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type, fill = assured_buy) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Body Type and Assured Buy") +
 coord_flip() +
 theme_minimal()

Findings

  • Most of the assured buy cars are in Mumbai.
  • Most of the not assured buy cars are in Bengaluru.
  • Maruti has most of the assured buy and not assured by cars.
  • Hatchback body type has the most assured buy and not assured buy cars.

Cars by Warranty Availability

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = warranty_avail) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Warranty Availability") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make, fill = warranty_avail) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Make and Warranty Availability") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type, fill = warranty_avail) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Body Type and Warranty Availability") +
 coord_flip() +
 theme_minimal()

Findings

  • Most of the in warranty cars are in New Delhi.
  • Most of the not in warranty cars are in Mumbai.
  • Maruti has most of the in warranty and not in warranty cars.
  • Hatchback body type has most of the in warranty and not in warranty cars.

Cars by Rating

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = car_rating) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Rating") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, fill = car_rating) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by City and Rating") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make, fill = car_rating) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Make and Rating") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type, fill = car_rating) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Body Type and Rating") +
 coord_flip() +
 theme_minimal()

Findings

  • Most cars are in “great” category.
  • Most of great cars are in Mumbai, good cars are in Mumbai, fair cars are in New Delhi and overpriced cars are in Pune.
  • Maruti has most cars of all categories except overpriced.
  • Hatchback hast most cars of all categories.

Cars by Registered State

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = registered_state) +
 geom_bar(fill = "#0c4c8a") +
 labs(title = "Cars by Registered State") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = registered_state, fill = make) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Registered State and Make") +
 coord_flip() +
 theme_minimal()

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = registered_state, fill = body_type) +
 geom_bar(position = "dodge") +
 scale_fill_hue() +
 labs(title = "Cars by Registered State and Body Type") +
 coord_flip() +
 theme_minimal()

Findings

  • Most of the cars are registered in Maharastra, Karnataka and Delhi (matching with top 3 cities).
  • Almost all makes are available in these 3 states.
  • Most of top 3 makes (Maruti, Hyundai and Honda) are registered in Maharastra (differs from top 3 cities).
  • Most of top 3 body types (Hatchback, Sedan, SUV) are registered in Maharastra (differs with top 3 cities).

Cars by hotness and times viewed vs manufacturing year

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = times_viewed, y = yr_mfr) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "Year of Manufacturing vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot and manufactured
    • Before 2010 have ~ 5000 views
    • Between 2010 and 2019 have > 5000 to ~ 10000 views
    • Between 2011 and 2018 have 8000 to 10000 views
    • After 2020 have < 5000 views
  • Cars marked not hot < 2000 views

Cars by hotness and times viewed vs sale price

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = times_viewed, y = sale_price) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "Sale Price vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot
  • Are with sale price < 5,00,000/- and with views < 5000
  • As the price increases, views reduce
  • Cars marked not hot are with sale price < 5,00,000/- and with views < 1000

Cars by hotness and times viewed vs KMs run

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = times_viewed, y = kms_run) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "KMs Run vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot and ran upto 1,00,000 KMs have upto 5000 views. As the KM run increases, views decrease.
  • Cars marked not hot and ran upto 1,00,000/- KMs have < 250 views

Cars by hotness and times viewed vs owners

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = times_viewed, y = total_owners) +
 geom_point(size = 1L, colour = "#0c4c8a") +
 labs(title = "Owners vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot with 1 owner have views upto 10,000. As owners increase the views are reducing by 2500
  • Cars marked not
    • With 1 owner have upto 500 views.
    • With 2 owners have 100 to 250 views.
    • With 3 and 4 owners have upto 500 views.

Cars by hotness and times viewed vs body type

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = body_type, y = times_viewed) +
 geom_boxplot(fill = "#0c4c8a") +
 labs(title = "Body Type vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot
    • Hatchback cars have 20,000 views.
    • Sedan and SUV cars have upto 10,000 views.
    • Mean views for these 3 categories is 1000.
  • Cars with < 1000 views are not hot irrespective of body type.

Cars by hotness and times viewed vs city

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = city, y = times_viewed) +
 geom_boxplot(fill = "#0c4c8a") +
 labs(title = "City vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 coord_flip() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot have
    • Mean views from most of the cities < 2500
    • From Bengaluru, Hyderabad, Mumbai, New Delhi, Pune have upto 10,000 views
  • Cars marked not hot
    • Kolkata have mean views of > 1000
    • Rest of the cities are between 250 to 500 views

Cars by hotness and times viewed vs registered state

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = registered_state, y = times_viewed) +
 geom_boxplot(fill = "#0c4c8a") +
 labs(title = "Registered State vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 coord_flip() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot
    • Mean views from all states are < 2500
    • Most are from Delhi, Karnataka and Maharastra
  • Cars marked not hot
    • West Bengal have mean views of upto 1000
    • Rest of the states are between 250 to 500 views

Cars by hotness and times viewed vs make

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = make, y = times_viewed) +
 geom_boxplot(fill = "#0c4c8a") +
 labs(title = "Make vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 coord_flip() +
 facet_wrap(vars(is_hot))

Findings

  • Mean views of all makes for hot cars is < 5000
  • Mean views for all makes for not hot cars is < 1000

Cars by hotness and times viewed vs rating

trainData %>%
 filter(!(body_type %in% "")) %>%
 filter(!(transmission %in% "")) %>%
 
    filter(!(registered_state %in% "")) %>%
 filter(!(source %in% "")) %>%
 filter(!(car_availability %in% 
    "")) %>%
 filter(!(car_rating %in% "")) %>%
 filter(!(fitness_certificate %in% 
    "")) %>%
 ggplot() +
 aes(x = car_rating, y = times_viewed) +
 geom_boxplot(fill = "#0c4c8a") +
 labs(title = "Rating vs Times Viewed", subtitle = "Grouped by Hotness") +
 theme_minimal() +
 facet_wrap(vars(is_hot))

Findings

  • Cars marked hot and with fair rating have mean views more than those with good / great rating.
  • Mean views of cars not marked hot is < 1000 irrespective of their rating.

Summary of Findings and hence the Hypothesis that can be verified

  • 80% of the cars are from Maruti, Hyundai and Honda
  • 50% of the market is filled with Hatchbacks
  • 50% of Hatchbacks are from Maruti
  • 90% of Sedans are from Maruti
  • 40% of SUVs are from Maruti
  • More than 50% of the cars are petrol variant
  • Maruti has close to 40% share in both Petrol and Diesel variants.
  • Manual transmission has the major share with up to 90%
  • In Automatic transmission, Maruti has the major share up to 30%
  • Maruti Swift, Hyundai i10, Maruti Swift Dzire, Maruti Wagon R 1.0, Hyundai Grand i10 take 25% share in the market
  • 40% of the cars are from 2012 to 2015 manufacturing years.
  • 50% of the cars are in top 3 cities - Mumbai, Bengaluru, New Delhi.
  • 75% of the cars are first owner.
  • Higher the number of views, the car is hot.
  • Lower the KMs run, we get more views.
  • Sale price < 5,00,000, we get more views.