# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
# Load data
trainData <- read.csv("Data/train.csv", header = TRUE, sep = ",")
names(trainData)
## [1] "id" "car_name" "yr_mfr"
## [4] "fuel_type" "kms_run" "sale_price"
## [7] "city" "times_viewed" "body_type"
## [10] "transmission" "variant" "assured_buy"
## [13] "registered_city" "registered_state" "is_hot"
## [16] "rto" "source" "make"
## [19] "model" "car_availability" "total_owners"
## [22] "broker_quote" "original_price" "car_rating"
## [25] "ad_created_on" "fitness_certificate" "emi_starts_from"
## [28] "booking_down_pymnt" "reserved" "warranty_avail"
str(trainData)
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
trainData <- trainData[-c(1)] # Remove ID column
# Convert ad_created_on as Date variable as it defaulted to Character
trainData$ad_created_on <- as.Date(trainData$ad_created_on)
# Create car_age column based on yr_mfr and ad_created_on
trainData$car_age <- as.numeric(format(trainData$ad_created_on, format = "%Y")) - trainData$yr_mfr
# Create diff_sale_orig_price with the difference between original price and the sale price of the car
trainData$diff_sale_orig_price <- trainData$original_price - trainData$sale_price
# Convert relevant variables as factor
trainData[,c("car_name","fuel_type", "city", "body_type", "transmission", "variant", "assured_buy", "registered_city", "registered_state", "is_hot", "rto", "source", "make", "model", "car_availability", "car_rating", "fitness_certificate", "reserved", "warranty_avail")] <- lapply(trainData[,c("car_name", "fuel_type", "city", "body_type", "transmission", "variant", "assured_buy", "registered_city", "registered_state", "is_hot", "rto", "source", "make", "model", "car_availability", "car_rating", "fitness_certificate", "reserved", "warranty_avail")], factor)
str(trainData)
## 'data.frame': 6399 obs. of 31 variables:
## $ car_name : Factor w/ 182 levels "audi a3","audi a4",..: 104 87 49 104 49 88 101 52 46 135 ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 3 1 3 1 3 ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : Factor w/ 13 levels "ahmedabad","bengaluru",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : Factor w/ 6 levels "","hatchback",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ transmission : Factor w/ 3 levels "","automatic",..: 3 3 3 3 3 1 3 3 3 3 ...
## $ variant : Factor w/ 873 levels "1.0 climber opt amt",..: 518 507 616 693 532 726 693 301 533 560 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
## $ registered_city : Factor w/ 231 levels "","-do","agra",..: 64 152 3 64 150 64 64 152 152 152 ...
## $ registered_state : Factor w/ 17 levels "","andhra pradesh",..: 5 15 15 5 5 5 5 15 15 15 ...
## $ is_hot : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ rto : Factor w/ 255 levels "ap02","ap04",..: 37 228 243 32 29 33 29 228 228 228 ...
## $ source : Factor w/ 4 levels "","customer_to_customer",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ make : Factor w/ 26 levels "audi","bmw","chevrolet",..: 14 14 8 14 8 14 14 8 8 20 ...
## $ model : Factor w/ 182 levels "3 series","5 series",..: 149 12 72 149 72 13 131 79 51 94 ...
## $ car_availability : Factor w/ 5 levels "","in_stock",..: 2 2 2 2 2 2 2 2 2 3 ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : Factor w/ 5 levels "","fair","good",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ ad_created_on : Date, format: "2021-04-04" "2021-03-22" ...
## $ fitness_certificate : Factor w/ 3 levels "","False","True": 3 3 3 3 3 3 3 3 3 3 ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ car_age : num 6 5 4 8 6 3 8 9 7 3 ...
## $ diff_sale_orig_price: num 17778 88814 NA 66327 5717 ...
# testData <- read.csv("Data/test.csv", header = TRUE, sep = ",")
# names(testData)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Make") +
coord_flip() +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Body Type") +
coord_flip() +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make) +
geom_bar(position = "dodge", fill = "#0c4c8a") +
labs(title = "Cars by Make and Body Type") +
coord_flip() +
theme_minimal() +
facet_wrap(vars(body_type)) +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = fuel_type) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Fuel Type") +
coord_flip() +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make) +
geom_bar(position = "dodge", fill = "#0c4c8a") +
labs(title = "Cars by Make and Fuel Type") +
coord_flip() +
theme_minimal() +
facet_wrap(vars(fuel_type)) +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = transmission) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Transmission") +
coord_flip() +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make) +
geom_bar(position = "dodge", fill = "#0c4c8a") +
labs(title = "Cars by Make and Transmission") +
coord_flip() +
theme_minimal() +
facet_wrap(vars(transmission)) +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("maruti")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = make, fill = model) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Models in Maruti") +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
theme(legend.position = "bottom")
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("hyundai")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = make, fill = model) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Models in Hyundai") +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
theme(legend.position = "bottom")
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("honda")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = make, fill = model) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Models in Honda") +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3) +
theme(legend.position = "bottom")
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("maruti", "hyundai", "honda")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = car_name) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Name", subtitle = "Among top 3 makes - Maruti, Hyundai, Honda") +
coord_flip() +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = yr_mfr) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Cars by Year of Manufacturing") +
theme_minimal() +
geom_text(stat = "count", aes(label=after_stat(count)), position = position_dodge(0.9), vjust = 0, check_overlap = TRUE, size = 3)
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = yr_mfr, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Cars by Make and Year of Manufacturing") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = kms_run) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Cars by Kilometers Run") +
theme_minimal()
trainData %>%
filter(kms_run >= 0L & kms_run <= 250000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = kms_run) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Cars by Kilometers Run", subtitle = "0 to 2,50,000 KM") +
theme_minimal()
trainData %>%
filter(kms_run >= 250000L & kms_run <= 1000000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = kms_run) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Cars by Kilometers Run", subtitle = "> 2,50,000 KM") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = kms_run, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Kilometers Run by Make") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = sale_price) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Sale Price Range") +
theme_minimal()
trainData %>%
filter(sale_price >= 0L & sale_price <= 500000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = sale_price) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Sale Price Range", subtitle = "< 5,00,000/-") +
theme_minimal()
trainData %>%
filter(sale_price >= 500000L & sale_price <= 1000000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = sale_price) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Sale Price Range", subtitle = "> 5,00,000/- and < 10,00,000/-") +
theme_minimal()
trainData %>%
filter(sale_price >= 1000000L & sale_price <= 3600000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = sale_price) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Sale Price Range", subtitle = "> 10,00,000/-") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = sale_price, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Sale Price Range by Make") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("maruti", "hyundai", "honda")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = sale_price, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Sale Price Range among top 3 makes") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = yr_mfr, y = sale_price) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "Sale Price of All Cars by Year of Manufacturing", subtitle = "Grouped by Body Type") +
theme_minimal() +
facet_wrap(vars(body_type))
trainData %>%
filter(body_type %in% c("hatchback", "sedan", "suv")) %>%
filter(!(transmission %in%
"")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in% c("maruti", "hyundai", "honda")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = yr_mfr, y = sale_price) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "Sale Price of Maruti, Hyundai and Honda by Year of Manufacturing", subtitle = "Grouped by Top 3 Body Types") +
theme_minimal() +
facet_wrap(vars(body_type))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = times_viewed) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Range of Views") +
theme_minimal()
trainData %>%
filter(times_viewed >= 0L & times_viewed <= 5000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = times_viewed) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Range of Views", subtitle = "< 5000") +
theme_minimal()
trainData %>%
filter(times_viewed >= 5000L & times_viewed <= 10000L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = times_viewed) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Range of Views", subtitle = "> 5000 and < 10000") +
theme_minimal()
trainData %>%
filter(times_viewed >= 10000L & times_viewed <= 46500L) %>%
filter(!(body_type %in%
"")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in% "")) %>%
filter(!(car_rating %in%
"")) %>%
filter(!(fitness_certificate %in% "")) %>%
ggplot() +
aes(x = times_viewed) +
geom_histogram(bins = 50L, fill = "#0c4c8a") +
labs(title = "Range of Views", subtitle = "> 10000") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = times_viewed, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Range of Views by Make") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(make %in%
c("maruti", "hyundai", "honda")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = "", y = times_viewed, fill = make) +
geom_boxplot() +
scale_fill_hue() +
labs(title = "Range of Views among top 3 Makes") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city) +
geom_bar(position = "dodge", fill = "#0c4c8a") +
labs(title = "Cars by City") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = make) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Make") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = body_type) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Body Type") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = fuel_type) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Fuel Type") +
coord_flip() +
theme_minimal()
Top 5 cities - Mumbai, Bengaluru, New Delhi, Pune and Chennai.
Approx. 50% of cars are in the top 3 cities.
City / Make
City / Body Type
City / Fuel Type
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = total_owners) +
geom_histogram(bins = 30L, position = "dodge", fill = "#0c4c8a") +
labs(title = "Cars by Total Owners") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = is_hot) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Hotness") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make, fill = is_hot) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Make and Hotness") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type, fill = is_hot) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Body Type and Hotness") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = yr_mfr, fill = is_hot) +
geom_histogram(bins = 30L, position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Year of Manufacturing and Hotness") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = total_owners, fill = is_hot) +
geom_histogram(bins = 30L, position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Total Owners and Hotness") +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = assured_buy) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Assured Buy") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make, fill = assured_buy) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Make and Assured Buy") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type, fill = assured_buy) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Body Type and Assured Buy") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = warranty_avail) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Warranty Availability") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make, fill = warranty_avail) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Make and Warranty Availability") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type, fill = warranty_avail) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Body Type and Warranty Availability") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = car_rating) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Rating") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, fill = car_rating) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by City and Rating") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make, fill = car_rating) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Make and Rating") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type, fill = car_rating) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Body Type and Rating") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = registered_state) +
geom_bar(fill = "#0c4c8a") +
labs(title = "Cars by Registered State") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = registered_state, fill = make) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Registered State and Make") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = registered_state, fill = body_type) +
geom_bar(position = "dodge") +
scale_fill_hue() +
labs(title = "Cars by Registered State and Body Type") +
coord_flip() +
theme_minimal()
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = times_viewed, y = yr_mfr) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "Year of Manufacturing vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = times_viewed, y = sale_price) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "Sale Price vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = times_viewed, y = kms_run) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "KMs Run vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = times_viewed, y = total_owners) +
geom_point(size = 1L, colour = "#0c4c8a") +
labs(title = "Owners vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = body_type, y = times_viewed) +
geom_boxplot(fill = "#0c4c8a") +
labs(title = "Body Type vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = city, y = times_viewed) +
geom_boxplot(fill = "#0c4c8a") +
labs(title = "City vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
coord_flip() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = registered_state, y = times_viewed) +
geom_boxplot(fill = "#0c4c8a") +
labs(title = "Registered State vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
coord_flip() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = make, y = times_viewed) +
geom_boxplot(fill = "#0c4c8a") +
labs(title = "Make vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
coord_flip() +
facet_wrap(vars(is_hot))
trainData %>%
filter(!(body_type %in% "")) %>%
filter(!(transmission %in% "")) %>%
filter(!(registered_state %in% "")) %>%
filter(!(source %in% "")) %>%
filter(!(car_availability %in%
"")) %>%
filter(!(car_rating %in% "")) %>%
filter(!(fitness_certificate %in%
"")) %>%
ggplot() +
aes(x = car_rating, y = times_viewed) +
geom_boxplot(fill = "#0c4c8a") +
labs(title = "Rating vs Times Viewed", subtitle = "Grouped by Hotness") +
theme_minimal() +
facet_wrap(vars(is_hot))