library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
df <- read.delim('Cars24.csv', sep = ",")
Cars24 is a leading AutoTech company focused on the sale, purchase, and financing of pre-owned cars. The company offers an online marketplace for buying and selling used cars, complemented by a suite of services including car financing, quality checks, warranties, and seamless documentation for transactions. Cars24 primarily serves the automotive industry with a customer base looking for pre-owned vehicle solutions.
I found this data from kaggle. Click here to view the source. The data set has the below columns and you can find the description as well.
Car Brand: The brand or make of the car (e.g., Toyota, Honda).
Model: The specific model of the car.
Price: The selling price of the used car.
Model year: The year the car was manufactured.
Location: The city or location where the car is bought.
Fuel: The type of fuel the car uses (e.g., petrol, diesel, CNG).
Driven: The total kilometers the car has been driven.
Gear: The transmission type (manual or automatic).
Ownership: The number of previous owners the car has had.
EMI: The monthly installment (EMI) required to buy the car.
The main idea I had behind using this dataset was to try and find some way to predict the selling price of a used car based on brand, model, age, no of previous owners, fuel type, kilometers driven and transmission type. In this dataset I have the selling price for all the rows, but if you consider a scenario like while adding a new row of a used car data, someone should make an assessment of the car and figure out what the selling price should be. I am trying to find a way to automate this part using linear regression.
top5_CarBrand<- df |>
group_by(Car.Brand) |>
summarize(Cars_count = n()) |>
arrange(desc(Cars_count)) |>
top_n(5,Cars_count)
df |>
filter(Gear == "Automatic" | Gear == "Manual") |>
ggplot() +
geom_bar(mapping = aes(Gear, fill = Fuel)) +
labs(
x = "Gear type",
y = "Number of Cars"
) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal() +
ggtitle("Breakdown of cars based on Gears and Fuel type")
The plot suggests that manual transmission cars are much more prevalent than automatic cars in this dataset, with the majority of manual cars running on petrol and a significant portion on diesel. Automatic cars, while fewer, are also mostly petrol-driven. Alternative fuel types like CNG and LPG are rare in both transmission types, and electric cars appear to be either non-existent or very few in number in the dataset.
df |>
filter(Car.Brand %in% top5_CarBrand$Car.Brand) |>
group_by(Model.Year, Car.Brand) |>
summarize(Avg_price = mean(Price)) |>
ggplot() +
geom_line(size = 1.1, mapping = aes(x = Model.Year, y = Avg_price, color = Car.Brand)) +
labs(
x = "Year",
y = "Average price"
) +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
labs(color = "Car Brand") +
theme_minimal() +
ggtitle("Trend on Average price of top 5 brands over years")
## `summarise()` has grouped output by 'Model.Year'. You can override using the
## `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
This plot reveals the increasing trend in car prices over time for the top 5 brands, with Toyota and Honda leading in terms of average price, while Maruti consistently remains the most affordable. Hyundai and Volkswagen show steady increases, though Volkswagen experiences some volatility. The significant rise in Honda’s prices after 2018 and the drop in Toyota’s prices after its peak suggest changes in market demand or model offerings during those years.
The upward trend across all brands suggests an overall increase in the cost of cars, possibly driven by factors such as inflation, new technologies, and evolving consumer preferences.
Null Hypothesis: There is no relationship between the kilometers driven by a car and its average price.
Alternative Hypothesis: There is a relationship between the kilometers driven by a car and its average price, such that the price decreases as the car’s km driven increases.
df$kmsDriven_category <- cut(df$Driven..Kms.,
breaks=c(0, 36800, 73600, 110400, 147200, 920000),
labels=c('< 36.8K', '36.8K-73.6K', '73.6K-110K', '110K-147K', '> 147K'))
head(df, n = 5)
## X Car.Brand Model Price Model.Year Location Fuel Driven..Kms.
## 1 0 Hyundai EonERA PLUS 330399 2016 Hyderabad Petrol 10674
## 2 1 Maruti Wagon R 1.0LXI 350199 2011 Hyderabad Petrol 20979
## 3 2 Maruti Alto K10LXI 229199 2011 Hyderabad Petrol 47330
## 4 3 Maruti RitzVXI BS IV 306399 2011 Hyderabad Petrol 19662
## 5 4 Tata NanoTWIST XTA 208699 2015 Hyderabad Petrol 11256
## Gear Ownership EMI..monthly. kmsDriven_category
## 1 Manual 2 7350 < 36.8K
## 2 Manual 1 7790 < 36.8K
## 3 Manual 2 5098 36.8K-73.6K
## 4 Manual 1 6816 < 36.8K
## 5 Automatic 1 4642 < 36.8K
df |>
ggplot() +
geom_bar(mapping = aes(x = kmsDriven_category, fill = Fuel)) +
labs(
title = "Distribution of Cars by Kilometers Driven",
x = "Kilometers driven",
y = "Number of cars"
) +
theme_minimal()
theme_minimal()
df |>
filter(Car.Brand %in% top5_CarBrand$Car.Brand) |>
ggplot() +
geom_point(mapping = aes(x = Price , y = Driven..Kms., color = Car.Brand)) +
labs(
title = "Scatter Plot of Car Price vs Kilometers Driven for top 5 brands",
x = "Price (INR)",
y = "Kms driven"
) +
scale_color_discrete(name = "Car Brand") +
scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
theme_minimal()
Below is the plot to see the cars that falls under the price 2M and under 250K kilometers driven.
df |>
filter(Car.Brand %in% top5_CarBrand$Car.Brand, Price < 2000000, Driven..Kms. < 250000) |>
ggplot() +
geom_point(mapping = aes(x = Price , y = Driven..Kms., color = Car.Brand)) +
labs(
title = "Scatter Plot of Car Price vs Kilometers Driven for top 5 brands",
x = "Price (INR)",
y = "Kms driven",
subtitle = "(For cars with price < 2M and kms driven < 250K)"
) +
scale_color_discrete(name = "Car Brand") +
scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
theme_minimal()
Null Hypothesis: There is no relationship between the age of a car and its average price, meaning the age of the car does not affect its price.
Alternative Hypothesis: There is a relationship between the age of a car and its average price, such that the price decreases as the car’s age increases.
# Adding a column named 'age' and populating the age of each car.
df$Age = year(now()) - df$Model.Year
# Segregating the age as 0 to 5, 6 to 10, 11 to 15 and 16 to 20 using the 'cut' function.
df$Age_category <- cut(df$Age,
breaks=c(0, 5, 10, 15, 20),
labels=c('< 5 years', '6-10 years', '11-15 years', '> 16 years'))
head(df, n = 5)
## X Car.Brand Model Price Model.Year Location Fuel Driven..Kms.
## 1 0 Hyundai EonERA PLUS 330399 2016 Hyderabad Petrol 10674
## 2 1 Maruti Wagon R 1.0LXI 350199 2011 Hyderabad Petrol 20979
## 3 2 Maruti Alto K10LXI 229199 2011 Hyderabad Petrol 47330
## 4 3 Maruti RitzVXI BS IV 306399 2011 Hyderabad Petrol 19662
## 5 4 Tata NanoTWIST XTA 208699 2015 Hyderabad Petrol 11256
## Gear Ownership EMI..monthly. kmsDriven_category Age Age_category
## 1 Manual 2 7350 < 36.8K 8 6-10 years
## 2 Manual 1 7790 < 36.8K 13 11-15 years
## 3 Manual 2 5098 36.8K-73.6K 13 11-15 years
## 4 Manual 1 6816 < 36.8K 13 11-15 years
## 5 Automatic 1 4642 < 36.8K 9 6-10 years
df |>
ggplot() +
geom_boxplot(mapping = aes(x = Age_category, y = Price)) +
labs(
title = "Age vs Price",
x = "Age Category",
y = "Price (INR)"
) +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal()