library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
df <- read.delim("./Cars24.csv", sep = ",")
head(df, n = 5)
## X Car.Brand Model Price Model.Year Location Fuel Driven..Kms.
## 1 0 Hyundai EonERA PLUS 330399 2016 Hyderabad Petrol 10674
## 2 1 Maruti Wagon R 1.0LXI 350199 2011 Hyderabad Petrol 20979
## 3 2 Maruti Alto K10LXI 229199 2011 Hyderabad Petrol 47330
## 4 3 Maruti RitzVXI BS IV 306399 2011 Hyderabad Petrol 19662
## 5 4 Tata NanoTWIST XTA 208699 2015 Hyderabad Petrol 11256
## Gear Ownership EMI..monthly.
## 1 Manual 2 7350
## 2 Manual 1 7790
## 3 Manual 2 5098
## 4 Manual 1 6816
## 5 Automatic 1 4642
In the below code I am grouping based on Car brand and getting the number of occurances for each brand and creating a new column where I populate the probability of picking each brand.
brand_probability <- df |>
group_by(Car.Brand) |>
summarise(Brand_count = n()) |>
mutate(probability_of_picking_brand = (Brand_count / nrow(df)) * 100 ) |>
arrange((Brand_count))
head(brand_probability, n = 5)
## # A tibble: 5 × 3
## Car.Brand Brand_count probability_of_picking_brand
## <chr> <int> <dbl>
## 1 ISUZU 1 0.0169
## 2 TOYOTA 1 0.0169
## 3 Volvo 1 0.0169
## 4 Mitsubishi 2 0.0338
## 5 Jaguar 3 0.0507
Using the cut function I have created three tags namely “Less than 1 percent”, “Between 1 and 10 percent” and “More than 10 percent”.
brand_probability$Tag <- cut(brand_probability$probability_of_picking_brand,
breaks=c(0, 1, 10, 100),
labels=c('Less than 1 percent', 'Between 1 and 10 percent', 'More than 10 percent'))
brand_probability |>
ggplot() +
geom_bar(mapping = aes(Tag)) +
labs(
x = "Probability of picking a brand",
y = "Number of Brands"
) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal() +
ggtitle("Number of brands that falls under which category")
Here I am getting only the brands where the probability of picking it would by less than 1 percent.
brand_probability |>
filter(probability_of_picking_brand < 1)
## # A tibble: 18 × 4
## Car.Brand Brand_count probability_of_picking_brand Tag
## <chr> <int> <dbl> <fct>
## 1 ISUZU 1 0.0169 Less than 1 percent
## 2 TOYOTA 1 0.0169 Less than 1 percent
## 3 Volvo 1 0.0169 Less than 1 percent
## 4 Mitsubishi 2 0.0338 Less than 1 percent
## 5 Jaguar 3 0.0507 Less than 1 percent
## 6 Ssangyong 3 0.0507 Less than 1 percent
## 7 Landrover 5 0.0845 Less than 1 percent
## 8 Fiat 12 0.203 Less than 1 percent
## 9 Jeep 15 0.253 Less than 1 percent
## 10 KIA 21 0.355 Less than 1 percent
## 11 MG 21 0.355 Less than 1 percent
## 12 Datsun 24 0.406 Less than 1 percent
## 13 Audi 25 0.422 Less than 1 percent
## 14 BMW 26 0.439 Less than 1 percent
## 15 Chevrolet 32 0.541 Less than 1 percent
## 16 Nissan 33 0.558 Less than 1 percent
## 17 Mercedes 34 0.575 Less than 1 percent
## 18 Skoda 51 0.862 Less than 1 percent
From the above analysis it is found that, there are 18 brands of cars which people in India doesn’t buy. And those brands are ISUZU, TOYOTA, Volvo, Mitsubishi, Jaguar, Ssangyong, Landrover, Fiat, Jeep, KIA, MG, Datsun, Audi, BMW, Chevrolet, Nissan, Mercedes, Skoda.
Indians usually prefer to buy native brand cars rather than other country brand.
df |>
group_by(Fuel) |>
summarise(Fuel_preferred = n()) |>
mutate(probability_of_picking_fueltype = (Fuel_preferred / nrow(df)) * 100 ) |>
arrange((Fuel_preferred))
## # A tibble: 5 × 3
## Fuel Fuel_preferred probability_of_picking_fueltype
## <chr> <int> <dbl>
## 1 Electric 2 0.0338
## 2 Petrol + LPG 18 0.304
## 3 Petrol + CNG 147 2.48
## 4 Diesel 1964 33.2
## 5 Petrol 3787 64.0
df |>
ggplot() +
geom_bar(mapping = aes(Fuel)) +
labs(
x = "Fuel type",
y = "Number of Cars"
) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal() +
ggtitle("Breakdown of cars")
From the above graph it is evident that a lot of Indians prefer their car to have petrol as fuel type rather than others. And there are really few people who are using electric cars.
Indian car buyers prefer petrol cars over other fuel types, and electric cars are significantly less popular than petrol cars.
Want to know the average age of each brand. To do this we need to calculate the age by finding the difference between Model.Year(year of manufacturing) and 2024.
df |>
mutate(age = 2024 - Model.Year) |> #this creates a new column age and populates the value
group_by(Car.Brand) |>
summarise(Avg_age = mean(age),
Avg_price = mean(Price)) |>
arrange(desc(Avg_age)) |>
slice_head(n = 5)
## # A tibble: 5 × 3
## Car.Brand Avg_age Avg_price
## <chr> <dbl> <dbl>
## 1 Chevrolet 12.2 210459.
## 2 TOYOTA 12 6500000
## 3 Volvo 12 940499
## 4 BMW 11.5 1163238.
## 5 Mercedes 11.2 1328252.
#Adding a column named 'age' and populating the age of each car.
df$Age = 2024 - df$Model.Year
df |>
slice_head(n = 5)
## X Car.Brand Model Price Model.Year Location Fuel Driven..Kms.
## 1 0 Hyundai EonERA PLUS 330399 2016 Hyderabad Petrol 10674
## 2 1 Maruti Wagon R 1.0LXI 350199 2011 Hyderabad Petrol 20979
## 3 2 Maruti Alto K10LXI 229199 2011 Hyderabad Petrol 47330
## 4 3 Maruti RitzVXI BS IV 306399 2011 Hyderabad Petrol 19662
## 5 4 Tata NanoTWIST XTA 208699 2015 Hyderabad Petrol 11256
## Gear Ownership EMI..monthly. Age
## 1 Manual 2 7350 8
## 2 Manual 1 7790 13
## 3 Manual 2 5098 13
## 4 Manual 1 6816 13
## 5 Automatic 1 4642 9
Segregating the age as 0 to 5, 6 to 10, 11 to 15 and 16 to 20 using the ‘cut’ function.
df$Age_category <- cut(df$Age,
breaks=c(0, 5, 10, 15, 20),
labels=c('Less than 5 years', 'Between 6 and 10 years', 'Between 11 and 15 years', 'More than 16 years'))
head(df, n = 5)
## X Car.Brand Model Price Model.Year Location Fuel Driven..Kms.
## 1 0 Hyundai EonERA PLUS 330399 2016 Hyderabad Petrol 10674
## 2 1 Maruti Wagon R 1.0LXI 350199 2011 Hyderabad Petrol 20979
## 3 2 Maruti Alto K10LXI 229199 2011 Hyderabad Petrol 47330
## 4 3 Maruti RitzVXI BS IV 306399 2011 Hyderabad Petrol 19662
## 5 4 Tata NanoTWIST XTA 208699 2015 Hyderabad Petrol 11256
## Gear Ownership EMI..monthly. Age Age_category
## 1 Manual 2 7350 8 Between 6 and 10 years
## 2 Manual 1 7790 13 Between 11 and 15 years
## 3 Manual 2 5098 13 Between 11 and 15 years
## 4 Manual 1 6816 13 Between 11 and 15 years
## 5 Automatic 1 4642 9 Between 6 and 10 years
df |>
ggplot() +
geom_boxplot(mapping = aes(x = Age_category, y = Price)) +
labs(
title = "Age vs Price",
x = "Age Category",
y = "Price (INR)"
) +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal()
From the above graph it is clear that the average price of a car decreases as it ages. In other words, the average price of the car and its age is indirectly proportional.
The average price of a car decreases as the age of the car increases, indicating an inverse relationship between car price and age.
Choosing the Fuel and Gear as the two categorical variables
unique_combinations <- unique(df[, c("Fuel", "Gear")])
unique_combinations
## Fuel Gear
## 1 Petrol Manual
## 5 Petrol Automatic
## 48 Petrol
## 53 Petrol + CNG Manual
## 83 Diesel Manual
## 210 Petrol + LPG Manual
## 251 Diesel Automatic
## 400 Diesel
## 651 Petrol + CNG Automatic
## 3146 Petrol + CNG
## 4921 Electric Manual
## 5377 Petrol + LPG
combination_count <- df |>
group_by(Fuel, Gear) |>
count() |>
arrange(desc(n))
combination_count
## # A tibble: 12 × 3
## # Groups: Fuel, Gear [12]
## Fuel Gear n
## <chr> <chr> <int>
## 1 Petrol "Manual" 3229
## 2 Diesel "Manual" 1686
## 3 Petrol "Automatic" 382
## 4 Diesel "Automatic" 191
## 5 Petrol "" 176
## 6 Petrol + CNG "Manual" 141
## 7 Diesel "" 87
## 8 Petrol + LPG "Manual" 17
## 9 Petrol + CNG "Automatic" 5
## 10 Electric "Manual" 2
## 11 Petrol + CNG "" 1
## 12 Petrol + LPG "" 1
From the above table it is clear that Petrol + Manual is popular combo and Electric + Manual is the least popular.
df |>
filter(Gear == "Automatic" | Gear == "Manual") |>
ggplot() +
geom_bar(mapping = aes(Gear, fill = Fuel)) +
labs(
x = "Gear type",
y = "Number of Cars"
) +
scale_fill_brewer(palette = 'Dark2') +
theme_minimal() +
ggtitle("Breakdown of cars based on Gears and Fuel type")