#Load Necessary Libraries

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ purrr     1.2.1
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(GGally)

## Warning: package 'GGally' was built under R version 4.5.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.5.3

library(cluster)
library(janitor)

## Warning: package 'janitor' was built under R version 4.5.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

bikes<-read.csv("C:/Users/offic/OneDrive/Desktop/bike_sales_india.csv")
View(bikes)

Level 1: Understanding the Data

Question 1.1: What is the structure of the dataset?

str(bikes)

## 'data.frame':    10000 obs. of  15 variables:
##  $ State                  : chr  "Karnataka" "Rajasthan" "Madhya Pradesh" "Karnataka" ...
##  $ Avg.Daily.Distance..km.: num  68.8 23.8 27.7 62.9 10.6 ...
##  $ Brand                  : chr  "Royal Enfield" "Bajaj" "KTM" "Kawasaki" ...
##  $ Model                  : chr  "Hunter 350" "Dominar 400" "125 Duke" "Ninja 300" ...
##  $ Price..INR.            : int  252816 131100 201016 132482 231796 60839 349740 193781 145538 227055 ...
##  $ Year.of.Manufacture    : int  2021 2020 2020 2021 2019 2019 2022 2019 2022 2024 ...
##  $ Engine.Capacity..cc.   : int  672 769 216 556 298 386 970 375 330 626 ...
##  $ Fuel.Type              : chr  "Electric" "Hybrid" "Hybrid" "Petrol" ...
##  $ Mileage..km.l.         : num  78.4 90 71.5 51.7 54.7 ...
##  $ Owner.Type             : chr  "Second" "Third" "Second" "Third" ...
##  $ Registration.Year      : int  2024 2023 2023 2021 2019 2019 2022 2022 2023 2024 ...
##  $ Insurance.Status       : chr  "Active" "Active" "Active" "Active" ...
##  $ Seller.Type            : chr  "Individual" "Individual" "Dealer" "Dealer" ...
##  $ Resale.Price..INR.     : num  149934 66960 141523 56057 132538 ...
##  $ City.Tier              : chr  "Tier 3" "Tier 3" "Tier 3" "Tier 1" ...

Question 1.2: Are there any missing values?

colSums(is.na(bikes))

##                   State Avg.Daily.Distance..km.                   Brand 
##                       0                       0                       0 
##                   Model             Price..INR.     Year.of.Manufacture 
##                       0                       0                       0 
##    Engine.Capacity..cc.               Fuel.Type          Mileage..km.l. 
##                       0                       0                       0 
##              Owner.Type       Registration.Year        Insurance.Status 
##                       0                       0                       0 
##             Seller.Type      Resale.Price..INR.               City.Tier 
##                       0                       0                       0

Question 1.3: Cleaning Column names

bikes<-clean_names(bikes)

Question 1.4: What is the average price of bikes in each state in descending order ?

state_price <- bikes %>%
  group_by(state) %>%
  summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_price))
state_price

## # A tibble: 10 × 2
##    state          avg_price
##    <chr>              <dbl>
##  1 Delhi            228110.
##  2 Uttar Pradesh    227511.
##  3 Tamil Nadu       226715.
##  4 Punjab           226566.
##  5 Gujarat          226441.
##  6 Karnataka        225220.
##  7 West Bengal      221925.
##  8 Rajasthan        220820.
##  9 Maharashtra      220032.
## 10 Madhya Pradesh   219937.

Level 2: Data Extraction and Filtering

Question 2.1: Which 5 brands have the highest average resale price ?

top_5_brands <- bikes %>%
  group_by(brand) %>%
  summarise(avg_resale_price = mean(resale_price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_resale_price)) %>%
  slice(1:5)
print(top_5_brands)

## # A tibble: 5 × 2
##   brand    avg_resale_price
##   <chr>               <dbl>
## 1 Bajaj             136662.
## 2 Yamaha            135535.
## 3 Hero              134885.
## 4 Kawasaki          134373.
## 5 TVS               133497.

Question 2.2: What are the top 10 bikes (model-wise) with the highest mileage ?

top_10_mileage_models <- bikes %>%
  group_by(model) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  slice_max(avg_mileage, n = 10)
top_10_mileage_models

## # A tibble: 10 × 2
##    model         avg_mileage
##    <chr>               <dbl>
##  1 NTorq 125            71.2
##  2 Unicorn              70.6
##  3 Versys 650           69.2
##  4 Splendor Plus        69.1
##  5 Hunter 350           69.0
##  6 Xtreme 160R          68.8
##  7 Pulsar 150           68.8
##  8 Vulcan S             68.7
##  9 Glamour              68.4
## 10 RC 390               68.4

Question 2.3: What are the top 10 brands with the largest average engine capacity ?

top_10_engine_brands <- bikes %>%
  group_by(brand) %>%
  summarise(avg_engine_capacity = mean(engine_capacity_cc, na.rm = TRUE)) %>%
  arrange(desc(avg_engine_capacity)) %>%
  slice(1:10)
top_10_engine_brands

## # A tibble: 8 × 2
##   brand         avg_engine_capacity
##   <chr>                       <dbl>
## 1 Kawasaki                     559.
## 2 Yamaha                       558.
## 3 Royal Enfield                555.
## 4 Hero                         554.
## 5 Honda                        550.
## 6 TVS                          549.
## 7 Bajaj                        548.
## 8 KTM                          548.

Level 3: Grouping and Summarization

Question 3.1: Which is the most common registration year by counting records per year ?

most_common_year <- bikes %>%
  count(registration_year) %>%
  arrange(desc(n)) %>%
  slice(1)
most_common_year

##   registration_year    n
## 1              2024 2296

Question 3.2: Which state has the most second-hand (Second owner) bikes ?

top_state_second_owner <- bikes %>%
  filter(owner_type == "Second") %>%
  group_by(state) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
top_state_second_owner[1, ]

## # A tibble: 1 × 2
##   state       count
##   <chr>       <int>
## 1 Maharashtra   368

Question 3.3: Which fuel type has the highest average mileage ?

fuel_mileage <- bikes %>%
  group_by(fuel_type) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  arrange(desc(avg_mileage))
fuel_mileage[1, ]

## # A tibble: 1 × 2
##   fuel_type avg_mileage
##   <chr>           <dbl>
## 1 Electric         80.1

Level 4: Sorting and Ranking Data

Question 4.1: Rank bike models based on mileage (km/l) from highest to lowest.

model_mileage_rank <- bikes %>%
  group_by(model) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  mutate(rank = dense_rank(desc(avg_mileage))) %>%
  arrange(rank)
model_mileage_rank

## # A tibble: 40 × 3
##    model         avg_mileage  rank
##    <chr>               <dbl> <int>
##  1 NTorq 125            71.2     1
##  2 Unicorn              70.6     2
##  3 Versys 650           69.2     3
##  4 Splendor Plus        69.1     4
##  5 Hunter 350           69.0     5
##  6 Xtreme 160R          68.8     6
##  7 Pulsar 150           68.8     7
##  8 Vulcan S             68.7     8
##  9 Glamour              68.4     9
## 10 RC 390               68.4    10
## # ℹ 30 more rows

Question 4.2: What are the top 5 states with the highest number of premium bikes (price > ₹2,00,000) ?

top_5_premium_states <- bikes %>%
  filter(price_inr > 200000) %>%
  group_by(state) %>%
  summarise(count = n()) %>%
  slice_max(count, n = 5)
top_5_premium_states

## # A tibble: 5 × 2
##   state         count
##   <chr>         <int>
## 1 Punjab          603
## 2 Gujarat         588
## 3 Tamil Nadu      588
## 4 Maharashtra     580
## 5 Uttar Pradesh   579

Question 4.3: Which are the top 5 brands with the lowest depreciation (best resale value) ?

top_5_low_depreciation <- bikes %>%
  mutate(depreciation = price_inr - resale_price_inr) %>%
  group_by(brand) %>%
  summarise(avg_depreciation = mean(depreciation, na.rm = TRUE)) %>%
  slice_min(avg_depreciation, n = 5)
top_5_low_depreciation

## # A tibble: 5 × 2
##   brand         avg_depreciation
##   <chr>                    <dbl>
## 1 Kawasaki                88790.
## 2 TVS                     88972.
## 3 Honda                   89979.
## 4 Royal Enfield           90295.
## 5 Yamaha                  90387.

Question 4.4: Create a depreciation percentage column where depreciation percentage=(price_inr - resale_price_inr) / price_inr) * 100

bikes <- bikes %>%
  mutate(depreciation_pct = ((price_inr - resale_price_inr) / price_inr) * 100)
head(bikes)

##            state avg_daily_distance_km         brand         model price_inr
## 1      Karnataka                 68.84 Royal Enfield    Hunter 350    252816
## 2      Rajasthan                 23.80         Bajaj   Dominar 400    131100
## 3 Madhya Pradesh                 27.67           KTM      125 Duke    201016
## 4      Karnataka                 62.85      Kawasaki     Ninja 300    132482
## 5 Madhya Pradesh                 10.62        Yamaha         FZ V3    231796
## 6    Maharashtra                 47.71          Hero Splendor Plus     60839
##   year_of_manufacture engine_capacity_cc fuel_type mileage_km_l owner_type
## 1                2021                672  Electric        78.41     Second
## 2                2020                769    Hybrid        89.98      Third
## 3                2020                216    Hybrid        71.46     Second
## 4                2021                556    Petrol        51.73      Third
## 5                2019                298    Petrol        54.72      Third
## 6                2019                386  Electric        92.08      First
##   registration_year insurance_status seller_type resale_price_inr city_tier
## 1              2024           Active  Individual        149934.18    Tier 3
## 2              2023           Active  Individual         66960.30    Tier 3
## 3              2023           Active      Dealer        141522.64    Tier 3
## 4              2021           Active      Dealer         56057.22    Tier 1
## 5              2019    Not Available  Individual        132538.36    Tier 3
## 6              2019    Not Available      Dealer         47466.45    Tier 3
##   depreciation_pct
## 1         40.69435
## 2         48.92426
## 3         29.59633
## 4         57.68692
## 5         42.82112
## 6         21.98023

Question 4.5: Create a categorical column bike_category:

Budget (< ₹1,00,000)

Mid-range (₹1,00,000 – ₹2,50,000)

Premium (> ₹2,50,000)

bikes <- bikes %>%
  mutate(bike_category = case_when(
    price_inr < 100000 ~ "Budget",
    price_inr >= 100000 & price_inr <= 250000 ~ "Mid-range",
    price_inr > 250000 ~ "Premium"
  ))
head(bikes)

##            state avg_daily_distance_km         brand         model price_inr
## 1      Karnataka                 68.84 Royal Enfield    Hunter 350    252816
## 2      Rajasthan                 23.80         Bajaj   Dominar 400    131100
## 3 Madhya Pradesh                 27.67           KTM      125 Duke    201016
## 4      Karnataka                 62.85      Kawasaki     Ninja 300    132482
## 5 Madhya Pradesh                 10.62        Yamaha         FZ V3    231796
## 6    Maharashtra                 47.71          Hero Splendor Plus     60839
##   year_of_manufacture engine_capacity_cc fuel_type mileage_km_l owner_type
## 1                2021                672  Electric        78.41     Second
## 2                2020                769    Hybrid        89.98      Third
## 3                2020                216    Hybrid        71.46     Second
## 4                2021                556    Petrol        51.73      Third
## 5                2019                298    Petrol        54.72      Third
## 6                2019                386  Electric        92.08      First
##   registration_year insurance_status seller_type resale_price_inr city_tier
## 1              2024           Active  Individual        149934.18    Tier 3
## 2              2023           Active  Individual         66960.30    Tier 3
## 3              2023           Active      Dealer        141522.64    Tier 3
## 4              2021           Active      Dealer         56057.22    Tier 1
## 5              2019    Not Available  Individual        132538.36    Tier 3
## 6              2019    Not Available      Dealer         47466.45    Tier 3
##   depreciation_pct bike_category
## 1         40.69435       Premium
## 2         48.92426     Mid-range
## 3         29.59633     Mid-range
## 4         57.68692     Mid-range
## 5         42.82112     Mid-range
## 6         21.98023        Budget

Level 5: Data Visualization

Question 5.1: Histogram to visualize the distribution of bike prices.

ggplot(bikes, aes(x = price_inr)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  ggtitle("Distribution of Bike Prices") +
  xlab("Price (INR)") +
  ylab("Count")

print(NULL)

## NULL

Question 5.2: Bar Chart showing state-wise analysis of bike prices

state_price <- bikes %>%
  group_by(state) %>%
  summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_price))

ggplot(state_price, aes(x = reorder(state, avg_price), y = avg_price)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  coord_flip() +
  ggtitle("Average Price by State") +
  xlab("State") +
  ylab("Avg Price")

print(NULL)

## NULL

Question 5.3: Scatter plot to analyze the relationship between mileage and price

ggplot(bikes, aes(x = mileage_km_l, y = price_inr)) +
  geom_point(alpha = 0.5, color = "purple") +
  ggtitle("Mileage vs Price")

Statewise Bike Sales data in India

Anuj Kumar

2026-04-13

Level 1: Understanding the Data

Question 1.1: What is the structure of the dataset?

Question 1.2: Are there any missing values?

Question 1.3: Cleaning Column names

Question 1.4: What is the average price of bikes in each state in descending order ?

Level 2: Data Extraction and Filtering

Question 2.1: Which 5 brands have the highest average resale price ?

Question 2.2: What are the top 10 bikes (model-wise) with the highest mileage ?

Question 2.3: What are the top 10 brands with the largest average engine capacity ?

Level 3: Grouping and Summarization

Question 3.1: Which is the most common registration year by counting records per year ?

Question 3.2: Which state has the most second-hand (Second owner) bikes ?

Question 3.3: Which fuel type has the highest average mileage ?

Level 4: Sorting and Ranking Data

Question 4.1: Rank bike models based on mileage (km/l) from highest to lowest.

Question 4.2: What are the top 5 states with the highest number of premium bikes (price > ₹2,00,000) ?

Question 4.3: Which are the top 5 brands with the lowest depreciation (best resale value) ?

Question 4.4: Create a depreciation percentage column where depreciation percentage=(price_inr - resale_price_inr) / price_inr) * 100

Question 4.5: Create a categorical column bike_category:

Budget (< ₹1,00,000)

Mid-range (₹1,00,000 – ₹2,50,000)

Premium (> ₹2,50,000)

Level 5: Data Visualization

Question 5.1: Histogram to visualize the distribution of bike prices.

Question 5.2: Bar Chart showing state-wise analysis of bike prices

Question 5.3: Scatter plot to analyze the relationship between mileage and price