#Load Necessary Libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ purrr     1.2.1
## ✔ ggplot2   4.0.3     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.3
library(cluster)
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.3
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)
## Warning: package 'scales' was built under R version 4.5.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
bikes<-read.csv("C:/Users/offic/OneDrive/Desktop/bike_sales_india.csv")
View(bikes)

Level 1: Understanding the Data

Question 1.1: What is the structure of the dataset?

str(bikes)
## 'data.frame':    10000 obs. of  15 variables:
##  $ State                  : chr  "Karnataka" "Rajasthan" "Madhya Pradesh" "Karnataka" ...
##  $ Avg.Daily.Distance..km.: num  68.8 23.8 27.7 62.9 10.6 ...
##  $ Brand                  : chr  "Royal Enfield" "Bajaj" "KTM" "Kawasaki" ...
##  $ Model                  : chr  "Hunter 350" "Dominar 400" "125 Duke" "Ninja 300" ...
##  $ Price..INR.            : int  252816 131100 201016 132482 231796 60839 349740 193781 145538 227055 ...
##  $ Year.of.Manufacture    : int  2021 2020 2020 2021 2019 2019 2022 2019 2022 2024 ...
##  $ Engine.Capacity..cc.   : int  672 769 216 556 298 386 970 375 330 626 ...
##  $ Fuel.Type              : chr  "Electric" "Hybrid" "Hybrid" "Petrol" ...
##  $ Mileage..km.l.         : num  78.4 90 71.5 51.7 54.7 ...
##  $ Owner.Type             : chr  "Second" "Third" "Second" "Third" ...
##  $ Registration.Year      : int  2024 2023 2023 2021 2019 2019 2022 2022 2023 2024 ...
##  $ Insurance.Status       : chr  "Active" "Active" "Active" "Active" ...
##  $ Seller.Type            : chr  "Individual" "Individual" "Dealer" "Dealer" ...
##  $ Resale.Price..INR.     : num  149934 66960 141523 56057 132538 ...
##  $ City.Tier              : chr  "Tier 3" "Tier 3" "Tier 3" "Tier 1" ...

Question 1.2: Are there any missing values?

colSums(is.na(bikes))
##                   State Avg.Daily.Distance..km.                   Brand 
##                       0                       0                       0 
##                   Model             Price..INR.     Year.of.Manufacture 
##                       0                       0                       0 
##    Engine.Capacity..cc.               Fuel.Type          Mileage..km.l. 
##                       0                       0                       0 
##              Owner.Type       Registration.Year        Insurance.Status 
##                       0                       0                       0 
##             Seller.Type      Resale.Price..INR.               City.Tier 
##                       0                       0                       0

Question 1.3: Cleaning Column names

bikes<-clean_names(bikes)

Question 1.4: What is the average price of bikes in each state in descending order ?

state_price <- bikes %>%
  group_by(state) %>%
  summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_price))
state_price
## # A tibble: 10 × 2
##    state          avg_price
##    <chr>              <dbl>
##  1 Delhi            228110.
##  2 Uttar Pradesh    227511.
##  3 Tamil Nadu       226715.
##  4 Punjab           226566.
##  5 Gujarat          226441.
##  6 Karnataka        225220.
##  7 West Bengal      221925.
##  8 Rajasthan        220820.
##  9 Maharashtra      220032.
## 10 Madhya Pradesh   219937.

Level 2: Data Extraction and Filtering

Question 2.1: Which 5 brands have the highest average resale price ?

top_5_brands <- bikes %>%
  group_by(brand) %>%
  summarise(avg_resale_price = mean(resale_price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_resale_price)) %>%
  slice(1:5)
print(top_5_brands)
## # A tibble: 5 × 2
##   brand    avg_resale_price
##   <chr>               <dbl>
## 1 Bajaj             136662.
## 2 Yamaha            135535.
## 3 Hero              134885.
## 4 Kawasaki          134373.
## 5 TVS               133497.

Question 2.2: What are the top 10 bikes (model-wise) with the highest mileage ?

top_10_mileage_models <- bikes %>%
  group_by(model) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  slice_max(avg_mileage, n = 10)
top_10_mileage_models
## # A tibble: 10 × 2
##    model         avg_mileage
##    <chr>               <dbl>
##  1 NTorq 125            71.2
##  2 Unicorn              70.6
##  3 Versys 650           69.2
##  4 Splendor Plus        69.1
##  5 Hunter 350           69.0
##  6 Xtreme 160R          68.8
##  7 Pulsar 150           68.8
##  8 Vulcan S             68.7
##  9 Glamour              68.4
## 10 RC 390               68.4

Question 2.3: What are the top 10 brands with the largest average engine capacity ?

top_10_engine_brands <- bikes %>%
  group_by(brand) %>%
  summarise(avg_engine_capacity = mean(engine_capacity_cc, na.rm = TRUE)) %>%
  arrange(desc(avg_engine_capacity)) %>%
  slice(1:10)
top_10_engine_brands
## # A tibble: 8 × 2
##   brand         avg_engine_capacity
##   <chr>                       <dbl>
## 1 Kawasaki                     559.
## 2 Yamaha                       558.
## 3 Royal Enfield                555.
## 4 Hero                         554.
## 5 Honda                        550.
## 6 TVS                          549.
## 7 Bajaj                        548.
## 8 KTM                          548.

Level 3: Grouping and Summarization

Question 3.1: Which is the most common registration year by counting records per year ?

most_common_year <- bikes %>%
  count(registration_year) %>%
  arrange(desc(n)) %>%
  slice(1)
most_common_year
##   registration_year    n
## 1              2024 2296

Question 3.2: Which state has the most second-hand (Second owner) bikes ?

top_state_second_owner <- bikes %>%
  filter(owner_type == "Second") %>%
  group_by(state) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
top_state_second_owner[1, ]
## # A tibble: 1 × 2
##   state       count
##   <chr>       <int>
## 1 Maharashtra   368

Question 3.3: Which fuel type has the highest average mileage ?

fuel_mileage <- bikes %>%
  group_by(fuel_type) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  arrange(desc(avg_mileage))
fuel_mileage[1, ]
## # A tibble: 1 × 2
##   fuel_type avg_mileage
##   <chr>           <dbl>
## 1 Electric         80.1

Level 4: Sorting and Ranking Data

Question 4.1: Rank bike models based on mileage (km/l) from highest to lowest.

model_mileage_rank <- bikes %>%
  group_by(model) %>%
  summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
  mutate(rank = dense_rank(desc(avg_mileage))) %>%
  arrange(rank)
model_mileage_rank
## # A tibble: 40 × 3
##    model         avg_mileage  rank
##    <chr>               <dbl> <int>
##  1 NTorq 125            71.2     1
##  2 Unicorn              70.6     2
##  3 Versys 650           69.2     3
##  4 Splendor Plus        69.1     4
##  5 Hunter 350           69.0     5
##  6 Xtreme 160R          68.8     6
##  7 Pulsar 150           68.8     7
##  8 Vulcan S             68.7     8
##  9 Glamour              68.4     9
## 10 RC 390               68.4    10
## # ℹ 30 more rows

Question 4.2: What are the top 5 states with the highest number of premium bikes (price > ₹2,00,000) ?

top_5_premium_states <- bikes %>%
  filter(price_inr > 200000) %>%
  group_by(state) %>%
  summarise(count = n()) %>%
  slice_max(count, n = 5)
top_5_premium_states
## # A tibble: 5 × 2
##   state         count
##   <chr>         <int>
## 1 Punjab          603
## 2 Gujarat         588
## 3 Tamil Nadu      588
## 4 Maharashtra     580
## 5 Uttar Pradesh   579

Question 4.3: Which are the top 5 brands with the lowest depreciation (best resale value) ?

top_5_low_depreciation <- bikes %>%
  mutate(depreciation = price_inr - resale_price_inr) %>%
  group_by(brand) %>%
  summarise(avg_depreciation = mean(depreciation, na.rm = TRUE)) %>%
  slice_min(avg_depreciation, n = 5)
top_5_low_depreciation
## # A tibble: 5 × 2
##   brand         avg_depreciation
##   <chr>                    <dbl>
## 1 Kawasaki                88790.
## 2 TVS                     88972.
## 3 Honda                   89979.
## 4 Royal Enfield           90295.
## 5 Yamaha                  90387.

Question 4.4: Create a depreciation percentage column where depreciation percentage=(price_inr - resale_price_inr) / price_inr) * 100

bikes <- bikes %>%
  mutate(depreciation_pct = ((price_inr - resale_price_inr) / price_inr) * 100)
head(bikes)
##            state avg_daily_distance_km         brand         model price_inr
## 1      Karnataka                 68.84 Royal Enfield    Hunter 350    252816
## 2      Rajasthan                 23.80         Bajaj   Dominar 400    131100
## 3 Madhya Pradesh                 27.67           KTM      125 Duke    201016
## 4      Karnataka                 62.85      Kawasaki     Ninja 300    132482
## 5 Madhya Pradesh                 10.62        Yamaha         FZ V3    231796
## 6    Maharashtra                 47.71          Hero Splendor Plus     60839
##   year_of_manufacture engine_capacity_cc fuel_type mileage_km_l owner_type
## 1                2021                672  Electric        78.41     Second
## 2                2020                769    Hybrid        89.98      Third
## 3                2020                216    Hybrid        71.46     Second
## 4                2021                556    Petrol        51.73      Third
## 5                2019                298    Petrol        54.72      Third
## 6                2019                386  Electric        92.08      First
##   registration_year insurance_status seller_type resale_price_inr city_tier
## 1              2024           Active  Individual        149934.18    Tier 3
## 2              2023           Active  Individual         66960.30    Tier 3
## 3              2023           Active      Dealer        141522.64    Tier 3
## 4              2021           Active      Dealer         56057.22    Tier 1
## 5              2019    Not Available  Individual        132538.36    Tier 3
## 6              2019    Not Available      Dealer         47466.45    Tier 3
##   depreciation_pct
## 1         40.69435
## 2         48.92426
## 3         29.59633
## 4         57.68692
## 5         42.82112
## 6         21.98023

Question 4.5: Create a categorical column bike_category:

Budget (< ₹1,00,000)

Mid-range (₹1,00,000 – ₹2,50,000)

Premium (> ₹2,50,000)

bikes <- bikes %>%
  mutate(bike_category = case_when(
    price_inr < 100000 ~ "Budget",
    price_inr >= 100000 & price_inr <= 250000 ~ "Mid-range",
    price_inr > 250000 ~ "Premium"
  ))
head(bikes)
##            state avg_daily_distance_km         brand         model price_inr
## 1      Karnataka                 68.84 Royal Enfield    Hunter 350    252816
## 2      Rajasthan                 23.80         Bajaj   Dominar 400    131100
## 3 Madhya Pradesh                 27.67           KTM      125 Duke    201016
## 4      Karnataka                 62.85      Kawasaki     Ninja 300    132482
## 5 Madhya Pradesh                 10.62        Yamaha         FZ V3    231796
## 6    Maharashtra                 47.71          Hero Splendor Plus     60839
##   year_of_manufacture engine_capacity_cc fuel_type mileage_km_l owner_type
## 1                2021                672  Electric        78.41     Second
## 2                2020                769    Hybrid        89.98      Third
## 3                2020                216    Hybrid        71.46     Second
## 4                2021                556    Petrol        51.73      Third
## 5                2019                298    Petrol        54.72      Third
## 6                2019                386  Electric        92.08      First
##   registration_year insurance_status seller_type resale_price_inr city_tier
## 1              2024           Active  Individual        149934.18    Tier 3
## 2              2023           Active  Individual         66960.30    Tier 3
## 3              2023           Active      Dealer        141522.64    Tier 3
## 4              2021           Active      Dealer         56057.22    Tier 1
## 5              2019    Not Available  Individual        132538.36    Tier 3
## 6              2019    Not Available      Dealer         47466.45    Tier 3
##   depreciation_pct bike_category
## 1         40.69435       Premium
## 2         48.92426     Mid-range
## 3         29.59633     Mid-range
## 4         57.68692     Mid-range
## 5         42.82112     Mid-range
## 6         21.98023        Budget

Level 5: Data Visualization

Question 5.1: Histogram to visualize the distribution of bike prices.

ggplot(bikes, aes(x = price_inr)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  ggtitle("Distribution of Bike Prices") +
  xlab("Price (INR)") +
  ylab("Count")

print(NULL)
## NULL

Question 5.2: Bar Chart showing state-wise analysis of bike prices

state_price <- bikes %>%
  group_by(state) %>%
  summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
  arrange(desc(avg_price))

ggplot(state_price, aes(x = reorder(state, avg_price), y = avg_price)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  coord_flip() +
  ggtitle("Average Price by State") +
  xlab("State") +
  ylab("Avg Price")

print(NULL)
## NULL

Question 5.3: Scatter plot to analyze the relationship between mileage and price

ggplot(bikes, aes(x = mileage_km_l, y = price_inr)) +
  geom_point(alpha = 0.5, color = "purple") +
  scale_y_continuous(labels=comma)+
  ggtitle("Mileage vs Price")

Level 6: Advanced Data Visualizations

Question 6.1: Facet Bar Chart for Count of Bikes

bike_count <- bikes %>%
  group_by(brand, fuel_type) %>%
  summarise(count = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by brand and fuel_type.
## ℹ Output is grouped by brand.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(brand, fuel_type))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
ggplot(bike_count, aes(x = brand, y = count, fill = brand)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Number of Bikes by Brand and Fuel Type") +
  facet_wrap(~fuel_type) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Question 6.2: Scatter Plot with Regression Line Showing Relationship Between Mileage and Price

ggplot(bikes, aes(x = mileage_km_l, y = price_inr, color = factor(fuel_type))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE, aes(linetype = "Regression Line")) +
  scale_y_continuous(labels=comma)+
  ggtitle("Mileage vs Price by Fuel Type") +
  labs(color = "Fuel Type", linetype = "Line Type")
## `geom_smooth()` using formula = 'y ~ x'

Question 6.3: Boxplot Showing Distribution of Mileage Across Different Bike Brands

ggplot(bikes, aes(x = factor(brand), y = mileage_km_l, fill = factor(brand))) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16) +
  labs(title = "Mileage Distribution by Brand",
       x = "Brand",
       y = "Mileage (km/l)") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Question 6.4 Distribution of Bike Prices Using Histogram and Density Plot

ggplot(bikes, aes(x = price_inr)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue",color="black") +
  geom_density(color = "red", linewidth = 1) +
  scale_y_continuous(labels=comma)+
  scale_x_continuous(labels=comma)+
  ggtitle("Distribution of Bike Prices") +
  labs(x = "Price (INR)", y = "Density")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Question 6.5: Average Bike Price Across Different Brands

bike_summary <- bikes %>%
  group_by(brand) %>%
  summarise(avg_price = mean(price_inr, na.rm = TRUE))
ggplot(bike_summary, aes(x = brand, y = avg_price, fill = brand)) +
  geom_bar(stat = "identity") +
  ggtitle("Average Price by Brand") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Level-7 Correlation Analysis

Question 7.1: Simple Correlation between Price and Performance Variables

num_data <- bikes %>%
  select(price_inr, mileage_km_l, engine_capacity_cc, resale_price_inr)
num_data <- na.omit(num_data)
corr_matrix <- cor(num_data)
corrplot(corr_matrix, method = "color")

Question 7.2: Advanced correlation heatmap with coffecient values

num_data <- bikes %>%
  select(price_inr, mileage_km_l, engine_capacity_cc, resale_price_inr) %>%
  na.omit()
corr_matrix <- cor(num_data)
corrplot(corr_matrix,
         method = "color",
         col = colorRampPalette(c("blue", "white", "red"))(200),
         addCoef.col = "black",
         number.cex = 0.8,
         tl.col = "black",
         tl.srt = 45,
         title = "Correlation Heatmap of Bike Dataset",
         mar = c(0,0,2,0))

Level 8: Regression Analysis

Level 8.1: How does mileage influence resale price of bikes using simple linear regression

reg_data <- bikes %>%
  select(resale_price_inr, depreciation_pct) %>%
  na.omit()
model1 <- lm(resale_price_inr ~ depreciation_pct, data = reg_data)
summary(model1)
## 
## Call:
## lm(formula = resale_price_inr ~ depreciation_pct, data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -137677  -49570    -172   49866  137606 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      225659.41    2234.77  100.98   <2e-16 ***
## depreciation_pct  -2274.47      53.23  -42.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61260 on 9998 degrees of freedom
## Multiple R-squared:  0.1544, Adjusted R-squared:  0.1543 
## F-statistic:  1826 on 1 and 9998 DF,  p-value: < 2.2e-16

Level 8.2 How does multiple variables influence resale price of bikes

multi_reg_data <- bikes %>%
  select(resale_price_inr, avg_daily_distance_km,depreciation_pct,city_tier) %>%
  na.omit()
model2 <- lm(resale_price_inr ~ + avg_daily_distance_km + depreciation_pct+city_tier,
             data = multi_reg_data)
summary(model2)
## 
## Call:
## lm(formula = resale_price_inr ~ +avg_daily_distance_km + depreciation_pct + 
##     city_tier, data = multi_reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -138088  -49477     -72   49812  137528 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           224259.64    2739.17  81.872   <2e-16 ***
## avg_daily_distance_km     29.45      28.28   1.042    0.298    
## depreciation_pct       -2275.50      53.25 -42.734   <2e-16 ***
## city_tierTier 1          -13.76    1751.30  -0.008    0.994    
## city_tierTier 2         -264.30    1739.71  -0.152    0.879    
## city_tierTier 3          984.71    1718.02   0.573    0.567    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61260 on 9994 degrees of freedom
## Multiple R-squared:  0.1546, Adjusted R-squared:  0.1541 
## F-statistic: 365.4 on 5 and 9994 DF,  p-value: < 2.2e-16
# Interpretation: The regression analysis shows that depreciation percentage
# is the only statistically significant predictor of resale price (p < 0.05),
# while average daily distance and city tier are not significant. The R² value
# remains approximately 0.154, indicating that additional variables do not
# improve the model. Therefore, depreciation is the primary factor influencing
# resale value.

Level 8.3: Resale price value prediction using multiple linear regression model

new_data <- data.frame(
  avg_daily_distance_km = 30,
  depreciation_pct = 20,
  city_tier = bikes$city_tier[1]   # takes a valid level automatically
)
predict(model2, newdata = new_data)
##      1 
## 180618

Level 8.4: Evaluating Model Performance Using Actual vs Predicted Resale Price Plot

multi_reg_data$Predicted <- predict(model2)
ggplot(multi_reg_data, aes(x = Predicted, y = resale_price_inr)) +
  geom_point(color = "yellow", alpha = 0.3) +
  scale_y_continuous(labels=comma)+
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Actual vs Predicted Resale Price")

# The Actual vs Predicted Resale Price plot compares the observed resale prices
# with the values predicted by the multiple linear regression model. The red
# diagonal line represents perfect predictions where actual and predicted values
# are equal. In the plot, most data points are scattered away from this line,
# indicating that the model does not predict resale prices very accurately.
# This suggests that the model has limited explanatory power. The spread of
# points implies that important factors affecting resale price are not included
# in the model. Overall, the model provides a general trend but lacks precision
# in prediction.