#Load Necessary Libraries
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ purrr 1.2.1
## ✔ ggplot2 4.0.3 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'GGally' was built under R version 4.5.3
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
## Warning: package 'ggpubr' was built under R version 4.5.3
## Warning: package 'janitor' was built under R version 4.5.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
## Warning: package 'scales' was built under R version 4.5.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
## 'data.frame': 10000 obs. of 15 variables:
## $ State : chr "Karnataka" "Rajasthan" "Madhya Pradesh" "Karnataka" ...
## $ Avg.Daily.Distance..km.: num 68.8 23.8 27.7 62.9 10.6 ...
## $ Brand : chr "Royal Enfield" "Bajaj" "KTM" "Kawasaki" ...
## $ Model : chr "Hunter 350" "Dominar 400" "125 Duke" "Ninja 300" ...
## $ Price..INR. : int 252816 131100 201016 132482 231796 60839 349740 193781 145538 227055 ...
## $ Year.of.Manufacture : int 2021 2020 2020 2021 2019 2019 2022 2019 2022 2024 ...
## $ Engine.Capacity..cc. : int 672 769 216 556 298 386 970 375 330 626 ...
## $ Fuel.Type : chr "Electric" "Hybrid" "Hybrid" "Petrol" ...
## $ Mileage..km.l. : num 78.4 90 71.5 51.7 54.7 ...
## $ Owner.Type : chr "Second" "Third" "Second" "Third" ...
## $ Registration.Year : int 2024 2023 2023 2021 2019 2019 2022 2022 2023 2024 ...
## $ Insurance.Status : chr "Active" "Active" "Active" "Active" ...
## $ Seller.Type : chr "Individual" "Individual" "Dealer" "Dealer" ...
## $ Resale.Price..INR. : num 149934 66960 141523 56057 132538 ...
## $ City.Tier : chr "Tier 3" "Tier 3" "Tier 3" "Tier 1" ...
## State Avg.Daily.Distance..km. Brand
## 0 0 0
## Model Price..INR. Year.of.Manufacture
## 0 0 0
## Engine.Capacity..cc. Fuel.Type Mileage..km.l.
## 0 0 0
## Owner.Type Registration.Year Insurance.Status
## 0 0 0
## Seller.Type Resale.Price..INR. City.Tier
## 0 0 0
state_price <- bikes %>%
group_by(state) %>%
summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
arrange(desc(avg_price))
state_price## # A tibble: 10 × 2
## state avg_price
## <chr> <dbl>
## 1 Delhi 228110.
## 2 Uttar Pradesh 227511.
## 3 Tamil Nadu 226715.
## 4 Punjab 226566.
## 5 Gujarat 226441.
## 6 Karnataka 225220.
## 7 West Bengal 221925.
## 8 Rajasthan 220820.
## 9 Maharashtra 220032.
## 10 Madhya Pradesh 219937.
top_5_brands <- bikes %>%
group_by(brand) %>%
summarise(avg_resale_price = mean(resale_price_inr, na.rm = TRUE)) %>%
arrange(desc(avg_resale_price)) %>%
slice(1:5)
print(top_5_brands)## # A tibble: 5 × 2
## brand avg_resale_price
## <chr> <dbl>
## 1 Bajaj 136662.
## 2 Yamaha 135535.
## 3 Hero 134885.
## 4 Kawasaki 134373.
## 5 TVS 133497.
top_10_mileage_models <- bikes %>%
group_by(model) %>%
summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
slice_max(avg_mileage, n = 10)
top_10_mileage_models## # A tibble: 10 × 2
## model avg_mileage
## <chr> <dbl>
## 1 NTorq 125 71.2
## 2 Unicorn 70.6
## 3 Versys 650 69.2
## 4 Splendor Plus 69.1
## 5 Hunter 350 69.0
## 6 Xtreme 160R 68.8
## 7 Pulsar 150 68.8
## 8 Vulcan S 68.7
## 9 Glamour 68.4
## 10 RC 390 68.4
top_10_engine_brands <- bikes %>%
group_by(brand) %>%
summarise(avg_engine_capacity = mean(engine_capacity_cc, na.rm = TRUE)) %>%
arrange(desc(avg_engine_capacity)) %>%
slice(1:10)
top_10_engine_brands## # A tibble: 8 × 2
## brand avg_engine_capacity
## <chr> <dbl>
## 1 Kawasaki 559.
## 2 Yamaha 558.
## 3 Royal Enfield 555.
## 4 Hero 554.
## 5 Honda 550.
## 6 TVS 549.
## 7 Bajaj 548.
## 8 KTM 548.
most_common_year <- bikes %>%
count(registration_year) %>%
arrange(desc(n)) %>%
slice(1)
most_common_year## registration_year n
## 1 2024 2296
top_state_second_owner <- bikes %>%
filter(owner_type == "Second") %>%
group_by(state) %>%
summarise(count = n()) %>%
arrange(desc(count))
top_state_second_owner[1, ]## # A tibble: 1 × 2
## state count
## <chr> <int>
## 1 Maharashtra 368
fuel_mileage <- bikes %>%
group_by(fuel_type) %>%
summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
arrange(desc(avg_mileage))
fuel_mileage[1, ]## # A tibble: 1 × 2
## fuel_type avg_mileage
## <chr> <dbl>
## 1 Electric 80.1
model_mileage_rank <- bikes %>%
group_by(model) %>%
summarise(avg_mileage = mean(mileage_km_l, na.rm = TRUE)) %>%
mutate(rank = dense_rank(desc(avg_mileage))) %>%
arrange(rank)
model_mileage_rank## # A tibble: 40 × 3
## model avg_mileage rank
## <chr> <dbl> <int>
## 1 NTorq 125 71.2 1
## 2 Unicorn 70.6 2
## 3 Versys 650 69.2 3
## 4 Splendor Plus 69.1 4
## 5 Hunter 350 69.0 5
## 6 Xtreme 160R 68.8 6
## 7 Pulsar 150 68.8 7
## 8 Vulcan S 68.7 8
## 9 Glamour 68.4 9
## 10 RC 390 68.4 10
## # ℹ 30 more rows
top_5_low_depreciation <- bikes %>%
mutate(depreciation = price_inr - resale_price_inr) %>%
group_by(brand) %>%
summarise(avg_depreciation = mean(depreciation, na.rm = TRUE)) %>%
slice_min(avg_depreciation, n = 5)
top_5_low_depreciation## # A tibble: 5 × 2
## brand avg_depreciation
## <chr> <dbl>
## 1 Kawasaki 88790.
## 2 TVS 88972.
## 3 Honda 89979.
## 4 Royal Enfield 90295.
## 5 Yamaha 90387.
bikes <- bikes %>%
mutate(depreciation_pct = ((price_inr - resale_price_inr) / price_inr) * 100)
head(bikes)## state avg_daily_distance_km brand model price_inr
## 1 Karnataka 68.84 Royal Enfield Hunter 350 252816
## 2 Rajasthan 23.80 Bajaj Dominar 400 131100
## 3 Madhya Pradesh 27.67 KTM 125 Duke 201016
## 4 Karnataka 62.85 Kawasaki Ninja 300 132482
## 5 Madhya Pradesh 10.62 Yamaha FZ V3 231796
## 6 Maharashtra 47.71 Hero Splendor Plus 60839
## year_of_manufacture engine_capacity_cc fuel_type mileage_km_l owner_type
## 1 2021 672 Electric 78.41 Second
## 2 2020 769 Hybrid 89.98 Third
## 3 2020 216 Hybrid 71.46 Second
## 4 2021 556 Petrol 51.73 Third
## 5 2019 298 Petrol 54.72 Third
## 6 2019 386 Electric 92.08 First
## registration_year insurance_status seller_type resale_price_inr city_tier
## 1 2024 Active Individual 149934.18 Tier 3
## 2 2023 Active Individual 66960.30 Tier 3
## 3 2023 Active Dealer 141522.64 Tier 3
## 4 2021 Active Dealer 56057.22 Tier 1
## 5 2019 Not Available Individual 132538.36 Tier 3
## 6 2019 Not Available Dealer 47466.45 Tier 3
## depreciation_pct
## 1 40.69435
## 2 48.92426
## 3 29.59633
## 4 57.68692
## 5 42.82112
## 6 21.98023
ggplot(bikes, aes(x = price_inr)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
ggtitle("Distribution of Bike Prices") +
xlab("Price (INR)") +
ylab("Count")## NULL
state_price <- bikes %>%
group_by(state) %>%
summarise(avg_price = mean(price_inr, na.rm = TRUE)) %>%
arrange(desc(avg_price))
ggplot(state_price, aes(x = reorder(state, avg_price), y = avg_price)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
ggtitle("Average Price by State") +
xlab("State") +
ylab("Avg Price")## NULL
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by brand and fuel_type.
## ℹ Output is grouped by brand.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(brand, fuel_type))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
ggplot(bike_count, aes(x = brand, y = count, fill = brand)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Number of Bikes by Brand and Fuel Type") +
facet_wrap(~fuel_type) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(bikes, aes(x = mileage_km_l, y = price_inr, color = factor(fuel_type))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = TRUE, aes(linetype = "Regression Line")) +
scale_y_continuous(labels=comma)+
ggtitle("Mileage vs Price by Fuel Type") +
labs(color = "Fuel Type", linetype = "Line Type")## `geom_smooth()` using formula = 'y ~ x'
ggplot(bikes, aes(x = factor(brand), y = mileage_km_l, fill = factor(brand))) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16) +
labs(title = "Mileage Distribution by Brand",
x = "Brand",
y = "Mileage (km/l)") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(bikes, aes(x = price_inr)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue",color="black") +
geom_density(color = "red", linewidth = 1) +
scale_y_continuous(labels=comma)+
scale_x_continuous(labels=comma)+
ggtitle("Distribution of Bike Prices") +
labs(x = "Price (INR)", y = "Density")## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bike_summary <- bikes %>%
group_by(brand) %>%
summarise(avg_price = mean(price_inr, na.rm = TRUE))
ggplot(bike_summary, aes(x = brand, y = avg_price, fill = brand)) +
geom_bar(stat = "identity") +
ggtitle("Average Price by Brand") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))num_data <- bikes %>%
select(price_inr, mileage_km_l, engine_capacity_cc, resale_price_inr)
num_data <- na.omit(num_data)
corr_matrix <- cor(num_data)
corrplot(corr_matrix, method = "color")num_data <- bikes %>%
select(price_inr, mileage_km_l, engine_capacity_cc, resale_price_inr) %>%
na.omit()
corr_matrix <- cor(num_data)
corrplot(corr_matrix,
method = "color",
col = colorRampPalette(c("blue", "white", "red"))(200),
addCoef.col = "black",
number.cex = 0.8,
tl.col = "black",
tl.srt = 45,
title = "Correlation Heatmap of Bike Dataset",
mar = c(0,0,2,0))reg_data <- bikes %>%
select(resale_price_inr, depreciation_pct) %>%
na.omit()
model1 <- lm(resale_price_inr ~ depreciation_pct, data = reg_data)
summary(model1)##
## Call:
## lm(formula = resale_price_inr ~ depreciation_pct, data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137677 -49570 -172 49866 137606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 225659.41 2234.77 100.98 <2e-16 ***
## depreciation_pct -2274.47 53.23 -42.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61260 on 9998 degrees of freedom
## Multiple R-squared: 0.1544, Adjusted R-squared: 0.1543
## F-statistic: 1826 on 1 and 9998 DF, p-value: < 2.2e-16
multi_reg_data <- bikes %>%
select(resale_price_inr, avg_daily_distance_km,depreciation_pct,city_tier) %>%
na.omit()
model2 <- lm(resale_price_inr ~ + avg_daily_distance_km + depreciation_pct+city_tier,
data = multi_reg_data)
summary(model2)##
## Call:
## lm(formula = resale_price_inr ~ +avg_daily_distance_km + depreciation_pct +
## city_tier, data = multi_reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138088 -49477 -72 49812 137528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 224259.64 2739.17 81.872 <2e-16 ***
## avg_daily_distance_km 29.45 28.28 1.042 0.298
## depreciation_pct -2275.50 53.25 -42.734 <2e-16 ***
## city_tierTier 1 -13.76 1751.30 -0.008 0.994
## city_tierTier 2 -264.30 1739.71 -0.152 0.879
## city_tierTier 3 984.71 1718.02 0.573 0.567
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61260 on 9994 degrees of freedom
## Multiple R-squared: 0.1546, Adjusted R-squared: 0.1541
## F-statistic: 365.4 on 5 and 9994 DF, p-value: < 2.2e-16
# Interpretation: The regression analysis shows that depreciation percentage
# is the only statistically significant predictor of resale price (p < 0.05),
# while average daily distance and city tier are not significant. The R² value
# remains approximately 0.154, indicating that additional variables do not
# improve the model. Therefore, depreciation is the primary factor influencing
# resale value.new_data <- data.frame(
avg_daily_distance_km = 30,
depreciation_pct = 20,
city_tier = bikes$city_tier[1] # takes a valid level automatically
)
predict(model2, newdata = new_data)## 1
## 180618
multi_reg_data$Predicted <- predict(model2)
ggplot(multi_reg_data, aes(x = Predicted, y = resale_price_inr)) +
geom_point(color = "yellow", alpha = 0.3) +
scale_y_continuous(labels=comma)+
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Actual vs Predicted Resale Price")# The Actual vs Predicted Resale Price plot compares the observed resale prices
# with the values predicted by the multiple linear regression model. The red
# diagonal line represents perfect predictions where actual and predicted values
# are equal. In the plot, most data points are scattered away from this line,
# indicating that the model does not predict resale prices very accurately.
# This suggests that the model has limited explanatory power. The spread of
# points implies that important factors affecting resale price are not included
# in the model. Overall, the model provides a general trend but lacks precision
# in prediction.