#Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)
library(readr)
library(tibble)
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- read.csv("C:/Users/ACER/Downloads/used_cars_data.csv")
data
str(data)
## 'data.frame': 7253 obs. of 14 variables:
## $ S.No. : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Name : chr "Maruti Wagon R LXI CNG" "Hyundai Creta 1.6 CRDi SX Option" "Honda Jazz V" "Maruti Ertiga VDI" ...
## $ Location : chr "Mumbai" "Pune" "Chennai" "Chennai" ...
## $ Year : int 2010 2015 2011 2012 2013 2012 2013 2016 2013 2012 ...
## $ Kilometers_Driven: int 72000 41000 46000 87000 40670 75000 86999 36000 64430 65932 ...
## $ Fuel_Type : chr "CNG" "Diesel" "Petrol" "Diesel" ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ Owner_Type : chr "First" "First" "First" "First" ...
## $ Mileage : chr "26.6 km/kg" "19.67 kmpl" "18.2 kmpl" "20.77 kmpl" ...
## $ Engine : chr "998 CC" "1582 CC" "1199 CC" "1248 CC" ...
## $ Power : chr "58.16 bhp" "126.2 bhp" "88.7 bhp" "88.76 bhp" ...
## $ Seats : int 5 5 5 7 5 5 5 8 5 5 ...
## $ New_Price : chr "" "" "8.61 Lakh" "" ...
## $ Price : num 1.75 12.5 4.5 6 17.74 ...
Answer: Displays the structure of the dataset including data types and sample values
data %>%
summary()
## S.No. Name Location Year
## Min. : 0 Length:7253 Length:7253 Min. :1996
## 1st Qu.:1813 Class :character Class :character 1st Qu.:2011
## Median :3626 Mode :character Mode :character Median :2014
## Mean :3626 Mean :2013
## 3rd Qu.:5439 3rd Qu.:2016
## Max. :7252 Max. :2019
##
## Kilometers_Driven Fuel_Type Transmission Owner_Type
## Min. : 171 Length:7253 Length:7253 Length:7253
## 1st Qu.: 34000 Class :character Class :character Class :character
## Median : 53416 Mode :character Mode :character Mode :character
## Mean : 58699
## 3rd Qu.: 73000
## Max. :6500000
##
## Mileage Engine Power Seats
## Length:7253 Length:7253 Length:7253 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 5.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 5.28
## 3rd Qu.: 5.00
## Max. :10.00
## NA's :53
## New_Price Price
## Length:7253 Min. : 0.440
## Class :character 1st Qu.: 3.500
## Mode :character Median : 5.640
## Mean : 9.479
## 3rd Qu.: 9.950
## Max. :160.000
## NA's :1234
Answer: Provides summary statistics like mean, median, min, max, and missing values.
data %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
t()
## [,1]
## S.No. 0
## Name 0
## Location 0
## Year 0
## Kilometers_Driven 0
## Fuel_Type 0
## Transmission 0
## Owner_Type 0
## Mileage 0
## Engine 0
## Power 0
## Seats 53
## New_Price 0
## Price 1234
Answer: The dataset contains missing values across several columns, with New_Price having the highest proportion of missing data
data <- data %>%
mutate(
Mileage = as.numeric(str_extract(Mileage, "\\d+\\.?\\d*")),
Engine = as.numeric(str_extract(Engine, "\\d+")),
Power = as.numeric(str_extract(Power, "\\d+\\.?\\d*"))
)
data
Answer: Text-based numerical features were cleaned and converted into numeric format for proper analysis
data %>%
summarise(duplicates = sum(duplicated(.)))
Answers Duplicate records were identified and removed to ensure data consistency and reliability
data <- data %>%
mutate(Car_Age = 2026 - Year)
Answers Car_Age was derived from the Year column to capture the effect of vehicle age on price
data %>%
ggplot(aes(x = Price)) +
geom_histogram(fill = "maroon", color = "black", bins = 30) +
labs(title = "Distribution of Car Prices",
x = "Price (in Lakhs)",
y = "Count") +
theme_minimal()
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_bin()`).
Anwers The histogram shows that car prices are positively skewed, with a majority of vehicles clustered in the lower price range.
data %>%
ggplot(aes(x = Kilometers_Driven)) +
geom_histogram(fill = "darkorange", color = "black", bins = 30) +
labs(title = "Distribution of Kilometers Driven",
x = "Kilometers Driven",
y = "Count") +
theme_minimal()
Answer The histogram shows that most vehicles have moderate mileage, while a few exhibit very high usage, indicating the presence of outliers
data %>%
count(Fuel_Type) %>%
ggplot(aes(x = Fuel_Type, y = n, fill = Fuel_Type)) +
geom_bar(stat = "identity") +
labs(title = "Fuel Type Frequency",
x = "Fuel Type",
y = "Count") +
theme_minimal() +
theme(legend.position = "none")
Answer The frequency analysis shows that
petrol and diesel vehicles are the most common in the dataset.
data %>%
ggplot(aes(x = Year)) +
geom_histogram(fill = "purple", color = "black", bins = 20) +
labs(title = "Distribution of Manufacturing Year",
x = "Year",
y = "Count") +
theme_minimal()
Answer The histogram indicates that the dataset is dominated by cars manufactured in recent years, with fewer older vehicles.
data %>%
ggplot(aes(x = Car_Age)) +
geom_histogram(fill = "seagreen", color = "black", bins = 20) +
labs(title = "Distribution of Car Age",
x = "Car Age",
y = "Count") +
theme_minimal()
Answer The distribution of car age shows that most vehicles are less than 10 years old, with fewer older vehicles in the dataset.
data %>%
ggplot(aes(x = Car_Age, y = Price)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Car Age vs Price",
x = "Car Age",
y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer The scatter plot shows that price decreases as car age increases, reflecting the depreciation of vehicles over time.
data %>%
ggplot(aes(x = Kilometers_Driven, y = Price)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Kilometers Driven vs Price",
x = "Kilometers Driven",
y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer The scatter plot shows that price decreases with an increase in kilometers driven, reflecting the effect of usage on vehicle value.
data %>%
ggplot(aes(x = Fuel_Type, y = Price, fill = Fuel_Type)) +
geom_boxplot() +
labs(title = "Fuel Type vs Price",
x = "Fuel Type",
y = "Price") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Answer The analysis indicates that fuel type plays a significant role in determining resale price, with diesel vehicles generally priced higher.
data %>%
ggplot(aes(x = Transmission, y = Price, fill = Transmission)) +
geom_boxplot() +
facet_wrap(~Owner_Type) +
labs(title = "Transmission & Owner Type vs Price",
x = "Transmission",
y = "Price") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Answer The analysis shows that both transmission type and ownership significantly influence car price, with automatic and first-owner vehicles commanding higher values
data %>%
mutate(Brand = word(Name, 1)) %>%
count(Brand, sort = TRUE)
Answer This identifies the most common car brands in the dataset. Helps understand market dominance and popularity.
###Q17: What is the average price by fuel type?
data %>%
group_by(Fuel_Type) %>%
summarise(avg_price = mean(Price, na.rm = TRUE))
Answer 👉 Compares average prices across fuel types. 👉 Shows how fuel type influences car pricing.
data %>%
count(Location, sort = TRUE)
Answer 👉 Identifies cities with highest number of listings. 👉 Reflects regional availability and demand.
data %>%
group_by(Location) %>%
summarise(avg_price = mean(Price, na.rm = TRUE))
Answer 👉 Shows price variation across cities. 👉 Indicates regional price differences.
data %>%
arrange(desc(Price)) %>%
head(10)
Answer 👉 Identifies high-end vehicles. 👉 Useful for understanding premium segment.
data %>%
arrange(Price) %>%
head(10)
Answer 👉 Highlights budget-friendly cars. 👉 Shows low-price segment of market.
data %>%
group_by(Owner_Type) %>%
summarise(avg_km = mean(Kilometers_Driven, na.rm = TRUE))
Answer 👉 Compares usage across owners. 👉 Indicates usage pattern based on ownership
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
data %>%
select(Price, Kilometers_Driven, Engine, Power, Car_Age) %>%
cor(use = "complete.obs") %>%
melt() %>%
ggplot(aes(Var1, Var2, fill = value)) +
geom_tile()
Answer Visual representation of correlations.
data %>%
ggplot(aes(x = Engine, y = Price)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1270 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1270 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer 👉 Shows relationship between engine capacity and price. 👉 Indicates that higher engine power generally increases price.
pairs(data[, c("Price", "Engine", "Power", "Kilometers_Driven")])
Answer Shows relationships between all variable pairs.
model1 <- lm(Price ~ Car_Age, data = data)
summary(model1)
##
## Call:
## lm(formula = Price ~ Car_Age, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.354 -5.660 -3.561 1.147 146.716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.6867 0.5485 41.36 <2e-16 ***
## Car_Age -1.0447 0.0420 -24.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.65 on 6017 degrees of freedom
## (1234 observations deleted due to missingness)
## Multiple R-squared: 0.09322, Adjusted R-squared: 0.09307
## F-statistic: 618.6 on 1 and 6017 DF, p-value: < 2.2e-16
Answer Car_Age significantly affects Price, and the negative coefficient indicates depreciation Coefficients (Intercept) → starting price Car_Age → change in price per year If negative → price decreases with age (depreciation) R-squared How much of price variation is explained by age Example: 0.60 → 60% explained p-value Tells if the relationship is statistically significant Small p-value → strong evidence the relationship is real
model2 <- lm(Price ~ Kilometers_Driven, data = data)
summary(model2)
##
## Call:
## lm(formula = Price ~ Kilometers_Driven, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.095 -5.991 -3.871 0.459 150.473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.562e+00 1.715e-01 55.757 <2e-16 ***
## Kilometers_Driven -1.409e-06 1.580e-06 -0.892 0.373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.19 on 6017 degrees of freedom
## (1234 observations deleted due to missingness)
## Multiple R-squared: 0.0001321, Adjusted R-squared: -3.409e-05
## F-statistic: 0.7949 on 1 and 6017 DF, p-value: 0.3727
Answer ✔ Coefficients Shows relationship strength ✔ R-squared Example: 0.45 👉 45% of price variation explained by KM ✔ p-value Small value → relationship is significant 👉 “As kilometers driven increases, the price of the car decreases.”
➡ Negative relationship
model3 <- lm(Price ~ Car_Age + Kilometers_Driven + Engine + Power, data = data)
summary(model3)
##
## Call:
## lm(formula = Price ~ Car_Age + Kilometers_Driven + Engine + Power,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.919 -3.049 -0.682 1.962 124.050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.998e+00 3.947e-01 10.128 <2e-16 ***
## Car_Age -1.045e+00 2.687e-02 -38.904 <2e-16 ***
## Kilometers_Driven 1.574e-06 9.189e-07 1.713 0.0868 .
## Engine 8.465e-04 2.810e-04 3.013 0.0026 **
## Power 1.521e-01 3.123e-03 48.705 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.357 on 5871 degrees of freedom
## (1377 observations deleted due to missingness)
## Multiple R-squared: 0.6807, Adjusted R-squared: 0.6805
## F-statistic: 3129 on 4 and 5871 DF, p-value: < 2.2e-16
Answer 🔹 3) What model is learning
👉 It checks combined effect of all variables
➡ Not just one factor ➡ Real-world prediction (more accurate)
🔹 4) Output (summary(model3)) ✔ Coefficients Each variable has its own effect Example: Car_Age → negative Power → positive ✔ R-squared 👉 Higher than simple regression ➡ Model explains more variation
✔ p-values
👉 Shows which variables are important
Small p-value → significant Large p-value → not useful
Car price is influenced by multiple factors including age, usage, engine capacity, and power
data %>%
group_by(Fuel_Type) %>%
summarise(avg_age = mean(Car_Age, na.rm = TRUE))
Answer Shows age distribution across fuel types
data %>%
group_by(Owner_Type) %>%
summarise(median_price = median(Price, na.rm = TRUE))
Answer Helps understand typical price per ownership level
data %>%
mutate(Price_Category = case_when(
Price < 5 ~ "Low",
Price < 10 ~ "Medium",
TRUE ~ "High"
)) %>%
count(Price_Category)
Answer Categorizes cars into price segments
data %>%
group_by(Transmission) %>%
summarise(avg_mileage = mean(Mileage, na.rm = TRUE))
Answer Compares fuel efficiency by transmission
data %>%
group_by(Year) %>%
summarise(avg_price = mean(Price, na.rm = TRUE)) %>%
arrange(desc(avg_price))
Answer Shows most valuable manufacturing years
data %>%
ggplot(aes(x = Mileage, y = Price)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1236 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1236 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer Shows whether higher mileage affects price.
data %>%
group_by(Fuel_Type) %>%
summarise(avg_engine = mean(Engine, na.rm = TRUE))
Answer Compares engine capacity across fuels
data %>%
ggplot(aes(x = Seats)) +
geom_bar()
## Warning: Removed 53 rows containing non-finite outside the scale range
## (`stat_count()`).
Answer Shows common seating capacity
data %>%
group_by(Seats) %>%
summarise(avg_price = mean(Price, na.rm = TRUE))
Answer Shows how capacity affects price
data %>%
count(Fuel_Type, Transmission, sort = TRUE)
Answer Identifies common car configurations
data %>%
mutate(Brand = word(Name, 1)) %>%
group_by(Brand) %>%
summarise(avg_power = mean(Power, na.rm = TRUE))
Answer Shows performance comparison by brand
data %>%
count(Owner_Type) %>%
mutate(percent = n / sum(n) * 100)
Answer Shows ownership distribution
data %>%
filter(Kilometers_Driven < 20000, Price > 10)
Answer Identifies premium low-usage cars