library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)
library(readr)
library(tibble)
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- read.csv("C:/Users/ACER/Downloads/used_cars_data.csv")
data
str(data)
## 'data.frame': 7253 obs. of 14 variables:
## $ S.No. : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Name : chr "Maruti Wagon R LXI CNG" "Hyundai Creta 1.6 CRDi SX Option" "Honda Jazz V" "Maruti Ertiga VDI" ...
## $ Location : chr "Mumbai" "Pune" "Chennai" "Chennai" ...
## $ Year : int 2010 2015 2011 2012 2013 2012 2013 2016 2013 2012 ...
## $ Kilometers_Driven: int 72000 41000 46000 87000 40670 75000 86999 36000 64430 65932 ...
## $ Fuel_Type : chr "CNG" "Diesel" "Petrol" "Diesel" ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ Owner_Type : chr "First" "First" "First" "First" ...
## $ Mileage : chr "26.6 km/kg" "19.67 kmpl" "18.2 kmpl" "20.77 kmpl" ...
## $ Engine : chr "998 CC" "1582 CC" "1199 CC" "1248 CC" ...
## $ Power : chr "58.16 bhp" "126.2 bhp" "88.7 bhp" "88.76 bhp" ...
## $ Seats : int 5 5 5 7 5 5 5 8 5 5 ...
## $ New_Price : chr "" "" "8.61 Lakh" "" ...
## $ Price : num 1.75 12.5 4.5 6 17.74 ...
Answer: Displays the structure of the dataset including data types and sample values
data %>%
summary()
## S.No. Name Location Year
## Min. : 0 Length:7253 Length:7253 Min. :1996
## 1st Qu.:1813 Class :character Class :character 1st Qu.:2011
## Median :3626 Mode :character Mode :character Median :2014
## Mean :3626 Mean :2013
## 3rd Qu.:5439 3rd Qu.:2016
## Max. :7252 Max. :2019
##
## Kilometers_Driven Fuel_Type Transmission Owner_Type
## Min. : 171 Length:7253 Length:7253 Length:7253
## 1st Qu.: 34000 Class :character Class :character Class :character
## Median : 53416 Mode :character Mode :character Mode :character
## Mean : 58699
## 3rd Qu.: 73000
## Max. :6500000
##
## Mileage Engine Power Seats
## Length:7253 Length:7253 Length:7253 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 5.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 5.28
## 3rd Qu.: 5.00
## Max. :10.00
## NA's :53
## New_Price Price
## Length:7253 Min. : 0.440
## Class :character 1st Qu.: 3.500
## Mode :character Median : 5.640
## Mean : 9.479
## 3rd Qu.: 9.950
## Max. :160.000
## NA's :1234
Answer: Provides summary statistics like mean, median, min, max, and missing values.
data %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
t()
## [,1]
## S.No. 0
## Name 0
## Location 0
## Year 0
## Kilometers_Driven 0
## Fuel_Type 0
## Transmission 0
## Owner_Type 0
## Mileage 0
## Engine 0
## Power 0
## Seats 53
## New_Price 0
## Price 1234
Answer: The dataset contains missing values across several columns, with New_Price having the highest proportion of missing data
data <- data %>%
mutate(
Mileage = as.numeric(str_extract(Mileage, "\\d+\\.?\\d*")),
Engine = as.numeric(str_extract(Engine, "\\d+")),
Power = as.numeric(str_extract(Power, "\\d+\\.?\\d*"))
)
data
Answer: Text-based numerical features were cleaned and converted into numeric format for proper analysis
data %>%
summarise(duplicates = sum(duplicated(.)))
Answers Duplicate records were identified and removed to ensure data consistency and reliability
data <- data %>%
mutate(Car_Age = 2026 - Year)
Answers Car_Age was derived from the Year column to capture the effect of vehicle age on price
data %>%
ggplot(aes(x = Price)) +
geom_histogram(fill = "maroon", color = "black", bins = 30) +
labs(title = "Distribution of Car Prices",
x = "Price (in Lakhs)",
y = "Count") +
theme_minimal()
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_bin()`).
Anwers The histogram shows that car prices are positively skewed, with a majority of vehicles clustered in the lower price range.
data %>%
ggplot(aes(x = Kilometers_Driven)) +
geom_histogram(fill = "darkorange", color = "black", bins = 30) +
labs(title = "Distribution of Kilometers Driven",
x = "Kilometers Driven",
y = "Count") +
theme_minimal()
Answer The histogram shows that most vehicles have moderate mileage, while a few exhibit very high usage, indicating the presence of outliers
data %>%
count(Fuel_Type) %>%
ggplot(aes(x = Fuel_Type, y = n, fill = Fuel_Type)) +
geom_bar(stat = "identity") +
labs(title = "Fuel Type Frequency",
x = "Fuel Type",
y = "Count") +
theme_minimal() +
theme(legend.position = "none")
Answer The frequency analysis shows that
petrol and diesel vehicles are the most common in the dataset.
data %>%
ggplot(aes(x = Year)) +
geom_histogram(fill = "purple", color = "black", bins = 20) +
labs(title = "Distribution of Manufacturing Year",
x = "Year",
y = "Count") +
theme_minimal()
Answer The histogram indicates that the dataset is dominated by cars manufactured in recent years, with fewer older vehicles.
data %>%
ggplot(aes(x = Car_Age)) +
geom_histogram(fill = "seagreen", color = "black", bins = 20) +
labs(title = "Distribution of Car Age",
x = "Car Age",
y = "Count") +
theme_minimal()
Answer The distribution of car age shows that most vehicles are less than 10 years old, with fewer older vehicles in the dataset.
data %>%
ggplot(aes(x = Car_Age, y = Price)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Car Age vs Price",
x = "Car Age",
y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer The scatter plot shows that price decreases as car age increases, reflecting the depreciation of vehicles over time.
data %>%
ggplot(aes(x = Kilometers_Driven, y = Price)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Kilometers Driven vs Price",
x = "Kilometers Driven",
y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer The scatter plot shows that price decreases with an increase in kilometers driven, reflecting the effect of usage on vehicle value.
data %>%
ggplot(aes(x = Fuel_Type, y = Price, fill = Fuel_Type)) +
geom_boxplot() +
labs(title = "Fuel Type vs Price",
x = "Fuel Type",
y = "Price") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Answer The analysis indicates that fuel type plays a significant role in determining resale price, with diesel vehicles generally priced higher.
data %>%
ggplot(aes(x = Transmission, y = Price, fill = Transmission)) +
geom_boxplot() +
facet_wrap(~Owner_Type) +
labs(title = "Transmission & Owner Type vs Price",
x = "Transmission",
y = "Price") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Answer The analysis shows that both transmission type and ownership significantly influence car price, with automatic and first-owner vehicles commanding higher values
data %>%
summarise(
mean_price = mean(Price, na.rm = TRUE),
median_price = median(Price, na.rm = TRUE),
sd_price = sd(Price, na.rm = TRUE)
)
Answer Mean > Median → Right Skewness It means: Most cars are cheap to mid-range Few cars are very expensive (luxury cars)
👉 Those few expensive cars: ➡ Pull the mean upward
IQR(data$Price, na.rm = TRUE)
## [1] 6.45
Answer Measures range of middle 50% data 👉 Small IQR → consistent prices 👉 Large IQR → high variability
Q1 <- quantile(data$Price, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Price, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
data %>%
filter(Price < (Q1 - 1.5*IQR_val) | Price > (Q3 + 1.5*IQR_val))
Answer Finds extreme values outside normal range 👉 Luxury cars or very cheap cars = outliers 👉 These can distort analysis Lamborghini, BMW, Audi → upper outliers Old Maruti / Nano → lower outliers
data %>%
mutate(z = (Price - mean(Price, na.rm = TRUE)) / sd(Price, na.rm = TRUE)) %>%
filter(abs(z) > 3)
Answer Standardizes data Finds values far from mean Values beyond ±3 → unusual Since your data is right-skewed
Z-score may detect:
Mostly high-price outliers May miss some low-end ones
data %>%
group_by(Fuel_Type) %>%
summarise(avg_price = mean(Price, na.rm = TRUE))
Answer Compares average price across groups 👉 Diesel cars → usually higher resale value
data %>%
ggplot(aes(x = Price)) +
geom_density(fill = "blue", alpha = 0.5)
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_density()`).
Answer Creates a smooth curve of price distribution Unlike histogram (bars), this shows a continuous distribution shape
Peak on left (low prices) Long tail on right (expensive cars)
Meaning: âž¡ Most cars are cheap âž¡ Few luxury cars stretch the distribution
Majority of cars are around 5–10 lakhs
data %>%
ggplot(aes(y = Price)) +
geom_boxplot(fill = "orange")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Answer Median is closer to bottom of box âž¡ Data is right-skewed
many dots above the whiskers These are luxury / high-priced cars Market has premium extremes more than ultra-cheap ones
cor(data$Price, data$Engine, use = "complete.obs")
## [1] 0.6583536
Answer The correlation between price and engine is positive, indicating that vehicles with larger engine capacity tend to have higher prices. This reflects that engine size is an important factor influencing car value
data %>%
select(Price, Engine, Power, Kilometers_Driven, Car_Age) %>%
cor(use = "complete.obs")
## Price Engine Power Kilometers_Driven
## Price 1.000000000 0.65753749 0.77256580 -0.008310286
## Engine 0.657537489 1.00000000 0.86618471 0.092989979
## Power 0.772565800 0.86618471 1.00000000 0.033503290
## Kilometers_Driven -0.008310286 0.09298998 0.03350329 1.000000000
## Car_Age -0.299511071 0.06826054 -0.01452523 0.169369261
## Car_Age
## Price -0.29951107
## Engine 0.06826054
## Power -0.01452523
## Kilometers_Driven 0.16936926
## Car_Age 1.00000000
Answer The correlation matrix shows that power has a strong positive relationship with price, while car age has a strong negative relationship. Engine has a moderate positive correlation, whereas kilometers driven shows weak correlation, indicating limited influence on price.
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
data %>%
select(Price, Engine, Power, Kilometers_Driven, Car_Age) %>%
cor(use = "complete.obs") %>%
melt() %>%
ggplot(aes(Var1, Var2, fill = value)) +
geom_tile()
Answer The heatmap visually represents the correlation between variables. It shows that power has a strong positive relationship with price, while car age has a strong negative relationship. Engine shows moderate correlation, and kilometers driven has weak influence. The heatmap helps quickly identify key factors affecting price
pairs(data[, c("Price", "Engine", "Power", "Car_Age")])
Answer The pair plot visualizes relationships between multiple variables. It shows that price decreases with car age and increases with power and engine size. Kilometers driven shows weak association. It also indicates a strong relationship between engine and power, suggesting possible multicollinearity
model1 <- lm(Price ~ Car_Age, data = data)
summary(model1)
##
## Call:
## lm(formula = Price ~ Car_Age, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.354 -5.660 -3.561 1.147 146.716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.6867 0.5485 41.36 <2e-16 ***
## Car_Age -1.0447 0.0420 -24.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.65 on 6017 degrees of freedom
## (1234 observations deleted due to missingness)
## Multiple R-squared: 0.09322, Adjusted R-squared: 0.09307
## F-statistic: 618.6 on 1 and 6017 DF, p-value: < 2.2e-16
Answer âž¡ Every 1 year increase âž¡ Price decreases by ~1.04 lakh
p-value < 0.05 âž¡ Relationship is real and reliable
R² ≈ 0.093 ➡ Only 9.3% of price variation explained ➡ Age alone is not enough to predict price
Price depends on:
Power Engine Brand
model2 <- lm(Price ~ Kilometers_Driven, data = data)
summary(model2)
##
## Call:
## lm(formula = Price ~ Kilometers_Driven, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.095 -5.991 -3.871 0.459 150.473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.562e+00 1.715e-01 55.757 <2e-16 ***
## Kilometers_Driven -1.409e-06 1.580e-06 -0.892 0.373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.19 on 6017 degrees of freedom
## (1234 observations deleted due to missingness)
## Multiple R-squared: 0.0001321, Adjusted R-squared: -3.409e-05
## F-statistic: 0.7949 on 1 and 6017 DF, p-value: 0.3727
Answer The regression shows that kilometers driven has a very weak and statistically insignificant effect on price. The extremely low R-squared value indicates that it does not explain price variation.
model3 <- lm(Price ~ Car_Age + Kilometers_Driven + Engine + Power, data = data)
summary(model3)
##
## Call:
## lm(formula = Price ~ Car_Age + Kilometers_Driven + Engine + Power,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.919 -3.049 -0.682 1.962 124.050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.998e+00 3.947e-01 10.128 <2e-16 ***
## Car_Age -1.045e+00 2.687e-02 -38.904 <2e-16 ***
## Kilometers_Driven 1.574e-06 9.189e-07 1.713 0.0868 .
## Engine 8.465e-04 2.810e-04 3.013 0.0026 **
## Power 1.521e-01 3.123e-03 48.705 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.357 on 5871 degrees of freedom
## (1377 observations deleted due to missingness)
## Multiple R-squared: 0.6807, Adjusted R-squared: 0.6805
## F-statistic: 3129 on 4 and 5871 DF, p-value: < 2.2e-16
Answer The multiple regression model shows that car age negatively impacts price, while power and engine positively influence it. Power is the most significant factor. Kilometers driven is not statistically significant. The model explains about 68% of the variation in price, indicating a good fit. Multiple regression captures real-world complexity better than simple regression.
plot(model3$residuals)
Answer Checks model accuracy Residual range: -50 to +124 âž¡ Some large errors exist âž¡ Likely due to:
Luxury cars Outliers Residual analysis is essential to validate regression assumptions and model reliability.
Balanced above & below zero âž¡ Model is unbiased
data %>%
group_by(Year) %>%
summarise(avg_price = mean(Price, na.rm = TRUE)) %>%
arrange(desc(avg_price))
Answer Shows most valuable manufacturing years
data %>%
ggplot(aes(x = Mileage, y = Price)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1236 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1236 rows containing missing values or values outside the scale range
## (`geom_point()`).
Answer The scatter plot with regression line shows a weak relationship between mileage and price. Although there is a slight trend, the wide dispersion of points indicates that mileage is not a strong predictor of price.
data %>%
group_by(Fuel_Type) %>%
summarise(avg_engine = mean(Engine, na.rm = TRUE))
Answer Compares engine capacity across fuels
data %>%
ggplot(aes(x = Seats)) +
geom_bar()
## Warning: Removed 53 rows containing non-finite outside the scale range
## (`stat_count()`).
Answer Shows common seating capacity
data %>%
group_by(Seats) %>%
summarise(avg_price = mean(Price, na.rm = TRUE))
Answer Shows how capacity affects price
data %>%
count(Fuel_Type, Transmission, sort = TRUE)
Answer Identifies common car configurations
data %>%
mutate(Brand = word(Name, 1)) %>%
group_by(Brand) %>%
summarise(avg_power = mean(Power, na.rm = TRUE))
Answer Shows performance comparison by brand
data %>%
count(Owner_Type) %>%
mutate(percent = n / sum(n) * 100)
Answer Shows ownership distribution
data %>%
filter(Kilometers_Driven < 20000, Price > 10)
Answer Identifies premium low-usage cars