#Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)
library(readr)
library(tibble)
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Level 1: BASIC DATA UNDERSTANDING

Q1: What is the structure of the dataset?

data <- read.csv("C:/Users/ACER/Downloads/used_cars_data.csv")
data
str(data)
## 'data.frame':    7253 obs. of  14 variables:
##  $ S.No.            : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Name             : chr  "Maruti Wagon R LXI CNG" "Hyundai Creta 1.6 CRDi SX Option" "Honda Jazz V" "Maruti Ertiga VDI" ...
##  $ Location         : chr  "Mumbai" "Pune" "Chennai" "Chennai" ...
##  $ Year             : int  2010 2015 2011 2012 2013 2012 2013 2016 2013 2012 ...
##  $ Kilometers_Driven: int  72000 41000 46000 87000 40670 75000 86999 36000 64430 65932 ...
##  $ Fuel_Type        : chr  "CNG" "Diesel" "Petrol" "Diesel" ...
##  $ Transmission     : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ Owner_Type       : chr  "First" "First" "First" "First" ...
##  $ Mileage          : chr  "26.6 km/kg" "19.67 kmpl" "18.2 kmpl" "20.77 kmpl" ...
##  $ Engine           : chr  "998 CC" "1582 CC" "1199 CC" "1248 CC" ...
##  $ Power            : chr  "58.16 bhp" "126.2 bhp" "88.7 bhp" "88.76 bhp" ...
##  $ Seats            : int  5 5 5 7 5 5 5 8 5 5 ...
##  $ New_Price        : chr  "" "" "8.61 Lakh" "" ...
##  $ Price            : num  1.75 12.5 4.5 6 17.74 ...

Answer: Displays the structure of the dataset including data types and sample values

Q2: What are the summary statistics of the dataset?

data %>%
  summary()
##      S.No.          Name             Location              Year     
##  Min.   :   0   Length:7253        Length:7253        Min.   :1996  
##  1st Qu.:1813   Class :character   Class :character   1st Qu.:2011  
##  Median :3626   Mode  :character   Mode  :character   Median :2014  
##  Mean   :3626                                         Mean   :2013  
##  3rd Qu.:5439                                         3rd Qu.:2016  
##  Max.   :7252                                         Max.   :2019  
##                                                                     
##  Kilometers_Driven  Fuel_Type         Transmission        Owner_Type       
##  Min.   :    171   Length:7253        Length:7253        Length:7253       
##  1st Qu.:  34000   Class :character   Class :character   Class :character  
##  Median :  53416   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  58699                                                           
##  3rd Qu.:  73000                                                           
##  Max.   :6500000                                                           
##                                                                            
##    Mileage             Engine             Power               Seats      
##  Length:7253        Length:7253        Length:7253        Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 5.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 5.00  
##                                                           Mean   : 5.28  
##                                                           3rd Qu.: 5.00  
##                                                           Max.   :10.00  
##                                                           NA's   :53     
##   New_Price             Price        
##  Length:7253        Min.   :  0.440  
##  Class :character   1st Qu.:  3.500  
##  Mode  :character   Median :  5.640  
##                     Mean   :  9.479  
##                     3rd Qu.:  9.950  
##                     Max.   :160.000  
##                     NA's   :1234

Answer: Provides summary statistics like mean, median, min, max, and missing values.

Q3: How many missing values are present in each column?

data %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  t()
##                   [,1]
## S.No.                0
## Name                 0
## Location             0
## Year                 0
## Kilometers_Driven    0
## Fuel_Type            0
## Transmission         0
## Owner_Type           0
## Mileage              0
## Engine               0
## Power                0
## Seats               53
## New_Price            0
## Price             1234

Answer: The dataset contains missing values across several columns, with New_Price having the highest proportion of missing data

LEVEL 2: DATA PREPROCESSING

Q4: Conversion data types

data <- data %>%
  mutate(
    Mileage = as.numeric(str_extract(Mileage, "\\d+\\.?\\d*")),
    Engine  = as.numeric(str_extract(Engine, "\\d+")),
    Power   = as.numeric(str_extract(Power, "\\d+\\.?\\d*"))
  )
data

Answer: Text-based numerical features were cleaned and converted into numeric format for proper analysis

Q5: Check and remove duplicates

data %>%
  summarise(duplicates = sum(duplicated(.)))

Answers Duplicate records were identified and removed to ensure data consistency and reliability

Q6: Create a new feature (Car_Age)

data <- data %>%
  mutate(Car_Age = 2026 - Year)

Answers Car_Age was derived from the Year column to capture the effect of vehicle age on price

LEVEL 3: Univariate Analysis

Q7: What is the distribution of car prices?

data %>%
  ggplot(aes(x = Price)) +
  geom_histogram(fill = "maroon", color = "black", bins = 30) +
  labs(title = "Distribution of Car Prices",
       x = "Price (in Lakhs)",
       y = "Count") +
  theme_minimal()
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_bin()`).

Anwers The histogram shows that car prices are positively skewed, with a majority of vehicles clustered in the lower price range.

Q8: What is the distribution of kilometers driven?

data %>%
  ggplot(aes(x = Kilometers_Driven)) +
  geom_histogram(fill = "darkorange", color = "black", bins = 30) +
  labs(title = "Distribution of Kilometers Driven",
       x = "Kilometers Driven",
       y = "Count") +
  theme_minimal()

Answer The histogram shows that most vehicles have moderate mileage, while a few exhibit very high usage, indicating the presence of outliers

Q9: What is the frequency of fuel types?

data %>%
  count(Fuel_Type) %>%
  ggplot(aes(x = Fuel_Type, y = n, fill = Fuel_Type)) +
  geom_bar(stat = "identity") +
  labs(title = "Fuel Type Frequency",
       x = "Fuel Type",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

Answer The frequency analysis shows that petrol and diesel vehicles are the most common in the dataset.

Q10: What is the distribution of manufacturing year?

data %>%
  ggplot(aes(x = Year)) +
  geom_histogram(fill = "purple", color = "black", bins = 20) +
  labs(title = "Distribution of Manufacturing Year",
       x = "Year",
       y = "Count") +
  theme_minimal()

Answer The histogram indicates that the dataset is dominated by cars manufactured in recent years, with fewer older vehicles.

Q11: What is the distribution of car age?

data %>%
  ggplot(aes(x = Car_Age)) +
  geom_histogram(fill = "seagreen", color = "black", bins = 20) +
  labs(title = "Distribution of Car Age",
       x = "Car Age",
       y = "Count") +
  theme_minimal()

Answer The distribution of car age shows that most vehicles are less than 10 years old, with fewer older vehicles in the dataset.

LEVEL 4: Bivariate Analysis

Q12: How does car age affect price?

data %>%
  ggplot(aes(x = Car_Age, y = Price)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Car Age vs Price",
       x = "Car Age",
       y = "Price") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).

Answer The scatter plot shows that price decreases as car age increases, reflecting the depreciation of vehicles over time.

Q13: How do kilometers driven affect price?

data %>%
  ggplot(aes(x = Kilometers_Driven, y = Price)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Kilometers Driven vs Price",
       x = "Kilometers Driven",
       y = "Price") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1234 rows containing missing values or values outside the scale range
## (`geom_point()`).

Answer The scatter plot shows that price decreases with an increase in kilometers driven, reflecting the effect of usage on vehicle value.

Q14: Does fuel type affect price?

data %>%
  ggplot(aes(x = Fuel_Type, y = Price, fill = Fuel_Type)) +
  geom_boxplot() +
  labs(title = "Fuel Type vs Price",
       x = "Fuel Type",
       y = "Price") +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Answer The analysis indicates that fuel type plays a significant role in determining resale price, with diesel vehicles generally priced higher.

Q15: How do transmission and owner type affect price?

data %>%
  ggplot(aes(x = Transmission, y = Price, fill = Transmission)) +
  geom_boxplot() +
  facet_wrap(~Owner_Type) +
  labs(title = "Transmission & Owner Type vs Price",
       x = "Transmission",
       y = "Price") +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Removed 1234 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Answer The analysis shows that both transmission type and ownership significantly influence car price, with automatic and first-owner vehicles commanding higher values

LEVEL 5: ADVANCED DATA ANALYSIS

Q16: Which car brands appear most frequently?

data %>%
  mutate(Brand = word(Name, 1)) %>%
  count(Brand, sort = TRUE)

Answer This identifies the most common car brands in the dataset. Helps understand market dominance and popularity.

###Q17: What is the average price by fuel type?

data %>%
  group_by(Fuel_Type) %>%
  summarise(avg_price = mean(Price, na.rm = TRUE))

Answer 👉 Compares average prices across fuel types. 👉 Shows how fuel type influences car pricing.

Q19: Which locations have the most cars?

data %>%
  count(Location, sort = TRUE)

Answer 👉 Identifies cities with highest number of listings. 👉 Reflects regional availability and demand.

Q20: What is the average price by location?

data %>%
  group_by(Location) %>%
  summarise(avg_price = mean(Price, na.rm = TRUE))

Answer 👉 Shows price variation across cities. 👉 Indicates regional price differences.

Q21: Top 10 most expensive cars

data %>%
  arrange(desc(Price)) %>%
  head(10)

Answer 👉 Identifies high-end vehicles. 👉 Useful for understanding premium segment.

Q22: Top 10 cheapest cars

data %>%
  arrange(Price) %>%
  head(10)

Answer 👉 Highlights budget-friendly cars. 👉 Shows low-price segment of market.

Q23: Average KM driven by owner type

data %>%
  group_by(Owner_Type) %>%
  summarise(avg_km = mean(Kilometers_Driven, na.rm = TRUE))

Answer 👉 Compares usage across owners. 👉 Indicates usage pattern based on ownership

Q24:Correlation heatmap ?

library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
data %>%
  select(Price, Kilometers_Driven, Engine, Power, Car_Age) %>%
  cor(use = "complete.obs") %>%
  melt() %>%
  ggplot(aes(Var1, Var2, fill = value)) +
  geom_tile()

Answer Visual representation of correlations.

Q25: Relationship between engine and price

data %>%
  ggplot(aes(x = Engine, y = Price)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1270 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1270 rows containing missing values or values outside the scale range
## (`geom_point()`).

Answer 👉 Shows relationship between engine capacity and price. 👉 Indicates that higher engine power generally increases price.

Q26: Pairwise scatterplot?

pairs(data[, c("Price", "Engine", "Power", "Kilometers_Driven")])

Answer Shows relationships between all variable pairs.

Q27: Simple Linear Regression (Price vs Car Age)?

model1 <- lm(Price ~ Car_Age, data = data)
summary(model1)
## 
## Call:
## lm(formula = Price ~ Car_Age, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.354  -5.660  -3.561   1.147 146.716 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.6867     0.5485   41.36   <2e-16 ***
## Car_Age      -1.0447     0.0420  -24.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.65 on 6017 degrees of freedom
##   (1234 observations deleted due to missingness)
## Multiple R-squared:  0.09322,    Adjusted R-squared:  0.09307 
## F-statistic: 618.6 on 1 and 6017 DF,  p-value: < 2.2e-16

Answer Car_Age significantly affects Price, and the negative coefficient indicates depreciation Coefficients (Intercept) → starting price Car_Age → change in price per year If negative → price decreases with age (depreciation) R-squared How much of price variation is explained by age Example: 0.60 → 60% explained p-value Tells if the relationship is statistically significant Small p-value → strong evidence the relationship is real

KM Vs Price

model2 <- lm(Price ~ Kilometers_Driven, data = data)
summary(model2)
## 
## Call:
## lm(formula = Price ~ Kilometers_Driven, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -9.095  -5.991  -3.871   0.459 150.473 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        9.562e+00  1.715e-01  55.757   <2e-16 ***
## Kilometers_Driven -1.409e-06  1.580e-06  -0.892    0.373    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.19 on 6017 degrees of freedom
##   (1234 observations deleted due to missingness)
## Multiple R-squared:  0.0001321,  Adjusted R-squared:  -3.409e-05 
## F-statistic: 0.7949 on 1 and 6017 DF,  p-value: 0.3727

Answer ✔ Coefficients Shows relationship strength ✔ R-squared Example: 0.45 👉 45% of price variation explained by KM ✔ p-value Small value → relationship is significant 👉 “As kilometers driven increases, the price of the car decreases.”

➡ Negative relationship

Multiple regression model

model3 <- lm(Price ~ Car_Age + Kilometers_Driven + Engine + Power, data = data)
summary(model3)
## 
## Call:
## lm(formula = Price ~ Car_Age + Kilometers_Driven + Engine + Power, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.919  -3.049  -0.682   1.962 124.050 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.998e+00  3.947e-01  10.128   <2e-16 ***
## Car_Age           -1.045e+00  2.687e-02 -38.904   <2e-16 ***
## Kilometers_Driven  1.574e-06  9.189e-07   1.713   0.0868 .  
## Engine             8.465e-04  2.810e-04   3.013   0.0026 ** 
## Power              1.521e-01  3.123e-03  48.705   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.357 on 5871 degrees of freedom
##   (1377 observations deleted due to missingness)
## Multiple R-squared:  0.6807, Adjusted R-squared:  0.6805 
## F-statistic:  3129 on 4 and 5871 DF,  p-value: < 2.2e-16

Answer 🔹 3) What model is learning

👉 It checks combined effect of all variables

➡ Not just one factor ➡ Real-world prediction (more accurate)

🔹 4) Output (summary(model3)) ✔ Coefficients Each variable has its own effect Example: Car_Age → negative Power → positive ✔ R-squared 👉 Higher than simple regression ➡ Model explains more variation

✔ p-values

👉 Shows which variables are important

Small p-value → significant Large p-value → not useful

Car price is influenced by multiple factors including age, usage, engine capacity, and power

Q28: What is the average car age by fuel type?

data %>%
  group_by(Fuel_Type) %>%
  summarise(avg_age = mean(Car_Age, na.rm = TRUE))

Answer Shows age distribution across fuel types

Q29: What is the median price by owner type?

data %>%
  group_by(Owner_Type) %>%
  summarise(median_price = median(Price, na.rm = TRUE))

Answer Helps understand typical price per ownership level

Q30: How many cars fall in each price range?

data %>%
  mutate(Price_Category = case_when(
    Price < 5 ~ "Low",
    Price < 10 ~ "Medium",
    TRUE ~ "High"
  )) %>%
  count(Price_Category)

Answer Categorizes cars into price segments

Q31: What is the average mileage by transmission type?

data %>%
  group_by(Transmission) %>%
  summarise(avg_mileage = mean(Mileage, na.rm = TRUE))

Answer Compares fuel efficiency by transmission

Q32: Which year has the highest average price?

data %>%
  group_by(Year) %>%
  summarise(avg_price = mean(Price, na.rm = TRUE)) %>%
  arrange(desc(avg_price))

Answer Shows most valuable manufacturing years

Q33: What is the relationship between mileage and price?

data %>%
  ggplot(aes(x = Mileage, y = Price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1236 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1236 rows containing missing values or values outside the scale range
## (`geom_point()`).

Answer Shows whether higher mileage affects price.

Q34: What is the average engine size by fuel type?

data %>%
  group_by(Fuel_Type) %>%
  summarise(avg_engine = mean(Engine, na.rm = TRUE))

Answer Compares engine capacity across fuels

Q35: What is the distribution of seats?

data %>%
  ggplot(aes(x = Seats)) +
  geom_bar()
## Warning: Removed 53 rows containing non-finite outside the scale range
## (`stat_count()`).

Answer Shows common seating capacity

Q36: What is the average price by number of seats?

data %>%
  group_by(Seats) %>%
  summarise(avg_price = mean(Price, na.rm = TRUE))

Answer Shows how capacity affects price

Q37: Which combinations of fuel and transmission are most common?

data %>%
  count(Fuel_Type, Transmission, sort = TRUE)

Answer Identifies common car configurations

Q38: What is the average power by brand?

data %>%
  mutate(Brand = word(Name, 1)) %>%
  group_by(Brand) %>%
  summarise(avg_power = mean(Power, na.rm = TRUE))

Answer Shows performance comparison by brand

Q39: What percentage of cars belong to each owner type?

data %>%
  count(Owner_Type) %>%
  mutate(percent = n / sum(n) * 100)

Answer Shows ownership distribution

Q40: Which cars have both low KM and high price?

data %>%
  filter(Kilometers_Driven < 20000, Price > 10)

Answer Identifies premium low-usage cars