library(stringr)
library(moments)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
#Loading the data
usedCars<-read.csv("used_cars_dataset.csv", stringsAsFactors = FALSE)
str(usedCars)
## 'data.frame':    14856 obs. of  11 variables:
##  $ Brand       : chr  "Honda" "Toyota" "Volkswagen" "Maruti Suzuki" ...
##  $ model       : chr  "City" "Innova" "VentoTest" "Swift" ...
##  $ Year        : int  2001 2009 2010 2017 2019 2014 2014 2019 2020 2017 ...
##  $ Age         : int  23 15 14 7 5 10 10 5 4 7 ...
##  $ kmDriven    : chr  "98,000 km" "190000.0 km" "77,246 km" "83,500 km" ...
##  $ Transmission: chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ Owner       : chr  "second" "second" "first" "second" ...
##  $ FuelType    : chr  "Petrol" "Diesel" "Diesel" "Diesel" ...
##  $ PostedDate  : chr  "Nov-24" "Jul-24" "Nov-24" "Nov-24" ...
##  $ AdditionInfo: chr  "Honda City v teck in mint condition, valid genuine car," "Toyota Innova 2.5 G (Diesel) 7 Seater, 2009, Diesel" "Volkswagen Vento 2010-2013 Diesel Breeze, 2010, Diesel" "Maruti Suzuki Swift 2017 Diesel Good Condition" ...
##  $ AskPrice    : chr  "₹ 1,95,000" "₹ 3,75,000" "₹ 1,84,999" "₹ 5,65,000" ...

The dataset initially contained numerical fields such as price and mileage in text format, which required cleaning and conversion for analysis.

summary(usedCars)
##     Brand              model                Year           Age         
##  Length:14856       Length:14856       Min.   :1900   Min.   :  0.000  
##  Class :character   Class :character   1st Qu.:2014   1st Qu.:  5.000  
##  Mode  :character   Mode  :character   Median :2017   Median :  7.000  
##                                        Mean   :2016   Mean   :  7.666  
##                                        3rd Qu.:2019   3rd Qu.: 10.000  
##                                        Max.   :2024   Max.   :124.000  
##    kmDriven         Transmission          Owner             FuelType        
##  Length:14856       Length:14856       Length:14856       Length:14856      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   PostedDate        AdditionInfo         AskPrice        
##  Length:14856       Length:14856       Length:14856      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
#Checking missing values
colSums(is.na(usedCars))
##        Brand        model         Year          Age     kmDriven Transmission 
##            0            0            0            0            0            0 
##        Owner     FuelType   PostedDate AdditionInfo     AskPrice 
##            0            0            0            0            0

No missing values were detected, ensuring data completeness and reliability for further analysis.

#Cleaning the price column
usedCars$AskPrice <- str_replace_all(usedCars$AskPrice, "₹ ","")
usedCars$AskPrice <- str_replace_all(usedCars$AskPrice, ",", "")
usedCars$AskPrice <- as.numeric(usedCars$AskPrice)
str(usedCars)
## 'data.frame':    14856 obs. of  11 variables:
##  $ Brand       : chr  "Honda" "Toyota" "Volkswagen" "Maruti Suzuki" ...
##  $ model       : chr  "City" "Innova" "VentoTest" "Swift" ...
##  $ Year        : int  2001 2009 2010 2017 2019 2014 2014 2019 2020 2017 ...
##  $ Age         : int  23 15 14 7 5 10 10 5 4 7 ...
##  $ kmDriven    : chr  "98,000 km" "190000.0 km" "77,246 km" "83,500 km" ...
##  $ Transmission: chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ Owner       : chr  "second" "second" "first" "second" ...
##  $ FuelType    : chr  "Petrol" "Diesel" "Diesel" "Diesel" ...
##  $ PostedDate  : chr  "Nov-24" "Jul-24" "Nov-24" "Nov-24" ...
##  $ AdditionInfo: chr  "Honda City v teck in mint condition, valid genuine car," "Toyota Innova 2.5 G (Diesel) 7 Seater, 2009, Diesel" "Volkswagen Vento 2010-2013 Diesel Breeze, 2010, Diesel" "Maruti Suzuki Swift 2017 Diesel Good Condition" ...
##  $ AskPrice    : num  195000 375000 184999 565000 685000 ...

The price column was cleaned by removing currency symbols and commas, and converted into numeric format to enable quantitative analysis.

# Cleaning the km driven column
usedCars$kmDriven <- str_replace_all(usedCars$kmDriven, ",","")
usedCars$kmDriven <- str_replace_all(usedCars$kmDriven, " km", "")
usedCars$kmDriven <- as.numeric(usedCars$kmDriven)
str(usedCars)
## 'data.frame':    14856 obs. of  11 variables:
##  $ Brand       : chr  "Honda" "Toyota" "Volkswagen" "Maruti Suzuki" ...
##  $ model       : chr  "City" "Innova" "VentoTest" "Swift" ...
##  $ Year        : int  2001 2009 2010 2017 2019 2014 2014 2019 2020 2017 ...
##  $ Age         : int  23 15 14 7 5 10 10 5 4 7 ...
##  $ kmDriven    : num  98000 190000 77246 83500 45000 ...
##  $ Transmission: chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ Owner       : chr  "second" "second" "first" "second" ...
##  $ FuelType    : chr  "Petrol" "Diesel" "Diesel" "Diesel" ...
##  $ PostedDate  : chr  "Nov-24" "Jul-24" "Nov-24" "Nov-24" ...
##  $ AdditionInfo: chr  "Honda City v teck in mint condition, valid genuine car," "Toyota Innova 2.5 G (Diesel) 7 Seater, 2009, Diesel" "Volkswagen Vento 2010-2013 Diesel Breeze, 2010, Diesel" "Maruti Suzuki Swift 2017 Diesel Good Condition" ...
##  $ AskPrice    : num  195000 375000 184999 565000 685000 ...

The mileage column was standardized by removing text and converting it into numeric format for accurate analysis.

#Standardizing the text data
usedCars$Transmission <- str_to_title(usedCars$Transmission)
usedCars$Owner <- str_to_title(usedCars$Owner)
usedCars$FuelType <- str_to_title(usedCars$FuelType)

Text variables were standardized (converted to title case) to avoid inconsistencies caused by different capitalizations.

# Removing duplicates
usedCars <- unique(usedCars)
str(usedCars)
## 'data.frame':    13850 obs. of  11 variables:
##  $ Brand       : chr  "Honda" "Toyota" "Volkswagen" "Maruti Suzuki" ...
##  $ model       : chr  "City" "Innova" "VentoTest" "Swift" ...
##  $ Year        : int  2001 2009 2010 2017 2019 2014 2014 2019 2020 2017 ...
##  $ Age         : int  23 15 14 7 5 10 10 5 4 7 ...
##  $ kmDriven    : num  98000 190000 77246 83500 45000 ...
##  $ Transmission: chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ Owner       : chr  "Second" "Second" "First" "Second" ...
##  $ FuelType    : chr  "Petrol" "Diesel" "Diesel" "Diesel" ...
##  $ PostedDate  : chr  "Nov-24" "Jul-24" "Nov-24" "Nov-24" ...
##  $ AdditionInfo: chr  "Honda City v teck in mint condition, valid genuine car," "Toyota Innova 2.5 G (Diesel) 7 Seater, 2009, Diesel" "Volkswagen Vento 2010-2013 Diesel Breeze, 2010, Diesel" "Maruti Suzuki Swift 2017 Diesel Good Condition" ...
##  $ AskPrice    : num  195000 375000 184999 565000 685000 ...

Duplicate records were removed to improve data quality and prevent biased analysis.

# Handling outliers
boxplot(usedCars$kmDriven)

usedCars <- usedCars[usedCars$kmDriven > 1000 & usedCars$kmDriven < 150000, ]
# Handling outliers
boxplot(usedCars$Year)

usedCars <- usedCars[usedCars$Year > 1999, ]
usedCars <- usedCars[usedCars$AskPrice < 15000000, ]

Outliers were filtered to ensure realistic data ranges and improve model accuracy.

#Saving cleaned dataset
write.csv(usedCars, "used_cars_clean.csv", row.names = FALSE)
#EDA
#1. Overall summary, mean, median, mode, standard deviation
summary(usedCars)
##     Brand              model                Year           Age        
##  Length:13050       Length:13050       Min.   :2000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.:2014   1st Qu.: 5.000  
##  Mode  :character   Mode  :character   Median :2017   Median : 7.000  
##                                        Mean   :2016   Mean   : 7.524  
##                                        3rd Qu.:2019   3rd Qu.:10.000  
##                                        Max.   :2024   Max.   :24.000  
##     kmDriven      Transmission          Owner             FuelType        
##  Min.   :  1022   Length:13050       Length:13050       Length:13050      
##  1st Qu.: 44330   Class :character   Class :character   Class :character  
##  Median : 65000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 65047                                                           
##  3rd Qu.: 84515                                                           
##  Max.   :149500                                                           
##   PostedDate        AdditionInfo          AskPrice       
##  Length:13050       Length:13050       Min.   :   15000  
##  Class :character   Class :character   1st Qu.:  350000  
##  Mode  :character   Mode  :character   Median :  570000  
##                                        Mean   :  960956  
##                                        3rd Qu.:  995000  
##                                        Max.   :14900000

The dataset shows a wide range of values across price, mileage, and manufacturing year. The spread indicates a diverse mix of vehicles from different segments, ranging from low-cost older cars to relatively newer and higher-priced vehicles.

mean(usedCars$AskPrice)
## [1] 960956.2
median(usedCars$AskPrice)
## [1] 570000
sd(usedCars$AskPrice)
## [1] 1293931

The mean price is higher than the median, indicating that the distribution of car prices is positively skewed. This suggests the presence of a few high-value cars that pull the average upward. The standard deviation is relatively high, reflecting significant variation in car prices across the dataset.

#2
quantile(usedCars$AskPrice, probs = c(0.25, 0.5, 0.75))
##    25%    50%    75% 
## 350000 570000 995000

The interquartile values show that 50% of the cars lie within a moderate price range. This confirms that most vehicles fall in the mid-market segment, with fewer cars in extreme high or low price ranges.

#3. Skewness
skewness(usedCars$AskPrice)
## [1] 4.451038

The skewness value is positive, means that the price distribution is right-skewed, confirming the presence of a few high-priced cars influencing the distribution.

#4
range(usedCars$AskPrice)
## [1]    15000 14900000

Price range shows a large gap between minimum and maximum values, indicating a wide market spectrum.

range(usedCars$kmDriven)
## [1]   1022 149500

kmDriven varies significantly, suggesting different levels of vehicle usage.

range(usedCars$Year)
## [1] 2000 2024

Year range confirms presence of both older and relatively newer vehicles.

#5. Inter Quartile Range
IQR(usedCars$AskPrice)
## [1] 645000
IQR(usedCars$kmDriven)
## [1] 40184.75

The IQR values indicate the spread of the middle 50% of the data. A relatively high IQR for price suggests variability even within the central portion of the dataset, highlighting market diversity.

#6. Correlation
cor(usedCars$AskPrice, usedCars$kmDriven)    # negative relation- the more kms driven, lesser the price
## [1] -0.2403548
cor(usedCars$AskPrice, usedCars$Year)    # positive relation- latest year, higher price
## [1] 0.3497914

Price decreases with higher mileage and increases with newer manufacturing year, confirming expected depreciation trends.

#7. frequency table
table(usedCars$FuelType)
## 
##     Diesel     Hybrid Hybrid/Cng     Petrol 
##       4378       1735       1699       5238

Petrol cars dominate the dataset, followed by diesel

table(usedCars$Transmission)
## 
## Automatic    Manual 
##      6462      6588

Manual transmission vehicles are slightly more common than automatic

#8. proportion
prop.table(table(usedCars$Brand))
## 
##    Ambassador          Audi         Bajaj           BMW     Chevrolet 
##  7.662835e-05  2.084291e-02  3.065134e-04  2.842912e-02  8.735632e-03 
##       Citroen        Datsun          Fiat         Force          Ford 
##  1.532567e-04  3.524904e-03  1.992337e-03  3.831418e-04  2.758621e-02 
##         Honda        Hummer       Hyundai         Isuzu        Jaguar 
##  8.865900e-02  7.662835e-05  1.704981e-01  8.429119e-04  3.601533e-03 
##          Jeep           Kia    Land Rover         Lexus      Mahindra 
##  7.432950e-03  1.578544e-02  6.973180e-03  1.992337e-03  4.996169e-02 
## Maruti Suzuki      Maserati Mercedes-Benz            MG          Mini 
##  3.115709e-01  7.662835e-05  3.954023e-02  8.122605e-03  3.065134e-03 
##    Mitsubishi        Nissan          Opel       Porsche       Renault 
##  1.379310e-03  8.505747e-03  1.532567e-04  1.992337e-03  2.643678e-02 
##         Skoda     Ssangyong          Tata        Toyota   Toyota Land 
##  1.938697e-02  3.831418e-04  4.038314e-02  6.337165e-02  1.532567e-04 
##    Volkswagen         Volvo 
##  3.310345e-02  4.521073e-03

A few brands dominate the market, indicating brand concentration in the used car segment.

prop.table(table(usedCars$FuelType))
## 
##     Diesel     Hybrid Hybrid/Cng     Petrol 
##  0.3354789  0.1329502  0.1301916  0.4013793

Fuel distribution shows petrol vehicles forming the majority segment followed by Diesel

#9. average price by fuel, brand
aggregate(AskPrice ~ FuelType, data=usedCars, mean)
##     FuelType  AskPrice
## 1     Diesel 1395971.7
## 2     Hybrid  530185.3
## 3 Hybrid/Cng  550119.2
## 4     Petrol  873308.5
# which fuel type or brand is most expensive

Diesel cars have a higher average resale value compared to petrol and other fuel types. This may be due to better fuel efficiency and higher demand for long-distance usage.

aggregate(AskPrice ~ Brand, usedCars, mean)
##            Brand  AskPrice
## 1     Ambassador  421000.0
## 2           Audi 1850168.9
## 3          Bajaj  219750.0
## 4            BMW 2917848.1
## 5      Chevrolet  246030.2
## 6        Citroen 1950000.0
## 7         Datsun  240826.0
## 8           Fiat  255957.7
## 9          Force 1252200.0
## 10          Ford 1030233.0
## 11         Honda  510758.3
## 12        Hummer 9800000.0
## 13       Hyundai  578536.4
## 14         Isuzu 1696272.7
## 15        Jaguar 2614553.1
## 16          Jeep 1758600.3
## 17           Kia 1466524.7
## 18    Land Rover 5133880.3
## 19         Lexus 4851923.0
## 20      Mahindra  961130.5
## 21 Maruti Suzuki  485533.8
## 22      Maserati 8000000.0
## 23 Mercedes-Benz 3098073.0
## 24            MG 1605113.3
## 25          Mini 2921975.0
## 26    Mitsubishi  777344.4
## 27        Nissan  442872.1
## 28          Opel  129000.0
## 29       Porsche 7371884.6
## 30       Renault  439164.0
## 31         Skoda  960030.6
## 32     Ssangyong  508000.6
## 33          Tata  670779.5
## 34        Toyota 1775640.0
## 35   Toyota Land 5700000.0
## 36    Volkswagen  670246.2
## 37         Volvo 2739775.8

Certain brands command higher average prices, indicating stronger brand value and perceived reliability in the resale market.

#10. median price by fuel
aggregate(AskPrice ~ FuelType, data = usedCars, median)
##     FuelType AskPrice
## 1     Diesel 899999.5
## 2     Hybrid 410000.0
## 3 Hybrid/Cng 425000.0
## 4     Petrol 515000.0

Median prices confirm that diesel cars consistently lie in a higher price range, reinforcing the trend observed in mean values.

#11. avg price by transmission
aggregate(AskPrice ~ Transmission, data=usedCars, mean)
##   Transmission AskPrice
## 1    Automatic  1399869
## 2       Manual   530438

Automatic transmission vehicles have higher average prices compared to manual ones. This reflects growing consumer preference for convenience.

#12. avg price by owner type
aggregate(AskPrice ~ Owner, data = usedCars, mean)
##    Owner  AskPrice
## 1  First 1147832.4
## 2 Second  776752.9

First-owner vehicles retain higher value, while resale price decreases with additional ownership.

#13. fuel vs kmdriven
aggregate(kmDriven ~ FuelType, usedCars, mean)
##     FuelType kmDriven
## 1     Diesel 76463.51
## 2     Hybrid 67121.10
## 3 Hybrid/Cng 66378.24
## 4     Petrol 54386.93

Diesel cars are driven more on average.

#14. multi-dimensional analysis-fuel+transmission
aggregate(AskPrice ~ FuelType + Transmission, data = usedCars, mean)
##     FuelType Transmission  AskPrice
## 1     Diesel    Automatic 2039212.1
## 2     Hybrid    Automatic  617058.8
## 3 Hybrid/Cng    Automatic  674619.5
## 4     Petrol    Automatic 1320305.8
## 5     Diesel       Manual  708977.6
## 6     Hybrid       Manual  445974.3
## 7 Hybrid/Cng       Manual  441661.4
## 8     Petrol       Manual  447311.0

Diesel + Automatic combinations tend to have the highest prices, indicating a premium segment combination.

Petrol + Manual combinations dominate lower price ranges

#15. age impact on price
cor(usedCars$AskPrice, usedCars$Age)
## [1] -0.3497914

There is a negative correlation between Age and Price. Older cars are priced lower due to depreciation

#16. grouping by age buckets
usedCars$Age_Group <- cut(usedCars$Age,
                          breaks = c(0,3,7,15),
                          labels = c("New", "Mid", "Old"))

aggregate(AskPrice ~ Age_Group, data = usedCars, mean)
##   Age_Group  AskPrice
## 1       New 1703621.3
## 2       Mid 1122099.9
## 3       Old  580364.6

New cars are priced highest, followed by mid-age and old cars, confirming depreciation patterns.

#17. segmenting by price range
usedCars$Price_Category <- cut(usedCars$AskPrice,
                               breaks = c(0,300000, 700000, 2000000),
                               labels = c("Low", "Medium", "High"))

#counting by segment
table(usedCars$Price_Category)
## 
##    Low Medium   High 
##   2650   5462   3679

Most cars fall in the medium price segment, indicating a balanced market with fewer luxury vehicles.

#18. transmission by segment
table(usedCars$Transmission, usedCars$Price_Category)
##            
##              Low Medium High
##   Automatic  741   2180 2330
##   Manual    1909   3282 1349
# owner by segment
table(usedCars$Owner, usedCars$Price_Category)
##         
##           Low Medium High
##   First   731   2696 2251
##   Second 1919   2766 1428

Premium segments are dominated by automatic and first-owner vehicles.

#19. fuel distribution in each segment
table(usedCars$FuelType, usedCars$Price_Category)
##             
##               Low Medium High
##   Diesel      314   1317 1900
##   Hybrid      580    872  245
##   Hybrid/Cng  536    887  233
##   Petrol     1220   2386 1301

Diesel cars are more common in high-price segments, while petrol dominates lower segments.

#20. coefficient of variation- measuring price volatility
sd(usedCars$AskPrice)/mean(usedCars$AskPrice)
## [1] 1.346504

High variation in price indicates a heterogeneous market with diverse pricing structures.

#21. most expensive cars
usedCars[which.max(usedCars$AskPrice), ]
##         Brand         model Year Age kmDriven Transmission Owner FuelType
## 13487 Porsche Cayenne Coupe 2022   2     9000    Automatic First   Petrol
##       PostedDate                        AdditionInfo AskPrice Age_Group
## 13487     Nov-24 Porsche Cayenne Coupe, 2022, Petrol 14900000       New
##       Price_Category
## 13487           <NA>
# top 5 expensive cars-
head(usedCars[order(-usedCars$AskPrice), ], 5)
##               Brand             model Year Age kmDriven Transmission  Owner
## 13487       Porsche     Cayenne Coupe 2022   2     9000    Automatic  First
## 1022  Mercedes-Benz           G-Class 2017   7    45000    Automatic Second
## 6986     Land Rover Range Rover Sport 2020   4    30000    Automatic  First
## 7130     Land Rover       Range Rover 2019   5   110000    Automatic Second
## 6028  Mercedes-Benz           S-Class 2023   1    22000    Automatic  First
##       FuelType PostedDate
## 13487   Petrol     Nov-24
## 1022    Petrol     Nov-24
## 6986    Petrol     Sep-24
## 7130    Diesel     Nov-24
## 6028    Diesel     Nov-24
##                                                        AdditionInfo AskPrice
## 13487                           Porsche Cayenne Coupe, 2022, Petrol 14900000
## 1022     Mercedes-Benz G-Class AMG G 63 Grand Edition, 2017, Petrol 14500000
## 6986                       Land Rover Range Sport HSE, 2020, Petrol 14500000
## 7130  Land Rover Range 4.4 Diesel LWB SVAutobiography, 2019, Diesel 14000000
## 6028                     Mercedes-Benz S-Class S 350d, 2023, Diesel 13800000
##       Age_Group Price_Category
## 13487       New           <NA>
## 1022        Mid           <NA>
## 6986        Mid           <NA>
## 7130        Mid           <NA>
## 6028        New           <NA>

The most expensive car represents a premium segment vehicle with likely low mileage, recent manufacturing year, and desirable features.

# 5 cheapest cars
head(usedCars[order(usedCars$AskPrice), ], 5)
##               Brand    model Year Age kmDriven Transmission  Owner FuelType
## 1736       Mahindra  Marshal 2000  24    55000       Manual Second   Diesel
## 13470           BMW 5 Series 2014  10    90000    Automatic Second   Diesel
## 14315       Renault   Duster 2013  11    80000       Manual Second   Diesel
## 3014  Maruti Suzuki   Baleno 2024   0     9000       Manual  First   Petrol
## 270   Maruti Suzuki Alto-800 2006  18    20024       Manual  First   Petrol
##       PostedDate                               AdditionInfo AskPrice Age_Group
## 1736      Nov-24 Want NOC for Mahindra Marshal or Commander    15000      <NA>
## 13470     Nov-24                          BMW 5 Series 2014    15000       Old
## 14315     Nov-24        2013 duster 110 all dpare available    15000       Old
## 3014      Nov-24   Panasound brand Daimond 2k androd stereo    18500      <NA>
## 270       Nov-24       पुरानी स्क्रैप की गाड़ी बेचने के लिए संपर्क करें    20000      <NA>
##       Price_Category
## 1736             Low
## 13470            Low
## 14315            Low
## 3014             Low
## 270              Low
#22. avg price by segment
aggregate(AskPrice ~ Price_Category, data = usedCars, mean)
##   Price_Category  AskPrice
## 1            Low  204319.4
## 2         Medium  492294.7
## 3           High 1148201.7

Clear separation exists between low, medium, and high segments, validating the segmentation approach.

#23. which combo gives the highest price
aggregate(AskPrice ~ FuelType + Transmission, data = usedCars, max)
##     FuelType Transmission AskPrice
## 1     Diesel    Automatic 14000000
## 2     Hybrid    Automatic 11000000
## 3 Hybrid/Cng    Automatic 11900000
## 4     Petrol    Automatic 14900000
## 5     Diesel       Manual  4550000
## 6     Hybrid       Manual  1450000
## 7 Hybrid/Cng       Manual  1550000
## 8     Petrol       Manual  6500000

The highest priced vehicles are typically diesel automatic cars, indicating strong demand for this combination in the resale market.

library(ggplot2)
ggplot(usedCars, aes(x=AskPrice/100000)) +
  geom_histogram(bins=15) +
  labs(x = "Price (in Lakhs)", y="Number of cars")

The price distribution is right-skewed, with most cars concentrated in lower price ranges.

# Relationship Analysis
# Price vs kmDriven (Regression)

ggplot(usedCars, aes(x=kmDriven, y=AskPrice/100000)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(y= "Price (in Lakhs)", x="Number of Kilometers")
## `geom_smooth()` using formula = 'y ~ x'

A negative relationship is observed, where higher mileage leads to lower resale value.

# price vs year - Depreciation Trend
ggplot(usedCars, aes(x=Year, y=AskPrice/100000)) + geom_point() + labs(y = "Price in lakhs", x="Year")

Newer cars command higher prices, indicating depreciation over time.

# Price by Transmission (Boxplot)
ggplot(usedCars, aes(x=Transmission, y=AskPrice/100000)) + geom_boxplot() + labs(y = "Price in lakhs")

# Price by Fuel (Boxplot)
ggplot(usedCars, aes(x=FuelType, y=AskPrice/100000)) + geom_boxplot() + labs(y="Price in lakhs")

# Price by owner
ggplot(usedCars, aes(x=Owner, y=AskPrice/100000)) + geom_boxplot() + labs(y="Price in lakhs")

Price varies significantly across fuel types, transmission, and ownership categories.

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
corrplot(cor(usedCars[sapply(usedCars, is.numeric)]))

The correlation plot shows that price increases with newer manufacturing year and decreases with higher age and mileage. It confirms that age and usage are the primary factors affecting car prices.

# Price vs kmDriven by Fuel
ggplot(usedCars, aes(kmDriven, (AskPrice/100000), color=FuelType)) + geom_point() + labs(y="Price in lakhs", x="Kms Driven")

Price decreases as km driven increases, showing depreciation. Diesel cars retain higher value, while petrol and hybrid cars are mostly in lower price ranges.

# faceted scatter plot by fuel
ggplot(usedCars, aes(kmDriven, AskPrice/100000)) + geom_point() + facet_wrap(~FuelType) + labs(y="Price in lakhs")

Price decreases with mileage across all fuel types. Diesel cars retain higher value, while petrol and hybrid/CNG cars are mostly in lower price ranges.

# Building a linear relationship model
# Convert categorical variables
usedCars$FuelType <- as.factor(usedCars$FuelType)
usedCars$Transmission <- as.factor(usedCars$Transmission)
usedCars$Owner <- as.factor(usedCars$Owner)
usedCars$Brand <- as.factor(usedCars$Brand)
# TRAIN-TEST SPLIT (70–30)
set.seed(123)

train_index <- sample(1:nrow(usedCars), 0.7 * nrow(usedCars))

train <- usedCars[train_index, ]
test <- usedCars[-train_index, ]
model <- lm(AskPrice ~ Age + kmDriven + FuelType + Transmission + Owner, data=train)
summary(model)
## 
## Call:
## lm(formula = AskPrice ~ Age + kmDriven + FuelType + Transmission + 
##     Owner, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2153555  -506765  -169664   214052 12981638 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.773e+06  3.738e+04   74.19   <2e-16 ***
## Age                -7.389e+04  3.486e+03  -21.20   <2e-16 ***
## kmDriven           -6.526e+00  4.650e-01  -14.03   <2e-16 ***
## FuelTypeHybrid     -8.725e+05  3.679e+04  -23.71   <2e-16 ***
## FuelTypeHybrid/Cng -8.813e+05  3.727e+04  -23.64   <2e-16 ***
## FuelTypePetrol     -6.483e+05  2.848e+04  -22.76   <2e-16 ***
## TransmissionManual -6.918e+05  2.298e+04  -30.11   <2e-16 ***
## OwnerSecond        -4.969e+03  2.490e+04   -0.20    0.842    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1082000 on 9127 degrees of freedom
## Multiple R-squared:  0.2806, Adjusted R-squared:   0.28 
## F-statistic: 508.5 on 7 and 9127 DF,  p-value: < 2.2e-16

The model shows that age, mileage, fuel type, and transmission significantly influence price. However, the model explains only ~28% of variation, indicating limited predictive power.

pred <- predict(model, newdata=test)

Linear regression can sometimes produce unrealistic values such as negative prices because it assumes a linear relationship without constraints. To address this, log transformation was applied to stabilize predictions and ensure meaningful outputs.

# Comparing actual vs predicted price
head(data.frame(
  Actual = test$AskPrice,
  Predicted = pred
))
##     Actual Predicted
## 4   565000 1014343.7
## 5   685000 1461760.0
## 10  649000  455972.7
## 11  395000 1224724.4
## 21 2399000 1896308.2
## 24 1050000 1829142.8
train$log_price <- log(train$AskPrice)

model_log <- lm(log_price ~ Age + kmDriven + FuelType + Transmission + Owner, data=train)

pred_log <- predict(model_log, test)

pred_final <- exp(pred_log)

Log transformation improves model stability by reducing skewness and handling extreme values.

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
train <- na.omit(train)

rf_model <- randomForest(
  AskPrice ~ Age + kmDriven + FuelType + Transmission + Owner + Brand,
  data = train,
  ntree = 500
)

print(rf_model)
## 
## Call:
##  randomForest(formula = AskPrice ~ Age + kmDriven + FuelType +      Transmission + Owner + Brand, data = train, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 43380191661
##                     % Var explained: 75.6

The Random Forest model captures non-linear relationships and improves prediction accuracy, explaining ~74% of variation in prices.

importance(rf_model)
##              IncNodePurity
## Age           3.194804e+14
## kmDriven      1.109744e+14
## FuelType      1.573757e+14
## Transmission  9.263559e+13
## Owner         2.190703e+13
## Brand         4.397035e+14
varImpPlot(rf_model)

Brand, age, and mileage are the most important predictors of price.

Final Conclusion: The analysis demonstrates that used car prices are influenced by multiple factors, with age, mileage, brand, and fuel type being the most significant. While linear regression provides basic insights, Random Forest delivers superior predictive performance by capturing complex relationships in the data.