Problems Statement:
Skyye, a general merchandise retail company, sells consumer, corporate and home office products across 88 countries around the world.
The results and insights obtained after data analysis would be used to devise strategies to open new stores and further investigate the reason for high returns and losses.
Importing the libraries required for the analysis
library(tidyverse)
library(grid)
library(gridExtra)
library(dplyr)
library(ggplot2)
library(reshape2)
library(DT)
library(RColorBrewer)
library(data.table)
library(knitr)
library(caret)Importing the data set for analysis
car_sales <- read.csv("CarDekho Sales.csv")Summary of data
# View the head and summary of the data
DT::datatable(head(car_sales, 5), rownames = FALSE)Dimension and summary of data
# View the dimension and summary of the data
dim(car_sales)## [1] 8128 13
summary(car_sales)## name year selling_price km_driven
## Length:8128 Min. :1983 Min. : 29999 Min. : 1
## Class :character 1st Qu.:2011 1st Qu.: 254999 1st Qu.: 35000
## Mode :character Median :2015 Median : 450000 Median : 60000
## Mean :2014 Mean : 638272 Mean : 69820
## 3rd Qu.:2017 3rd Qu.: 675000 3rd Qu.: 98000
## Max. :2020 Max. :10000000 Max. :2360457
##
## fuel seller_type transmission owner
## Length:8128 Length:8128 Length:8128 Length:8128
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## mileage engine max_power torque
## Length:8128 Length:8128 Length:8128 Length:8128
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## seats
## Min. : 2.000
## 1st Qu.: 5.000
## Median : 5.000
## Mean : 5.417
## 3rd Qu.: 5.000
## Max. :14.000
## NA's :221
Extract value from the mileage column
# Replace the 'kmpl, km/kg' string from data points in the 'mileage' feature
car_sales$mileage <- gsub('[a-zA-Z/ ]', '', car_sales$mileage)
# Convert the data type of 'mileage' feature from Character to Numeric.
car_sales$mileage <- as.numeric(car_sales$mileage)Extract value from the engine column
# Replace the 'CC' string from data points in the 'engine' feature
car_sales$engine <- gsub('[a-zA-Z/ ]', '', car_sales$engine)
# Convert the data type of 'engine' feature from Character to Numeric.
car_sales$engine <- as.numeric(car_sales$engine)Extract value from the max power column
car_sales$max_power <- as.numeric(str_remove(car_sales$max_power, " [a-z]+"))Extract torque value from the torque column
car_sales$torque_val <- as.numeric(str_sub(car_sales$torque, rep(1, nrow(car_sales)),
str_locate(car_sales$torque, "\\D+")[,1]-1))
head(car_sales)## name year selling_price km_driven fuel seller_type
## 1 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual
## 2 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual
## 3 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual
## 4 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual
## 5 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual
## 6 Hyundai Xcent 1.2 VTVT E Plus 2017 440000 45000 Petrol Individual
## transmission owner mileage engine max_power torque
## 1 Manual First Owner 23.40 1248 74.00 190Nm@ 2000rpm
## 2 Manual Second Owner 21.14 1498 103.52 250Nm@ 1500-2500rpm
## 3 Manual Third Owner 17.70 1497 78.00 12.7@ 2,700(kgm@ rpm)
## 4 Manual First Owner 23.00 1396 90.00 22.4 kgm at 1750-2750rpm
## 5 Manual First Owner 16.10 1298 88.20 11.5@ 4,500(kgm@ rpm)
## 6 Manual First Owner 20.14 1197 81.86 113.75nm@ 4000rpm
## seats torque_val
## 1 5 190
## 2 5 250
## 3 5 12
## 4 5 22
## 5 5 11
## 6 5 113
Extract RPM value from the torque column
num_length <- str_length(gsub("\\D+", "", car_sales$torque))
car_sales$rpm <- as.numeric(str_sub(as.numeric(gsub("\\D+", "", car_sales$torque)),num_length-3, num_length))
head(car_sales)## name year selling_price km_driven fuel seller_type
## 1 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual
## 2 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual
## 3 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual
## 4 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual
## 5 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual
## 6 Hyundai Xcent 1.2 VTVT E Plus 2017 440000 45000 Petrol Individual
## transmission owner mileage engine max_power torque
## 1 Manual First Owner 23.40 1248 74.00 190Nm@ 2000rpm
## 2 Manual Second Owner 21.14 1498 103.52 250Nm@ 1500-2500rpm
## 3 Manual Third Owner 17.70 1497 78.00 12.7@ 2,700(kgm@ rpm)
## 4 Manual First Owner 23.00 1396 90.00 22.4 kgm at 1750-2750rpm
## 5 Manual First Owner 16.10 1298 88.20 11.5@ 4,500(kgm@ rpm)
## 6 Manual First Owner 20.14 1197 81.86 113.75nm@ 4000rpm
## seats torque_val rpm
## 1 5 190 2000
## 2 5 250 2500
## 3 5 12 2700
## 4 5 22 2750
## 5 5 11 4500
## 6 5 113 4000
Extract brand from the can name column
# Extract the brand name (first word) into another feature from the 'name' feature.
car_sales$brand <- word(car_sales$name, start = 1, end = 1)
# Extract the brand name (first 2 words) into another feature from the 'name' feature.
car_sales$brand[car_sales$brand == "Ashok"] <- word(car_sales$name[car_sales$brand == "Ashok"], start = 1, end = 2)
car_sales$brand[car_sales$brand == "Land"] <- word(car_sales$name[car_sales$brand == "Land"], start = 1, end = 2)
car_sales$brand[car_sales$brand == "Range"] <- word(car_sales$name[car_sales$brand == "Range"], start = 1, end = 2)
#car_sales$brand <- ifelse( grepl( "Ashok Leyland", car_sales$name), word(car_sales$name, start = 0, end = 2), word(car_sales$name, 1))
#car_sales$brand <- ifelse( grepl( "Land Rover", car_sales$name), word(car_sales$name, start = 0, end = 2), word(car_sales$name, 1))
#car_sales$brand <- ifelse( grepl( "Range Rover", car_sales$name), word(car_sales$name, start = 0, end = 2), word(car_sales$name, 1))
#car_sales$brand <- word(car_sales$name, 1)Filter out the values for CNG and LPG
car_sales <- filter(car_sales, !fuel %in% c("LPG", "CNG"))Identify missing and NA values in the data
colSums(is.na(car_sales))## name year selling_price km_driven fuel
## 0 0 0 0 0
## seller_type transmission owner mileage engine
## 0 0 0 214 214
## max_power torque seats torque_val rpm
## 208 0 214 214 214
## brand
## 0
Remove unnecessary columns in the dataframe
car_sales$name <- NULL
car_sales$torque <- NULL
names(car_sales)[12] <- "torque"Impute missing and NA values in the data using kNN
preProcValues <- preProcess(car_sales %>%
dplyr::select(year, selling_price, km_driven, fuel, seller_type,
transmission, owner, mileage, engine, max_power, seats, torque, rpm),
method = c("knnImpute"),
k = 20,
knnSummary = mean)
impute_cars_info <- predict(preProcValues, car_sales, na.action = na.pass)
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
car_sales[i] <- impute_cars_info[i]*preProcValues$std[i]+preProcValues$mean[i]
}Create categorical column to indicate the size of the car
car_sales$size <- ifelse(car_sales$seats %in% c(2:5), "Small", ifelse(car_sales$seats %in% c(6:8), "Medium", ifelse(car_sales$seats %in% c(9:14), "Large", "Unknown")))
#car_sales$size[c(2:5) %in% car_sales$seats] <- "Small"Create a numeric column to obtain the age of the car
car_sales$age <- as.numeric(format(Sys.Date(), "%Y")) - car_sales$year
head(car_sales, 5)## year selling_price km_driven fuel seller_type transmission owner
## 1 2014 450000 145500 Diesel Individual Manual First Owner
## 2 2014 370000 120000 Diesel Individual Manual Second Owner
## 3 2006 158000 140000 Petrol Individual Manual Third Owner
## 4 2010 225000 127000 Diesel Individual Manual First Owner
## 5 2007 130000 120000 Petrol Individual Manual First Owner
## mileage engine max_power seats torque rpm brand size age
## 1 23.40 1248 74.00 5 190 2000 Maruti Small 7
## 2 21.14 1498 103.52 5 250 2500 Skoda Small 7
## 3 17.70 1497 78.00 5 12 2700 Honda Small 15
## 4 23.00 1396 90.00 5 22 2750 Hyundai Small 11
## 5 16.10 1298 88.20 5 11 4500 Maruti Small 14
colSums(is.na(car_sales))## year selling_price km_driven fuel seller_type
## 0 0 0 0 0
## transmission owner mileage engine max_power
## 0 0 0 0 0
## seats torque rpm brand size
## 0 0 0 0 0
## age
## 0
Obtain the descriptive statistics of the variables
Frequency plot of the categorical variables
Histogram and boxplot of numeric variables
Identify and cap the outliers in numeric variables
Check for the normality of numeric variables
Correlation matrix of numeric variables
#install.packages("ggcorrplot")
library(ggcorrplot)
data_corr <- car_sales[, c("selling_price", "km_driven", "mileage", "engine",
"max_power", "seats", "torque", "rpm", "age")]
corr <- round(cor(data_corr), 2)
ggcorrplot(corr, hc.order = TRUE, outline.col = "white",
ggtheme = ggplot2::theme_gray, colors = c("#6D9EC1", "white", "#E46726"))Pairplot of all the numeric variables
#install.packages("GGally")
library(GGally)
ggpairs(car_sales, columns = c(2, 3, 8:13), aes(color = transmission, alpha = 0.5))Yearly trend of price
#install.packages("gganimate")
#install.packages("gifski")
#install.packages("av")
library(ggplot2)
library(gganimate)
# Make a ggplot, but add frame=year: one image per year
gif1 <- ggplot(car_sales, aes(km_driven, selling_price, color = transmission)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_bw() +
# gganimate specific bits:
labs(title = 'Year: {frame_time}', x = 'Kilometers driven', y = 'Price of car') +
transition_time(year) +
ease_aes('linear')
# Save at gif:
animate(gif1, nframes = 100, fps = 5, end_pause = 20, renderer=gifski_renderer("test.gif"))Analyzing the transaction data helped reveal various insights and questions for further deep dive. In this study, the variables Sales_Total, Returns, Profits and LossPerReturn were analyzed across various markets and specifically in the Asia Pacific and Europe markets. These variables were analyzed across categorical variables like market, department, division and segment.
Xie, Y. C. D. (2021, October 7). R Markdown Cookbook. R Markdown. Retrieved October 30, 2021, from https://bookdown.org/yihui/rmarkdown-cookbook/
Bluman, A. (2017). Elementary Statistics: A Step By Step Approach (10th ed.). McGraw-Hill Education.
Kabacoff, R., I. (2022). R in Action, Third Edition. Manning.
Hillier, W. (2021, October 4). A Step-by-Step Guide to the Data Analysis Process. CareerFoundry. Retrieved October 30, 2021, from https://careerfoundry.com/en/blog/data-analytics/the-data-analysis-process-step-by-step/
F. (2021, April 2). tidyverse in r – Complete Tutorial. R-Bloggers. Retrieved October 30, 2021, from https://www.r-bloggers.com/2021/04/tidyverse-in-r-complete-tutorial/
The RMD file of the analysis is ‘Raj_M6_Project.Rmd’ is included with the analysis report.