#use sind data 22.04.2024
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'Amelia' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Warning: package 'sqldf' was built under R version 4.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.2.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.2.3
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
## Warning: package 'reshape2' was built under R version 4.2.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
## Warning: package 'Hmisc' was built under R version 4.2.3
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
## Warning: package 'doBy' was built under R version 4.2.3
##
## Attaching package: 'doBy'
##
## The following object is masked from 'package:dplyr':
##
## order_by
## Warning: package 'gmodels' was built under R version 4.2.3
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
## Warning: package 'effects' was built under R version 4.2.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## Warning: package 'polycor' was built under R version 4.2.3
##
## Attaching package: 'polycor'
##
## The following object is masked from 'package:psych':
##
## polyserial
##
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:psych':
##
## logit
##
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'modelr' was built under R version 4.2.3
## Warning: package 'gapminder' was built under R version 4.2.3
## Warning: package 'tree' was built under R version 4.2.3
## Warning: package 'gbm' was built under R version 4.2.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:psych':
##
## outlier
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'ggrepel' was built under R version 4.2.3
## Warning: package 'fastDummies' was built under R version 4.2.3
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#load data
pc <- read.csv("Pak_cars.csv")
dim(pc)
## [1] 24973 10
colnames(pc)
## [1] "X_id" "Brand" "Condition" "Fuel"
## [5] "KMs.Driven" "Model" "Price" "Registered.City"
## [9] "Transaction.Type" "Year"
md.pattern(pc, rotate.names = T)
## X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 22687 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 2284 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## Year KMs.Driven
## 22687 1 1 0
## 2 1 0 1
## 2284 0 0 2
## 2284 2286 4570
summary(pc)
## X_id Brand Condition Fuel
## Min. : 1 Length:24973 Length:24973 Length:24973
## 1st Qu.: 6244 Class :character Class :character Class :character
## Median :12487 Mode :character Mode :character Mode :character
## Mean :12487
## 3rd Qu.:18730
## Max. :24973
##
## KMs.Driven Model Price Registered.City
## Min. : 1 Length:24973 Min. : 50000 Length:24973
## 1st Qu.: 16000 Class :character 1st Qu.: 380000 Class :character
## Median : 66510 Mode :character Median : 650000 Mode :character
## Mean : 127811 Mean : 912895
## 3rd Qu.: 100000 3rd Qu.: 1145000
## Max. :10000000 Max. :87654321
## NA's :2286
## Transaction.Type Year
## Length:24973 Min. :1915
## Class :character 1st Qu.:2002
## Mode :character Median :2008
## Mean :2006
## 3rd Qu.:2013
## Max. :2020
## NA's :2284
head(pc)
## X_id Brand Condition Fuel KMs.Driven Model Price
## 1 1 Toyota Used Diesel 1 Prado 2100000
## 2 2 Suzuki Used Petrol 100000 Bolan 380000
## 3 3 Suzuki Used CNG 12345 Bolan 340000
## 4 4 Suzuki Used Petrol 94000 Alto 535000
## 5 5 Toyota Used Petrol 100000 Corolla XLI 1430000
## 6 6 Toyota Used Petrol 80000 Corrolla Altis 1620000
## Registered.City Transaction.Type Year
## 1 Karachi Cash 1997
## 2 Karachi Cash 2006
## 3 Karachi Cash 1998
## 4 Karachi Cash 2010
## 5 Karachi Cash 2013
## 6 Karachi Cash 2012
colnames(pc)
## [1] "X_id" "Brand" "Condition" "Fuel"
## [5] "KMs.Driven" "Model" "Price" "Registered.City"
## [9] "Transaction.Type" "Year"
dim(pc)
## [1] 24973 10
find unique data in variables
pc %>% group_by(X_id) %>% unique() %>% summarise(count = n())
## # A tibble: 24,973 × 2
## X_id count
## <int> <int>
## 1 1 1
## 2 2 1
## 3 3 1
## 4 4 1
## 5 5 1
## 6 6 1
## 7 7 1
## 8 8 1
## 9 9 1
## 10 10 1
## # ℹ 24,963 more rows
#all unique ids found which says no entry duplication
pc %>% group_by(Brand) %>% unique() %>% summarise(count = n())
## # A tibble: 24 × 2
## Brand count
## <chr> <int>
## 1 "" 2137
## 2 "Audi" 18
## 3 "BMW" 31
## 4 "Changan" 9
## 5 "Chevrolet" 47
## 6 "Classic & Antiques" 13
## 7 "Daewoo" 72
## 8 "Daihatsu" 2476
## 9 "FAW" 80
## 10 "Honda" 3324
## # ℹ 14 more rows
#2137 brand names are unlabelled
pc %>% group_by(Condition) %>% unique() %>% summarise(count = n())
## # A tibble: 3 × 2
## Condition count
## <chr> <int>
## 1 "" 2136
## 2 "New" 4365
## 3 "Used" 18472
#2136 car's condition are unlabelled
pc %>% group_by(Fuel) %>% unique() %>% summarise(count = n())
## # A tibble: 6 × 2
## Fuel count
## <chr> <int>
## 1 "" 2445
## 2 "CNG" 6797
## 3 "Diesel" 450
## 4 "Hybrid" 943
## 5 "LPG" 29
## 6 "Petrol" 14309
#2445 car's condition are unlabelled
pc %>% group_by(KMs.Driven) %>% unique() %>% summarise(count = n())
## # A tibble: 3,146 × 2
## KMs.Driven count
## <int> <int>
## 1 1 1699
## 2 2 56
## 3 3 11
## 4 4 16
## 5 5 89
## 6 6 10
## 7 7 14
## 8 8 31
## 9 9 13
## 10 10 165
## # ℹ 3,136 more rows
pc %>% group_by(Model) %>% unique() %>% summarise(count = n())
## # A tibble: 304 × 2
## Model count
## <chr> <int>
## 1 "" 2448
## 2 "120 Y" 24
## 3 "2 Series" 2
## 4 "200 D" 8
## 5 "240 Gd" 3
## 6 "250 D" 2
## 7 "3 Series" 11
## 8 "323" 12
## 9 "350Z" 3
## 10 "5 Series" 6
## # ℹ 294 more rows
#2448 car's model are unlabelled
pc %>% group_by(Price) %>% unique() %>% summarise(count = n())
## # A tibble: 1,131 × 2
## Price count
## <int> <int>
## 1 50000 19
## 2 52000 1
## 3 55000 13
## 4 57000 1
## 5 60000 11
## 6 62000 1
## 7 65000 6
## 8 68000 2
## 9 70000 21
## 10 72000 1
## # ℹ 1,121 more rows
pc %>% group_by(Registered.City) %>% unique() %>% summarise(count = n())
## # A tibble: 62 × 2
## Registered.City count
## <chr> <int>
## 1 "" 4636
## 2 "Abbottabad" 13
## 3 "Ali Masjid" 4
## 4 "Askoley" 3
## 5 "Attock" 6
## 6 "Badin" 3
## 7 "Bagh" 3
## 8 "Bahawalnagar" 3
## 9 "Bahawalpur" 39
## 10 "Bela" 5
## # ℹ 52 more rows
#4636 city are unlabelled
pc %>% group_by(Transaction.Type) %>% unique() %>% summarise(count = n())
## # A tibble: 3 × 2
## Transaction.Type count
## <chr> <int>
## 1 "" 2445
## 2 "Cash" 21513
## 3 "Installment/Leasing" 1015
#2445 transactions are unlabelled
pc %>% group_by(Year) %>% unique() %>% summarise(count = n())
## # A tibble: 66 × 2
## Year count
## <int> <int>
## 1 1915 1
## 2 1925 1
## 3 1943 2
## 4 1951 2
## 5 1952 4
## 6 1956 1
## 7 1960 1
## 8 1962 2
## 9 1963 2
## 10 1964 2
## # ℹ 56 more rows
#2284 year is missing
#check missing data
options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(pc)
## vars n mean sd median trimmed mad
## X_id 1 24973 12487.00 7209.23 12487 12487.00 9255.87
## Brand* 2 24973 17.38 7.88 23 18.50 1.48
## Condition* 3 24973 2.65 0.63 3 2.80 0.00
## Fuel* 4 24973 4.29 2.05 6 4.49 0.00
## KMs.Driven 5 22687 127811.16 599672.62 66510 63807.94 61542.73
## Model* 6 24973 121.27 80.51 110 117.28 99.33
## Price 7 24973 912895.28 1537133.61 650000 739939.16 518910.00
## Registered.City* 8 24973 22.27 10.97 26 23.04 0.00
## Transaction.Type* 9 24973 1.94 0.37 2 2.00 0.00
## Year 10 22689 2005.90 9.56 2008 2007.06 7.41
## min max range skew kurtosis se
## X_id 1 24973 24972 0.00 -1.20 45.62
## Brand* 1 24 23 -0.83 -0.84 0.05
## Condition* 1 3 2 -1.62 1.33 0.00
## Fuel* 1 6 5 -0.46 -1.64 0.01
## KMs.Driven 1 10000000 9999999 14.37 222.85 3981.31
## Model* 1 304 303 0.37 -0.69 0.51
## Price 50000 87654321 87604321 26.44 1158.16 9726.94
## Registered.City* 1 62 61 -0.94 0.51 0.07
## Transaction.Type* 1 3 2 -0.68 4.00 0.00
## Year 1915 2020 105 -1.13 1.42 0.06
#visualize missing data
missmap(pc)
#2% missing data is there in years and km-driven variables
#exclude all non-numeric columns
num.cols <- sapply(pc, is.numeric)
pc2 <- pc[, num.cols]
colnames(pc2)
## [1] "X_id" "KMs.Driven" "Price" "Year"
#now have 2 data sets pc (all variables) and pc2 (only numeric data)
md.pattern(pc2, rotate.names = T)
## X_id Price Year KMs.Driven
## 22687 1 1 1 1 0
## 2 1 1 1 0 1
## 2284 1 1 0 0 2
## 0 0 2284 2286 4570
library(tidyverse)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
#library(factoextra)
introduce(pc)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 24973 10 6 4 0
## total_missing_values complete_rows total_observations memory_usage
## 1 4570 22687 249730 1624944
plot_intro(pc)
plot_missing(pc)
dim(pc)
## [1] 24973 10
head(pc)
## X_id Brand Condition Fuel KMs.Driven Model Price
## 1 1 Toyota Used Diesel 1 Prado 2100000
## 2 2 Suzuki Used Petrol 100000 Bolan 380000
## 3 3 Suzuki Used CNG 12345 Bolan 340000
## 4 4 Suzuki Used Petrol 94000 Alto 535000
## 5 5 Toyota Used Petrol 100000 Corolla XLI 1430000
## 6 6 Toyota Used Petrol 80000 Corrolla Altis 1620000
## Registered.City Transaction.Type Year
## 1 Karachi Cash 1997
## 2 Karachi Cash 2006
## 3 Karachi Cash 1998
## 4 Karachi Cash 2010
## 5 Karachi Cash 2013
## 6 Karachi Cash 2012
#Visualize data
t <- pc %>% group_by(Brand) %>% summarise(AvgPrice = mean(Price), MaxPrice = max(Price), minPrice = min(Price))
options(repr.plot.height = 8, repr.plot.width = 8)
ggplot() +
geom_histogram(t, mapping = aes(AvgPrice, fill = Brand))+
geom_vline(t, mapping = aes(xintercept = AvgPrice), color = "red", size = 1.5) +
geom_text(t, mapping = aes(x = AvgPrice+3, y = 65, label = round(AvgPrice,digits = 2)))+
facet_wrap(Brand~.)+
theme_minimal()+
theme(legend.position = "bottom")+
labs(y = "Frequency", title = "Distribution & The Average Price of The Cars in each Brand", caption = "PBS-data")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
colnames(pc)
## [1] "X_id" "Brand" "Condition" "Fuel"
## [5] "KMs.Driven" "Model" "Price" "Registered.City"
## [9] "Transaction.Type" "Year"
#total value in each brand (market share)
pc %>%
group_by(Brand) %>%
summarise(Total.share = mean(Price), na.rm = TRUE) %>%
ggplot(aes(reorder(Brand, Total.share),Total.share, fill = Total.share))+
geom_col(show.legend = FALSE)+
coord_flip()+
theme_minimal()+
labs(y = "Average price", x = "Brand names", title = "Average price of car as per Brand", caption = "PBS-data")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))
scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
pc %>%
group_by(Condition) %>%
summarise(Total.share = mean(Price), na.rm = TRUE) %>%
ggplot(aes(reorder(Condition, Total.share),Total.share, fill = Total.share))+
geom_col(show.legend = FALSE)+
#coord_flip()+
theme_minimal()+
labs(y = "Average price", x = "Condition",title = "Average price of car as per Condition", caption = "PBS-data")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))
scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
colnames(pc)
## [1] "X_id" "Brand" "Condition" "Fuel"
## [5] "KMs.Driven" "Model" "Price" "Registered.City"
## [9] "Transaction.Type" "Year"
ggplot(pc, aes(Price, Brand, color = Condition)) +
geom_point()+
facet_wrap(~Condition)+
theme(legend.title = element_text(size = 14),
legend.text = element_text(size = 13),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
strip.text = element_text(size = 12)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title = "Car brands average prices as per Condition)",
subtitle = "The most expensive ones are Toyota and Suzuki irrespective of conditions", caption = "PBS-data")
Take out data with no brands
noBrands <- pc %>% filter(Brand == "")
head(noBrands)
## X_id Brand Condition Fuel KMs.Driven Model Price Registered.City
## 1 3566 Used 99999 350000
## 2 3570 NA 1040000
## 3 3571 NA 1065000
## 4 3573 NA 765000
## 5 3575 NA 428000
## 6 3577 NA 1725000
## Transaction.Type Year
## 1 1995
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
md.pattern(noBrands, rotate.names = T)
## X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 1 1 1 1 1 1 1 1 1
## 2136 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## KMs.Driven Year
## 1 1 1 0
## 2136 0 0 2
## 2136 2136 4272
dim(noBrands)
## [1] 2137 10
#2136 data of id and price only no other information
options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(noBrands)
## vars n mean sd median trimmed mad
## X_id 1 2137 14485.31 7456.64 16254 14603.36 10584.28
## Brand* 2 2137 1.00 0.00 1 1.00 0.00
## Condition* 3 2137 1.00 0.02 1 1.00 0.00
## Fuel* 4 2137 1.00 0.00 1 1.00 0.00
## KMs.Driven 5 1 99999.00 NA 99999 99999.00 0.00
## Model* 6 2137 1.00 0.00 1 1.00 0.00
## Price 7 2137 843451.37 777637.72 656000 732799.79 527805.60
## Registered.City* 8 2137 1.00 0.00 1 1.00 0.00
## Transaction.Type* 9 2137 1.00 0.00 1 1.00 0.00
## Year 10 1 1995.00 NA 1995 1995.00 0.00
## min max range skew kurtosis se
## X_id 3566 24968 21402 -0.13 -1.5 161.3
## Brand* 1 1 0 NaN NaN 0.0
## Condition* 1 2 1 46.16 2130.0 0.0
## Fuel* 1 1 0 NaN NaN 0.0
## KMs.Driven 99999 99999 0 NA NA NA
## Model* 1 1 0 NaN NaN 0.0
## Price 50000 10600000 10550000 4.63 40.4 16821.9
## Registered.City* 1 1 0 NaN NaN 0.0
## Transaction.Type* 1 1 0 NaN NaN 0.0
## Year 1995 1995 0 NA NA NA
#visualize missing data
missmap(noBrands)
#take out data with brands
withBrands <- pc %>% filter(Brand != "")
head(withBrands)
## X_id Brand Condition Fuel KMs.Driven Model Price
## 1 1 Toyota Used Diesel 1 Prado 2100000
## 2 2 Suzuki Used Petrol 100000 Bolan 380000
## 3 3 Suzuki Used CNG 12345 Bolan 340000
## 4 4 Suzuki Used Petrol 94000 Alto 535000
## 5 5 Toyota Used Petrol 100000 Corolla XLI 1430000
## 6 6 Toyota Used Petrol 80000 Corrolla Altis 1620000
## Registered.City Transaction.Type Year
## 1 Karachi Cash 1997
## 2 Karachi Cash 2006
## 3 Karachi Cash 1998
## 4 Karachi Cash 2010
## 5 Karachi Cash 2013
## 6 Karachi Cash 2012
md.pattern(withBrands, rotate.names = T)
## X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 22686 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 148 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## Year KMs.Driven
## 22686 1 1 0
## 2 1 0 1
## 148 0 0 2
## 148 150 298
dim(withBrands)
## [1] 22836 10
introduce(withBrands)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 22836 10 6 4 0
## total_missing_values complete_rows total_observations memory_usage
## 1 298 22686 228360 1488048
plot_intro(withBrands)
plot_missing(withBrands)
From now on work on with Brands
#check missing data
options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(withBrands)
## vars n mean sd median trimmed mad
## X_id 1 22836 12300.00 7157.32 12312.5 12291.41 9198.05
## Brand* 2 22836 17.91 6.36 22.0 18.67 1.48
## Condition* 3 22836 1.81 0.39 2.0 1.89 0.00
## Fuel* 4 22836 4.60 1.87 6.0 4.77 0.00
## KMs.Driven 5 22686 127812.39 599685.81 66505.0 63805.94 61535.31
## Model* 6 22836 132.52 74.88 110.0 127.86 90.44
## Price 7 22836 919393.87 1589603.60 650000.0 740675.65 518910.00
## Registered.City* 8 22836 24.26 9.23 26.0 25.49 0.00
## Transaction.Type* 9 22836 2.03 0.24 2.0 2.00 0.00
## Year 10 22688 2005.90 9.56 2008.0 2007.06 7.41
## min max range skew kurtosis se
## X_id 1 24973 24972 0.00 -1.18 47.36
## Brand* 1 23 22 -0.86 -1.08 0.04
## Condition* 1 2 1 -1.57 0.47 0.00
## Fuel* 1 6 5 -0.65 -1.48 0.01
## KMs.Driven 1 10000000 9999999 14.37 222.84 3981.48
## Model* 1 304 303 0.44 -0.65 0.50
## Price 50000 87654321 87604321 26.09 1106.63 10519.11
## Registered.City* 1 62 61 -1.23 2.80 0.06
## Transaction.Type* 1 3 2 1.89 13.77 0.00
## Year 1915 2020 105 -1.13 1.42 0.06
#visualize missing data
missmap(withBrands) #(150 values of Km and 148 of year is missing)
withBrands %>%
group_by(Condition) %>%
summarise(Total.share = mean(Price), na.rm = TRUE) %>%
ggplot(aes(reorder(Condition, Total.share),Total.share, fill = Total.share))+
geom_col(show.legend = FALSE)+
coord_flip()+
theme_minimal()+
labs(y = "Average price", x = "Condition",title = "Average price of car as per Condition", caption = "PBS-data")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))
scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
withBrands %>%
group_by(Brand) %>%
summarise(Total.share = mean(Price), na.rm = TRUE) %>%
ggplot(aes(reorder(Brand, Total.share),Total.share, fill = Total.share))+
geom_col(show.legend = FALSE)+
coord_flip()+
theme_minimal()+
labs(y = "Average price", x = "Brand",title = "Average price of car as per Condition", caption = "PBS-data")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))
scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
colnames(withBrands)
## [1] "X_id" "Brand" "Condition" "Fuel"
## [5] "KMs.Driven" "Model" "Price" "Registered.City"
## [9] "Transaction.Type" "Year"
ordered_price <- withBrands %>% arrange(desc(Price))
#most expensive
high_price <- ordered_price %>% filter(Price > mean(Price)) %>%
dplyr::select(Price,Brand, Condition, Fuel, Model, Registered.City, Transaction.Type, Year, KMs.Driven)
ordered_price %>%
group_by(Brand) %>%
summarise(mean = round(mean(Price),0), sd = round(sd(Price), 0)) %>%
arrange(desc(mean))
## # A tibble: 23 × 3
## Brand mean sd
## <chr> <dbl> <dbl>
## 1 Range Rover 4657000 4339836
## 2 Audi 4124167 4553829
## 3 Lexus 3200870 3921879
## 4 Porsche 2500000 2404163
## 5 BMW 2252419 2724229
## 6 Mercedes 2046467 1923814
## 7 Toyota 1653264 2311591
## 8 Honda 1137342 979474
## 9 Nissan 772144 593684
## 10 Mazda 762704 405140
## # ℹ 13 more rows
table(high_price$Brand)
##
## Audi BMW Chevrolet Classic & Antiques
## 12 19 1 4
## Daewoo Daihatsu FAW Honda
## 3 720 19 1633
## KIA Lexus Mazda Mercedes
## 3 16 36 60
## Mitsubishi Nissan Other Brands Porsche
## 124 284 35 1
## Range Rover Subaru Suzuki Toyota
## 5 4 1176 3634
#plot cars about mean price
ggplot(high_price, aes(Price, reorder(Brand, Price))) +
geom_point() +
facet_grid(
factor(Brand, levels =
c("Audi", 'BMW', 'Chevrolet','Classic & Antiques', 'Daewoo','Daihatsu','FAW','Honda','KIA','Lexus','Mazda','Mercedes','Mitsubishi','Nissan','Other Brands','Porsche','Range Rover','Subaru','Suzuki','Toyota')) ~., scales = "free", space = "free") +
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Car brands destributed as per Price",
subtitle = "The cars plotted here - are above the average price of cars in our data set") + ylab("Brandname")
ggplot(withBrands, aes(Price, Fuel, color = Condition))+
geom_point()+
facet_wrap(~Fuel) +
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Cars Fuel type and Condition sold",
subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Brandname")+
theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))
brand_nested <- ordered_price %>%
group_by(Brand) %>%
summarise(mean_price = mean(Price), sd = round(sd(Price),0),
mean_mileage = mean(KMs.Driven), sd = round(sd(KMs.Driven),0)) %>%
arrange(desc(mean_price))
#make a dataframe
brand_nested <- as.data.frame(brand_nested, header = TRUE)
#use brand as row name
row.names(brand_nested) <- brand_nested$Brand
#remove brand names since have made row
brand_nested <- brand_nested[,-1]
head(brand_nested, 5)
## mean_price sd mean_mileage
## Range Rover 4657000 41955 66697.00
## Audi 4124167 64111 50485.44
## Lexus 3200870 205711 130533.09
## Porsche 2500000 54447 88500.00
## BMW 2252419 59641 61499.35
clustering_brand <- brand_nested
#scale the data
scaled_brand <- scale(clustering_brand)
#perform hierarchical clustering
HC_brand <- hclust(dist(scaled_brand))
#canvas size - plot size
options(repr.plot.width = 8, repr.plot.height = 8)
#plot
plot(HC_brand, col = "red", main = "Car brands that share the same branch have a lot in common across all variables")
options(repr.plot.width = 12, repr.plot.height = 8)
withBrands %>%
group_by(Fuel, Condition) %>%
summarise(Total.Value = mean(Price), na.rm = TRUE) %>%
ggplot(aes(reorder(Fuel, Total.Value), Total.Value, fill = Total.Value))+
geom_col(show.legend = FALSE)+
facet_grid(~Condition)+
coord_flip()+
theme_minimal()+
labs(x = NULL, y = "Market Values of rhe Leagues")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))+
scale_y_continuous(label = scales::comma_format())+
labs(title="Cars Fuel type and Condition sold",
subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Price")
## `summarise()` has grouped output by 'Fuel'. You can override using the
## `.groups` argument.
#Comparison of two brands
options(repr.plot.width = 12, repr.plot.height = 8)
withBrands %>%
group_by(Registered.City, Condition) %>%
summarise(Total.Value = mean(Price)) %>%
ggplot(aes(reorder(Registered.City, Total.Value), Total.Value, fill = Total.Value))+
geom_col(show.legend = FALSE)+
facet_grid(~Condition)+
coord_flip()+
theme_minimal()+
labs(x = NULL, y = "Market Values of rhe Leagues")+
scale_fill_gradient(low = "khaki", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))+
scale_y_continuous(label = scales::comma_format())+
labs(title="Cars Fuel type and Condition of sold cars",
subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Price")
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.
withBrands %>% group_by(Registered.City, Transaction.Type) %>% filter(Condition == "New") %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(mean(count), Registered.City, color = Transaction.Type))+
geom_point()+
facet_grid(~Transaction.Type)+
theme_minimal()+
labs(x = NULL, y = "Market Values of rhe Leagues")+
scale_fill_gradient(low = "blue", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))+
scale_x_continuous(label = scales::comma_format())+
labs(title="Number of New cars sold as per city using different transaction types",
subtitle = "Cash is most predominant type of transaction type") + ylab("Registered cities") + xlab("Number of Cars")+
theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.
withBrands %>% group_by(Registered.City, Transaction.Type) %>% filter(Condition == "Used") %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(mean(count), Registered.City, color = Transaction.Type))+
geom_point()+
facet_grid(~Transaction.Type)+
theme_minimal()+
labs(x = NULL, y = "Market Values of rhe Leagues")+
scale_fill_gradient(low = "blue", high = "seagreen")+
theme(axis.line.y = element_line(colour = "darkslategray"),
axis.ticks.x = element_line(colour = "darkslategray"))+
scale_x_continuous(label = scales::comma_format())+
labs(title="Number of Used cars sold as per city using different transaction types",
subtitle = "Cash is most predominant type of transaction type") + ylab("Registered cities") +
xlab("Number of Cars")+
theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.