Introduction
This notebook uses Germany Cars sample dataset from Zenrow, by way of Kaggle. The dataset contains information of 46405 old and new vehicles that are registered between 2011 and 2021 in Germany, collected on 2021/06/09, scraped from AutoScout24 by Zenrow.
Data dictionary
| No. | Feature | Description |
|---|---|---|
| 1 | mileage | kilometres traveled by the vehicle |
| 2 | make | make of the car |
| 3 | model | model of the car |
| 4 | fuel | fuel type |
| 5 | gear | manual or automatic |
| 6 | offerType | type of offer (new, used, …) |
| 7 | price | sale price of the vehicle |
| 8 | hp | horse power |
| 9 | year | the vehicle registration year |
# load libraries
library(tidyverse, warn.conflicts = F)
library(scales)
library(skimr)
library(janitor)
library(psych)
library(gghalves)
library(ggdist)
library(ggbump)
library(ggstatsplot)
library(gt)
library(ggsci)
library(colorspace)
library(factoextra)
# set theme
theme_set(theme_minimal(base_size = 10))
theme_update(panel.grid.minor=element_blank(),
plot.title.position="plot",
axis.title=element_text(size=9),
legend.title=element_text(size=9),
plot.margin=ggplot2::margin(1,1,1,1,"cm"))
# suppress summarise info
options(dplyr.summarise.inform = FALSE)
# import data
de_cars = read_csv("autoscout24-germany-dataset.csv") %>%
clean_names()
── Column specification ─────────────────────────────────────────────────────────────────────────
cols(
mileage = col_double(),
make = col_character(),
model = col_character(),
fuel = col_character(),
gear = col_character(),
offerType = col_character(),
price = col_double(),
hp = col_double(),
year = col_double()
)
glimpse(de_cars)
Rows: 46,405
Columns: 9
$ mileage <dbl> 235000, 92800, 149300, 96200, 156000, 147000, 91894, 127500, 115000, 104, 59…
$ make <chr> "BMW", "Volkswagen", "SEAT", "Renault", "Peugeot", "Toyota", "Renault", "Ope…
$ model <chr> "316", "Golf", "Exeo", "Megane", "308", "Auris", "Scenic", "Zafira", "3", "T…
$ fuel <chr> "Diesel", "Gasoline", "Gasoline", "Gasoline", "Gasoline", "Electric/Gasoline…
$ gear <chr> "Manual", "Manual", "Manual", "Manual", "Manual", "Automatic", "Manual", "Ma…
$ offer_type <chr> "Used", "Used", "Used", "Used", "Used", "Used", "Used", "Used", "Used", "Use…
$ price <dbl> 6800, 6877, 6900, 6950, 6950, 6950, 6970, 6972, 6980, 6990, 6990, 6990, 6990…
$ hp <dbl> 116, 122, 160, 110, 156, 99, 131, 116, 150, 86, 101, 105, 204, 141, 120, 60,…
$ year <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
# data summary
skim(de_cars)
── Data Summary ────────────────────────
Values
Name de_cars
Number of rows 46405
Number of columns 9
_______________________
Column type frequency:
character 5
numeric 4
________________________
Group variables None
── Variable type: character ─────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 make 0 1 2 16 0 77 0
2 model 143 0.997 1 28 0 841 0
3 fuel 0 1 3 17 0 11 0
4 gear 182 0.996 6 14 0 3 0
5 offer_type 0 1 3 14 0 5 0
── Variable type: numeric ───────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 mileage 0 1 71178. 62625. 0 19800 60000 105000 1111111 ▇▁▁▁▁
2 price 0 1 16572. 19305. 1100 7490 10999 19490 1199900 ▇▁▁▁▁
3 hp 29 0.999 133. 75.4 1 86 116 150 850 ▇▂▁▁▁
4 year 0 1 2016. 3.16 2011 2013 2016 2019 2021 ▇▅▆▆▆
# missing data
sapply(de_cars, function(x) sum(is.na(x)))
mileage make model fuel gear offer_type price hp
0 0 143 0 182 0 0 29
year
0
# check for duplicates (rows)
# de_cars %>% get_dupes()
# drop duplicates and NA in hp
cars_cln = de_cars %>% distinct() %>% drop_na(hp)
dim(cars_cln)
[1] 44241 9
# count of observations by registration year
cars_cln %>% count(year)
# boxplot (hp, mileage, price)
cars_cln %>% mutate(id=row_number()) %>%
select(where(is.numeric)) %>%
select(-year) %>%
pivot_longer(!id) %>%
ggplot(aes(y=value)) +
geom_half_violin(aes(fill=name, color=name),side="r", nudge = 0.05) +
geom_half_boxplot(aes(color=name),outlier.shape = 21, outlier.alpha = 0.6, outlier.size = 1, outlier.fill = NA) +
#geom_boxplot(outlier.alpha = 0.8, outlier.shape = 21, show.legend = F) +
facet_wrap(~name, scales = "free", ncol=1, strip.position = "left") +
theme(legend.position="none",
axis.text.y=element_blank(),
panel.grid.major.y=element_blank(),
strip.text.y.left=element_text(angle=0)) +
scale_y_continuous(labels=comma_format()) +
scale_x_continuous(limits=c(-0.4,0.45)) +
scale_color_npg() +
scale_fill_npg() +
coord_flip()
# price summary by offer_type
psych::describeBy(cars_cln$price, cars_cln$offer_type, mat=T) %>% arrange(desc(n))
cars_cln %>% group_by(year, offer_type) %>% tally() %>%
mutate(prop=round(n/sum(n)*100,1)) %>%
ggplot(aes(x=factor(year), y=offer_type, fill=prop)) +
geom_tile(size=2, color="white") +
geom_text(aes(label=paste(prop,"%"), color=I(ifelse(prop>90,"white","black"))),size=2.8) +
scale_x_discrete(position="top") +
scale_fill_continuous_sequential(palette="heat") +
theme(legend.position="none",
panel.grid=element_line(size=.2),
axis.title=element_text(face="bold")) +
labs(x="Registration year\n", y="Offer type", subtitle="Proportion of offer type by registration year\n")
# used vehicle subset
used = cars_cln %>% filter(offer_type=="Used")
# count of used vehicle by registration year
used %>% count(year)
# make count
cars_cln %>% group_by(make) %>% tally(sort=T)
# make count by registration year
# get top 7 makes by count
top_make = cars_cln %>% filter(year<2021) %>%
group_by(make) %>% tally(sort=T) %>% slice(1:7)
make = cars_cln %>% filter(year<2021) %>%
filter(make %in% top_make$make) %>%
group_by(year,make) %>% tally()
# plot
make %>%
ggplot(aes(year, n, color=make)) +
geom_bump() +
geom_point(size=2) +
geom_text(data = make %>% filter(year==2020), aes(label=make, x=2020.2),size=3, hjust=0) +
scale_color_d3() +
scale_x_continuous(limits=c(2011,2021)) +
theme(legend.position = "none",
plot.title=element_text(size=10),
plot.subtitle=element_text(size=8),
axis.title = element_text(color="grey30")) +
labs(x="Registration year", y="Vehicle count",
title="Count of vehicle make by registration year",
subtitle="7 makes with highest vehicle count from 2011 to 2020")
# sum of cars in 2020
cars_cln %>% filter(year==2020) %>% count()
# proportion of car makes in 2020
make_sum = cars_cln %>% filter(year==2020) %>%
mutate(make_grp = fct_lump(factor(make), 10)) %>%
count(make_grp, sort=T) %>%
mutate(make_grp = fct_rev(fct_inorder(make_grp)),
make_grp=fct_relevel(make_grp,"Other",after=0),
perc= paste0(sprintf("%2.1f", n/sum(n)*100),"%"),
color=case_when(row_number()==1 ~"grey",
row_number()==2 ~"#ee9b00",
row_number()==3 ~"#4DBBD5FF",
row_number()==4 ~"#00A087FF",
TRUE~"grey55")
)
ggplot(make_sum, aes(n, make_grp, fill=color)) +
geom_col(width=0.8) +
geom_text(aes(label=perc), hjust=1.2, nudge_x = -.5, size=3.2, fontface="bold") +
geom_text(aes(x=-10,label=make_grp, color=color),
size=3.5, hjust=1, nudge_x=-0.5, fontface="bold") +
scale_x_continuous(expand = expansion(mult = c(.2,.1))) +
scale_fill_identity(guide="none") +
scale_color_identity(guide="none") +
theme_void(base_size=10) +
theme(plot.title=element_text(hjust=0.03),
plot.subtitle=element_text(hjust=0.03, color="grey55",size=10),
plot.margin=ggplot2::margin(1,1,1,1,"cm")
) +
labs(title="Proportion of car makes in 2020", subtitle = "(4118 new and old cars)\n")
# most popular car by registration year
cars_cln %>%
filter(year<2021) %>%
group_by(year) %>%
count(make, model, sort=T) %>%
slice(1)
# Volkswagen golf price by registration year
cars_cln %>%
filter(year<2021) %>%
filter(model=="Golf") %>%
ggplot(aes(x=factor(year), y=price)) +
ggdist::stat_halfeye(width = .6, .width = 0, justification = -.2, point_colour = NA, alpha=0.7) +
geom_boxplot(width = .1, outlier.shape = NA, color="#f77f00", fill=NA) +
gghalves::geom_half_point(side = "l", range_scale = .5, alpha = .3,size=.9, shape=21) +
theme(panel.grid.major.y=element_blank()) +
coord_flip() +
theme(legend.position="none",
axis.title.y=element_text(margin=margin(r=5)),
axis.title.x=element_text(margin=margin(t=5))) +
labs(x="Price", y="Registration year", subtitle="Volkswagen golf price by registration year")
medprice = cars_cln %>%
mutate(make_grp = fct_lump(factor(make), 10)) %>%
group_by(make_grp,year) %>%
summarise(median_price = median(price)) %>%
ungroup() %>%
group_by(make_grp) %>%
mutate(max_price = max(median_price)) %>%
arrange(max_price) %>%
ungroup() %>%
mutate(make_grp=fct_inorder(make_grp),
make_grp=fct_relevel(make_grp,"Other",after=0))
medprice %>%
ggplot(aes(y=make_grp, x=median_price, color=year)) +
geom_line(aes(group=make_grp),color="grey") +
geom_point() +
scale_color_continuous_sequential(palette="batlow") +
scale_x_continuous(breaks=seq(5000,60000,10000), expand=c(.01,.01),
labels=scales::comma_format()) +
theme(legend.position="top",
panel.grid.major.y=element_blank(),
panel.grid.major.x=element_line(size=.3),
plot.margin=ggplot2::margin(.5,.5,.5,.5,"cm")) +
guides(color = guide_colorbar(title.position = "top",
title.hjust = .5,
barwidth = unit(20, "lines"),
barheight = unit(.3, "lines"))) +
labs(color="Registration year", subtitle="Median price by make and registration year",
x="Median Price",y="Make")
# price summary by gear
psych::describeBy(cars_cln$price, cars_cln$gear, mat=T, ) %>% arrange(desc(n))
# proportion of manual and automatic gear 2011-2020
cars_cln %>%
filter(gear=="Manual"| gear=="Automatic") %>%
filter(year<2021) %>%
group_by(year, gear) %>% tally() %>%
mutate(prop=n/sum(n)) %>%
ggplot(aes(y=fct_rev(factor(year)), x=prop, fill=fct_rev(gear))) +
geom_col(alpha=0.9,width=0.6) +
theme(panel.grid.major.y=element_blank()) +
theme(axis.title=element_blank(),
plot.margin=ggplot2::margin(1,2,1,2,"cm"),
legend.position = "top",
legend.justification = "left",
legend.margin = margin(l=3, t=3.5,b=0),
legend.text = element_text(size=9),
legend.title=element_blank()) +
scale_fill_npg() +
scale_x_continuous(expand=c(.01,.01), labels=percent_format(), position="top") +
guides(fill=guide_legend(keyheight=0.8,keywidth=0.4, reverse=T)) +
ggtitle("Porportion of manual and automatic cars, 2011 to 2020")
# median price by gear (manual, automatic) and year
gear_agg = cars_cln %>% filter(gear=="Manual"| gear=="Automatic") %>%
group_by(gear, year) %>%
summarise(med = median(price)) %>%
ungroup() %>%
group_by(gear) %>%
mutate(current = med[which(year == 2021)])
gear_agg %>%
ggplot(aes(x=gear, y=med)) +
geom_col(
aes(color = factor(year), fill = I(ifelse(year==2021,"#0077b6","grey"))),
position = position_dodge(), size = .001
) +
theme(legend.position="none",
panel.grid=element_line(size=.3),
axis.text.y=element_blank(),
axis.title.y=element_blank(),
axis.text.x.bottom = element_blank(),
axis.title.x.bottom = element_blank(),
panel.grid.major.y=element_blank()) +
geom_text(
data = gear_agg %>% filter(gear == "Manual") %>%
mutate(year_lab = if_else(year %in% c(2011, 2016, 2021), as.character(year), "•")),
aes(y = 0, label = year_lab, color = factor(year), size = year_lab == "•"),
position = position_dodge(width = .9),
hjust = 2
) +
geom_text(
data = gear_agg %>% filter(gear == "Automatic") %>%
mutate(year_lab = if_else(year %in% c(2011, 2016, 2021), as.character(year), "•")),
aes(y = 0, label = year_lab, color = factor(year), size = year_lab == "•"),
position = position_dodge(width = .9),
hjust = 2
) +
scale_color_manual(values = c("grey50","grey50","grey50","grey50","grey50",
"grey50","grey50","grey50","grey50","grey50","#0077b6")) +
coord_flip() +
scale_y_continuous(label=scales::comma_format(),limits=c(-4000,38000),
sec.axis = dup_axis(name = "Median price"),
breaks=seq(0,30000,10000)) +
annotate("text", y=-3500, x=1.15, label="Automatic", hjust=1,size=3, angle=90) +
annotate("text", y=-3500, x=2.15, label="Manual", hjust=1,size=3, angle=90) +
labs(subtitle="Median price by registration year and gear")
# price summary by fuel
psych::describeBy(cars_cln$price, cars_cln$fuel, mat=T) %>% arrange(desc(n))
# median price by fuel and registration year
cars_cln %>% mutate(fuel_grp = fct_lump(factor(fuel), 2)) %>%
group_by(fuel_grp, year) %>%
summarise(med = median(price)) %>%
ggplot(aes(y=factor(year),x=med,
fill=factor(fuel_grp, levels=c("Gasoline","Other","Diesel", ordered=T)))) +
geom_line(aes(group=year), color="grey") +
geom_point(size=2.5, alpha=0.75, shape=21, color="black") +
scale_fill_d3() +
scale_x_continuous(labels=scales::comma_format(),
limits=c(5000,42000),
breaks=seq(5000,42000,10000)) +
theme(legend.position = "top",
panel.grid=element_line(size=.3),
axis.title.y=element_text(margin=margin(r=10)),
axis.title.x=element_text(margin=margin(t=8))) +
labs(fill="Fuel type",x="Median price", y="Registration Year",
subtitle="Median price by fuel and registration year")
# price and registration year (2011-2020)
cars_cln %>%
filter(year<2021) %>%
ggplot(aes(x=factor(year), y=price, color=fct_rev(factor(year)))) +
geom_half_violin(size=0.5, alpha=0.5, show.legend=F, side="r") +
geom_half_point(size=1, alpha=0.5, show.legend=F, side="l") +
theme(panel.grid.major.y=element_blank(),
plot.margin=ggplot2::margin(.1,.5,.1,.1,"cm")) +
scale_y_continuous(labels = unit_format(unit = "K", scale = 1e-3))+
labs(subtitle="Price and registration year\n",y="Price",x="Year") +
scale_color_futurama() +
coord_flip()
# boxplot (without outliers)
# filtering function - turns outliers into NAs to be removed
filter_lims <- function(x){
l <- boxplot.stats(x)$stats[1]
u <- boxplot.stats(x)$stats[5]
for (i in 1:length(x)){
x[i] <- ifelse(x[i]>l & x[i]<u, x[i], NA)
}
return(x)
}
# plot
cars_cln %>% filter(year<2021) %>%
group_by(year) %>%
mutate(price2= filter_lims(price)) %>%
ggplot(aes(x=factor(year), y=price2, color=fct_rev(factor(year)))) +
geom_boxplot(na.rm = TRUE, coef = 5) +
scale_y_continuous(labels = scales::comma_format())+
theme(panel.grid.major.y=element_blank(),
legend.position="none") +
coord_flip() +
scale_color_futurama() +
labs(x="Year",y="Price", subtitle="Boxplot without outliers: price and registration year\n")
# proportion of price groups by registration year
summary(cars_cln$price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1100 7490 10990 16554 19490 1199900
cars2 = cars_cln %>% filter(year<2021)
cars2$price_group = cut(cars2$price, breaks=c(1100,7490,10990,19490,1199900),
labels=c("1,100 to 7,490","7,490 to 10,990","10,990 to 19,490","19,490 to 119,900"),
include.lowest = T)
cars2 %>% count(price_group)
cars2 %>% group_by(year, price_group) %>% tally() %>%
mutate(prop=n/sum(n)) %>%
ggplot(aes(x=factor(year), y=prop, fill=price_group)) +
geom_col(width=0.8, alpha=0.9) +
coord_flip() +
theme(legend.position="top") +
scale_fill_npg(guide=guide_legend(reverse=T)) +
labs(fill="Price group",y="Proportion", x="Registration year",
subtitle="Proportion of price group by registration year")
# used vehicle prices by registration year
used = cars_cln %>% filter(offer_type=="Used") %>% filter(year<2021)
used_table = psych::describeBy(used_2021$price, used_2021$year, mat=T)
used_table %>% select(group1, mean, sd, median, min, max, range, skew) %>%
rename(year = group1) %>%
gt() %>%
fmt_number(
columns = c("mean","sd","median","min","max","range","skew"),
decimals=0) %>%
data_color(
columns =c("mean","sd","median","min","max","range","skew"),
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
domain = NULL)) %>%
tab_style(
style = list(
cell_borders(
sides = "bottom",
color = "black",
weight = px(3)
)
),
locations = list(
cells_column_labels(
columns = gt::everything()
)
)
) %>%
tab_options(table.font.size=13) %>%
tab_header(title=md("**Used Vehicle Prices by Registration Year**"))
| Used Vehicle Prices by Registration Year | |||||||
|---|---|---|---|---|---|---|---|
| year | mean | sd | median | min | max | range | skew |
| 2011 | 7,066 | 6,738 | 5,975 | 1,100 | 220,000 | 218,900 | 14 |
| 2012 | 8,273 | 7,218 | 6,900 | 1,250 | 149,000 | 147,750 | 10 |
| 2013 | 9,288 | 9,011 | 7,900 | 1,900 | 295,000 | 293,100 | 18 |
| 2014 | 9,968 | 9,271 | 8,497 | 1,500 | 349,000 | 347,500 | 20 |
| 2015 | 11,170 | 13,103 | 9,000 | 1,832 | 465,000 | 463,168 | 21 |
| 2016 | 12,741 | 12,664 | 9,900 | 3,000 | 399,999 | 396,999 | 13 |
| 2017 | 18,370 | 24,588 | 12,948 | 2,300 | 1,199,900 | 1,197,600 | 29 |
| 2018 | 22,019 | 19,385 | 17,410 | 3,979 | 499,800 | 495,821 | 8 |
| 2019 | 24,632 | 25,773 | 18,770 | 2,450 | 717,078 | 714,628 | 13 |
| 2020 | 30,129 | 22,400 | 24,723 | 2,499 | 349,000 | 346,501 | 5 |
# mileage and registration year
cars_cln %>% filter(year<2021) %>%
ggplot(aes(x=factor(year), y=mileage)) +
geom_half_boxplot(width=.5, outlier.color = NA) +
geom_half_point(side="r", range_scale=.5, alpha=0.3, size=1, color="#3C5488FF") +
scale_y_continuous(labels = scales::comma_format()) +
theme(panel.grid.major.x=element_blank(),
axis.title.y.left = element_text(margin=margin(r=5)),
axis.title=element_text(size=9, face="bold", color="grey50")) +
labs(x="Year",y="Mileage (in kilometers)",subtitle="Mileage and registration year")
# median mileage and registration year
cars_cln %>% filter(year<2021) %>%
filter(offer_type=="Used") %>%
group_by(year) %>%
summarise(median_mileage = median(mileage)) %>%
ggplot(aes(y=fct_rev(factor(year)), x=median_mileage)) +
geom_col(aes(fill=I(if_else(year==2011,"#3C5488FF","grey70"))),width=0.7, show.legend=F) +
geom_vline(color="white",xintercept=c(seq(0,100000,25000)), size=0.4) +
scale_x_continuous(limits=c(0,125000), expand=c(0,0), position = "top",
breaks=seq(0,100000,25000)) +
#scale_fill_gradientn(colours = wes_palette("Zissou1", 10, type = "continuous"), trans="reverse") +
theme_light(base_size=10) +
theme(panel.grid.major.x=element_blank(),
axis.title=element_text(size=9, face="bold", color="grey50"),
axis.title.y.left = element_text(margin=margin(r=5)),
axis.text.x.top = element_text(margin=margin(b=5,t=-5), vjust=-1),
axis.ticks.y=element_blank(),
panel.border = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.length=unit(.2, "cm"),
plot.margin=ggplot2::margin(1,1,1,1,"cm"),
plot.title.position = "plot"
) +
labs(y="Year",x="Median Mileage\n", subtitle = "Used vehicles: Median mileage by registration year\n")
# hp and price
cars_cln %>% filter(year<2021) %>%
ggplot(aes(x=factor(year), y=hp)) +
geom_half_boxplot(width=.5, outlier.color = NA) +
geom_half_point(side="r", range_scale=.5, alpha=0.2, size=1, color="#E64B35FF") +
theme(panel.grid.major.x=element_blank(),
axis.title.y.left = element_text(margin=margin(r=5)),
axis.title=element_text(size=9, face="bold", color="grey50")) +
labs(x="Year",y="Horsepower",subtitle="Horsepower and registration year")
cars_cln2 = cars_cln %>% mutate(id=row_number())
#used cars with registration year from 2011 to 2020
cars_n = cars_cln %>%
filter(offer_type=="Used") %>%
filter(year<2021) %>%
select(year, price, mileage, hp) %>%
drop_na()
# correlation
set.seed(123)
ggcorrmat(
data=cars_n,
cor.vars=c(year:hp),
title="Correlation",
)
cars_n2 = cars_num %>%
# get zscore
mutate(zscore_p =(price- mean(price))/ sd(price),
zscore_m =(mileage- mean(mileage))/ sd(mileage),
zscore_hp =(hp- mean(hp))/ sd(hp)) %>%
# drop outliers
filter(between(zscore_p,-3,3)) %>%
filter(between(zscore_m,-3,3)) %>%
filter(between(zscore_hp,-3,3)) %>%
# select variables
select(year, price, mileage,hp)
dim(cars_n)
[1] 38285 4
dim(cars_n2)
[1] 37028 4
# correlation after dropping outliers
set.seed(123)
ggcorrmat(
data=cars_n2,
cor.vars=c(year:hp),
title="Correlation",
)
# used cars registered between 2017 to 2019
used2 = cars_cln %>%
filter(offer_type=="Used") %>%
filter(between(year, 2017,2019)) %>%
drop_na(year, price, mileage,hp)
# drop outliers
used2b = used2 %>%
mutate(zscore_p =(price- mean(price))/ sd(price),
zscore_m =(mileage- mean(mileage))/ sd(mileage),
zscore_hp =(hp- mean(hp))/ sd(hp)) %>%
filter(between(zscore_p,-3,3)) %>%
filter(between(zscore_m,-3,3)) %>%
filter(between(zscore_hp,-3,3)) %>%
select(-zscore_p,-zscore_m,-zscore_hp,-offer_type)
dim(used2)
[1] 11813 9
dim(used2b)
[1] 11334 8
used3 = used2b %>% select(price, mileage,hp)
# scale
used3_scaled = scale(used3)
#hierarchical clustering
set.seed(1234)
hc= hclust(dist(used3_scaled))
plot(hc)
# check optimal clusters: elbow method
set.seed(123)
fviz_nbclust(used3_scaled,kmeans,method="wss")
#k means: 4 clusters
set.seed(1234)
k4= kmeans(used3_scaled,centers=4,nstart=50)
k4
K-means clustering with 4 clusters of sizes 1855, 1288, 3005, 5186
Cluster means:
price mileage hp
1 -0.3570169 1.73860167 -0.03468078
2 1.9694059 0.02292972 2.05642370
3 0.5368099 -0.30873698 0.43343275
4 -0.6724724 -0.44868588 -0.74948058
Clustering vector:
[1] 3 3 3 2 2 4 4 4 1 4 4 1 4 4 4 4 1 4 4 2 3 3 3 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3
[46] 3 3 3 3 3 3 3 3 4 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 2 4 4 1 4 4 1
[91] 4 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 2 3 2 2 2 2 3 2 2 3 2 3 3 2 1 3 2 3 3 2 3 2 3 3 2 2 2
[136] 2 3 2 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 4 4 1 4 1 4 1 4 4 1 4 4 3 4 4 3 3 3 4 2 2
[181] 2 2 2 2 2 2 2 2 2 2 2 1 4 4 4 4 4 4 4 4 1 4 4 4 1 1 4 1 4 4 4 1 4 1 4 4 1 1 4 4 4 4 4 4 4
[226] 4 4 4 4 1 4 4 4 4 1 4 4 3 3 3 3 3 1 3 1 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 1 4 3 4 1 3 3 3 4
[271] 3 4 4 4 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[316] 4 4 4 4 1 4 4 4 4 4 4 4 4 4 1 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 4 1 1 4 4 1 1
[361] 4 4 1 1 1 4 4 3 4 1 3 2 2 3 1 3 3 2 2 2 3 2 2 2 2 2 2 3 3 3 1 3 3 3 3 3 3 3 3 2 3 3 2 2 2
[406] 4 1 1 4 4 4 4 4 1 4 4 4 4 1 1 4 4 4 1 2 2 2 2 2 2 4 4 4 1 4 4 4 4 1 4 4 3 3 3 2 3 3 3 3 2
[451] 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4
[496] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 3 2 2 2 3
[541] 2 2 2 2 2 2 2 2 2 3 3 3 3 3 1 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
[586] 3 1 3 3 1 3 1 3 3 1 1 1 1 3 3 3 3 1 4 4 4 4 4 4 1 4 4 4 4 4 4 4 3 3 1 1 4 4 1 1 1 1 4 4 1
[631] 1 4 4 1 4 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 1 2 3 2 3 2 3 2 1 3 3 2 3 3 3 3 2 1 1
[676] 3 1 1 3 1 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 1 4 4 4 4 4 4 4
[721] 4 4 1 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 4 4 1 1 1 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[766] 4 4 4 4 4 3 3 3 2 3 3 2 4 1 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 2 3 2 3
[811] 3 3 2 2 3 2 3 3 2 2 2 2 4 4 1 4 4 1 1 1 4 1 4 4 4 4 4 4 1 4 4 4 3 2 2 3 3 2 4 4 1 1 1 4 4
[856] 4 4 4 4 3 2 3 2 3 3 2 2 2 3 2 3 2 2 2 2 2 2 4 4 4 4 4 1 4 4 4 4 4 4 1 1 4 1 4 4 1 1 4 4 1
[901] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 3 4 3 3 3 3 3 4 3 3 3 3 3 3 3 4 1 4 4 4 4 1 4 1 4 4 4
[946] 2 2 2 2 2 2 2 2 2 2 2 4 4 4 3 3 3 3 3 3 3 3 2 3 2 3 2 3 2 2 3 3 3 4 1 4 4 4 4 4 4 4 4 1 4
[991] 4 4 4 1 1 4 4 4 3 3
[ reached getOption("max.print") -- omitted 10334 entries ]
Within cluster sum of squares by cluster:
[1] 2208.972 2777.538 2287.290 2417.063
(between_SS / total_SS = 71.5 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault"
fviz_cluster(k4, data=used3_scaled, labelsize=0) #cluster plot
with(used3,pairs(used3_scaled,col=(1:4)[k4$cluster])) #pair plot
colnames(used2b)
[1] "mileage" "make" "model" "fuel" "gear" "offer_type" "price"
[8] "hp" "year"
# summary by cluster id
used2c = used2b %>% mutate_at(vars(make, model, fuel, gear, year), list(factor))
used2c$clustid=as.factor(k4$cluster)
by(used2c,used2c$clustid,summary)
used2c$clustid: 1
mileage make model fuel gear
Min. : 65197 Volkswagen:339 Astra : 187 Diesel :1443 Automatic : 833
1st Qu.: 86826 Opel :306 Focus : 101 Gasoline : 384 Manual :1014
Median :102434 Ford :244 Golf : 94 Electric/Gasoline: 17 Semi-automatic: 2
Mean :108217 Skoda :205 Octavia: 82 LPG : 7 NA's : 6
3rd Qu.:126521 Audi :128 Tiguan : 44 Electric : 2
Max. :171423 BMW :122 (Other):1343 -/- (Fuel) : 1
(Other) :511 NA's : 4 (Other) : 1
price hp year clustid
Min. : 2300 Min. : 58.0 2017:1071 1:1855
1st Qu.: 9850 1st Qu.:110.0 2018: 675 2: 0
Median :13950 Median :136.0 2019: 109 3: 0
Mean :15392 Mean :138.6 4: 0
3rd Qu.:19950 3rd Qu.:160.0
Max. :44990 Max. :313.0
------------------------------------------------------------------------
used2c$clustid: 2
mileage make model fuel
Min. : 10 Audi :302 XC90 : 88 Diesel :691
1st Qu.: 25583 BMW :271 A6 : 79 Gasoline :551
Median : 43766 Volvo :205 530 : 65 Electric/Gasoline: 30
Mean : 47934 Mercedes-Benz:158 XC60 : 40 Electric : 7
3rd Qu.: 66115 Volkswagen :128 Touareg: 36 Electric/Diesel : 7
Max. :160000 Land : 41 (Other):976 LPG : 2
(Other) :183 NA's : 4 (Other) : 0
gear price hp year clustid
Automatic :1268 Min. :23550 Min. :140.0 2017:363 1: 0
Manual : 19 1st Qu.:34999 1st Qu.:250.0 2018:519 2:1288
Semi-automatic: 0 Median :39980 Median :272.0 2019:406 3: 0
NA's : 1 Mean :43204 Mean :284.7 4: 0
3rd Qu.:47990 3rd Qu.:320.0
Max. :89900 Max. :417.0
------------------------------------------------------------------------
used2c$clustid: 3
mileage make model fuel
Min. : 10 Volkswagen :547 Tiguan : 138 Diesel :1450
1st Qu.:19490 Audi :427 A4 : 117 Gasoline :1412
Median :33719 BMW :352 Golf : 94 Electric/Gasoline: 96
Mean :36281 Ford :282 Kuga : 94 Electric : 28
3rd Qu.:52300 Mercedes-Benz:267 A3 : 64 CNG : 8
Max. :90450 Volvo :165 (Other):2490 Electric/Diesel : 5
(Other) :965 NA's : 8 (Other) : 6
gear price hp year clustid
Automatic :2272 Min. :14330 Min. : 68.0 2017: 724 1: 0
Manual : 727 1st Qu.:21980 1st Qu.:150.0 2018:1043 2: 0
Semi-automatic: 0 Median :25350 Median :170.0 2019:1238 3:3005
NA's : 6 Mean :26078 Mean :171.3 4: 0
3rd Qu.:29490 3rd Qu.:190.0
Max. :55900 Max. :288.0
------------------------------------------------------------------------
used2c$clustid: 4
mileage make model fuel gear
Min. : 0 Opel : 827 Fiesta : 398 Gasoline :4438 Automatic : 775
1st Qu.:16436 Volkswagen: 742 Corsa : 362 Diesel : 490 Manual :4381
Median :28300 Ford : 641 up! : 349 Electric : 158 Semi-automatic: 2
Mean :31363 Renault : 431 Polo : 197 Electric/Gasoline: 61 NA's : 28
3rd Qu.:44164 smart : 305 forTwo : 191 CNG : 18
Max. :80400 Fiat : 274 (Other):3661 Others : 13
(Other) :1966 NA's : 28 (Other) : 8
price hp year clustid
Min. : 2450 Min. : 1.00 2017:1701 1: 0
1st Qu.: 8990 1st Qu.: 71.00 2018:1637 2: 0
Median :10980 Median : 86.00 2019:1848 3: 0
Mean :11621 Mean : 88.65 4:5186
3rd Qu.:13795 3rd Qu.:102.00
Max. :23820 Max. :150.00
cluster size: c4 (n=5186) > c3 (n=3005) > c1 (n=1855) > c2 (n=1288)
mean price: c2 > c3 > c1 > c4
mean mileage: c1 > c2 > c3 > c4
mean hp: c2 > c3 > c1 > c4
summary of cluster mean
summary of other features by cluster ID