Kaggle’s database

分析二手車品牌的價格性能、註冊量、油品價格

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(corrplot)
library(scales)
## Warning: package 'scales' was built under R version 3.3.2
library(showtext)
## Warning: package 'showtext' was built under R version 3.3.2
## Loading required package: sysfonts

Clearing Data

showtext.begin()
autos <- read.csv("~/Desktop/used-cars-database/autos/autos.csv",encoding = "UTF-8", stringsAsFactors = FALSE)
autos[which(autos$vehicleType == ""),]$vehicleType = "無"
autos[which(autos$vehicleType == "andere"),]$vehicleType = "其他"
autos[which(autos$vehicleType == "bus"),]$vehicleType = "巴士"
autos[which(autos$vehicleType == "cabrio"),]$vehicleType = "敞篷車"
autos[which(autos$vehicleType == "coupe"),]$vehicleType = "四門轎車"
autos[which(autos$vehicleType == "kleinwagen"),]$vehicleType = "小型車"
autos[which(autos$vehicleType == "kombi"),]$vehicleType = "旅行車"
autos[which(autos$vehicleType == "limousine"),]$vehicleType = "豪華轎車"
autos[which(autos$vehicleType == "suv"),]$vehicleType = "運動休旅車"
autos$vehicleType <- factor(autos$vehicleType,levels = c("無","其他","小型車","巴士","四門轎車",
                                    "旅行車","運動休旅車","敞篷車","豪華轎車"))
autos[which(autos$fuelType == ""),]$fuelType = "無"
autos[which(autos$fuelType == "andere"),]$fuelType = "其他"
autos[which(autos$fuelType == "benzin"),]$fuelType = "輕質汽油"
autos[which(autos$fuelType == "cng"),]$fuelType = "天然氣"
autos[which(autos$fuelType == "diesel"),]$fuelType = "柴油"
autos[which(autos$fuelType == "elektro"),]$fuelType = "電動"
autos[which(autos$fuelType == "hybrid"),]$fuelType = "混合動力"
autos[which(autos$fuelType == "lpg"),]$fuelType = "液化石油氣"
autos$fuelType <- factor(autos$fuelType, levels = c("無","其他","輕質汽油","天然氣",
                                                    "柴油","電動","混合動力","液化石油氣"))
autos[which(autos$notRepairedDamage == ""),]$notRepairedDamage = "無"
autos[which(autos$notRepairedDamage == "ja"),]$notRepairedDamage = "有"
autos[which(autos$notRepairedDamage == "nein"),]$notRepairedDamage = "沒有"
autos$notRepairedDamage <- factor(autos$notRepairedDamage, levels = c("無","沒有","有"))
autos$dateCrawled <- ymd_hms(autos$dateCrawled)
autos$dateCreated <- ymd_hms(autos$dateCreated)
autos$lastSeen <- ymd_hms(autos$lastSeen)
auto <- autos[,c(-3,-4,-18)]
auto <- autos %>% filter(price<100000, price>100,
                        yearOfRegistration>1990,yearOfRegistration<2017,
                        powerPS<1500,powerPS>30)
auto1930 <- autos[,c(-3,-4,-18)] %>% 
    filter(price<100000, price>100,
           yearOfRegistration>1930,yearOfRegistration<2017,
           powerPS<1500,powerPS>30)
auto$model <- strsplit(auto$name, split = "_") %>% 
    sapply("[[", 1)
auto1930$model <- strsplit(auto1930$name,split = "_") %>%
    sapply("[[",1)
showtext.end()

Correlation plot

好奇的點:價格跟性能相關性不高

corrdf <- auto[,c("price","powerPS","kilometer","monthOfRegistration","yearOfRegistration")]
corrmodel <- cor(corrdf) %>% corrplot.mixed()

Vehicle Price

ggplot(auto, aes(x = price)) +
    stat_density(fill = "lightpink") +
    scale_x_log10(labels = dollar_format(prefix = "",suffix = "€")) +
    ggtitle("Vehicle Price")

PowerPS

ggplot(auto, aes(x = powerPS)) +
    stat_density(fill = "lightpink") + 
    scale_x_log10() +
    ggtitle("Vehicle PowerPS")

Kilometers

ggplot(auto, aes(x = kilometer)) +
    stat_density(fill = "lightpink") +
    ggtitle("Vehicle Kilometer")

Vehicle by month and year of registration

group_by(auto, yearOfRegistration, monthOfRegistration) %>%
    summarise(month = length(monthOfRegistration)) %>%
    ggplot(aes(x = monthOfRegistration, y = month,fill = month)) +
    geom_bar(stat = "identity") +
    facet_wrap(~yearOfRegistration) +
    ggtitle(expression(atop("Vehicle by month and year of registration",
                            atop(plain("Number of Vehicle"),"")))) +
    theme(plot.title = element_text(size = 25, face = "plain",vjust = -1)) +
    labs(x = "Month", y = NULL)

每年註冊數成長量變化

group_by(auto1930, yearOfRegistration) %>%
    summarise(year = length(yearOfRegistration)) %>%
    ggplot(aes(x = yearOfRegistration,y = year, fill = year)) +
    geom_bar(stat = "identity") + 
    ggtitle(expression(atop("Vehicles by year of registration",
                            atop(plain("Number of Vehicle"),"")))) +
    theme(plot.title = element_text(size = 25, face = "plain")) +
    labs(x = "Year", y = NULL)

車種的金額

不分群,發現其實高消費力的人不少,所以決定分成幾個族群

ggplot(auto, aes(x = vehicleType, y = price, fill = vehicleType)) + 
    geom_boxplot() +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12))

價格介於100~15000之間

filter(auto,price<15000) %>%
    ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
    geom_boxplot() +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
    ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) + 
    labs(x = NULL,y = NULL)

價格介於15000~40000之間

filter(auto,price>15000,price<40000) %>%
    ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
    geom_boxplot() +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
    ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) + 
    labs(x = NULL,y = NULL)

大於40000以上,已經沒有小型車的價格

filter(auto,price>40000) %>%
    ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
    geom_boxplot() +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
    ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) + 
    labs(x = NULL,y = NULL)

價格取對數減少價格之間的差距對圖的影響

ggplot(auto, aes(x = vehicleType, y = price, fill = vehicleType)) + 
    geom_boxplot() +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
    scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
    ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) + 
    labs(x = NULL,y = NULL)

常識?手排通常比自排貴?

filter(auto, gearbox == c("manuell","automatik")) %>%
    ggplot(aes(x = gearbox,y = price, fill = gearbox)) +
    geom_boxplot() + 
    scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
    ggtitle(expression(atop("Gearbox by Price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) + 
    labs(x = NULL,y = NULL)

油品價格分布

ggplot(auto, aes(x = fuelType, y = price, fill = fuelType)) + 
    geom_boxplot() +
    scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
    theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
    ggtitle(expression(atop("FuelType by price",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 25,face = "plain")) +
    labs(x = NULL,y = NULL)

維修狀況

ggplot(auto, aes(x = notRepairedDamage, y = price, fill = notRepairedDamage)) +
    geom_boxplot() +
    scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
    theme(text = element_text(family = "楷體-繁 黑體",size = 15)) +
    ggtitle(expression(atop("Vehicle by notRepairedDamage",atop(plain("Price"),"")))) +
    theme(plot.title = element_text(size = 20,face = "plain")) +
    labs(x = NULL,y = NULL)

沒有去維修的人比較多,客人可能想說壞了就算了

品牌的價格與性能

總共有40個品牌,價格由低到高,這裡拿前13個品牌的價格與性能做比較

看出其實價格最低的dewoo的性能反而比價格較高的某些品牌性能要好

價格最高的jaguar性能跟價格低一些的jeep性能差不多

x <- group_by(auto, brand) %>%
     summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
     arrange(meanprice) %>%
     slice(1:13)
x$brand <- factor(x$brand,levels = x$brand)
ggplot(x,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
     geom_point(size = 8) +
     scale_y_log10() +
     geom_point(aes(y = meanpowerPS),size = 8) +
     ggtitle("Brand by meamprice and meanpowerPS from 1 to 13") +
     theme(plot.title = element_text(size = 15,face = "plain"),
           axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
     labs(x = NULL,y = NULL) +
    guides(col = guide_legend(nrow = 10))

14~26個品牌價格,看出到skoda品牌為止價格都是差不多的

像saab這品牌價格跟其他差不多,不過性能卻比其他出眾,可以考慮

y <- group_by(auto, brand) %>%
     summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
     arrange(meanprice) %>%
     slice(14:26)
y$brand <- factor(y$brand,levels = y$brand)
ggplot(y,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
    geom_point(size = 8) +
    scale_y_log10() +
    geom_point(aes(y = meanpowerPS),size = 8) +
    ggtitle("Brand by meamprice and meanpowerPS from 14 to 26") +
    theme(plot.title = element_text(size = 15,face = "plain"),
          axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
    labs(x = NULL,y = NULL) +
    guides(col = guide_legend(nrow = 10))

z <- group_by(auto, brand) %>%
    summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
    arrange(meanprice) %>%
    slice(27:40)
z$brand <- factor(z$brand,levels = z$brand)
ggplot(z,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
    geom_point(size = 8) +
    scale_y_log10() +
    geom_point(aes(y = meanpowerPS),size = 8) +
    ggtitle("Brand by meamprice and meanpowerPS from 27 to 40") +
    theme(plot.title = element_text(size = 15,face = "plain"),
          axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
    labs(x = NULL,y = NULL) +
    guides(col = guide_legend(nrow = 10))