分析二手車品牌的價格性能、註冊量、油品價格
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(corrplot)
library(scales)
## Warning: package 'scales' was built under R version 3.3.2
library(showtext)
## Warning: package 'showtext' was built under R version 3.3.2
## Loading required package: sysfonts
showtext.begin()
autos <- read.csv("~/Desktop/used-cars-database/autos/autos.csv",encoding = "UTF-8", stringsAsFactors = FALSE)
autos[which(autos$vehicleType == ""),]$vehicleType = "無"
autos[which(autos$vehicleType == "andere"),]$vehicleType = "其他"
autos[which(autos$vehicleType == "bus"),]$vehicleType = "巴士"
autos[which(autos$vehicleType == "cabrio"),]$vehicleType = "敞篷車"
autos[which(autos$vehicleType == "coupe"),]$vehicleType = "四門轎車"
autos[which(autos$vehicleType == "kleinwagen"),]$vehicleType = "小型車"
autos[which(autos$vehicleType == "kombi"),]$vehicleType = "旅行車"
autos[which(autos$vehicleType == "limousine"),]$vehicleType = "豪華轎車"
autos[which(autos$vehicleType == "suv"),]$vehicleType = "運動休旅車"
autos$vehicleType <- factor(autos$vehicleType,levels = c("無","其他","小型車","巴士","四門轎車",
"旅行車","運動休旅車","敞篷車","豪華轎車"))
autos[which(autos$fuelType == ""),]$fuelType = "無"
autos[which(autos$fuelType == "andere"),]$fuelType = "其他"
autos[which(autos$fuelType == "benzin"),]$fuelType = "輕質汽油"
autos[which(autos$fuelType == "cng"),]$fuelType = "天然氣"
autos[which(autos$fuelType == "diesel"),]$fuelType = "柴油"
autos[which(autos$fuelType == "elektro"),]$fuelType = "電動"
autos[which(autos$fuelType == "hybrid"),]$fuelType = "混合動力"
autos[which(autos$fuelType == "lpg"),]$fuelType = "液化石油氣"
autos$fuelType <- factor(autos$fuelType, levels = c("無","其他","輕質汽油","天然氣",
"柴油","電動","混合動力","液化石油氣"))
autos[which(autos$notRepairedDamage == ""),]$notRepairedDamage = "無"
autos[which(autos$notRepairedDamage == "ja"),]$notRepairedDamage = "有"
autos[which(autos$notRepairedDamage == "nein"),]$notRepairedDamage = "沒有"
autos$notRepairedDamage <- factor(autos$notRepairedDamage, levels = c("無","沒有","有"))
autos$dateCrawled <- ymd_hms(autos$dateCrawled)
autos$dateCreated <- ymd_hms(autos$dateCreated)
autos$lastSeen <- ymd_hms(autos$lastSeen)
auto <- autos[,c(-3,-4,-18)]
auto <- autos %>% filter(price<100000, price>100,
yearOfRegistration>1990,yearOfRegistration<2017,
powerPS<1500,powerPS>30)
auto1930 <- autos[,c(-3,-4,-18)] %>%
filter(price<100000, price>100,
yearOfRegistration>1930,yearOfRegistration<2017,
powerPS<1500,powerPS>30)
auto$model <- strsplit(auto$name, split = "_") %>%
sapply("[[", 1)
auto1930$model <- strsplit(auto1930$name,split = "_") %>%
sapply("[[",1)
showtext.end()
好奇的點:價格跟性能相關性不高
corrdf <- auto[,c("price","powerPS","kilometer","monthOfRegistration","yearOfRegistration")]
corrmodel <- cor(corrdf) %>% corrplot.mixed()
ggplot(auto, aes(x = price)) +
stat_density(fill = "lightpink") +
scale_x_log10(labels = dollar_format(prefix = "",suffix = "€")) +
ggtitle("Vehicle Price")
ggplot(auto, aes(x = powerPS)) +
stat_density(fill = "lightpink") +
scale_x_log10() +
ggtitle("Vehicle PowerPS")
ggplot(auto, aes(x = kilometer)) +
stat_density(fill = "lightpink") +
ggtitle("Vehicle Kilometer")
group_by(auto, yearOfRegistration, monthOfRegistration) %>%
summarise(month = length(monthOfRegistration)) %>%
ggplot(aes(x = monthOfRegistration, y = month,fill = month)) +
geom_bar(stat = "identity") +
facet_wrap(~yearOfRegistration) +
ggtitle(expression(atop("Vehicle by month and year of registration",
atop(plain("Number of Vehicle"),"")))) +
theme(plot.title = element_text(size = 25, face = "plain",vjust = -1)) +
labs(x = "Month", y = NULL)
group_by(auto1930, yearOfRegistration) %>%
summarise(year = length(yearOfRegistration)) %>%
ggplot(aes(x = yearOfRegistration,y = year, fill = year)) +
geom_bar(stat = "identity") +
ggtitle(expression(atop("Vehicles by year of registration",
atop(plain("Number of Vehicle"),"")))) +
theme(plot.title = element_text(size = 25, face = "plain")) +
labs(x = "Year", y = NULL)
不分群,發現其實高消費力的人不少,所以決定分成幾個族群
ggplot(auto, aes(x = vehicleType, y = price, fill = vehicleType)) +
geom_boxplot() +
theme(text = element_text(family = "楷體-繁 黑體",size = 12))
filter(auto,price<15000) %>%
ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
geom_boxplot() +
theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
filter(auto,price>15000,price<40000) %>%
ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
geom_boxplot() +
theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
filter(auto,price>40000) %>%
ggplot(aes(x = vehicleType, y = price, fill = vehicleType)) +
geom_boxplot() +
theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
ggplot(auto, aes(x = vehicleType, y = price, fill = vehicleType)) +
geom_boxplot() +
theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
ggtitle(expression(atop("Vehicle by Price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
filter(auto, gearbox == c("manuell","automatik")) %>%
ggplot(aes(x = gearbox,y = price, fill = gearbox)) +
geom_boxplot() +
scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
ggtitle(expression(atop("Gearbox by Price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
ggplot(auto, aes(x = fuelType, y = price, fill = fuelType)) +
geom_boxplot() +
scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
theme(text = element_text(family = "楷體-繁 黑體",size = 12)) +
ggtitle(expression(atop("FuelType by price",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 25,face = "plain")) +
labs(x = NULL,y = NULL)
ggplot(auto, aes(x = notRepairedDamage, y = price, fill = notRepairedDamage)) +
geom_boxplot() +
scale_y_log10(labels = dollar_format(prefix = "",suffix = "€")) +
theme(text = element_text(family = "楷體-繁 黑體",size = 15)) +
ggtitle(expression(atop("Vehicle by notRepairedDamage",atop(plain("Price"),"")))) +
theme(plot.title = element_text(size = 20,face = "plain")) +
labs(x = NULL,y = NULL)
沒有去維修的人比較多,客人可能想說壞了就算了
總共有40個品牌,價格由低到高,這裡拿前13個品牌的價格與性能做比較
看出其實價格最低的dewoo的性能反而比價格較高的某些品牌性能要好
價格最高的jaguar性能跟價格低一些的jeep性能差不多
x <- group_by(auto, brand) %>%
summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
arrange(meanprice) %>%
slice(1:13)
x$brand <- factor(x$brand,levels = x$brand)
ggplot(x,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
geom_point(size = 8) +
scale_y_log10() +
geom_point(aes(y = meanpowerPS),size = 8) +
ggtitle("Brand by meamprice and meanpowerPS from 1 to 13") +
theme(plot.title = element_text(size = 15,face = "plain"),
axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
labs(x = NULL,y = NULL) +
guides(col = guide_legend(nrow = 10))
14~26個品牌價格,看出到skoda品牌為止價格都是差不多的
像saab這品牌價格跟其他差不多,不過性能卻比其他出眾,可以考慮
y <- group_by(auto, brand) %>%
summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
arrange(meanprice) %>%
slice(14:26)
y$brand <- factor(y$brand,levels = y$brand)
ggplot(y,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
geom_point(size = 8) +
scale_y_log10() +
geom_point(aes(y = meanpowerPS),size = 8) +
ggtitle("Brand by meamprice and meanpowerPS from 14 to 26") +
theme(plot.title = element_text(size = 15,face = "plain"),
axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
labs(x = NULL,y = NULL) +
guides(col = guide_legend(nrow = 10))
z <- group_by(auto, brand) %>%
summarise(meanprice = mean(price),meanpowerPS = mean(powerPS)) %>%
arrange(meanprice) %>%
slice(27:40)
z$brand <- factor(z$brand,levels = z$brand)
ggplot(z,aes(x = brand,y = meanprice,color = brand,group = meanprice)) +
geom_point(size = 8) +
scale_y_log10() +
geom_point(aes(y = meanpowerPS),size = 8) +
ggtitle("Brand by meamprice and meanpowerPS from 27 to 40") +
theme(plot.title = element_text(size = 15,face = "plain"),
axis.text.x = element_text(size = 5,face = "bold",color = "blue")) +
labs(x = NULL,y = NULL) +
guides(col = guide_legend(nrow = 10))