In summary, we will see overall car price deprication by year, how car mileage and model becomes less important the older the car. Basically car prices converge as cars get older, which makes sense because the scale of prices is gettting smaller.
Apply some filters to the data and bin the mileage numerical values . . .
#filter years
data1$Year=as.numeric(data1$Year)
data2<-subset(data1, Year>2000)
data2$Year=as.factor(data2$Year)
#Filter Price
data2<-subset(data2, Price<=30000)
#bin mileage
data2$mileageM=data2$mileageM/1000
data2$MileageBin<-cut(data2$mileageM, seq(0,200,50), right=FALSE)
Let me have a general look at prices by Year. Wow! look at the depreciation of a new car.
#main barplot
plot(data2$Year, data2$Price)
###Frequency Distribution Table for Car Make As before . . . create a frequency table so that we can focus on the the top 10 Makes.
#count Make
data2$Make=as.factor(data2$Make)
y=count(data2, 'Make')
y1=y[order(y$freq, decreasing=TRUE), ]
y2=y1[1:10,]
y2
## Make freq
## 52 Volkswagen 4580
## 13 Ford 3103
## 49 Toyota 2887
## 3 BMW 2741
## 2 Audi 2531
## 34 Opel 1464
## 35 Peugeot 1390
## 33 Nissan 1332
## 38 Renault 1236
## 29 Mercedes-Benz 1157
Lets look at the frequency distribution by the top 10 manufacturers by Year. . . The distributions all look pretty similar. Perhaps Audi is a little flatter than the others.
par(mfrow = c(3, 4))
for (i in 1:10){
data2a<-subset(data2, Make==y2[i,1])
#plot(data2a$Year, data2a$Price, main=y2[i,1])
plot(data2a$Year, main=y2[i,1])
}
Lets look at depreciation by make. All look pretty similar at a first review.
par(mfrow = c(3, 4))
for (i in 1:10){
data2a<-subset(data2, Make==y2[i,1])
plot(data2a$Year, data2a$Price, main=y2[i,1])
}
Combine the frequency distribution and the depreciation graphs together for ease of review . . . .
par(mfrow = c(1, 2))
for (i in 1:10){
data2a<-subset(data2, Make==y2[i,1])
plot(data2a$Year, data2a$Price, main=y2[i,1])
plot(data2a$Year, main=y2[i,1])
}
A look at the distribution by Year of registration . . .
plot(data2a$Year)#, main=y2[i,1])
Note the lack of outliers for 2003/2002 registered cars considering the fequency for this year.
#all data binned
ggplot(data2a, aes(x=Year, y=Price, fill=MileageBin))+
geom_boxplot()+
facet_grid(.~Year, scales="free")+
labs(x="X (binned)")+
theme(axis.text.x=element_blank())+
ggtitle("Price Vs Year/Mileage")+
labs(x="Year",y="Price(Euros)")
Re-creating the same plot, except this time by Make. And ill look at the top 3 makes for simplicity. What is becoming obvious, is that mileage appears to have less influence over the price of a car as it becomes older.
plot_list = list()
for (i in 1:10){
data3a<-subset(data2, Make==y2[i,1])
p=ggplot(data3a, aes(x=Year, y=Price, fill=MileageBin))+
geom_boxplot()+
facet_grid(.~Year, scales="free")+
labs(x="X (binned)")+
theme(axis.text.x=element_blank())+
ggtitle(y2[i,1])+
labs(x="Year",y="Price(Euros)")
plot_list[[i]] = p
}
plot_list[[1]]
plot_list[[2]]
plot_list[[3]]
#title1=textGrob("Car Price by Make, Year and Mileage", gp=gpar(fontface="bold"))
#grid.arrange(plot_list[[1]], plot_list[[2]], ncol=1, newpage = TRUE, top=title1)
#grid.arrange(plot_list[[3]], plot_list[[4]], ncol=1, newpage = TRUE, top=title1)
Ill do the same again, this time looking at specific models.
#count Make Model
data2$Make=as.factor(data2$make_model1)
x=count(data2, 'make_model1')
x1=x[order(x$freq, decreasing=TRUE), ]
x2=x1[1:10,]
x2
## make_model1 freq
## 543 Volkswagen-Golf 1838
## 547 Volkswagen-Passat 1501
## 132 Ford-Focus 1500
## 33 BMW-3Series 1432
## 12 Audi-A4 1375
## 35 BMW-5Series 879
## 484 Toyota-Corolla 857
## 480 Toyota-Avensis 835
## 139 Ford-Mondeo 567
## 353 Opel-Astra 517
#unbinned by makemodel
par(mfrow = c(2, 2))
for (i in 1:4){
data3a<-subset(data2, make_model1==x2[i,1])
plot(data3a$Year, data3a$Price, main=x2[i,1])
}
###Multivariate plot of Price by Year and mileage bucket ###by model
plot_list = list()
for (i in 1:4){
data3a<-subset(data2, make_model1==x2[i,1])
p=ggplot(data3a, aes(x=Year, y=Price, fill=MileageBin))+
geom_boxplot()+
facet_grid(.~Year, scales="free")+
labs(x="X (binned)")+
theme(axis.text.x=element_blank())+
ggtitle(x2[i,1])+
labs(x="Year",y="Price(Euros)")+
ylim(0, 30000)
plot_list[[i]] = p
}
plot_list[[1]]
plot_list[[2]]
plot_list[[3]]
plot_list[[4]]
#title1=textGrob("Car Price by Model, Year and Mileage", gp=gpar(fontface="bold"))
#grid.arrange(plot_list[[1]], plot_list[[2]], ncol=1, newpage = TRUE, top=title1)
#grid.arrange(plot_list[[3]], plot_list[[4]], ncol=1, newpage = TRUE, top=title1)
Comparing Price of the top 5 models by Year of Registeration. Its clear to see that there are greater differences in price for newer car models compared to older cars. The order stays pretty consistant over the years also.
x3=x2[1:5,]
selectedRows <- (data2$make_model1 %in% x3$make_model1)
dfReduced <- data2[selectedRows,]
p=ggplot(dfReduced, aes(x=Year, y=Price, fill=make_model1))+
geom_boxplot()+
facet_grid(.~Year, scales="free")+
labs(x="X (binned)")+
theme(axis.text.x=element_blank())+
ggtitle("Comparison of Top 5 models by Year")+
labs(x="Year",y="Price(Euros)")+
ylim(0, 30000)
p