Summary

In summary, we will see overall car price deprication by year, how car mileage and model becomes less important the older the car. Basically car prices converge as cars get older, which makes sense because the scale of prices is gettting smaller.

Filters & Binning

Apply some filters to the data and bin the mileage numerical values . . .

#filter years
data1$Year=as.numeric(data1$Year)
data2<-subset(data1, Year>2000)
data2$Year=as.factor(data2$Year)

#Filter Price
data2<-subset(data2, Price<=30000)

#bin mileage
data2$mileageM=data2$mileageM/1000
data2$MileageBin<-cut(data2$mileageM, seq(0,200,50), right=FALSE)

Car Price vs Year (revisited)

Let me have a general look at prices by Year. Wow! look at the depreciation of a new car.

#main barplot
plot(data2$Year, data2$Price)

###Frequency Distribution Table for Car Make As before . . . create a frequency table so that we can focus on the the top 10 Makes.

#count Make
data2$Make=as.factor(data2$Make)
y=count(data2, 'Make')
y1=y[order(y$freq, decreasing=TRUE), ]
y2=y1[1:10,]
y2
##             Make freq
## 52    Volkswagen 4580
## 13          Ford 3103
## 49        Toyota 2887
## 3            BMW 2741
## 2           Audi 2531
## 34          Opel 1464
## 35       Peugeot 1390
## 33        Nissan 1332
## 38       Renault 1236
## 29 Mercedes-Benz 1157

Frequency Distribution by Make and Year

Lets look at the frequency distribution by the top 10 manufacturers by Year. . . The distributions all look pretty similar. Perhaps Audi is a little flatter than the others.

par(mfrow = c(3, 4))
for (i in 1:10){
  data2a<-subset(data2, Make==y2[i,1])
  #plot(data2a$Year, data2a$Price, main=y2[i,1])
  plot(data2a$Year, main=y2[i,1])
  }

Depreciation by Make

Lets look at depreciation by make. All look pretty similar at a first review.

par(mfrow = c(3, 4))
for (i in 1:10){
  data2a<-subset(data2, Make==y2[i,1])
  plot(data2a$Year, data2a$Price, main=y2[i,1])
  }

Depreciation by Make along with Frequency by Year

Combine the frequency distribution and the depreciation graphs together for ease of review . . . .

par(mfrow = c(1, 2))
for (i in 1:10){
  data2a<-subset(data2, Make==y2[i,1])
  plot(data2a$Year, data2a$Price, main=y2[i,1])
  plot(data2a$Year, main=y2[i,1])
  }

Frequency Distribution by Year (Revisited)

A look at the distribution by Year of registration . . .

plot(data2a$Year)#, main=y2[i,1])

Multivariate plot of Price by Year and mileage bucket

Note the lack of outliers for 2003/2002 registered cars considering the fequency for this year.

#all data binned
ggplot(data2a, aes(x=Year, y=Price, fill=MileageBin))+
  geom_boxplot()+
  facet_grid(.~Year, scales="free")+
  labs(x="X (binned)")+
  theme(axis.text.x=element_blank())+
    ggtitle("Price Vs Year/Mileage")+
    labs(x="Year",y="Price(Euros)")

Multivariate plot of Price by Year and mileage bucket

by Make

Re-creating the same plot, except this time by Make. And ill look at the top 3 makes for simplicity. What is becoming obvious, is that mileage appears to have less influence over the price of a car as it becomes older.

plot_list = list()
for (i in 1:10){
  
  data3a<-subset(data2, Make==y2[i,1])
  
  p=ggplot(data3a, aes(x=Year, y=Price, fill=MileageBin))+
    geom_boxplot()+
    facet_grid(.~Year, scales="free")+
    labs(x="X (binned)")+
    theme(axis.text.x=element_blank())+
    ggtitle(y2[i,1])+
    labs(x="Year",y="Price(Euros)")
  plot_list[[i]] = p
}

plot_list[[1]]

plot_list[[2]]

plot_list[[3]]

#title1=textGrob("Car Price by Make, Year and Mileage", gp=gpar(fontface="bold"))
#grid.arrange(plot_list[[1]], plot_list[[2]], ncol=1, newpage = TRUE, top=title1)
#grid.arrange(plot_list[[3]], plot_list[[4]], ncol=1, newpage = TRUE, top=title1)

Multivariate plot of Price by Year and mileage bucket

Ill do the same again, this time looking at specific models.

#count Make Model
data2$Make=as.factor(data2$make_model1)
x=count(data2, 'make_model1')
x1=x[order(x$freq, decreasing=TRUE), ]
x2=x1[1:10,]
x2
##           make_model1 freq
## 543   Volkswagen-Golf 1838
## 547 Volkswagen-Passat 1501
## 132        Ford-Focus 1500
## 33        BMW-3Series 1432
## 12            Audi-A4 1375
## 35        BMW-5Series  879
## 484    Toyota-Corolla  857
## 480    Toyota-Avensis  835
## 139       Ford-Mondeo  567
## 353        Opel-Astra  517
#unbinned by makemodel
par(mfrow = c(2, 2))

for (i in 1:4){
  data3a<-subset(data2, make_model1==x2[i,1])
  plot(data3a$Year, data3a$Price, main=x2[i,1])
}

###Multivariate plot of Price by Year and mileage bucket ###by model

plot_list = list()
for (i in 1:4){
  
  data3a<-subset(data2, make_model1==x2[i,1])
  
  p=ggplot(data3a, aes(x=Year, y=Price, fill=MileageBin))+
    geom_boxplot()+
    facet_grid(.~Year, scales="free")+
    labs(x="X (binned)")+
    theme(axis.text.x=element_blank())+
    ggtitle(x2[i,1])+
    labs(x="Year",y="Price(Euros)")+ 
    ylim(0, 30000)
  plot_list[[i]] = p
}

plot_list[[1]]

plot_list[[2]]

plot_list[[3]]

plot_list[[4]]

#title1=textGrob("Car Price by Model, Year and Mileage", gp=gpar(fontface="bold"))
#grid.arrange(plot_list[[1]], plot_list[[2]], ncol=1, newpage = TRUE, top=title1)
#grid.arrange(plot_list[[3]], plot_list[[4]], ncol=1, newpage = TRUE, top=title1)

Price vs Year by Top 5 Models

Comparing Price of the top 5 models by Year of Registeration. Its clear to see that there are greater differences in price for newer car models compared to older cars. The order stays pretty consistant over the years also.

x3=x2[1:5,]

selectedRows <- (data2$make_model1 %in% x3$make_model1)
dfReduced <- data2[selectedRows,]

p=ggplot(dfReduced, aes(x=Year, y=Price, fill=make_model1))+
    geom_boxplot()+
    facet_grid(.~Year, scales="free")+
    labs(x="X (binned)")+
    theme(axis.text.x=element_blank())+
    ggtitle("Comparison of Top 5 models by Year")+
    labs(x="Year",y="Price(Euros)")+ 
    ylim(0, 30000)

p