Loading data and parse dates

autos <- read.csv("~/Desktop/used-cars-database/autos/autos.csv",encoding = "UTF-8",stringsAsFactors = FALSE)
auto <- autos[,c(-2,-3,-4,-6,-18)]
auto$dateCrawled <- ymd_hms(auto$dateCrawled)
auto$dateCreated <- ymd_hms(auto$dateCreated)
auto$lastSeen <- ymd_hms(auto$lastSeen)

Price Cleaning

quantile(auto$price, probs = seq(0,1,0.1))
##       0%      10%      20%      30%      40%      50%      60%      70% 
##        0      500      900     1400     2000     2950     4150     5999 
##      80%      90%     100% 
##     8800    14000 99999999
p1 <- ggplot(auto, aes(x = "price",y = price)) +
    geom_boxplot() +
    ylim(quantile(auto$price,0.1),quantile(auto$price,0.95))
p2 <- ggplot(auto,aes(x = "price", y = price)) +
    geom_boxplot() +
    ylim(quantile(auto$price,0.05),quantile(auto$price,0.90))
p3 <- ggplot(auto,aes(x = "price",y = price)) +
    geom_boxplot() +
    ylim(quantile(auto$price,0.05),quantile(auto$price,0.95))
grid.arrange(p1,p2,p3,nrow = 1)

auto <- auto[(auto$price > quantile(auto$price,0.05)) & (auto$price < quantile(auto$price,0.95)),]

PowerPS Cleaning

quantile(auto$powerPS, probs = seq(0,1,0.05))
##    0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55% 
##     0     0    26    56    60    72    75    85    90   101   105   111 
##   60%   65%   70%   75%   80%   85%   90%   95%  100% 
##   116   125   136   143   150   170   180   218 19208
p1 <- ggplot(auto,aes(x = "powerPS", y = powerPS)) +
    geom_boxplot()
p2 <- ggplot(auto,aes(x = "powerPS", y = powerPS)) +
    geom_boxplot() + 
    ylim(quantile(auto$powerPS, 0.05), quantile(auto$powerPS, 0.95))
grid.arrange(p1,p2, nrow = 1)

auto <- auto[which((auto$powerPS > quantile(auto$powerPS,0.05)) &
               (auto$powerPS < quantile(auto$powerPS,0.95))),]
quantile(auto$powerPS)
##   0%  25%  50%  75% 100% 
##    1   75  109  140  217

VehicleType Cleaning

auto <- auto[which(auto$vehicleType != ""),]

Choose Date

table(auto$yearOfRegistration)
## 
## 1910 1929 1930 1933 1935 1937 1940 1942 1943 1945 1947 1949 1950 1951 1952 
##    2    1    1    1    1    1    1    1    2    1    1    1    3    3    1 
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 
##    6    6    2    4    5    6    8   18    8    8   16   21   17   33   25 
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 
##   32   43   54   66   78   59   59   42   53   62  101  102   87   99  132 
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 
##  167  170  182  188  275  383  501  793 1072 1199 1321 1865 2966 4232 5799 
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
## 7238 9402 8257 8538 8268 8561 8253 8136 8218 7059 6482 6540 4972 4563 3482 
## 2013 2014 2015 2016 2017 2018 
## 1959 1145  506   64    5    1
p1 <- ggplot(auto,aes(x = "year", y = yearOfRegistration)) +
    geom_boxplot()
p2 <- ggplot(auto,aes(x = vehicleType, y = yearOfRegistration)) +
    geom_boxplot()
p3 <- ggplot(auto,aes(x = yearOfRegistration,y = price)) +
    geom_point() +
    facet_wrap(~vehicleType) +
    xlim(1960,2016) +
    geom_smooth()
grid.arrange(p1,p2,p3)
## `geom_smooth()` using method = 'gam'

auto <- auto[which((auto$yearOfRegistration >= 1960) & 
                       (auto$yearOfRegistration <= 2016)),]

Remove 0 month and missing value

auto <- auto[which(auto$monthOfRegistration != 0),]
auto <- auto[which(auto$fuelType != ""),]
auto <- auto[which(auto$gearbox != ""),]
auto <- auto[which(auto$model != ""),]
auto <- auto[which(auto$notRepairedDamage != ""),]

Remove difftime more than 30 days

auto <- mutate(auto, diff_time = difftime(auto$lastSeen,auto$dateCreated,units = "days"))
auto$diff_time <- as.numeric(auto$diff_time)
quantile(auto$diff_time,probs = seq(0,1,0.05))
##           0%           5%          10%          15%          20% 
##   0.02518519   0.60150463   0.82170602   1.45036806   2.28293287 
##          25%          30%          35%          40%          45% 
##   2.67510417   3.13694444   4.26140046   4.76307407   5.99321528 
##          50%          55%          60%          65%          70% 
##   6.76231481   8.13616898   9.47132407  10.90708565  12.55429167 
##          75%          80%          85%          90%          95% 
##  14.32403935  16.78133796  19.30232639  22.88797685  27.23921296 
##         100% 
## 239.86590278
auto <- auto[which(auto$diff_time <= 30),]
auto$diff_time <- round(auto$diff_time, digits = 2)

Postalcode

auto$postalCode <- clean.zipcodes(auto$postalCode)

VehicleType

group_by(auto, vehicleType) %>%
    summarise(vehicleTypes = length(vehicleType)) %>%
    ggplot(aes(x = reorder(vehicleType,desc(vehicleTypes)),y = vehicleTypes)) +
    geom_bar(aes(fill = vehicleType),color = "Black",stat = "identity") +
    scale_fill_brewer(type = "seq") +
    ggtitle("VehicleType")

Mean price point

ggplot(auto,aes(x = vehicleType,y = price)) +
    geom_boxplot(aes(fill = vehicleType)) +
    stat_summary(fun.y = mean,geom = "point",size = 3)

PowerPS and price by vehicleType

ggplot(auto, aes(x = powerPS, y = price)) +
    geom_point(alpha = 1/25, color = "red",position = "jitter") +
    geom_smooth() +
    facet_wrap(~vehicleType,nrow = 3) + 
    ggtitle("Price vs powerPS by vehicleType")
## `geom_smooth()` using method = 'gam'

Price vs vehicleType by gearbox

All the price are dominated by automatik

ggplot(auto, aes(x = vehicleType, y = price)) +
    geom_boxplot(aes(fill = gearbox)) +
    ggtitle("Price vs vehicleType by gearbox") +
    stat_summary(fun.y = mean ,geom = "point", size = 3)

Price vs difftime by vehicletype

x <- group_by(auto, vehicleType, diff_time) %>%
    summarise(meanprice = mean(price),
              medianprice = median(price),
              count = n()) 
p1 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = meanprice,color = vehicleType))
p2 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = medianprice,color = vehicleType))
p3 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = count,color = vehicleType))
grid.arrange(p1,p2,p3)