autos <- read.csv("~/Desktop/used-cars-database/autos/autos.csv",encoding = "UTF-8",stringsAsFactors = FALSE)
auto <- autos[,c(-2,-3,-4,-6,-18)]
auto$dateCrawled <- ymd_hms(auto$dateCrawled)
auto$dateCreated <- ymd_hms(auto$dateCreated)
auto$lastSeen <- ymd_hms(auto$lastSeen)
quantile(auto$price, probs = seq(0,1,0.1))
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0 500 900 1400 2000 2950 4150 5999
## 80% 90% 100%
## 8800 14000 99999999
p1 <- ggplot(auto, aes(x = "price",y = price)) +
geom_boxplot() +
ylim(quantile(auto$price,0.1),quantile(auto$price,0.95))
p2 <- ggplot(auto,aes(x = "price", y = price)) +
geom_boxplot() +
ylim(quantile(auto$price,0.05),quantile(auto$price,0.90))
p3 <- ggplot(auto,aes(x = "price",y = price)) +
geom_boxplot() +
ylim(quantile(auto$price,0.05),quantile(auto$price,0.95))
grid.arrange(p1,p2,p3,nrow = 1)
auto <- auto[(auto$price > quantile(auto$price,0.05)) & (auto$price < quantile(auto$price,0.95)),]
quantile(auto$powerPS, probs = seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55%
## 0 0 26 56 60 72 75 85 90 101 105 111
## 60% 65% 70% 75% 80% 85% 90% 95% 100%
## 116 125 136 143 150 170 180 218 19208
p1 <- ggplot(auto,aes(x = "powerPS", y = powerPS)) +
geom_boxplot()
p2 <- ggplot(auto,aes(x = "powerPS", y = powerPS)) +
geom_boxplot() +
ylim(quantile(auto$powerPS, 0.05), quantile(auto$powerPS, 0.95))
grid.arrange(p1,p2, nrow = 1)
auto <- auto[which((auto$powerPS > quantile(auto$powerPS,0.05)) &
(auto$powerPS < quantile(auto$powerPS,0.95))),]
quantile(auto$powerPS)
## 0% 25% 50% 75% 100%
## 1 75 109 140 217
auto <- auto[which(auto$vehicleType != ""),]
table(auto$yearOfRegistration)
##
## 1910 1929 1930 1933 1935 1937 1940 1942 1943 1945 1947 1949 1950 1951 1952
## 2 1 1 1 1 1 1 1 2 1 1 1 3 3 1
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## 6 6 2 4 5 6 8 18 8 8 16 21 17 33 25
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
## 32 43 54 66 78 59 59 42 53 62 101 102 87 99 132
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 167 170 182 188 275 383 501 793 1072 1199 1321 1865 2966 4232 5799
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 7238 9402 8257 8538 8268 8561 8253 8136 8218 7059 6482 6540 4972 4563 3482
## 2013 2014 2015 2016 2017 2018
## 1959 1145 506 64 5 1
p1 <- ggplot(auto,aes(x = "year", y = yearOfRegistration)) +
geom_boxplot()
p2 <- ggplot(auto,aes(x = vehicleType, y = yearOfRegistration)) +
geom_boxplot()
p3 <- ggplot(auto,aes(x = yearOfRegistration,y = price)) +
geom_point() +
facet_wrap(~vehicleType) +
xlim(1960,2016) +
geom_smooth()
grid.arrange(p1,p2,p3)
## `geom_smooth()` using method = 'gam'
auto <- auto[which((auto$yearOfRegistration >= 1960) &
(auto$yearOfRegistration <= 2016)),]
auto <- auto[which(auto$monthOfRegistration != 0),]
auto <- auto[which(auto$fuelType != ""),]
auto <- auto[which(auto$gearbox != ""),]
auto <- auto[which(auto$model != ""),]
auto <- auto[which(auto$notRepairedDamage != ""),]
auto <- mutate(auto, diff_time = difftime(auto$lastSeen,auto$dateCreated,units = "days"))
auto$diff_time <- as.numeric(auto$diff_time)
quantile(auto$diff_time,probs = seq(0,1,0.05))
## 0% 5% 10% 15% 20%
## 0.02518519 0.60150463 0.82170602 1.45036806 2.28293287
## 25% 30% 35% 40% 45%
## 2.67510417 3.13694444 4.26140046 4.76307407 5.99321528
## 50% 55% 60% 65% 70%
## 6.76231481 8.13616898 9.47132407 10.90708565 12.55429167
## 75% 80% 85% 90% 95%
## 14.32403935 16.78133796 19.30232639 22.88797685 27.23921296
## 100%
## 239.86590278
auto <- auto[which(auto$diff_time <= 30),]
auto$diff_time <- round(auto$diff_time, digits = 2)
auto$postalCode <- clean.zipcodes(auto$postalCode)
group_by(auto, vehicleType) %>%
summarise(vehicleTypes = length(vehicleType)) %>%
ggplot(aes(x = reorder(vehicleType,desc(vehicleTypes)),y = vehicleTypes)) +
geom_bar(aes(fill = vehicleType),color = "Black",stat = "identity") +
scale_fill_brewer(type = "seq") +
ggtitle("VehicleType")
ggplot(auto,aes(x = vehicleType,y = price)) +
geom_boxplot(aes(fill = vehicleType)) +
stat_summary(fun.y = mean,geom = "point",size = 3)
ggplot(auto, aes(x = powerPS, y = price)) +
geom_point(alpha = 1/25, color = "red",position = "jitter") +
geom_smooth() +
facet_wrap(~vehicleType,nrow = 3) +
ggtitle("Price vs powerPS by vehicleType")
## `geom_smooth()` using method = 'gam'
All the price are dominated by automatik
ggplot(auto, aes(x = vehicleType, y = price)) +
geom_boxplot(aes(fill = gearbox)) +
ggtitle("Price vs vehicleType by gearbox") +
stat_summary(fun.y = mean ,geom = "point", size = 3)
x <- group_by(auto, vehicleType, diff_time) %>%
summarise(meanprice = mean(price),
medianprice = median(price),
count = n())
p1 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = meanprice,color = vehicleType))
p2 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = medianprice,color = vehicleType))
p3 <- ggplot(x) +geom_smooth(aes(x = diff_time, y = count,color = vehicleType))
grid.arrange(p1,p2,p3)