library(readxl)
library(lubridate)
library(ggplot2)
library(tidyr)
library(dplyr)
library(corrplot)
library(caret)
cars <- read_excel("/Users/aditya/Desktop/Capstone/autos.xlsx", sheet = "autos")
cars$dateCrawled <- ymd_hms(cars$dateCrawled)
cars$lastSeen <- as.Date(cars$lastSeen)
cars$dateCreated <- as.Date(cars$dateCreated)
cars$nrOfPictures <- NULL
cars$abtest <- NULL
cars$postalCode <- NULL
sapply(cars, function(x)sum(is.na(x)))
## dateCrawled name seller
## 1 15 1
## offerType price vehicleType
## 1 1 37870
## yearOfRegistration gearbox powerPS
## 2 20210 1
## model kilometer monthOfRegistration
## 20485 1 2
## fuelType brand notRepairedDamage
## 33388 2 72062
## dateCreated lastSeen
## 2 2
cars <- cars %>% drop_na(price)
cars <- cars %>% drop_na(powerPS)
cars <- cars %>% drop_na(vehicleType)
cars <- cars %>% drop_na(gearbox)
cars <- cars %>% drop_na(name)
cars <- cars %>% filter(price > quantile(cars$price,0.1,na.rm = T) & price < quantile(cars$price,0.99,na.rm = T))
cars <- cars %>% filter(powerPS > quantile(cars$powerPS,0.1) & powerPS < quantile(cars$powerPS,0.99))
cars <- cars %>% filter(vehicleType != "")
cars <- cars %>% filter(yearOfRegistration > 1980 & yearOfRegistration < 2016)
cars <- cars %>% filter(monthOfRegistration > 0)
cars <- cars %>% filter(fuelType != "")
cars <- cars %>% drop_na(model)
cars$daysonMarket <- as.integer(cars$lastSeen - cars$dateCreated)
cars$age <- as.factor(year(today()) - cars$yearOfRegistration)
cars <- cars %>% filter(brand != "trabant")
Embed Plots
#Price Histogram
ggplot(data = cars, aes(x = price)) +
geom_histogram(fill = 'deepskyblue4', bins = 200) +
scale_x_continuous(trans = "log10", breaks = c(100, 300, 1000, 3.e3, 1.e4, 3.e4, 1.e5, 3.e5)) +
geom_vline(xintercept = median(cars$price), colour = 'red', size = 1) +
geom_vline(xintercept = mean(cars$price), colour = 'green', size = 1)
#Power Histogram
ggplot(data = cars, aes(x = powerPS)) +
geom_histogram(binwidth = 5, fill = 'deepskyblue4') +
scale_x_continuous(breaks = seq(0, 350, 20)) +
geom_vline(xintercept = mean(cars$powerPS), size = 1, colour = 'red') +
geom_vline(xintercept = median(cars$powerPS), size = 1, colour = 'green')
#Year of Registration
ggplot(data = cars,aes(x = yearOfRegistration)) +
geom_histogram(binwidth = 1, fill = 'deepskyblue4') +
scale_x_continuous(breaks = seq(1981, 2016, 5)) +
geom_vline(xintercept = mean(cars$yearOfRegistration),size = 1, colour = 'red') +
geom_vline(xintercept = median(cars$yearOfRegistration),size = 1, colour = 'green')
ggplot(data = cars,aes(x = kilometer)) +
geom_histogram(binwidth = 5000, fill = 'deepskyblue4') +
scale_x_continuous(breaks = seq(5000, 150000, 15000)) +
geom_vline(xintercept = mean(cars$kilometer),size = 1, colour = 'red') +
geom_vline(xintercept = median(cars$kilometer),size = 1, colour = 'green')
ggplot(data = cars, aes(x = daysonMarket)) +
geom_histogram(binwidth = 5, fill = 'deepskyblue4') +
geom_vline(xintercept = mean(cars$daysonMarket), colour = 'red', size = 1) +
geom_vline(xintercept = median(cars$daysonMarket), colour = 'green', size = 1)
cars %>% count(brand) %>% top_n(10) %>% arrange(-n,brand) %>% ggplot(aes(x = reorder(brand,n), y = n)) +
geom_bar(stat = 'identity', fill = 'red',color='black') + coord_flip()
#Price vs Engine Facet by vehicle
ggplot(data = cars, aes(x = powerPS, y = log(price))) +
geom_point(fill = "deepskyblue4",shape = 21, alpha = .15, size = 1, position = 'jitter') +
geom_smooth() + facet_wrap(~vehicleType)
#Price vs Mileage
ggplot(data = cars,aes(x = kilometer, y = log(price))) +
geom_point(colour = 'deepskyblue4', shape = '.', alpha = .25, position = 'jitter') +
geom_smooth()
#Price vs Vehicle
ggplot(cars,aes(x=vehicleType, y=price)) +
geom_boxplot(aes(fill = vehicleType)) + stat_summary(fun.y = mean, geom="point", size=2)
#Price vs Fuel
ggplot(cars,aes(x=fuelType, y=price)) +
geom_boxplot(aes(fill = fuelType)) + stat_summary(fun.y = mean, geom="point", size=2)
#Price vs FuelType
ggplot(data = subset(cars, fuelType %in% c("benzin", "diesel")),
aes(x = powerPS, y = log(price), fill = fuelType)) +
geom_point(shape = 21, alpha = .15, size = 1, position = 'jitter') +
scale_fill_brewer(type = 'qual',guide = guide_legend(reverse = TRUE)) +
geom_smooth(aes(colour = fuelType))
#price vs fuel & gears
ggplot(data = subset(cars, fuelType %in% c("benzin", "diesel","lpg")),
aes(x = powerPS, y = log(price))) +
geom_point(shape = 21, fill = 'deepskyblue1', alpha = .05, position = 'jitter') +
geom_smooth() + facet_grid(gearbox ~ fuelType)
#Price vs Age of car
ggplot(data = cars,aes(x = yearOfRegistration, y = price)) +
geom_point(fill = 'deepskyblue4', shape = 21, alpha = .05,
position = 'jitter') +
coord_trans(x = 'identity', y = 'log10', limx = c(1980, 2016)) +
geom_smooth()
#Price vs VehicleType with gears
ggplot(aes(x= vehicleType, y=price), data=cars) +
geom_boxplot(aes(fill=gearbox), color='black')
#Price vs Age of vehicle
ggplot(aes(x=as.integer(age), y=price, alpha = 1/100), data = cars) +
geom_point() +
facet_wrap(~vehicleType) +
geom_smooth() +
labs(x= "Age of Vehicle", y= "Price",
title= "Price vs. Age by Vehicle Type")
#linear regression - base
cars_num <- cars[,c('kilometer','price','age','powerPS','daysonMarket')]
cars_num$age <- as.integer(cars_num$age)
corrplot(cor(cars_num),method='number')
#Dividing the data
set.seed(13579)
trainIndex <- createDataPartition(cars$price, p = 0.70, list = FALSE)
train <- cars[trainIndex, ]
test <- cars[-trainIndex, ]
#Linear Regression
fit <- lm(price ~ kilometer + as.numeric(age) + powerPS +
daysonMarket + gearbox +
vehicleType + fuelType, data=train)
summary(fit)
##
## Call:
## lm(formula = price ~ kilometer + as.numeric(age) + powerPS +
## daysonMarket + gearbox + vehicleType + fuelType, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24293 -2123 -300 1600 36488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.018e+04 8.226e+02 12.376 < 2e-16 ***
## kilometer -5.387e-02 2.844e-04 -189.440 < 2e-16 ***
## as.numeric(age) -4.059e+02 2.091e+00 -194.114 < 2e-16 ***
## powerPS 5.150e+01 2.275e-01 226.372 < 2e-16 ***
## daysonMarket 4.299e+01 1.023e+00 42.012 < 2e-16 ***
## gearboxmanuell -7.302e+02 2.360e+01 -30.934 < 2e-16 ***
## vehicleTypebus 3.380e+02 1.108e+02 3.052 0.00228 **
## vehicleTypecabrio 1.890e+03 1.128e+02 16.752 < 2e-16 ***
## vehicleTypecoupe 1.090e+03 1.149e+02 9.487 < 2e-16 ***
## vehicleTypekleinwagen -5.482e+02 1.100e+02 -4.984 6.24e-07 ***
## vehicleTypekombi -7.543e+02 1.092e+02 -6.910 4.85e-12 ***
## vehicleTypelimousine -1.320e+01 1.087e+02 -0.121 0.90335
## vehicleTypesuv 1.798e+03 1.151e+02 15.618 < 2e-16 ***
## fuelTypebenzin 6.496e+02 8.146e+02 0.797 0.42523
## fuelTypecng 6.139e+02 8.410e+02 0.730 0.46541
## fuelTypediesel 2.482e+03 8.147e+02 3.046 0.00232 **
## fuelTypeelektro 1.995e+03 1.226e+03 1.627 0.10373
## fuelTypehybrid 2.323e+03 8.683e+02 2.676 0.00746 **
## fuelTypelpg -4.906e+02 8.175e+02 -0.600 0.54841
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3550 on 162103 degrees of freedom
## Multiple R-squared: 0.6903, Adjusted R-squared: 0.6903
## F-statistic: 2.007e+04 on 18 and 162103 DF, p-value: < 2.2e-16
plot(fit)