Packages Required

library(readxl)
library(lubridate)
library(ggplot2)
library(tidyr)
library(dplyr)
library(corrplot)
library(caret)

cars <- read_excel("/Users/aditya/Desktop/Capstone/autos.xlsx", sheet = "autos")
cars$dateCrawled <- ymd_hms(cars$dateCrawled) 
cars$lastSeen <- as.Date(cars$lastSeen)
cars$dateCreated <- as.Date(cars$dateCreated)

cars$nrOfPictures <- NULL
cars$abtest <- NULL
cars$postalCode <- NULL
sapply(cars, function(x)sum(is.na(x)))
##         dateCrawled                name              seller 
##                   1                  15                   1 
##           offerType               price         vehicleType 
##                   1                   1               37870 
##  yearOfRegistration             gearbox             powerPS 
##                   2               20210                   1 
##               model           kilometer monthOfRegistration 
##               20485                   1                   2 
##            fuelType               brand   notRepairedDamage 
##               33388                   2               72062 
##         dateCreated            lastSeen 
##                   2                   2
cars <- cars %>% drop_na(price)
cars <- cars %>% drop_na(powerPS)
cars <- cars %>% drop_na(vehicleType)
cars <- cars %>% drop_na(gearbox)
cars <- cars %>% drop_na(name)

cars <- cars %>% filter(price > quantile(cars$price,0.1,na.rm = T) & price < quantile(cars$price,0.99,na.rm = T))

cars <- cars %>% filter(powerPS > quantile(cars$powerPS,0.1) & powerPS < quantile(cars$powerPS,0.99))

cars <- cars %>% filter(vehicleType != "")

cars <- cars %>% filter(yearOfRegistration > 1980 & yearOfRegistration < 2016)

cars <- cars %>% filter(monthOfRegistration > 0)

cars <- cars %>% filter(fuelType != "")

cars <- cars %>% drop_na(model)

cars$daysonMarket <- as.integer(cars$lastSeen - cars$dateCreated)

cars$age <- as.factor(year(today()) - cars$yearOfRegistration)

cars <- cars %>% filter(brand != "trabant")

Including Plots

Embed Plots

#Price Histogram
ggplot(data = cars, aes(x = price)) + 
  geom_histogram(fill = 'deepskyblue4', bins = 200) +
  scale_x_continuous(trans = "log10", breaks = c(100, 300, 1000, 3.e3, 1.e4, 3.e4, 1.e5, 3.e5)) +
  geom_vline(xintercept = median(cars$price), colour = 'red', size = 1) +
  geom_vline(xintercept = mean(cars$price), colour = 'green', size = 1) 

#Power Histogram
ggplot(data = cars, aes(x = powerPS)) +
  geom_histogram(binwidth = 5, fill = 'deepskyblue4') +
  scale_x_continuous(breaks = seq(0, 350, 20)) +
  geom_vline(xintercept = mean(cars$powerPS), size = 1, colour = 'red') +
  geom_vline(xintercept = median(cars$powerPS), size = 1, colour = 'green')

#Year of Registration
ggplot(data = cars,aes(x = yearOfRegistration)) +
  geom_histogram(binwidth = 1, fill = 'deepskyblue4') +
  scale_x_continuous(breaks = seq(1981, 2016, 5)) +
  geom_vline(xintercept = mean(cars$yearOfRegistration),size = 1, colour = 'red') +
  geom_vline(xintercept = median(cars$yearOfRegistration),size = 1, colour = 'green')

ggplot(data = cars,aes(x = kilometer)) +
  geom_histogram(binwidth = 5000, fill = 'deepskyblue4') +
  scale_x_continuous(breaks = seq(5000, 150000, 15000)) +
  geom_vline(xintercept = mean(cars$kilometer),size = 1, colour = 'red') +
  geom_vline(xintercept = median(cars$kilometer),size = 1, colour = 'green')

ggplot(data = cars, aes(x = daysonMarket)) +
  geom_histogram(binwidth = 5, fill = 'deepskyblue4') +
  geom_vline(xintercept = mean(cars$daysonMarket), colour = 'red', size = 1) +
  geom_vline(xintercept = median(cars$daysonMarket), colour = 'green', size = 1)

cars %>% count(brand) %>% top_n(10) %>% arrange(-n,brand) %>% ggplot(aes(x = reorder(brand,n), y = n)) +
  geom_bar(stat = 'identity', fill = 'red',color='black') + coord_flip()

#Price vs Engine Facet by vehicle
ggplot(data = cars, aes(x = powerPS, y = log(price))) + 
  geom_point(fill = "deepskyblue4",shape = 21, alpha = .15, size = 1, position = 'jitter') +
  geom_smooth() + facet_wrap(~vehicleType)

#Price vs Mileage
ggplot(data = cars,aes(x = kilometer, y = log(price))) +
  geom_point(colour = 'deepskyblue4', shape = '.', alpha = .25, position = 'jitter') +
  geom_smooth()

#Price vs Vehicle
ggplot(cars,aes(x=vehicleType, y=price)) +
  geom_boxplot(aes(fill = vehicleType)) + stat_summary(fun.y = mean, geom="point", size=2)

#Price vs Fuel
ggplot(cars,aes(x=fuelType, y=price)) +
  geom_boxplot(aes(fill = fuelType)) + stat_summary(fun.y = mean, geom="point", size=2)

#Price vs FuelType
ggplot(data = subset(cars, fuelType %in% c("benzin", "diesel")),
       aes(x = powerPS, y = log(price), fill = fuelType)) +
  geom_point(shape = 21, alpha = .15, size = 1, position = 'jitter') +
  scale_fill_brewer(type = 'qual',guide = guide_legend(reverse = TRUE)) +
  geom_smooth(aes(colour = fuelType))

#price vs fuel & gears
ggplot(data = subset(cars, fuelType %in% c("benzin", "diesel","lpg")),
  aes(x = powerPS, y = log(price))) +
  geom_point(shape = 21, fill = 'deepskyblue1', alpha = .05, position = 'jitter') +
  geom_smooth() + facet_grid(gearbox ~ fuelType)

#Price vs Age of car
ggplot(data = cars,aes(x = yearOfRegistration, y = price)) +
  geom_point(fill = 'deepskyblue4', shape = 21, alpha = .05, 
             position = 'jitter') + 
  coord_trans(x = 'identity', y = 'log10', limx = c(1980, 2016)) + 
  geom_smooth()

#Price vs VehicleType with gears
ggplot(aes(x= vehicleType, y=price), data=cars) + 
  geom_boxplot(aes(fill=gearbox), color='black')

#Price vs Age of vehicle
ggplot(aes(x=as.integer(age), y=price, alpha = 1/100), data = cars) + 
  geom_point() +
  facet_wrap(~vehicleType) +
  geom_smooth() +
  labs(x= "Age of Vehicle", y= "Price", 
       title= "Price vs. Age by Vehicle Type")

#linear regression - base
cars_num <- cars[,c('kilometer','price','age','powerPS','daysonMarket')]
cars_num$age <- as.integer(cars_num$age)
corrplot(cor(cars_num),method='number')

#Dividing the data
set.seed(13579)
trainIndex <- createDataPartition(cars$price, p = 0.70, list = FALSE)
train <- cars[trainIndex, ]
test <- cars[-trainIndex, ]

#Linear Regression
fit <- lm(price ~ kilometer + as.numeric(age) + powerPS +
            daysonMarket   + gearbox + 
            vehicleType + fuelType, data=train)
summary(fit)
## 
## Call:
## lm(formula = price ~ kilometer + as.numeric(age) + powerPS + 
##     daysonMarket + gearbox + vehicleType + fuelType, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24293  -2123   -300   1600  36488 
## 
## Coefficients:
##                         Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)            1.018e+04  8.226e+02   12.376  < 2e-16 ***
## kilometer             -5.387e-02  2.844e-04 -189.440  < 2e-16 ***
## as.numeric(age)       -4.059e+02  2.091e+00 -194.114  < 2e-16 ***
## powerPS                5.150e+01  2.275e-01  226.372  < 2e-16 ***
## daysonMarket           4.299e+01  1.023e+00   42.012  < 2e-16 ***
## gearboxmanuell        -7.302e+02  2.360e+01  -30.934  < 2e-16 ***
## vehicleTypebus         3.380e+02  1.108e+02    3.052  0.00228 ** 
## vehicleTypecabrio      1.890e+03  1.128e+02   16.752  < 2e-16 ***
## vehicleTypecoupe       1.090e+03  1.149e+02    9.487  < 2e-16 ***
## vehicleTypekleinwagen -5.482e+02  1.100e+02   -4.984 6.24e-07 ***
## vehicleTypekombi      -7.543e+02  1.092e+02   -6.910 4.85e-12 ***
## vehicleTypelimousine  -1.320e+01  1.087e+02   -0.121  0.90335    
## vehicleTypesuv         1.798e+03  1.151e+02   15.618  < 2e-16 ***
## fuelTypebenzin         6.496e+02  8.146e+02    0.797  0.42523    
## fuelTypecng            6.139e+02  8.410e+02    0.730  0.46541    
## fuelTypediesel         2.482e+03  8.147e+02    3.046  0.00232 ** 
## fuelTypeelektro        1.995e+03  1.226e+03    1.627  0.10373    
## fuelTypehybrid         2.323e+03  8.683e+02    2.676  0.00746 ** 
## fuelTypelpg           -4.906e+02  8.175e+02   -0.600  0.54841    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3550 on 162103 degrees of freedom
## Multiple R-squared:  0.6903, Adjusted R-squared:  0.6903 
## F-statistic: 2.007e+04 on 18 and 162103 DF,  p-value: < 2.2e-16
plot(fit)