library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.2
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
theme_set(theme_minimal())
options(scipen = 6)
df <- read.csv("C:/Users/toyha/Downloads/vehicle/car details v4.csv")
#converting non-american stuff to american stuff
df <- df |> mutate(years_since = year(now()) - Year) |> mutate(PriceUSD = Price * 0.012) |> mutate(Miles = Kilometer * 0.621371) |> mutate(LengthInch = Length * 0.0393701) |> mutate(WidthInch = Width * 0.0393701) |> mutate(HeightInch = Height * 0.0393701) |> mutate(FuelGallons = Fuel.Tank.Capacity * 0.264172) |> mutate(Volume = LengthInch * WidthInch * HeightInch)
#Cleaning up Owner attribute
df['Owner'][df['Owner'] == 'Fourth'] <- '4 or More'
df$Engine <- sub(x = df$Engine, pattern = " cc", replacement = "")
df$Engine <- strtoi(df$Engine)
The model year seems like it could be decent to work with. While it would be better to work with the date of a vehicle’s sale, I think looking at trends for a certain year of manufacture may be worth looking at, as the prices of cars with certain model years may vary.
An earlier attempt had problems with using years that occured more than once as the key, so I decided to use the mean price of each year as a starting point. I also removed both years before 2000 since the gaps were too big with what I wanted to do.
df_mean <- aggregate(x= df$PriceUSD,
# Specify group indicator
by = list(df$Year),
# Specify function (i.e. mean)
FUN = mean)
colnames(df_mean)[colnames(df_mean) == 'Group.1'] <- 'Year'
colnames(df_mean)[colnames(df_mean) == 'x'] <- 'MeanPrice'
df_mean$Year <- lubridate::ymd(df_mean$Year, truncated = 2L)
df_mean <- df_mean[-c(1:2), ]
print(df_mean)
## Year MeanPrice
## 3 2000-01-01 14400.000
## 4 2002-01-01 1200.000
## 5 2004-01-01 852.012
## 6 2006-01-01 2640.000
## 7 2007-01-01 8865.998
## 8 2008-01-01 2568.000
## 9 2009-01-01 3756.364
## 10 2010-01-01 6096.444
## 11 2011-01-01 11756.506
## 12 2012-01-01 11845.955
## 13 2013-01-01 8124.187
## 14 2014-01-01 12435.750
## 15 2015-01-01 13613.527
## 16 2016-01-01 17165.326
## 17 2017-01-01 18573.893
## 18 2018-01-01 23628.492
## 19 2019-01-01 23590.513
## 20 2020-01-01 34521.181
## 21 2021-01-01 39131.538
## 22 2022-01-01 44214.370
#TODO: Create tsibble and plot
year_ts <- as_tsibble(df_mean, index=Year) |>
index_by(date = date(Year))
#print(year_ts)
year_ts |> ggplot() + geom_line(mapping = aes(x=date, y=MeanPrice)) + theme_hc()
As seen in previous weeks, there’s a very low number of data points earlier in the chart, but this line’s shape may indicate an upward curve in mean price starting from the 2000’s. I noticed several spikes in the cart that may indicate seasonality, but I’ll have to use LOESS to be sure.
#TODO: Create linear regression
model <- lm(MeanPrice ~ Year, data = year_ts)
summary(model)
##
## Call:
## lm(formula = MeanPrice ~ Year, data = year_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8109 -5344 -2315 2341 19012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52706.1170 11288.8849 -4.669 0.000191 ***
## Year 4.3894 0.7245 6.059 0.00001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7406 on 18 degrees of freedom
## Multiple R-squared: 0.671, Adjusted R-squared: 0.6527
## F-statistic: 36.71 on 1 and 18 DF, p-value: 0.000009997
There is indeed an upwards trend associated with the year, but it’s very slight. The starting year not being 0 might be throwing things off just a bit, but this is still a serviceable model.
price_xts <- xts(x = df_mean$MeanPrice, order.by = df_mean$Year)
price_xts <- setNames(price_xts, "price")
year_ts |>
index_by(year = floor_date(date, 'year')) |>
ggplot(mapping = aes(x = Year, y = MeanPrice)) +
geom_line() +
geom_smooth(span = 0.3, color = 'blue', se=FALSE, ) +
labs(title = "Yearly Mean Price of Used Cars",
subtitle = "LOESS") +
scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
theme_hc()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The years in the x-axis are a bit crowded, but I can still make out a trough between 2003 and 2004, a peak at 2007, another trough at 2009, a peak at around 2011, and a very slight trough between 2013 and 2014. From there, there appears to be exponential growth that makes it difficult to assign peaks and troughs to our line, but I can very faintly make out peaks at 2015 and 2017 and troughs at 2016 and 2018. It seems from our model that there are peaks followed by troughs about 2 to 4 years apart from each other, though the upward curve at the end of the line would make it difficult to assign seasonality beyond that point.
#TODO: use ACF & PACF
year_ts |>
mutate(price_lag2 = lag(MeanPrice, 1)) |>
drop_na()
## # A tsibble: 0 x 4 [?]
## # Groups: @ date [?]
## # ℹ 4 variables: Year <date>, MeanPrice <dbl>, date <date>, price_lag2 <dbl>
#acf(year_ts, ci = 0.95)
price_xts <- xts(year_ts$MeanPrice,
order.by = year_ts$date,
frequency = 1)
price_xts <- setNames(price_xts, "price")
pacf(price_xts, na.action = na.exclude,
xlab = "Lag (yearly)", main = "PACF for Yearly Mean Price")
I had technical difficulties with the ACF function, and I was unable to calculate autocorrelation. However, I was still able to calculate partial autocorrelation, and a lag of 1 seems to be most correlated with the time periods.