```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Loading libraries for analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.2
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Loading the Ames dataset
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
# Create a Sale.Date column using Yr.Sold and Mo.Sold
ames <- ames %>%
mutate(Sale.Date = as.Date(paste(Yr.Sold, Mo.Sold, "01", sep = "-")))
# Select relevant columns for analysis
time_series_data <- ames %>%
select(Sale.Date, SalePrice) %>%
arrange(Sale.Date)
# Aggregate data by month to calculate average SalePrice
monthly_data <- time_series_data %>%
group_by(Sale.Date) %>%
summarise(Average_SalePrice = mean(SalePrice, na.rm = TRUE))
# Convert data to a tsibble for time series analysis
monthly_tsibble <- monthly_data %>%
as_tsibble(index = Sale.Date)
# Inspect the tsibble
monthly_tsibble
## # A tsibble: 55 x 2 [1D]
## Sale.Date Average_SalePrice
## <date> <dbl>
## 1 2006-01-01 202997.
## 2 2006-02-01 188865.
## 3 2006-03-01 182009.
## 4 2006-04-01 158864.
## 5 2006-05-01 169166.
## 6 2006-06-01 174234.
## 7 2006-07-01 178638.
## 8 2006-08-01 202379.
## 9 2006-09-01 219365.
## 10 2006-10-01 165379.
## # ℹ 45 more rows
# Plot the average sale price over time
ggplot(data = monthly_tsibble, aes(x = Sale.Date, y = Average_SalePrice)) +
geom_line(color = "blue") +
labs(title = "Average Sale Price Over Time", x = "Date", y = "Average Sale Price") +
theme_minimal()
What stands out immediately?
# Linear regression to detect trends
lm_trend <- lm(Average_SalePrice ~ Sale.Date, data = monthly_tsibble)
# Summary of the regression model
summary(lm_trend)
##
## Call:
## lm(formula = Average_SalePrice ~ Sale.Date, data = monthly_tsibble)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29732 -8242 597 6995 32633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 317156.227 54315.001 5.839 3.26e-07 ***
## Sale.Date -9.739 3.886 -2.506 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13930 on 53 degrees of freedom
## Multiple R-squared: 0.106, Adjusted R-squared: 0.0891
## F-statistic: 6.282 on 1 and 53 DF, p-value: 0.0153
# Add trend line to the plot
ggplot(data = monthly_tsibble, aes(x = Sale.Date, y = Average_SalePrice)) +
geom_line(color = "blue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Trend Detection in Sale Prices", x = "Date", y = "Average Sale Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Do you need to subset the data for multiple trends?
How strong are the trends?
# Smoothing to detect seasonality
monthly_tsibble <- monthly_tsibble %>%
mutate(Smoothed = stats::filter(Average_SalePrice, rep(1/12, 12), sides = 2))
# Plotting smoothed values
ggplot(data = monthly_tsibble, aes(x = Sale.Date)) +
geom_line(aes(y = Average_SalePrice), color = "blue") +
geom_line(aes(y = Smoothed), color = "red", linetype = "dashed") +
labs(title = "Seasonality in Sale Prices", x = "Date", y = "Average Sale Price") +
theme_minimal()
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
# ACF and PACF plots to analyze seasonality
forecast::ggAcf(monthly_tsibble$Average_SalePrice, lag.max = 24) +
ggtitle("ACF of Sale Prices")
forecast::ggPacf(monthly_tsibble$Average_SalePrice, lag.max = 24) +
ggtitle("PACF of Sale Prices")
Can you illustrate the seasonality using ACF or PACF?
There is an overall trend in sale prices, potentially upward or downward depending on the regression results.
Seasonal patterns are evident, suggesting external influences like market cycles or seasonal demand in the housing market.