#load the necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(xts)
## Warning: package 'xts' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(boot)
library(broom)
library(lindia)
#loading the dataset
df_ufo <- read.csv('./ufo-web-page-views.csv')
head(df_ufo)
# Ensure the Date Column is correctly interpreted as date
df_ufo <- df_ufo |>
mutate(date = as.Date(date, format="%m/%d/%Y"))
head(df_ufo)
str(df_ufo)
## 'data.frame': 3319 obs. of 2 variables:
## $ date : Date, format: "2015-10-01" "2015-10-02" ...
## $ ufo_views: int 1317 1459 1295 1484 1621 2147 1401 1586 1786 1739 ...
#filter out columns where date column is NA
df_ufo <- df_ufo |>
filter(!is.na(df_ufo$date))
#creating the ufo time series
ufo_tsibble <- df_ufo |>
as_tsibble(index=date)
head(ufo_tsibble)
#plotting the time series of the ufo data set
ggplot(data = ufo_tsibble) +
geom_line(mapping = aes(x=date, y=ufo_views), linewidth = 0.8, col="blue") +
labs(title = "UFOs Time Series - Wikipedia",
x="Date",
y="UFO Page Views"
) +
theme_hc()
From the plot above we notice a few prominent spikes, one being around the year 2018. These instances could correspond to specific events such as news related to UFOs that cause the spike in Wikipedia page views around the topic. We can zoom into these period by plotting the time series for this specific time window:
ufo_tsibble |>
filter(date >= as.Date("2017-01-01") & date<= as.Date("2018-12-31")) |>
ggplot(aes(x=date, y=ufo_views))+
geom_line(color="green", linewidth=0.5) +
labs(
title = "UFO Time Series 2017-2018",
x="Date",
y="UFO views"
) +
theme_minimal()
From the above plot, we are able to narrow down to a more specific time when the prominent spike happened, and this happens to be around the month of July, 2017. The neighboring time frames seems to have little UFO activity.
#fit a linear model
trend_model <- lm(ufo_views ~ date, data=ufo_tsibble)
summary(trend_model)
##
## Call:
## lm(formula = ufo_views ~ date, data = ufo_tsibble)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1121 -434 -260 -27 94233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.132e+03 7.188e+02 -2.966 0.00304 **
## date 2.094e-01 3.908e-02 5.359 8.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2157 on 3317 degrees of freedom
## Multiple R-squared: 0.008584, Adjusted R-squared: 0.008285
## F-statistic: 28.72 on 1 and 3317 DF, p-value: 8.935e-08
#adding the trend linear model to our plot
ufo_tsibble |>
ggplot(aes(x=date, y=ufo_views))+
geom_line(color="green", linewidth=0.5) +
geom_smooth(method = "lm", se=TRUE, color="blue", linewidth=0.7) +
labs(
title = "UFO Page Views - Linear Trend",
x="Date",
y="UFO views"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
geom_smooth function.#Smoothing the plot using LO(W)ESS method
ufo_tsibble |>
ggplot(mapping = aes(x=date, y=ufo_views)) +
geom_point(size=0.2, shape='0') +
geom_smooth(span=0.2, color="blue", SE=FALSE) +
labs(
title="LO(W)ESS Smoothing Method",
x="Date",
y="UFO Views") +
theme_hc()
## Warning in geom_smooth(span = 0.2, color = "blue", SE = FALSE): Ignoring
## unknown parameters: `SE`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#plot acf
acf(ufo_tsibble$ufo_views, ci = 0.95, main="ACF for UFO Page Views")
During this Week’s Data Dive we explored time series data from Wikipedia Page Views on UFO. We made a tsibble from the UFO time series data, plotted the data, and attempted smoothing using LO(W)ESS method. We finally tried to detect seasonality using ACF method. We noticed that our data wasn’t so stochastic, but rather fairly constant over the whole period of time, hence we did not detect any seasons in our data. The strength of our trend was also pretty weak, as we got a low slope from our trend model. That not withstanding, it was a great learning experience of time series data exploration.