# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the pageviews package
if (!requireNamespace("pageviews", quietly = TRUE)) {
    install.packages("pageviews")
}
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the tsibble package
if (!requireNamespace("tsibble", quietly = TRUE)) {
    install.packages("tsibble")
}
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the ggplot2 package
if (!requireNamespace("ggplot2", quietly = TRUE)) {
    install.packages("ggplot2")
}
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the forecast package
if (!requireNamespace("forecast", quietly = TRUE)) {
    install.packages("forecast")
}
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Load the libraries
library(pageviews)
## Warning: package 'pageviews' was built under R version 4.4.2
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.2
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(ggplot2)
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.2
# Fetch the page views for "Education" page from English Wikipedia
education_pageviews <- article_pageviews(
  project = "en.wikipedia", 
  article = "Education", 
  start = as.Date("2023-01-01"), 
  end = as.Date("2024-11-01"),
  granularity = "daily"
)

# View the first few rows
head(education_pageviews)
##     project language   article     access      agent granularity       date
## 1 wikipedia       en Education all-access all-agents       daily 2023-01-01
## 2 wikipedia       en Education all-access all-agents       daily 2023-01-02
## 3 wikipedia       en Education all-access all-agents       daily 2023-01-03
## 4 wikipedia       en Education all-access all-agents       daily 2023-01-04
## 5 wikipedia       en Education all-access all-agents       daily 2023-01-05
## 6 wikipedia       en Education all-access all-agents       daily 2023-01-06
##   views
## 1  6581
## 2  8176
## 3  7799
## 4  7688
## 5  7910
## 6  6982
education_pageviews$date<-as.Date(education_pageviews$date)
str(education_pageviews)
## 'data.frame':    671 obs. of  8 variables:
##  $ project    : chr  "wikipedia" "wikipedia" "wikipedia" "wikipedia" ...
##  $ language   : chr  "en" "en" "en" "en" ...
##  $ article    : chr  "Education" "Education" "Education" "Education" ...
##  $ access     : chr  "all-access" "all-access" "all-access" "all-access" ...
##  $ agent      : chr  "all-agents" "all-agents" "all-agents" "all-agents" ...
##  $ granularity: chr  "daily" "daily" "daily" "daily" ...
##  $ date       : Date, format: "2023-01-01" "2023-01-02" ...
##  $ views      : num  6581 8176 7799 7688 7910 ...
education_tsibble <- as_tsibble(education_pageviews, index = date)
# Plot the pageviews over time
ggplot(education_pageviews, aes(x = date, y = views)) +
  geom_line(color = "blue") +
  labs(title = "Wikipedia Pageviews for 'Education'",
       x = "Date",
       y = "Pageviews") +
  theme_minimal()

linear_trend <- lm(views ~ date, data = education_tsibble)
summary(linear_trend)
## 
## Call:
## lm(formula = views ~ date, data = education_tsibble)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9963.2 -1641.6  -311.1  1990.4  9012.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.371e+05  1.168e+04   28.86   <2e-16 ***
## date        -1.665e+01  5.931e-01  -28.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2976 on 669 degrees of freedom
## Multiple R-squared:  0.5408, Adjusted R-squared:  0.5401 
## F-statistic: 787.8 on 1 and 669 DF,  p-value: < 2.2e-16
# Add the linear trend line to the plot
ggplot(education_tsibble, aes(x = date, y = views)) +
  geom_line(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Trend in Wikipedia Pageviews for 'Education'",
       x = "Date",
       y = "Pageviews") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Detecting Seasonality Using Smoothing and ACF/PACF:

# Apply smoothing
education_tsibble %>%
  ggplot(aes(x = date, y = views)) +
  geom_line() +
  geom_smooth(span = 0.2, color = "green") +
  labs(title = "Smoothed Trend in Wikipedia Pageviews for 'Education'",
       x = "Date",
       y = "Pageviews") +
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# ACF and PACF to detect seasonality
acf(education_tsibble$views, main = "ACF of 'Education' Pageviews")

ACF Observations:

pacf(education_tsibble$views, main = "PACF of 'Education' Pageviews")

PACF Observations: