# Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
data_1 <- data("midwest")

# time series toolkits
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(pageviews)
library(wikipediatrend)
## 
##   [wikipediatrend]
##     
##   Note:
##     
##     - Data before 2016-01-01 
##       * is provided by petermeissner.de and
##       * was prepared in a project commissioned by the Hertie School of Governance (Prof. Dr. Simon Munzert)
##       * and supported by the Daimler and Benz Foundation.
##     
##     - Data from 2016-01-01 onwards 
##       * is provided by the Wikipedia Foundation
##       * via its pageviews package and API.
## 
IL = wp_trend(                        
  "Illinois",                               # search term is "Illinois"
  from = "2021-01-01", 
  to = "2023-11-01")

IN = wp_trend(                        
  "Indiana",                               # search term is "INdiana"
  from = "2021-01-01", 
  to = "2023-11-01")

MI = wp_trend(                        
  "Michigan",                               # search term is "mi"
  from = "2021-01-01", 
  to = "2023-11-01")

Using pageviews as response variable.

IL$state <- "Illinois"
IN$state <- "Indiana"
MI$state <- "Michigan"

# Combine the datasets
combined_data <- rbind(IL, IN, MI)

# Plotting the combined data
ggplot(combined_data, aes(date, views, color = state)) +
  geom_line() +
  labs(title = "Page Views of States (IL, IN, MI) on Wikipedia", x = "Date", y = "Page Views") +
  theme_minimal()

-Michigan has large number of hits during late 2021- could be game season. -Illinois has huge hits in 2022 can be related to events happening in Chicago. -overall the number of hits can be arranged in ascending: Indiana > Michigan >Illinois -this can be related to total population.

pop_data <- midwest %>%
  group_by(state) %>%
  summarise(total_population = sum(poptotal))

ggplot(data = pop_data, aes(x = state, y = total_population, fill = state)) +
  geom_bar(stat = "identity") +
  ggtitle("Total Population of Midwest States") +
  xlab("Midwest States") +
  ylab("Total Population")

#response-like variable of interest could be the "page views" data itself.
hits <- wpd_get_exact(page="Midwestern_United_States", 
                      from="2013-01-01", 
                      to="2023-01-01", 
                      lang="en", 
                      warn=TRUE)

hits <- select(hits, date, views)
hits_ts <- as_tsibble(hits, index = date)
hits_ts |>
  ggplot() +
  geom_line(mapping = aes(x = date, y = views)) +
  labs(title = '"Time Series" Page Views on Wikipedia') +
  theme_minimal()

hits_ts %>%
  ggplot(aes(x=date, y=views)) +
  geom_line(color="lightblue") + 
  geom_smooth(method = "loess", color = 'purple', se=FALSE, size=1) + 
  ylim(0,13000)+
  labs(title = 'Trend of "Midwest" Page Views on Wikipedia', x="Date", y="Page Views") +  
  theme_minimal() + 
  scale_x_date(date_breaks = "1 year", date_labels = "%Y")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

trend_model <- lm(views ~ date, data = hits_ts)
summary(trend_model)
## 
## Call:
## lm(formula = views ~ date, data = hits_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2182.3  -509.0   115.0   519.2  5671.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.087e+04  1.469e+03   21.02   <2e-16 ***
## date        -1.793e+00  9.035e-02  -19.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 945.1 on 1093 degrees of freedom
## Multiple R-squared:  0.2648, Adjusted R-squared:  0.2642 
## F-statistic: 393.8 on 1 and 1093 DF,  p-value: < 2.2e-16

Date coefficient shows negative value, signifying a downward trend ie with increase in time the views decrease by 1.793e+00. We should also consider the changes in the overall behavior of users seeking information or a shift to alternative sources rather than solely relying on Wikipedia could certainly affect page views.

acf(hits_ts, ci = 0.95, na.action = na.exclude)

-The vertical bars in the ACF plot represent the correlation values between the time series and its lagged versions. -A point on the plot at lag k shows the correlation between the series at time t and the series at time t-k. -Dashed horizontal lines represent confidence intervals for the correlation values. -Recurring peaks at regular intervals suggest seasonality or periodic behavior. -Consecutive values appear to follow one another fairly closely, suggesting an autoregression model could be appropriate -Here we notice that there is a significant spike at a lag of 1 and much lower spikes for the subsequent lags. Thus, an AR(1) model would likely be feasible for this data set.

hits_xts <- xts(hits_ts$views, 
                order.by = hits_ts$date,
                frequency = 12)  # we'll keep this as 7
hits_xts <- setNames(hits_xts, "views")
pacf(hits_xts, na.action = na.exclude,
     xlab = "Lag (Weekly)", main = "PACF for Wikipedia Hits")

-Similar to ACF plots, PACF plots often have blue dashed horizontal lines that represent CI. -Points outside these bounds may suggest statistically significant partial correlations -Significant spikes at specific lags indicate direct relationships between the observations at those intervals. -Non-significant spikes might represent correlations explained by shorter lags.