Wikipedia API

install.packages("pageviews_0.5.0.tar", repos = NULL, type = "source")
Installing package into 'C:/Users/msubstudent/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)
Warning in install.packages("pageviews_0.5.0.tar", repos = NULL, type =
"source"): installation of package 'pageviews_0.5.0.tar' had non-zero exit
status
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pageviews)        # This package gets data on Wikipedia viewing
library(DT)               # DT stands for datatable, and creates interactive tables
library(infer)            # for some stats like t_test
library(devtools)
Loading required package: usethis
devtools::install_github("ironholds/pageviews")
Skipping install of 'pageviews' from a github remote, the SHA1 (d32c629c) has not changed since last install.
  Use `force = TRUE` to force installation
gun_control <- article_pageviews(article = "Gun control", start = as.Date("2017-1-1"), end = as.Date("2023-12-31"))

glimpse(gun_control)
Rows: 2,556
Columns: 8
$ project     <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
$ language    <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en"…
$ article     <chr> "Gun_control", "Gun_control", "Gun_control", "Gun_control"…
$ access      <chr> "all-access", "all-access", "all-access", "all-access", "a…
$ agent       <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
$ granularity <chr> "daily", "daily", "daily", "daily", "daily", "daily", "dai…
$ date        <dttm> 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 2017-01-0…
$ views       <dbl> 285, 393, 434, 544, 635, 620, 431, 464, 610, 595, 686, 688…
topLV <- top_articles(start = as.Date("2017-10-2"))
topM <- top_articles(start = as.Date("2017-4-19"))
Las_Vegas_shooting <- article_pageviews(article = "2017 Las Vegas shooting", start = as.Date("2017-1-1"), end = as.Date("2023-12-31"))

glimpse(Las_Vegas_shooting)
Rows: 2,282
Columns: 8
$ project     <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
$ language    <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en"…
$ article     <chr> "2017_Las_Vegas_shooting", "2017_Las_Vegas_shooting", "201…
$ access      <chr> "all-access", "all-access", "all-access", "all-access", "a…
$ agent       <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
$ granularity <chr> "daily", "daily", "daily", "daily", "daily", "daily", "dai…
$ date        <dttm> 2017-10-02, 2017-10-03, 2017-10-04, 2017-10-05, 2017-10-0…
$ views       <dbl> 6994, 314, 414, 233, 238, 235, 40, 22, 2844, 35329, 31019,…
Fresno_shootings <- article_pageviews(article = "Fresno shootings", start = as.Date("2017-1-1"), end = as.Date("2023-12-31"))

glimpse(Fresno_shootings)
Rows: 1,284
Columns: 8
$ project     <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
$ language    <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en"…
$ article     <chr> "Fresno_shootings", "Fresno_shootings", "Fresno_shootings"…
$ access      <chr> "all-access", "all-access", "all-access", "all-access", "a…
$ agent       <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
$ granularity <chr> "daily", "daily", "daily", "daily", "daily", "daily", "dai…
$ date        <dttm> 2017-11-03, 2017-11-04, 2017-11-05, 2017-11-06, 2017-11-0…
$ views       <dbl> 73, 33, 89, 226, 84, 73, 70, 49, 53, 40, 51, 82, 99, 56, 6…
gun_control |> 
  slice_max(views, n = 10)
     project language     article     access      agent granularity       date
1  wikipedia       en Gun_control all-access all-agents       daily 2022-05-25
2  wikipedia       en Gun_control all-access all-agents       daily 2018-02-16
3  wikipedia       en Gun_control all-access all-agents       daily 2018-02-15
4  wikipedia       en Gun_control all-access all-agents       daily 2022-05-26
5  wikipedia       en Gun_control all-access all-agents       daily 2018-02-22
6  wikipedia       en Gun_control all-access all-agents       daily 2017-10-03
7  wikipedia       en Gun_control all-access all-agents       daily 2018-02-20
8  wikipedia       en Gun_control all-access all-agents       daily 2018-02-19
9  wikipedia       en Gun_control all-access all-agents       daily 2018-02-21
10 wikipedia       en Gun_control all-access all-agents       daily 2018-02-18
   views
1   9666
2   6549
3   6117
4   5587
5   5142
6   4342
7   4233
8   4229
9   3914
10  3760
Mass_shootings_in_the_United_States <- article_pageviews(article = "Mass shootings in the United States", start = as.Date("2017-1-1"), end = as.Date("2023-12-31"))

glimpse(Mass_shootings_in_the_United_States)
Rows: 2,554
Columns: 8
$ project     <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
$ language    <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en"…
$ article     <chr> "Mass_shootings_in_the_United_States", "Mass_shootings_in_…
$ access      <chr> "all-access", "all-access", "all-access", "all-access", "a…
$ agent       <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
$ granularity <chr> "daily", "daily", "daily", "daily", "daily", "daily", "dai…
$ date        <dttm> 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 2017-01-0…
$ views       <dbl> 13, 9, 7, 13, 21, 54, 49, 32, 16, 19, 24, 14, 9, 11, 6, 21…
gun_Mass <- bind_rows(gun_control, Mass_shootings_in_the_United_States) 
Mass_shootings_in_the_United_States |> 
  ggplot(aes(x = date, y = views)) +
  geom_line()

gun_control |> 
  ggplot(aes(x = date, y = views)) +
  geom_line()

gun_Mass |> 
  ggplot(aes(x = date, y = views, color = article)) +
  geom_line()

gun_Mass |> 
  pivot_wider(names_from = article, values_from = views) |> 
  ggplot(aes(x = Mass_shootings_in_the_United_States, y = Gun_control)) +          # create scatterplot
  geom_point() +
  geom_smooth(method = lm) +                                  # create regression line
  labs(x = "Views of the Wikipedia Mass shootings in the United States article", 
       y = "Views of the Wikipedia Gun control article", 
       title = "Relationship between Wikipedia article views")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 2 rows containing missing values (`geom_point()`).

topLV |> 
  select(article, views) |>
  filter(!article == "Main_Page", !article == "Special:Search") |> 
  slice_max(views, n = 10) |> 
  datatable()
topLV |> 
  select(article, views) |>
  filter(!article == "Main_Page", !article == "Special:Search") |> 
  top_n(10, views) |> 
  ggplot(aes(x = fct_rev(as_factor(article)), y = views)) +
  geom_col(fill = "blue") +
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  labs(y = "Number of Views",
       x = "Article",
       title = "Top Wikipedia articles, Oct. 2, 2017")

topM |> 
  select(article, views) |>
  filter(!article == "Main_Page", !article == "Special:Search") |> 
  slice_max(views, n = 10) |> 
  datatable()
topM |> 
  select(article, views) |>
  filter(!article == "Main_Page", !article == "Special:Search") |> 
  top_n(10, views) |> 
  ggplot(aes(x = fct_rev(as_factor(article)), y = views)) +
  geom_col(fill = "blue") +
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  labs(y = "Number of Views",
       x = "Article",
       title = "Top Wikipedia articles, Apl. 19, 2017")

Las_Vegas_shooting <- article_pageviews(article = "Gun_control",
                           start = as.Date("2017-9-17"),
                           end = as.Date("2017-10-15"))
Las_Vegas_shooting <- Las_Vegas_shooting |> 
  mutate(day = -14:14) |> 
  mutate(event = "Las Vegas shooting")
Las_Vegas_shooting |> 
  ggplot(aes(x = day, y = views)) +
  geom_line()

Fresno_shootings <- article_pageviews(article = "Gun_control",
                           start = as.Date("2017-4-4"),
                           end = as.Date("2017-5-2"))

Fresno_shootings <- Fresno_shootings |> 
  mutate(day = -14:14) |> 
  mutate(event = "Fresno shootings")

Fresno_shootings |> 
  ggplot(aes(x = day, y = views)) +
  geom_line()

shootings <- bind_rows(Las_Vegas_shooting, Fresno_shootings)

shootings |> 
  ggplot(aes(x = day, y = views, color = event)) +
  geom_line() +
  theme_minimal() +
  labs(x = "Days before/after Shooting", 
       y = "Wikipedia Views", 
       color = "Event", 
       title = "Views of the Wikipedia Gun Control Article before and after Two Mass Shootings")

shootings |> 
  mutate(after_event = (day > 0)) |> 
  t_test(views ~ after_event)
Warning: The statistic is based on a difference or ratio; by default, for
difference-based statistics, the explanatory variable is subtracted in the
order "TRUE" - "FALSE", or divided in the order "TRUE" / "FALSE" for
ratio-based statistics. To specify this order yourself, supply `order =
c("TRUE", "FALSE")`.
# A tibble: 1 × 7
  statistic  t_df  p_value alternative estimate lower_ci upper_ci
      <dbl> <dbl>    <dbl> <chr>          <dbl>    <dbl>    <dbl>
1      3.84  27.6 0.000651 two.sided       778.     363.    1193.
shootings |> 
  mutate(after_event = (day > 0)) |> 
  group_by(after_event) |> 
  summarize(Mean = mean(views),
            StdDev = sd(views),
            N = n())
# A tibble: 2 × 4
  after_event  Mean StdDev     N
  <lgl>       <dbl>  <dbl> <int>
1 FALSE        459.   119.    30
2 TRUE        1237.  1065.    28

The average number of views of the Wikipedia Gun Control article in the 7 days prior to the two shootings (M = 458.7, SD = 118.57) was statistically significant and different from the average number of views in the 7 days after the shooting (M = 1236.7, SD = 1065.25), t(27.6) = 3.84, p < .001.