##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
Superstore_data=read.csv("SampleSuperstore_final.csv")
head(Superstore_data)
## Ship.Mode Segment Country City State Postal.Code
## 1 Second Class Consumer United States Henderson Kentucky 42420
## 2 Second Class Consumer United States Henderson Kentucky 42420
## 3 Second Class Corporate United States Los Angeles California 90036
## 4 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 5 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 6 Standard Class Consumer United States Los Angeles California 90032
## Region Category Sub.Category Sales Quantity Discount Profit
## 1 South Furniture Bookcases 261.9600 2 0.00 41.9136
## 2 South Furniture Chairs 731.9400 3 0.00 219.5820
## 3 West Office Supplies Labels 14.6200 2 0.00 6.8714
## 4 South Furniture Tables 957.5775 5 0.45 -383.0310
## 5 South Office Supplies Storage 22.3680 2 0.20 2.5164
## 6 West Furniture Furnishings 48.8600 7 0.00 14.1694
library(wikipediatrend)
## Warning: package 'wikipediatrend' was built under R version 4.3.2
##
## [wikipediatrend]
##
## Note:
##
## - Data before 2016-01-01
## * is provided by petermeissner.de and
## * was prepared in a project commissioned by the Hertie School of Governance (Prof. Dr. Simon Munzert)
## * and supported by the Daimler and Benz Foundation.
##
## - Data from 2016-01-01 onwards
## * is provided by the Wikipedia Foundation
## * via its pageviews package and API.
##
exercise_pageviews <- wpd_get_exact(page="Exercise", from="2006-12-01", to="2015-12-31", lang="en", warn=TRUE)
tail(exercise_pageviews)
## language article date views
## 726 en exercise 2015-12-26 0
## 727 en exercise 2015-12-27 0
## 728 en exercise 2015-12-28 0
## 729 en exercise 2015-12-29 0
## 730 en exercise 2015-12-30 0
## 731 en exercise 2015-12-31 0
ex_pgViews <- select(exercise_pageviews, date, views)
ex_pgViewsTS <- as_tsibble(ex_pgViews, index = date)
head(ex_pgViewsTS)
## # A tsibble: 6 x 2 [1D]
## date views
## <date> <int>
## 1 2007-12-10 1725
## 2 2007-12-11 1546
## 3 2007-12-12 1548
## 4 2007-12-13 1568
## 5 2007-12-14 1248
## 6 2007-12-15 803
ex_pgViewsTS %>%
ggplot() +
geom_line(mapping = aes(x = date, y = views)) +
labs(title = "Exercise Page Views on Wikipedia from 2008-2016") +
theme_bw() +
scale_x_date(breaks = "1 year", labels = \(x) year(x))
Here from the above plot can see drastic reduction of pageview/search for “exercise” over time within the range 2008 - 2016 . But overall the search about exercise is reducing. These dips maybe associated with the increase in how various technological advances happened over the years and maybe people have not been that health conscious.
Following is the plot when tech was just evolving i.e. between 2008 - 2011.
ex_pgViewsTS %>%
filter(date < as.Date('2011/01/01')) |>
ggplot() +
geom_line(mapping = aes(x = date, y = views)) +
ylim(0,2500) +
labs(title = ' 2008-2011',
subtitle = 'Pre Tech evolution',
x = 'Years',
y = 'Total views/curiosity about Exercise') +
theme_bw() +
scale_x_date(breaks = "1 year", labels = \(x) year(x))
This graph plots the window of time before Technology overtook and exercise was important and people took time to understand it better.
ex_pgViewsTS %>%
filter(date >= as.Date('2011/01/01')) |>
ggplot() +
geom_line(mapping = aes(x = date, y = views)) +
ylim(0,2500) +
labs(title = ' 2011-2016',
subtitle = 'Time where "Exercise" was known post Technology evolution',
x = 'Years',
y = 'Total views/curiosity about Exercise') +
theme_bw() +
scale_x_date(breaks = "1 year", labels = \(x) year(x))
This graph plots the window of time post Technology overtook and exercise was already known to most people and hence maybe they stoped searching for it.
ex_pgViewsTS %>%
ggplot(mapping = aes(x = date, y = views)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Exercise Page Views on Wikipedia from 2008-2016",
x = 'Season',
y = 'Total 3 Point Attempts')
## `geom_smooth()` using formula = 'y ~ x'
As we could already tell visually there is a down trend which is linear exists. From 2007 - 2010, this trend looks pretty consistent over time, however, from 2010-2016 the drop appears to be more rapid than the previous years. Like it is seen to be reducing by certain amount each phase of the years to come.
Using Linear regression it is so easy to detect the downwards trends over the years.
pre2010 <- ex_pgViewsTS %>%
filter(date < as.Date('2010/01/01'))
post2010 <- ex_pgViewsTS %>%
filter(date >= as.Date('2010/01/01'))
ggplot() +
geom_line(mapping = aes(x = pre2010$date, y = pre2010$views)) +
geom_smooth(mapping = aes(x = pre2010$date, y = pre2010$views),method = "lm", se = FALSE) +
geom_line(mapping = aes(x = post2010$date, y = post2010$views)) +
geom_smooth(mapping = aes(x = post2010$date, y = post2010$views),method = "lm", se = FALSE) +
labs(title = "Wikipedia page view for time period 2008 - 2016",
x = 'years',
y = 'Views/Curiosity for Exercise ')
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ex_pgViewsTS %>%
ggplot(mapping = aes(x= date, y = views)) +
geom_point(size = 1, shape = 'o') +
geom_smooth(span = 0.4, se = FALSE) +
labs(title = "Exercise page View by the year",
subtitle = "loess smoothing - span = 0.4",
x = 'years',
y = 'Total views')
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ex_pgViewsTS|>
index_by(year = floor_date(date, 'quarter')) |>
summarise(avg_views= mean(views, na.rm = TRUE)) |>
ggplot(mapping = aes(x = year, y = avg_views)) + geom_line(color="blue",size=1.5) +geom_smooth(span = 0.3, color = 'red', se=FALSE, size=1.5) +labs(title = 'Average Number of "Exercise" Wikipedia Page Views Over Time', subtitle = "by quarter year",y="Average Views",x="Year") + scale_x_date(breaks = "1 year", labels = \(x) year(x)) + theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
acf(ex_pgViewsTS, ci = 0.95, na.action = na.exclude,main='ACF for "Exercise" Wikipedia Page Views',xlab="Lag (Weekly)")
From the analysis of the “Exercise” Wikipedia page views between January 2007 and December 2017, we could see 2 clear trends emerge. Between January 2007 and December 2010, there was a sharp decreasing trend, and between Jan 2010 and December 2017, there was a slow decreasing trend in the number of page views. It is unclear why the sharp decreasing trend switches to a mild decreasing trend during the course from 2010 to 2017.
Furthermore, the page views data seems to exhibit both weekly and annual seasonality; that is, page views fluctuate in a comparable way throughout the week. Some broad weekly trends in page views are to be expected, as the weekly seasonality may be explained by people’s tendency to spend more time on the weekends and less time during the week searching the internet for information. The yearly seasonality may be explained by the fact that, as a result of their New Year’s Resolutions, which may include starting an exercise regimen, people tend to be more health-conscious at the start of the year.