library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggrepel)
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
## 
##     index
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df <- read.csv('~/Downloads/pageviews-20220222-20241118.csv')
summary(df)
##      Date                BMI        
##  Length:1001        Min.   : 25.00  
##  Class :character   1st Qu.: 53.00  
##  Mode  :character   Median : 61.00  
##                     Mean   : 61.62  
##                     3rd Qu.: 69.00  
##                     Max.   :124.00  
##                     NA's   :1
which(is.na(df),arr.ind = TRUE)
##       row col
## [1,] 1001   2
df <- df[-1001,]

Choose a column of data to analyze over time. This should be a “response-like” variable that is of particular interest.

Create a tsibble object of just the date and response variable. Then, plot your data over time. Consider different windows of time.

df$Date <- as.Date(df$Date)
df_ts <- df |> as_tsibble(index = Date)
df_xts <- xts(x = df_ts$BMI, order.by = df_ts$Date)
df_xts <- setNames(df_xts, "BMI")

Visualizing BMI page views over time

df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) + geom_line() +
  geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

  • From above subsets, the trend slope is slightly different but the overall trend is downwards (negative slope)

Considering different time windows

  • For time period of one week (7 days)
df_ts7 <- df_ts |> 
  mutate(W = floor_date(Date, "week"))


df_ts7_D <- df_ts7 |>
  select(-Date) |>
  group_by(W) |>
  summarize(Weekly_BMI = sum(BMI))


df_ts7_D |> 
  ggplot(mapping = aes(x = W, y = Weekly_BMI)) +
  geom_line() +
  geom_smooth(method = lm, se = FALSE, color = 'red') +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

- For time period of one month (~30 days)

df_ts30 <- df_ts |> 
  mutate(Month = floor_date(Date, "month")) |>
  group_by(Month) |> 
  summarize(Month_BMI = sum(BMI), .groups = 'drop')

df_ts30 |> ggplot(mapping = aes(x = Month, y = Month_BMI)) + geom_line() +
  geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

What stands out immediately?

Use linear regression to detect any upwards or downwards trends.

df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) + geom_line() +
  geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

Use smoothing to detect at least one season in your data, and interpret your results.

df_xts |> 
  rollapply(width = 30, FUN = mean, fill = FALSE) |>
  ggplot(mapping = aes(x = Index, y = BMI)) + 
  geom_line(mapping = aes(x = Index, y = BMI)) + theme_classic()

df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) + 
  geom_point(size = 1, alpha = 0.4) + 
  geom_smooth(span = 0.2, color = 'blue', se = FALSE)+theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Can you illustrate the seasonality using ACF or PACF?

acf(df_ts, ci = 0.95, na.action = na.exclude)

pacf(df_xts, na.action = na.exclude, xlab = 'lag', main = "PACF for BMI pageviews" )

df_diff7 <- diff(df_xts, lag = 7, na.pad = FALSE)

df_diff7 |> ggplot(mapping = aes(x = Index, y = BMI)) + 
  geom_line(mapping = aes(x = Index, y = BMI)) + theme_classic()+
  geom_smooth(method = 'lm', se = FALSE, color ='red')
## `geom_smooth()` using formula = 'y ~ x'

If you choose this option, find ways to tie your results from the below analysis into what you’re seeing with your own data!