library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggrepel)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
##
## index
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
##
## interval
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df <- read.csv('~/Downloads/pageviews-20220222-20241118.csv')
summary(df)
## Date BMI
## Length:1001 Min. : 25.00
## Class :character 1st Qu.: 53.00
## Mode :character Median : 61.00
## Mean : 61.62
## 3rd Qu.: 69.00
## Max. :124.00
## NA's :1
which(is.na(df),arr.ind = TRUE)
## row col
## [1,] 1001 2
df <- df[-1001,]
df$Date <- as.Date(df$Date)
df_ts <- df |> as_tsibble(index = Date)
df_xts <- xts(x = df_ts$BMI, order.by = df_ts$Date)
df_xts <- setNames(df_xts, "BMI")
df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) + geom_line() +
geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
df_ts7 <- df_ts |>
mutate(W = floor_date(Date, "week"))
df_ts7_D <- df_ts7 |>
select(-Date) |>
group_by(W) |>
summarize(Weekly_BMI = sum(BMI))
df_ts7_D |>
ggplot(mapping = aes(x = W, y = Weekly_BMI)) +
geom_line() +
geom_smooth(method = lm, se = FALSE, color = 'red') +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
- For time period of one month (~30 days)
df_ts30 <- df_ts |>
mutate(Month = floor_date(Date, "month")) |>
group_by(Month) |>
summarize(Month_BMI = sum(BMI), .groups = 'drop')
df_ts30 |> ggplot(mapping = aes(x = Month, y = Month_BMI)) + geom_line() +
geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) + geom_line() +
geom_smooth(method = lm, se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
df_ts |> filter(Date< as.Date('2024-01-01')) |>
ggplot(mapping = aes(x = Date, y = BMI)) +
geom_line()+
geom_smooth(method = 'lm', se = FALSE, color = 'red')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
* The downward trend is very high until 2024-01-01 (compare to
afterwards) but the downward trend continous even after 2024-01-01
df_ts |> filter(Date > as.Date('2024-01-01')) |>
ggplot(mapping = aes(x = Date, y = BMI)) +
geom_line()+
geom_smooth(method = 'lm', se = FALSE, color = 'blue')+ theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
model_lm <- lm(df_ts$BMI ~ df_ts$Date)
summary(model_lm) |> coefficients()
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 385.6471466 25.491438247 15.1285 1.048118e-46
## df_ts$Date -0.0165792 0.001304135 -12.7128 2.000051e-34
df_xts |>
rollapply(width = 30, FUN = mean, fill = FALSE) |>
ggplot(mapping = aes(x = Index, y = BMI)) +
geom_line(mapping = aes(x = Index, y = BMI)) + theme_classic()
We can observe that the plot is a lot smooth compared to simple plot of data.
The seasonality is evident but the seasonality is different at different points of time.
Another smoothing method is LOWESS as shown below.
LO(W)ESS
df_ts |> ggplot(mapping = aes(x = Date, y = BMI)) +
geom_point(size = 1, alpha = 0.4) +
geom_smooth(span = 0.2, color = 'blue', se = FALSE)+theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
acf(df_ts, ci = 0.95, na.action = na.exclude)
pacf(df_xts, na.action = na.exclude, xlab = 'lag', main = "PACF for BMI pageviews" )
df_diff7 <- diff(df_xts, lag = 7, na.pad = FALSE)
df_diff7 |> ggplot(mapping = aes(x = Index, y = BMI)) +
geom_line(mapping = aes(x = Index, y = BMI)) + theme_classic()+
geom_smooth(method = 'lm', se = FALSE, color ='red')
## `geom_smooth()` using formula = 'y ~ x'