Learning change point analysis

Learning change point analysis using some examples from following website: https://www.marinedatascience.co/blog/2019/09/28/comparison-of-change-point-detection-methods/ Otto, S.A. (2019, Sept.,28). Comparison of change point detection methods [Blog post]. Retrieved from https://www.marinedatascience.co/blog/2019/09/28/comparison-of-change-point-detection-methods/ Following packages needs to be loaded

# Loading packages
library(changepoint)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Successfully loaded changepoint package version 2.2.4
##  See NEWS for details of changes.

library(bcp)

## Loading required package: grid

library(strucchange)

## Loading required package: sandwich

library(segmented)

## Loading required package: MASS

## Loading required package: nlme

library(tree)

I am learning using the nile dataset using the measurement of flow

library(datasets)
data(Nile)
str(Nile)

##  Time-Series [1:100] from 1871 to 1970: 1120 1160 963 1210 1160 1160 813 1230 1370 1140 ...

Now plot the nile data set

plot(Nile)

Looking for 1 change point using the “AMOC” method (which is also the default):

cpt.mean(Nile, method = "AMOC")

## Class 'cpt' : Changepoint Object
##        ~~   : S4 class containing 12 slots with names
##               cpttype date version data.set method test.stat pen.type pen.value minseglen cpts ncpts.max param.est 
## 
## Created on  : Sun Nov 13 23:03:19 2022 
## 
## summary(.)  :
## ----------
## Created Using changepoint version 2.2.4 
## Changepoint type      : Change in mean 
## Method of analysis    : AMOC 
## Test Statistic  : Normal 
## Type of penalty       : MBIC with value, 13.81551 
## Minimum Segment Length : 1 
## Maximum no. of cpts   : 1 
## Changepoint Locations : 28

The method identifies correctly a change point in 1898 (#28). What would happen when using the setting for identifying multiple change points (if we wouldn’t know the exact number):

cpt.mean(Nile, method = "PELT", Q = 10)

## Class 'cpt' : Changepoint Object
##        ~~   : S4 class containing 12 slots with names
##               cpttype date version data.set method test.stat pen.type pen.value minseglen cpts ncpts.max param.est 
## 
## Created on  : Sun Nov 13 23:03:19 2022 
## 
## summary(.)  :
## ----------
## Created Using changepoint version 2.2.4 
## Changepoint type      : Change in mean 
## Method of analysis    : PELT 
## Test Statistic  : Normal 
## Type of penalty       : MBIC with value, 13.81551 
## Minimum Segment Length : 1 
## Maximum no. of cpts   : Inf 
## Number of changepoints: 94

cpt2 <- cpt.mean(Nile, method = "PELT", penalty = "CROPS", pen.value = c(1,25))

## [1] "Maximum number of runs of algorithm = 6"
## [1] "Completed runs = 2"
## [1] "Completed runs = 3"
## [1] "Completed runs = 5"

summary(cpt2)

## Created Using changepoint version 2.2.4 
## Changepoint type      : Change in mean 
## Method of analysis    : PELT 
## Test Statistic  : Normal 
## Type of penalty       : CROPS with value, 1 25 
## Minimum Segment Length : 1 
## Maximum no. of cpts   : Inf 
## Changepoint Locations :  
## Range of segmentations:
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    2    3    4    6    7    8    9   10    11    12    13    14    15
## [2,]    1    2    3    4    6    7    8    9   10    11    12    13    14    15
## [3,]    1    2    3    4    6    7    8    9   10    11    12    13    14    15
## [4,]    1    2    3    4    6    7    8    9   10    11    12    13    14    15
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,]    16    17    18    19    20    21    22    23    24    25    26    27
## [2,]    16    17    18    19    20    21    22    23    24    25    26    27
## [3,]    16    17    18    19    20    21    22    23    24    25    26    27
## [4,]    16    17    18    19    20    21    22    23    24    25    26    27
##      [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
## [1,]    28    29    30    31    32    33    34    35    36    37    38    39
## [2,]    28    29    30    31    32    33    34    35    36    37    38    39
## [3,]    28    29    30    31    32    33    34    35    36    37    38    39
## [4,]    28    29    30    31    32    33    34    35    36    37    38    39
##      [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
## [1,]    40    41    42    43    44    45    46    47    48    49    50    51
## [2,]    40    41    42    43    44    45    46    47    48    49    50    51
## [3,]    40    41    42    43    44    45    46    47    48    49    50    51
## [4,]    40    41    42    43    44    45    46    47    48    49    50    51
##      [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
## [1,]    52    53    54    55    56    57    58    59    60    61    62    63
## [2,]    52    54    55    56    57    58    59    60    61    62    63    64
## [3,]    52    54    55    56    57    58    59    60    61    62    63    64
## [4,]    52    54    55    56    57    58    59    60    61    62    63    64
##      [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74]
## [1,]    64    65    66    67    68    69    70    71    72    73    74    75
## [2,]    65    66    67    68    69    70    71    72    73    74    75    76
## [3,]    65    66    67    68    69    70    71    72    73    74    75    76
## [4,]    65    66    67    68    69    70    71    72    73    74    75    76
##      [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86]
## [1,]    76    77    78    79    80    81    82    83    84    85    86    87
## [2,]    77    78    79    80    81    82    83    84    85    86    87    88
## [3,]    77    78    79    80    81    82    83    84    85    86    87    88
## [4,]    77    78    79    80    82    83    84    85    86    87    88    89
##      [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98]
## [1,]    88    89    90    91    92    93    94    95    96    97    98    99
## [2,]    89    90    91    92    93    94    95    96    97    98    99    NA
## [3,]    89    90    91    92    93    94    95    96    97    99    NA    NA
## [4,]    90    91    93    94    95    96    97    99    NA    NA    NA    NA
## 
##  For penalty values: 1 2 8 12.5

We can also plot the diagnostics to see the number of changepoints in each segmentation against the change in test statistic when adding that change. The plot is similar to the scree plot in principal component analysis as when a true changepoint is added the cost increases or decreases rapidly, but when a changepoint due to noise is added the change is small:

plot(cpt2, diagnostic = TRUE)

BCP is another method, lets see how it measures:

x <- Nile
bcp_x <- bcp(x, return.mcmc = TRUE)
plot(bcp_x)

The lower posterior probability plot shows that at one location (looks like #28) the probability of a change is very high. We can get the exact locations where probabilities are high (e.g. > 70%) with this code:

bcp_sum <- as.data.frame(summary(bcp_x))

## 
## Bayesian Change Point (bcp) summary:
## 
## 
## Probability of a change in mean and posterior means:
## 
##     Probability     X1
## 1         0.004 1087.4
## 2         0.024 1087.3
## 3         0.010 1086.1
## 4         0.014 1086.6
## 5         0.008 1086.2
## 6         0.014 1085.7
## 7         0.012 1084.0
## 8         0.002 1087.2
## 9         0.024 1087.1
## 10        0.020 1083.6
## 11        0.016 1081.5
## 12        0.016 1080.2
## 13        0.008 1079.8
## 14        0.014 1079.3
## 15        0.006 1078.7
## 16        0.012 1078.5
## 17        0.020 1079.2
## 18        0.026 1076.6
## 19        0.044 1079.6
## 20        0.028 1085.9
## 21        0.052 1088.9
## 22        0.026 1094.9
## 23        0.014 1097.0
## 24        0.018 1098.3
## 25        0.004 1099.2
## 26        0.090 1099.1
## 27        0.120 1078.1
## 28        0.724 1050.2
## 29        0.090  867.3
## 30        0.012  850.2
## 31        0.006  850.2
## 32        0.014  849.9
## 33        0.000  850.3
## 34        0.010  850.3
## 35        0.008  850.4
## 36        0.012  850.7
## 37        0.024  850.6
## 38        0.010  852.9
## 39        0.012  853.7
## 40        0.014  853.4
## 41        0.050  851.8
## 42        0.030  841.9
## 43        0.042  837.5
## 44        0.012  846.9
## 45        0.048  847.7
## 46        0.014  858.1
## 47        0.024  857.4
## 48        0.024  853.1
## 49        0.008  851.0
## 50        0.018  851.0
## 51        0.004  850.9
## 52        0.014  851.0
## 53        0.008  851.1
## 54        0.014  851.1
## 55        0.016  851.2
## 56        0.012  851.5
## 57        0.016  851.8
## 58        0.008  852.2
## 59        0.006  852.5
## 60        0.008  852.6
## 61        0.006  852.8
## 62        0.006  853.0
## 63        0.010  853.2
## 64        0.008  853.7
## 65        0.010  853.9
## 66        0.010  853.7
## 67        0.008  853.5
## 68        0.014  853.6
## 69        0.010  852.2
## 70        0.012  851.4
## 71        0.012  851.6
## 72        0.010  852.3
## 73        0.014  852.8
## 74        0.022  853.2
## 75        0.022  854.5
## 76        0.016  856.3
## 77        0.026  856.5
## 78        0.006  857.3
## 79        0.012  857.5
## 80        0.024  857.8
## 81        0.010  858.3
## 82        0.030  858.9
## 83        0.052  860.8
## 84        0.016  865.1
## 85        0.016  865.8
## 86        0.010  866.8
## 87        0.020  866.8
## 88        0.012  868.2
## 89        0.012  869.0
## 90        0.018  869.5
## 91        0.012  870.4
## 92        0.006  870.1
## 93        0.020  870.4
## 94        0.048  871.1
## 95        0.050  864.0
## 96        0.024  856.4
## 97        0.032  854.0
## 98        0.022  848.8
## 99        0.008  846.0
## 100          NA  845.3

# Let's filter the data frame and identify the year:
bcp_sum$id <- 1:length(x)
(sel <- bcp_sum[which(bcp_x$posterior.prob > 0.7), ])

# Get the year:
time(x)[sel$id]

## [1] 1898

strucchange

ocus_nile <- efp(Nile ~ 1, type = "OLS-CUSUM")
op <- par(mfrow = c(1,2))
plot(ocus_nile)
plot(ocus_nile, alpha = 0.01, alt.boundary = TRUE)

sctest(ocus_nile)

## 
##  OLS-based CUSUM test
## 
## data:  ocus_nile
## S0 = 2.9518, p-value = 5.409e-08

fs_nile <- Fstats(Nile ~ 1)
plot(fs_nile)

#Both tests suggest a significant change in the time series.

#Test for 1 breakpoint using the breakpoints() function → Note that the number of breakpoints have to be defined beforehand for this method!
bp_nile <- breakpoints(Nile ~ 1)
bp_nile

## 
##   Optimal 2-segment partition: 
## 
## Call:
## breakpoints.formula(formula = Nile ~ 1)
## 
## Breakpoints at observation number:
## 28 
## 
## Corresponding to breakdates:
## 1898

To check for multiple breaks, the breakpoint() function can be also applied to breakpoints objects with an explicit breaks argument (so you actually nest a breakpoints function in breakpoints():

breakpoints(bp_nile, breaks = 2)

## 
##   Optimal 3-segment partition: 
## 
## Call:
## breakpoints.breakpointsfull(obj = bp_nile, breaks = 2)
## 
## Breakpoints at observation number:
## 28 83 
## 
## Corresponding to breakdates:
## 1898 1953

# Nested syntax with 3 breaks:
breakpoints(breakpoints(Nile ~ 1), breaks = 3)

## 
##   Optimal 4-segment partition: 
## 
## Call:
## breakpoints.breakpointsfull(obj = breakpoints(Nile ~ 1), breaks = 3)
## 
## Breakpoints at observation number:
## 28 68 83 
## 
## Corresponding to breakdates:
## 1898 1938 1953

So how does one know how many breakpoints exist in the time series? Here, the comparison of BIC estimates for different numbers of breakpoint is useful:

plot(bp_nile)

Based on the BIC we should choose one breakpoint.

To get the optimal number numerically instead of inspecting the plot each time, here is a solution:

# get best model
opt_bpts <- function(x) {
  #x = bpts_sum$RSS["BIC",]
  n <- length(x)
  lowest <- vector("logical", length = n-1)
  lowest[1] <- FALSE
  for (i in 2:n) {
    lowest[i] <- x[i] < x[i-1] & x[i] < x[i+1]
  }
  out <- as.integer(names(x)[lowest])
  return(out)
}
bpts_sum <- summary(bp_nile)
opt_brks <- opt_bpts(bpts_sum$RSS["BIC",])
opt_brks

## [1] 1

One can also visualize the breakpoints in the time series plot with confidence intervals using the stats::confint() function:

ci_nile <- confint(bp_nile, breaks = opt_brks)
plot(Nile)
lines(ci_nile)

We can also add the regression lines of the null model and our model with 1 breakpoint for comparison

# null model
fm0 <- lm(Nile ~ 1)
coef(fm0)

## (Intercept) 
##      919.35

# breakpoint model
nile_fac <- breakfactor(bp_nile, breaks = 1)
fm1 <- lm(Nile ~ nile_fac - 1)
coef(fm1)

## nile_facsegment1 nile_facsegment2 
##        1097.7500         849.9722

plot(Nile)
lines(ci_nile)
lines(ts(fitted(fm0), start = 1871), col = 3)
lines(ts(fitted(fm1), start = 1871), col = 4)
lines(bp_nile)

Segmented With this method, the number of breakpoints have to be also specified beforehand similar to the strucchange::breakpoints() function:

Nile_df <- data.frame(Nile = as.numeric(Nile), Year = 1871:1970)
seg_nile <- segmented(lm(Nile ~ 1, data = Nile_df), ~ Year, 
  psi = list(Year = c(1900))) # using 1900 as starting value
summary(seg_nile)

## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = lm(Nile ~ 1, data = Nile_df), seg.Z = ~Year, 
##     psi = list(Year = c(1900)))
## 
## Estimated Break-Point(s):
##               Est. St.Err
## psi1.Year 1875.95 27.328
## 
## Meaningful coefficients of the linear terms:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1041.1365    67.7584  15.365   <2e-16 ***
## U1.Year       -2.7247     0.5669  -4.807       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 151.5 on 97 degrees of freedom
## Multiple R-Squared: 0.2146,  Adjusted R-squared: 0.1984 
## 
## Boot restarting based on 6 samples. Last fit:
## Convergence attained in 2 iterations (rel. change 1.4674e-08)

summary(seg_nile)$psi

##           Initial    Est.   St.Err
## psi1.Year    1900 1875.95 27.32767

plot(Nile ~ Year, data = Nile_df, type = "l")
plot(seg_nile, add=T)
lines(seg_nile,col='red')

#tree
Nile_df <- data.frame(Nile = as.numeric(Nile), Year = 1871:1970)
tree_nile <- tree(Nile ~ Year, data = Nile_df)
summary(tree_nile)

## 
## Regression tree:
## tree(formula = Nile ~ Year, data = Nile_df)
## Number of terminal nodes:  6 
## Residual mean deviance:  13750 = 1293000 / 94 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -380.100  -62.160   -3.645    0.000   54.840  283.900

plot(tree_nile)
text(tree_nile, pretty = 0)

set.seed(1)
# Normal distributed data with a single change in mean (at x = 25/26)
df <- data.frame(
  x = 1:50,
  z = c(rnorm(25,0,1), rnorm(25,5,1))
)
plot(df$x, df$z, type = 'l', xlab = '', ylab = '')
lines(x = 1:25, y = rep(0,25), col = 'red', lwd = 3)
lines(x=26:50, y = rep(5,25), col = 'red', lwd = 3)

z_ts <- as.ts(df$z)

plot(cpt2, diagnostic = TRUE)