Change Detection using CUSUM

Preparing the data

library(reshape2)
library(ggplot2)
library(plotly)
library(dplyr)
library(knitr)

rm(list = ls())

data <- read.delim('/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw3/temps.txt')
colnames(data) <- sub("X", "", colnames(data))
set.seed(1)

Visualizing data

years <- melt(data, id.vars = c("DAY"),variable.name = "YEAR",value.name="TEMP")
fig1 <- plot_ly(years, y = ~TEMP, x = ~YEAR, type = "box")
fig1

Q6.2_1 CUSUM to detect unofficial end of summer

With a fixed C-value of 20, we conducted experiments across different T-values. We observed that a T-value of 75 yielded dates that closely matched the ‘realistic’ unofficial end of summer, although it missed the year 2013 because no days had a cusum value above the specified threshold. On the other hand, when employing a T-value of 70, the model produced dates for all years, but a majority of these dates fell in July, which appeared less realistic when considering the context.

avg <- mean(years$TEMP)
for (i in 2:(nrow(years)-1)) {
  years$cusum[i] <- max(0, (years$TEMP[i-1] + (avg- years$TEMP[i]- 20)))
}

T=75

subset_data_higherT <- years[years$cusum > 75, ]
first_rows_per_year_higherT <- subset_data_higherT %>%
  group_by(YEAR) %>%
  slice(1) %>%
  ungroup()
first_rows_per_year_higherT$DAY <- as.Date(first_rows_per_year_higherT$DAY, format = "%d-%b")

kable(head(first_rows_per_year_higherT,19))

DAY	YEAR	TEMP	cusum
2023-10-04	1996	70	77.33902
2023-07-31	1997	72	79.33902
2023-10-08	1998	69	76.33902
2023-09-09	1999	77	77.33902
2023-09-06	2000	66	78.33902
2023-09-24	2001	69	82.33902
2023-09-13	2002	75	78.33902
2023-09-08	2004	73	76.33902
2023-10-22	2005	67	76.33902
2023-09-13	2006	70	75.33902
2023-10-11	2007	67	77.33902
2023-10-01	2008	74	75.33902
2023-10-05	2009	62	75.33902
2023-08-02	2010	84	75.33902
2023-08-04	2011	85	75.33902
2023-07-02	2012	93	75.33902
2023-09-21	2013	73	76.33902
2023-10-04	2014	65	76.33902
2023-09-21	2015	77	75.33902

ggplot(data=first_rows_per_year_higherT, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

T=70

subset_data <- years[years$cusum > 70, ]
first_rows_per_year <- subset_data %>%
  group_by(YEAR) %>%
  slice(1) %>%
  ungroup()
first_rows_per_year$DAY <- as.Date(first_rows_per_year$DAY, format = "%d-%b")

kable(head(first_rows_per_year,20))

DAY	YEAR	TEMP	cusum
2023-07-04	1996	90	70.33902
2023-07-05	1997	84	70.33902
2023-07-27	1998	80	71.33902
2023-07-07	1999	82	72.33902
2023-07-23	2000	87	72.33902
2023-08-28	2001	81	73.33902
2023-07-11	2002	84	70.33902
2023-09-06	2003	73	74.33902
2023-07-15	2004	84	70.33902
2023-07-06	2005	82	70.33902
2023-07-06	2006	81	72.33902
2023-07-02	2007	85	73.33902
2023-07-13	2008	85	71.33902
2023-07-05	2009	80	74.33902
2023-08-02	2010	84	75.33902
2023-07-14	2011	90	70.33902
2023-07-02	2012	93	75.33902
2023-07-03	2013	76	72.33902
2023-07-20	2014	76	72.33902
2023-07-03	2015	79	71.33902

# Create a plot of 'cusum' values for the subset
ggplot(data=first_rows_per_year, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

Q6.2_2 CUSUM to detect whether Atlanta has gotten hotter over the years

##### Q6.2_2
cusum <- aggregate(TEMP~YEAR, data=years, mean)
ggplot(data=cusum, aes(x=YEAR, y=TEMP, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)
cusum$cusum <- 0
for (i in 2:(nrow(cusum)-1)) {
  year <- cusum$YEAR[i]
  x <- cusum$TEMP[i]
  cusum$cusum[i] <- max(0,cusum$cusum[i - 1] + (x - mu - 0))}
kable(head(cusum))

YEAR	TEMP	cusum
1996	83.71545	0.0000000
1997	81.67480	0.0000000
1998	84.26016	0.9211382
1999	83.35772	0.9398374
2000	84.03252	1.6333333
2001	81.55285	0.0000000

#plot(cusum$S, type = "l")
ggplot(data=cusum, aes(x=YEAR, y=cusum, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)

results <- data.frame()

for (t in c(5, 10, 15, 20)) {
  for (c in c(0, 2, 4, 6, 8)) {
    S <- 0
    changeyear <- NA
    for (i in 1:nrow(cusum)) {
      year <- cusum$YEAR[i]
      x <- cusum$TEMP[i]
      
      S <- max(0, S + (x - mu -c))
      
      if (S > t) {
        changeyear <- year
        break
      }
    }
    results <- rbind(results, data.frame(c, t, changeyear))
  }
}

print(results)

##    c  t changeyear
## 1  0  5       2011
## 2  2  5       <NA>
## 3  4  5       <NA>
## 4  6  5       <NA>
## 5  8  5       <NA>
## 6  0 10       <NA>
## 7  2 10       <NA>
## 8  4 10       <NA>
## 9  6 10       <NA>
## 10 8 10       <NA>
## 11 0 15       <NA>
## 12 2 15       <NA>
## 13 4 15       <NA>
## 14 6 15       <NA>
## 15 8 15       <NA>
## 16 0 20       <NA>
## 17 2 20       <NA>
## 18 4 20       <NA>
## 19 6 20       <NA>
## 20 8 20       <NA>