Preparing the data

library(reshape2)
library(ggplot2)
library(plotly)
library(dplyr)
library(knitr)

rm(list = ls())

data <- read.delim('/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw3/temps.txt')
colnames(data) <- sub("X", "", colnames(data))
set.seed(1)

Visualizing data

years <- melt(data, id.vars = c("DAY"),variable.name = "YEAR",value.name="TEMP")
fig1 <- plot_ly(years, y = ~TEMP, x = ~YEAR, type = "box")
fig1

Q6.2_1 CUSUM to detect unofficial end of summer

With a fixed C-value of 20, we conducted experiments across different T-values. We observed that a T-value of 75 yielded dates that closely matched the ‘realistic’ unofficial end of summer, although it missed the year 2013 because no days had a cusum value above the specified threshold. On the other hand, when employing a T-value of 70, the model produced dates for all years, but a majority of these dates fell in July, which appeared less realistic when considering the context.

avg <- mean(years$TEMP)
for (i in 2:(nrow(years)-1)) {
  years$cusum[i] <- max(0, (years$TEMP[i-1] + (avg- years$TEMP[i]- 20)))
}

T=75

subset_data_higherT <- years[years$cusum > 75, ]
first_rows_per_year_higherT <- subset_data_higherT %>%
  group_by(YEAR) %>%
  slice(1) %>%
  ungroup()
first_rows_per_year_higherT$DAY <- as.Date(first_rows_per_year_higherT$DAY, format = "%d-%b")

kable(head(first_rows_per_year_higherT,19))
DAY YEAR TEMP cusum
2023-10-04 1996 70 77.33902
2023-07-31 1997 72 79.33902
2023-10-08 1998 69 76.33902
2023-09-09 1999 77 77.33902
2023-09-06 2000 66 78.33902
2023-09-24 2001 69 82.33902
2023-09-13 2002 75 78.33902
2023-09-08 2004 73 76.33902
2023-10-22 2005 67 76.33902
2023-09-13 2006 70 75.33902
2023-10-11 2007 67 77.33902
2023-10-01 2008 74 75.33902
2023-10-05 2009 62 75.33902
2023-08-02 2010 84 75.33902
2023-08-04 2011 85 75.33902
2023-07-02 2012 93 75.33902
2023-09-21 2013 73 76.33902
2023-10-04 2014 65 76.33902
2023-09-21 2015 77 75.33902
ggplot(data=first_rows_per_year_higherT, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

T=70

subset_data <- years[years$cusum > 70, ]
first_rows_per_year <- subset_data %>%
  group_by(YEAR) %>%
  slice(1) %>%
  ungroup()
first_rows_per_year$DAY <- as.Date(first_rows_per_year$DAY, format = "%d-%b")

kable(head(first_rows_per_year,20))
DAY YEAR TEMP cusum
2023-07-04 1996 90 70.33902
2023-07-05 1997 84 70.33902
2023-07-27 1998 80 71.33902
2023-07-07 1999 82 72.33902
2023-07-23 2000 87 72.33902
2023-08-28 2001 81 73.33902
2023-07-11 2002 84 70.33902
2023-09-06 2003 73 74.33902
2023-07-15 2004 84 70.33902
2023-07-06 2005 82 70.33902
2023-07-06 2006 81 72.33902
2023-07-02 2007 85 73.33902
2023-07-13 2008 85 71.33902
2023-07-05 2009 80 74.33902
2023-08-02 2010 84 75.33902
2023-07-14 2011 90 70.33902
2023-07-02 2012 93 75.33902
2023-07-03 2013 76 72.33902
2023-07-20 2014 76 72.33902
2023-07-03 2015 79 71.33902
# Create a plot of 'cusum' values for the subset
ggplot(data=first_rows_per_year, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

Q6.2_2 CUSUM to detect whether Atlanta has gotten hotter over the years

##### Q6.2_2
cusum <- aggregate(TEMP~YEAR, data=years, mean)
ggplot(data=cusum, aes(x=YEAR, y=TEMP, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)
cusum$cusum <- 0
for (i in 2:(nrow(cusum)-1)) {
  year <- cusum$YEAR[i]
  x <- cusum$TEMP[i]
  cusum$cusum[i] <- max(0,cusum$cusum[i - 1] + (x - mu - 0))}
kable(head(cusum))
YEAR TEMP cusum
1996 83.71545 0.0000000
1997 81.67480 0.0000000
1998 84.26016 0.9211382
1999 83.35772 0.9398374
2000 84.03252 1.6333333
2001 81.55285 0.0000000
#plot(cusum$S, type = "l")
ggplot(data=cusum, aes(x=YEAR, y=cusum, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)

results <- data.frame()

for (t in c(5, 10, 15, 20)) {
  for (c in c(0, 2, 4, 6, 8)) {
    S <- 0
    changeyear <- NA
    for (i in 1:nrow(cusum)) {
      year <- cusum$YEAR[i]
      x <- cusum$TEMP[i]
      
      S <- max(0, S + (x - mu -c))
      
      if (S > t) {
        changeyear <- year
        break
      }
    }
    results <- rbind(results, data.frame(c, t, changeyear))
  }
}

print(results)
##    c  t changeyear
## 1  0  5       2011
## 2  2  5       <NA>
## 3  4  5       <NA>
## 4  6  5       <NA>
## 5  8  5       <NA>
## 6  0 10       <NA>
## 7  2 10       <NA>
## 8  4 10       <NA>
## 9  6 10       <NA>
## 10 8 10       <NA>
## 11 0 15       <NA>
## 12 2 15       <NA>
## 13 4 15       <NA>
## 14 6 15       <NA>
## 15 8 15       <NA>
## 16 0 20       <NA>
## 17 2 20       <NA>
## 18 4 20       <NA>
## 19 6 20       <NA>
## 20 8 20       <NA>