Preparing the data
library(reshape2)
library(ggplot2)
library(plotly)
library(dplyr)
library(knitr)
rm(list = ls())
data <- read.delim('/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw3/temps.txt')
colnames(data) <- sub("X", "", colnames(data))
set.seed(1)
Visualizing data
years <- melt(data, id.vars = c("DAY"),variable.name = "YEAR",value.name="TEMP")
fig1 <- plot_ly(years, y = ~TEMP, x = ~YEAR, type = "box")
fig1
Q6.2_1 CUSUM to detect unofficial end of summer
With a fixed C-value of 20, we conducted experiments across different
T-values. We observed that a T-value of 75 yielded dates that closely
matched the ‘realistic’ unofficial end of summer, although it missed the
year 2013 because no days had a cusum value above the specified
threshold. On the other hand, when employing a T-value of 70, the model
produced dates for all years, but a majority of these dates fell in
July, which appeared less realistic when considering the context.
avg <- mean(years$TEMP)
for (i in 2:(nrow(years)-1)) {
years$cusum[i] <- max(0, (years$TEMP[i-1] + (avg- years$TEMP[i]- 20)))
}
T=75
subset_data_higherT <- years[years$cusum > 75, ]
first_rows_per_year_higherT <- subset_data_higherT %>%
group_by(YEAR) %>%
slice(1) %>%
ungroup()
first_rows_per_year_higherT$DAY <- as.Date(first_rows_per_year_higherT$DAY, format = "%d-%b")
kable(head(first_rows_per_year_higherT,19))
| 2023-10-04 |
1996 |
70 |
77.33902 |
| 2023-07-31 |
1997 |
72 |
79.33902 |
| 2023-10-08 |
1998 |
69 |
76.33902 |
| 2023-09-09 |
1999 |
77 |
77.33902 |
| 2023-09-06 |
2000 |
66 |
78.33902 |
| 2023-09-24 |
2001 |
69 |
82.33902 |
| 2023-09-13 |
2002 |
75 |
78.33902 |
| 2023-09-08 |
2004 |
73 |
76.33902 |
| 2023-10-22 |
2005 |
67 |
76.33902 |
| 2023-09-13 |
2006 |
70 |
75.33902 |
| 2023-10-11 |
2007 |
67 |
77.33902 |
| 2023-10-01 |
2008 |
74 |
75.33902 |
| 2023-10-05 |
2009 |
62 |
75.33902 |
| 2023-08-02 |
2010 |
84 |
75.33902 |
| 2023-08-04 |
2011 |
85 |
75.33902 |
| 2023-07-02 |
2012 |
93 |
75.33902 |
| 2023-09-21 |
2013 |
73 |
76.33902 |
| 2023-10-04 |
2014 |
65 |
76.33902 |
| 2023-09-21 |
2015 |
77 |
75.33902 |
ggplot(data=first_rows_per_year_higherT, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

T=70
subset_data <- years[years$cusum > 70, ]
first_rows_per_year <- subset_data %>%
group_by(YEAR) %>%
slice(1) %>%
ungroup()
first_rows_per_year$DAY <- as.Date(first_rows_per_year$DAY, format = "%d-%b")
kable(head(first_rows_per_year,20))
| 2023-07-04 |
1996 |
90 |
70.33902 |
| 2023-07-05 |
1997 |
84 |
70.33902 |
| 2023-07-27 |
1998 |
80 |
71.33902 |
| 2023-07-07 |
1999 |
82 |
72.33902 |
| 2023-07-23 |
2000 |
87 |
72.33902 |
| 2023-08-28 |
2001 |
81 |
73.33902 |
| 2023-07-11 |
2002 |
84 |
70.33902 |
| 2023-09-06 |
2003 |
73 |
74.33902 |
| 2023-07-15 |
2004 |
84 |
70.33902 |
| 2023-07-06 |
2005 |
82 |
70.33902 |
| 2023-07-06 |
2006 |
81 |
72.33902 |
| 2023-07-02 |
2007 |
85 |
73.33902 |
| 2023-07-13 |
2008 |
85 |
71.33902 |
| 2023-07-05 |
2009 |
80 |
74.33902 |
| 2023-08-02 |
2010 |
84 |
75.33902 |
| 2023-07-14 |
2011 |
90 |
70.33902 |
| 2023-07-02 |
2012 |
93 |
75.33902 |
| 2023-07-03 |
2013 |
76 |
72.33902 |
| 2023-07-20 |
2014 |
76 |
72.33902 |
| 2023-07-03 |
2015 |
79 |
71.33902 |
# Create a plot of 'cusum' values for the subset
ggplot(data=first_rows_per_year, aes(x=DAY, y=cusum, group=1)) +geom_point() + scale_x_date(date_labels = "%m-%d")

Q6.2_2 CUSUM to detect whether Atlanta has gotten hotter over the
years
##### Q6.2_2
cusum <- aggregate(TEMP~YEAR, data=years, mean)
ggplot(data=cusum, aes(x=YEAR, y=TEMP, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)
cusum$cusum <- 0
for (i in 2:(nrow(cusum)-1)) {
year <- cusum$YEAR[i]
x <- cusum$TEMP[i]
cusum$cusum[i] <- max(0,cusum$cusum[i - 1] + (x - mu - 0))}
kable(head(cusum))
| 1996 |
83.71545 |
0.0000000 |
| 1997 |
81.67480 |
0.0000000 |
| 1998 |
84.26016 |
0.9211382 |
| 1999 |
83.35772 |
0.9398374 |
| 2000 |
84.03252 |
1.6333333 |
| 2001 |
81.55285 |
0.0000000 |
#plot(cusum$S, type = "l")
ggplot(data=cusum, aes(x=YEAR, y=cusum, group=1)) + geom_line(color="blue")+geom_point()

mu <- mean(cusum$TEMP)
results <- data.frame()
for (t in c(5, 10, 15, 20)) {
for (c in c(0, 2, 4, 6, 8)) {
S <- 0
changeyear <- NA
for (i in 1:nrow(cusum)) {
year <- cusum$YEAR[i]
x <- cusum$TEMP[i]
S <- max(0, S + (x - mu -c))
if (S > t) {
changeyear <- year
break
}
}
results <- rbind(results, data.frame(c, t, changeyear))
}
}
print(results)
## c t changeyear
## 1 0 5 2011
## 2 2 5 <NA>
## 3 4 5 <NA>
## 4 6 5 <NA>
## 5 8 5 <NA>
## 6 0 10 <NA>
## 7 2 10 <NA>
## 8 4 10 <NA>
## 9 6 10 <NA>
## 10 8 10 <NA>
## 11 0 15 <NA>
## 12 2 15 <NA>
## 13 4 15 <NA>
## 14 6 15 <NA>
## 15 8 15 <NA>
## 16 0 20 <NA>
## 17 2 20 <NA>
## 18 4 20 <NA>
## 19 6 20 <NA>
## 20 8 20 <NA>