EPH-505 CodeBuild#3 Evidence of Climate Change

Set working directory

Read datasets

noaa <- read.csv("C:/Users/Chenyi Huang/OneDrive - Yale University/22Fall/EPH 505/Code Build/CodeBuild_3/noaa_bigdata.csv")  
noaadig <- read.csv("C:/Users/Chenyi Huang/OneDrive - Yale University/22Fall/EPH 505/Code Build/CodeBuild_3/noaa_bigdigest.csv")

Data Cleaning

Set as date

library(tidyverse)
library(lubridate)
noaa$year=str_sub(noaa$DATE, 1, 4)  #Get the year of each line
noaa$year<-noaa$year %>% as.numeric() 
noaa<-merge(noaa,noaadig,by.x="STATION",by.y="STATION",all=TRUE)  #Get the State and location information for each station

Subset the dataset

noaa_4s=noaa %>% filter(STATE=="CA" |STATE=="NJ" |STATE=="MO" |STATE=="VA")  #Subset only 4 states for regression analysis
noaa_4s <- noaa_4s %>% filter(year >= "1922")  #Subset only the recent 100 years
noaa_p1 <- noaa %>% filter(year >= "1981")  #Subset only the 1981-2010 period as the reference level for the overview of normals
noaa_p1 <- noaa_p1 %>% filter(year <= "2010") 
noaa_p2 <- noaa %>% filter(year >= "1991")  #Subset only the recent 30 period for the overview of normals
noaa_p2 <- noaa_p2 %>% filter(year <= "2020")

Get the dataset of TAVG for visualization

noaa_tavg<-noaa_p2[!is.na(noaa_p2$TAVG),]  
noaa_tavg_p1<-noaa_p1[!is.na(noaa_p1$TAVG),]  #Delete NA values 
noaa_tavg=aggregate(x=noaa_tavg$TAVG,
          by=list(noaa_tavg$STATION),
          FUN=mean)   #Get the mean of TAVG during the 1991-2020 period 
noaa_tavg_p1=aggregate(x=noaa_tavg_p1$TAVG,
          by=list(noaa_tavg_p1$STATION),
          FUN=mean)  #Get the mean of TAVG during the 1981-2010 period 
noaa_tavg <- rename(noaa_tavg,Station="Group.1",TAVG=x)
noaa_tavg_p1<- rename(noaa_tavg_p1,Station="Group.1",TAVG_p1=x)
noaa_tavg<-merge(noaa_tavg,noaadig,by.x="Station",by.y="STATION",all=TRUE) #Merge to get the State and location information for each station
noaa_tavg <- noaa_tavg %>% na.omit()
noaa_tavg_p1 <- noaa_tavg_p1 %>% na.omit()
tavg_dif<-merge(noaa_tavg,noaa_tavg_p1,by.x="Station",by.y="Station",all=TRUE)
tavg_dif<-tavg_dif %>%   group_by(Station) %>% mutate(change = TAVG-TAVG_p1)  #Add a variable measuring the difference between two 30-yr periods. This would be a comparison on an “apples to apples” basis
tavg_dif<-tavg_dif[!is.na(tavg_dif$TAVG),] 

Get the dataset of Precipitation for visualization

noaa_prcp<-noaa_p2[!is.na(noaa_p2$PRCP),]
noaa_prcp_p1<-noaa_p1[!is.na(noaa_p1$PRCP),]
noaa_prcp=aggregate(x=noaa_prcp$PRCP,
          by=list(noaa_prcp$STATION),
          FUN=mean)
noaa_prcp_p1=aggregate(x=noaa_prcp_p1$PRCP,
          by=list(noaa_prcp_p1$STATION),
          FUN=mean)
noaa_prcp <- rename(noaa_prcp,Station="Group.1",PRCP=x)
noaa_prcp_p1<- rename(noaa_prcp_p1,Station="Group.1",PRCP_p1=x)
noaa_prcp<-merge(noaa_prcp,noaadig,by.x="Station",by.y="STATION",all=TRUE)
noaa_prcp <- noaa_prcp %>% na.omit()
noaa_prcp_p1 <- noaa_prcp_p1 %>% na.omit()
prcp_dif<-merge(noaa_prcp,noaa_prcp_p1,by.x="Station",by.y="Station",all=TRUE)
prcp_dif<-prcp_dif %>%  group_by(Station) %>% mutate(pct_change = (PRCP-PRCP_p1)/PRCP_p1*100)  #Add a variable measuring the percentage change between two 30-yr periods.
prcp_dif<-prcp_dif[!is.na(prcp_dif$PRCP),]
prcp_dif<-prcp_dif[!is.na(prcp_dif$pct_change),]

Visualization

We use the ggplot2 and usmap packages to plot 2 climate normals (TAVG and PRCP) during the 1991-2020 period on the US map. Check these 2 links for more about the plot_usmap function: https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html https://cran.r-project.org/web/packages/usmap/vignettes/advanced-mapping.html

For the plot_usmap function, it is necessary to coordinate date frame to usmap projection. Check the link for more information. Therefore, we need to use the ‘usmap_transform’ function. Check the link for more information: https://rdrr.io/cran/usmap/man/usmap_transform.html

Annual TACG normals

library(usmap)
library(ggplot2)
tavg_transformed<-usmap_transform(noaa_tavg,
  input_names = c("LONGITUDE", "LATITUDE"),
  output_names = c("x", "y")  
)  #Convert coordinate date frame to usmap projection. 

To customize the color palette for our graphs, we use the wesanderson packackge. Installation required. Check the link for more information: https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/

library(wesanderson)
pal <- wes_palette("Zissou1", 100, type = "continuous")
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
  geom_point(data =  tavg_transformed,
             aes(x = x, y = y,col=TAVG)) +
scale_colour_gradientn(colours =pal)+
  labs(title = "a) Annual Tavg (°C) normals for 1991–2020") +
  theme(legend.position = "right")+
  ggeasy::easy_center_title() 

ggsave("../map-TAVG.png")

ggeasy::easy_center_title() is a great function to center the ggplot title. But package ‘ggeasy’ need to be installed.

Annual TACG normals Difference

To illustrate the potential change over time of the avereage temperature, we use the TAVG normals during the 1981-2010 period to make a comparison on an “apple-to-apple” basis. The red color means the area is hotter over time. While the blue color means the opposite direction.

tavg_dif<-usmap_transform(tavg_dif,
  input_names = c("LONGITUDE", "LATITUDE"),
  output_names = c("x", "y"))
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
  geom_point(data =  tavg_dif,
             aes(x = x, y = y,col=change),size=2) +
scale_color_gradient2(midpoint = 0, low = "blue", mid = "white",
                            high = "red", space = "Lab" ,name='TAVG Change (°C)')+
  labs(title = "c) Difference between the new Tavg normals for the 1991–2020 period \n and comparable averages for the 1981–2010 period.") +
  theme(legend.position = "right")+
  ggeasy::easy_center_title()  

ggsave("../map-TAVG-dif.png")

You can customize the color gradient for continuous variable using the scale_color_gradient2 function, including the specific value and color for midpoint.

Annual PRCP normals

prcp_transformed<-usmap_transform(noaa_prcp,
  input_names = c("LONGITUDE", "LATITUDE"),
  output_names = c("x", "y")
)
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
  geom_point(data =  prcp_transformed,
             aes(x = x, y = y,col=PRCP)) +
scale_color_gradient2(midpoint = mean(prcp_transformed$PRCP), low = "brown", mid = "white",
                            high = "blue", space = "Lab" )+
  labs(title = "b) Annual Precipitation (°C) normals for 1991–2020") +
  theme(legend.position = "right")+
  ggeasy::easy_center_title()  #Use the scale_color_gradient2 function

ggsave("../map-PRCP.png")

Annual PRCP normals Difference

We calculate the percentage of change in precipitation. The blue color means the area is more humid over time. While the brown color means the opposite direction. For the name of colors in R, check the link for an overview: https://r-graph-gallery.com/42-colors-names.html You can also find a cheatsheet for R color here: https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf

prcp_dif<-usmap_transform(prcp_dif,
  input_names = c("LONGITUDE", "LATITUDE"),
  output_names = c("x", "y"))
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
  geom_point(data =  prcp_dif,
             aes(x = x, y = y,col=pct_change),size=2) +
scale_color_gradient2(midpoint = 0, low = "brown", mid = "white",
                            high = "deepskyblue2", space = "Lab" ,name='Precipitation Change (%)')+
  labs(title = "d) Percent difference between the new precipitation normals for 1991–2020 \n and comparable averages for the 1981–2010 period.") +
  theme(legend.position = "right")+
  ggeasy::easy_center_title()

ggsave("../map-PRCP-dif.png")

Change of TAVG in 4 states over time

After selecting 4 states for further analysis, we visualize the temperature during the 1922-2022 period. The dashed lines illustrate the annual average of temperature. While the solid lines show the 11-year running mean of annual average temperature. Lines are colored by state.

A moving average is commonly used with time series data to smooth out short-term fluctuations and highlight longer-term trends or cycles. Therefore, we also plot the 11-yr moving means in 4 states. To calculate the moving average, we use the rollmean() function from the zoo package (Installation required). Check the link for more information: https://www.storybench.org/how-to-calculate-a-rolling-average-in-r/

To overlay multiple geom_line in one graph, we need to set x-axis in the ggplot() and appoint different y-axes in two geom_line(). Check the link for more explanation: https://www.datanovia.com/en/blog/how-to-create-a-ggplot-with-multiple-lines/

noaa_4s<-noaa_4s[!is.na(noaa_4s$TAVG),]
tavg_4s=aggregate(x=noaa_4s$TAVG,
          by=list(noaa_4s$STATION,noaa_4s$year),
          FUN=mean)
tavg_4s<- rename(tavg_4s,Station="Group.1",year="Group.2",TAVG=x)
tavg_4s<-merge(tavg_4s,noaadig,by.x="Station",by.y="STATION",all=TRUE)
tavg_4s<-tavg_4s[!is.na(tavg_4s$TAVG),]
tavg_4s=aggregate(x=noaa_4s$TAVG,
          by=list(noaa_4s$STATE,noaa_4s$year),
          FUN=mean)
tavg_4s<- rename(tavg_4s,State="Group.1",year="Group.2",TAVG=x)
prcp_4s=aggregate(x=noaa_4s$PRCP,
          by=list(noaa_4s$STATION,noaa_4s$year),
          FUN=mean)
prcp_4s<- rename(prcp_4s,Station="Group.1",year="Group.2",PRCP=x)
prcp_4s<-merge(prcp_4s,noaadig,by.x="Station",by.y="STATION",all=TRUE)
prcp_4s<-prcp_4s[!is.na(prcp_4s$PRCP),]
prcp_4s=aggregate(x=noaa_4s$PRCP,
          by=list(noaa_4s$STATE,noaa_4s$year),
          FUN=mean)
prcp_4s<- rename(prcp_4s,State="Group.1",year="Group.2",PRCP=x)
tavg_4s <- tavg_4s %>%
    dplyr::arrange(year) %>% 
    dplyr::group_by(State) %>% 
    dplyr::mutate(tavg_11yr = zoo::rollmean(TAVG, k = 11, fill = NA))
noaa4s<-merge(tavg_4s,prcp_4s,by=c("State","year"),all=TRUE)
png(filename="../TAVG-4states.png", width=800, height=600)
ggplot(noaa4s,aes(x=year))+geom_line(aes(y=TAVG, col=State),linetype ="twodash",size=0.8)+
theme_classic()+
  geom_line(aes(y=tavg_11yr, col=State),size=0.8)+
  labs(title="Fig.2. Statewide Annual Average Temperature",x="Year",y="TAVG (°C)")+
  ggeasy::easy_center_title()
ggsave("../TAVG-4states.png")

To make the graph wider, we set the width and height of the graph in advance. A possible improvement in the future can be extra legend on the graph explaining that dashed lines show annual average while solid lines illustrate 11-yr running mean.

Linear Regression

Then, we select 4 states for further analysis. These states are CA, NJ, VA, and MO. Because the writers for visualization and linear regression have different style in naming datasets and variables, we respect each’s work, clearing the previous one’s environment. ## Clear the Environment and Read-in the Data

# close all, clear all
rm(list=ls())
graphics.off()

# set working directory
bigdata<-read.csv("noaa_bigdata.csv")
bigdigest<-read.csv("noaa_bigdigest.csv")

## count NA
output <- vector("double", ncol(bigdata))    
for (i in seq_along(bigdata)) {               
  output[[i]] <- sum(is.na(bigdata[[i]]))       
}
output 
##  [1]      0      0      0      0      0      0   7382   7382   7382  73125
## [11]  43149  20443  20443  20021  20021  20021 165493  20443   7382  20021
## [21]   6335  43119  21988  20021  20443 193021 194953 196468 196176

Data cleaning

library(tidyverse)
# seperate city, state, and country 
bigdata= separate(bigdata,col="NAME",into=c("City","State"),sep=",",remove=TRUE) %>%
   separate(col="State",into=c("blank","State","Country"),sep=" ",remove=TRUE) 
bigdata=  bigdata[,-7]

# 1. subset data to NV (West America), MO (Middle America), and NJ (East America)
subdata<- bigdata %>%
  filter(State=="CA" |State=="NJ" |State=="MO" |State=="VA") %>%
  filter(DATE >= "1922-01-01") ### restrict date to the recent 100 years

 ### convert date format ##
library(zoo)
subdata$DATE=as.Date(as.yearmon(subdata$DATE))

Regression Models

For regression models, we explore whether climate change happens in different states.

## Subset the data
subdataCA<- bigdata %>%
  filter(State=="CA") 
subdataNJ<- bigdata %>%
  filter(State=="NJ")
subdataVA<- bigdata %>%
  filter(State=="VA")
subdataMO<- bigdata %>%
  filter(State=="MO")

Our first explanation variable is the Number of days with >= 1 inch/25.4 millimeters in the month. Then we use the Highest daily total of precipitation in the month (EMXP), Total Monthly Precipitation (PRCP), Extreme maximum temperature for the month (in Degrees Fahrenheit) (EMXT), Extreme minimum temperature for the month (in Degrees Fahrenheit) (EMNT) as the dependent variable.

## We change the subdata here with each state's subset (subdataCA,subdataNJ,subdataVA, and subdataMO,) to get estimation in each state.
lm(DP1X ~ State, data = subdata) %>% summary()
## 
## Call:
## lm(formula = DP1X ~ State, data = subdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9971 -0.6832 -0.4753  0.3168 10.5247 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.47527    0.01846  25.748  < 2e-16 ***
## StateMO      0.46135    0.02293  20.122  < 2e-16 ***
## StateNJ      0.52182    0.02606  20.024  < 2e-16 ***
## StateVA      0.20795    0.02594   8.016 1.16e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.079 on 16652 degrees of freedom
##   (因为不存在,368个观察量被删除了)
## Multiple R-squared:  0.03289,    Adjusted R-squared:  0.03271 
## F-statistic: 188.7 on 3 and 16652 DF,  p-value: < 2.2e-16
table(subdata$DP1X, subdata$State)
##     
##        CA   MO   NJ   VA
##   0  2684 2814 1333 1867
##   1   357 1960 1190 1094
##   2   157  914  621  388
##   3    98  400  211  109
##   4    44  148   65   34
##   5    33   42   15   11
##   6    19   13    5    1
##   7    11    4    0    0
##   8     5    0    0    0
##   9     6    0    0    0
##   11    3    0    0    0
lm(EMXP ~ State, data = subdata) %>% summary()
## 
## Call:
## lm(formula = EMXP ~ State, data = subdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.210 -14.273  -5.102   8.098 200.127 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.2032     0.3668   46.90   <2e-16 ***
## StateMO      15.0696     0.4556   33.08   <2e-16 ***
## StateNJ      17.3063     0.5179   33.42   <2e-16 ***
## StateVA      10.7985     0.5155   20.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.44 on 16652 degrees of freedom
##   (因为不存在,368个观察量被删除了)
## Multiple R-squared:  0.07824,    Adjusted R-squared:  0.07807 
## F-statistic: 471.1 on 3 and 16652 DF,  p-value: < 2.2e-16
lm(PRCP ~ State, data = subdata) %>% summary()
## 
## Call:
## lm(formula = PRCP ~ State, data = subdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96.14 -43.24 -14.52  25.59 707.35 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   51.953      1.064   48.82   <2e-16 ***
## StateMO       34.663      1.319   26.28   <2e-16 ***
## StateNJ       44.482      1.497   29.72   <2e-16 ***
## StateVA       27.932      1.495   18.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.22 on 16795 degrees of freedom
##   (因为不存在,225个观察量被删除了)
## Multiple R-squared:  0.05674,    Adjusted R-squared:  0.05657 
## F-statistic: 336.8 on 3 and 16795 DF,  p-value: < 2.2e-16
lm(EMXT ~ State, data = subdata) %>% summary()
## 
## Call:
## lm(formula = EMXT ~ State, data = subdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.540  -6.040   1.281   6.281  18.735 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.8404     0.1320 218.566  < 2e-16 ***
## StateMO      -0.8756     0.1746  -5.015 5.36e-07 ***
## StateNJ      -3.6489     0.1884 -19.364  < 2e-16 ***
## StateVA      -1.2216     0.1859  -6.570 5.20e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.75 on 14860 degrees of freedom
##   (因为不存在,2160个观察量被删除了)
## Multiple R-squared:  0.02715,    Adjusted R-squared:  0.02695 
## F-statistic: 138.2 on 3 and 14860 DF,  p-value: < 2.2e-16
lm(EMNT ~ State,data = subdata) %>% summary()
## 
## Call:
## lm(formula = EMNT ~ State, data = subdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.088  -7.242   0.212   7.728  22.758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8876     0.1669   5.318 1.06e-07 ***
## StateMO      -3.6456     0.2217 -16.441  < 2e-16 ***
## StateNJ      -2.5153     0.2374 -10.596  < 2e-16 ***
## StateVA      -3.5022     0.2358 -14.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.85 on 14934 degrees of freedom
##   (因为不存在,2086个观察量被删除了)
## Multiple R-squared:  0.02112,    Adjusted R-squared:  0.02092 
## F-statistic: 107.4 on 3 and 14934 DF,  p-value: < 2.2e-16