EPH-505 CodeBuild#3 Evidence of Climate Change
Set working directory
Read datasets
noaa <- read.csv("C:/Users/Chenyi Huang/OneDrive - Yale University/22Fall/EPH 505/Code Build/CodeBuild_3/noaa_bigdata.csv")
noaadig <- read.csv("C:/Users/Chenyi Huang/OneDrive - Yale University/22Fall/EPH 505/Code Build/CodeBuild_3/noaa_bigdigest.csv")Data Cleaning
Set as date
library(tidyverse)
library(lubridate)
noaa$year=str_sub(noaa$DATE, 1, 4) #Get the year of each line
noaa$year<-noaa$year %>% as.numeric()
noaa<-merge(noaa,noaadig,by.x="STATION",by.y="STATION",all=TRUE) #Get the State and location information for each stationSubset the dataset
noaa_4s=noaa %>% filter(STATE=="CA" |STATE=="NJ" |STATE=="MO" |STATE=="VA") #Subset only 4 states for regression analysis
noaa_4s <- noaa_4s %>% filter(year >= "1922") #Subset only the recent 100 years
noaa_p1 <- noaa %>% filter(year >= "1981") #Subset only the 1981-2010 period as the reference level for the overview of normals
noaa_p1 <- noaa_p1 %>% filter(year <= "2010")
noaa_p2 <- noaa %>% filter(year >= "1991") #Subset only the recent 30 period for the overview of normals
noaa_p2 <- noaa_p2 %>% filter(year <= "2020")Get the dataset of TAVG for visualization
noaa_tavg<-noaa_p2[!is.na(noaa_p2$TAVG),]
noaa_tavg_p1<-noaa_p1[!is.na(noaa_p1$TAVG),] #Delete NA values
noaa_tavg=aggregate(x=noaa_tavg$TAVG,
by=list(noaa_tavg$STATION),
FUN=mean) #Get the mean of TAVG during the 1991-2020 period
noaa_tavg_p1=aggregate(x=noaa_tavg_p1$TAVG,
by=list(noaa_tavg_p1$STATION),
FUN=mean) #Get the mean of TAVG during the 1981-2010 period
noaa_tavg <- rename(noaa_tavg,Station="Group.1",TAVG=x)
noaa_tavg_p1<- rename(noaa_tavg_p1,Station="Group.1",TAVG_p1=x)
noaa_tavg<-merge(noaa_tavg,noaadig,by.x="Station",by.y="STATION",all=TRUE) #Merge to get the State and location information for each station
noaa_tavg <- noaa_tavg %>% na.omit()
noaa_tavg_p1 <- noaa_tavg_p1 %>% na.omit()
tavg_dif<-merge(noaa_tavg,noaa_tavg_p1,by.x="Station",by.y="Station",all=TRUE)
tavg_dif<-tavg_dif %>% group_by(Station) %>% mutate(change = TAVG-TAVG_p1) #Add a variable measuring the difference between two 30-yr periods. This would be a comparison on an “apples to apples” basis
tavg_dif<-tavg_dif[!is.na(tavg_dif$TAVG),] Get the dataset of Precipitation for visualization
noaa_prcp<-noaa_p2[!is.na(noaa_p2$PRCP),]
noaa_prcp_p1<-noaa_p1[!is.na(noaa_p1$PRCP),]
noaa_prcp=aggregate(x=noaa_prcp$PRCP,
by=list(noaa_prcp$STATION),
FUN=mean)
noaa_prcp_p1=aggregate(x=noaa_prcp_p1$PRCP,
by=list(noaa_prcp_p1$STATION),
FUN=mean)
noaa_prcp <- rename(noaa_prcp,Station="Group.1",PRCP=x)
noaa_prcp_p1<- rename(noaa_prcp_p1,Station="Group.1",PRCP_p1=x)
noaa_prcp<-merge(noaa_prcp,noaadig,by.x="Station",by.y="STATION",all=TRUE)
noaa_prcp <- noaa_prcp %>% na.omit()
noaa_prcp_p1 <- noaa_prcp_p1 %>% na.omit()
prcp_dif<-merge(noaa_prcp,noaa_prcp_p1,by.x="Station",by.y="Station",all=TRUE)
prcp_dif<-prcp_dif %>% group_by(Station) %>% mutate(pct_change = (PRCP-PRCP_p1)/PRCP_p1*100) #Add a variable measuring the percentage change between two 30-yr periods.
prcp_dif<-prcp_dif[!is.na(prcp_dif$PRCP),]
prcp_dif<-prcp_dif[!is.na(prcp_dif$pct_change),]Visualization
We use the ggplot2 and usmap packages to plot 2 climate normals (TAVG and PRCP) during the 1991-2020 period on the US map. Check these 2 links for more about the plot_usmap function: https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html https://cran.r-project.org/web/packages/usmap/vignettes/advanced-mapping.html
For the plot_usmap function, it is necessary to coordinate date frame to usmap projection. Check the link for more information. Therefore, we need to use the ‘usmap_transform’ function. Check the link for more information: https://rdrr.io/cran/usmap/man/usmap_transform.html
Annual TACG normals
library(usmap)
library(ggplot2)
tavg_transformed<-usmap_transform(noaa_tavg,
input_names = c("LONGITUDE", "LATITUDE"),
output_names = c("x", "y")
) #Convert coordinate date frame to usmap projection. To customize the color palette for our graphs, we use the wesanderson packackge. Installation required. Check the link for more information: https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/
library(wesanderson)
pal <- wes_palette("Zissou1", 100, type = "continuous")
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
geom_point(data = tavg_transformed,
aes(x = x, y = y,col=TAVG)) +
scale_colour_gradientn(colours =pal)+
labs(title = "a) Annual Tavg (°C) normals for 1991–2020") +
theme(legend.position = "right")+
ggeasy::easy_center_title() ggsave("../map-TAVG.png")ggeasy::easy_center_title() is a great function to center the ggplot title. But package ‘ggeasy’ need to be installed.
Annual TACG normals Difference
To illustrate the potential change over time of the avereage temperature, we use the TAVG normals during the 1981-2010 period to make a comparison on an “apple-to-apple” basis. The red color means the area is hotter over time. While the blue color means the opposite direction.
tavg_dif<-usmap_transform(tavg_dif,
input_names = c("LONGITUDE", "LATITUDE"),
output_names = c("x", "y"))
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
geom_point(data = tavg_dif,
aes(x = x, y = y,col=change),size=2) +
scale_color_gradient2(midpoint = 0, low = "blue", mid = "white",
high = "red", space = "Lab" ,name='TAVG Change (°C)')+
labs(title = "c) Difference between the new Tavg normals for the 1991–2020 period \n and comparable averages for the 1981–2010 period.") +
theme(legend.position = "right")+
ggeasy::easy_center_title() ggsave("../map-TAVG-dif.png")You can customize the color gradient for continuous variable using the scale_color_gradient2 function, including the specific value and color for midpoint.
Annual PRCP normals
prcp_transformed<-usmap_transform(noaa_prcp,
input_names = c("LONGITUDE", "LATITUDE"),
output_names = c("x", "y")
)
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
geom_point(data = prcp_transformed,
aes(x = x, y = y,col=PRCP)) +
scale_color_gradient2(midpoint = mean(prcp_transformed$PRCP), low = "brown", mid = "white",
high = "blue", space = "Lab" )+
labs(title = "b) Annual Precipitation (°C) normals for 1991–2020") +
theme(legend.position = "right")+
ggeasy::easy_center_title() #Use the scale_color_gradient2 functionggsave("../map-PRCP.png")Annual PRCP normals Difference
We calculate the percentage of change in precipitation. The blue color means the area is more humid over time. While the brown color means the opposite direction. For the name of colors in R, check the link for an overview: https://r-graph-gallery.com/42-colors-names.html You can also find a cheatsheet for R color here: https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf
prcp_dif<-usmap_transform(prcp_dif,
input_names = c("LONGITUDE", "LATITUDE"),
output_names = c("x", "y"))
plot_usmap(regions = "states",fill = "light yellow",labels = TRUE) +
geom_point(data = prcp_dif,
aes(x = x, y = y,col=pct_change),size=2) +
scale_color_gradient2(midpoint = 0, low = "brown", mid = "white",
high = "deepskyblue2", space = "Lab" ,name='Precipitation Change (%)')+
labs(title = "d) Percent difference between the new precipitation normals for 1991–2020 \n and comparable averages for the 1981–2010 period.") +
theme(legend.position = "right")+
ggeasy::easy_center_title()ggsave("../map-PRCP-dif.png")Change of TAVG in 4 states over time
After selecting 4 states for further analysis, we visualize the temperature during the 1922-2022 period. The dashed lines illustrate the annual average of temperature. While the solid lines show the 11-year running mean of annual average temperature. Lines are colored by state.
A moving average is commonly used with time series data to smooth out short-term fluctuations and highlight longer-term trends or cycles. Therefore, we also plot the 11-yr moving means in 4 states. To calculate the moving average, we use the rollmean() function from the zoo package (Installation required). Check the link for more information: https://www.storybench.org/how-to-calculate-a-rolling-average-in-r/
To overlay multiple geom_line in one graph, we need to set x-axis in the ggplot() and appoint different y-axes in two geom_line(). Check the link for more explanation: https://www.datanovia.com/en/blog/how-to-create-a-ggplot-with-multiple-lines/
noaa_4s<-noaa_4s[!is.na(noaa_4s$TAVG),]
tavg_4s=aggregate(x=noaa_4s$TAVG,
by=list(noaa_4s$STATION,noaa_4s$year),
FUN=mean)
tavg_4s<- rename(tavg_4s,Station="Group.1",year="Group.2",TAVG=x)
tavg_4s<-merge(tavg_4s,noaadig,by.x="Station",by.y="STATION",all=TRUE)
tavg_4s<-tavg_4s[!is.na(tavg_4s$TAVG),]
tavg_4s=aggregate(x=noaa_4s$TAVG,
by=list(noaa_4s$STATE,noaa_4s$year),
FUN=mean)
tavg_4s<- rename(tavg_4s,State="Group.1",year="Group.2",TAVG=x)
prcp_4s=aggregate(x=noaa_4s$PRCP,
by=list(noaa_4s$STATION,noaa_4s$year),
FUN=mean)
prcp_4s<- rename(prcp_4s,Station="Group.1",year="Group.2",PRCP=x)
prcp_4s<-merge(prcp_4s,noaadig,by.x="Station",by.y="STATION",all=TRUE)
prcp_4s<-prcp_4s[!is.na(prcp_4s$PRCP),]
prcp_4s=aggregate(x=noaa_4s$PRCP,
by=list(noaa_4s$STATE,noaa_4s$year),
FUN=mean)
prcp_4s<- rename(prcp_4s,State="Group.1",year="Group.2",PRCP=x)
tavg_4s <- tavg_4s %>%
dplyr::arrange(year) %>%
dplyr::group_by(State) %>%
dplyr::mutate(tavg_11yr = zoo::rollmean(TAVG, k = 11, fill = NA))
noaa4s<-merge(tavg_4s,prcp_4s,by=c("State","year"),all=TRUE)
png(filename="../TAVG-4states.png", width=800, height=600)
ggplot(noaa4s,aes(x=year))+geom_line(aes(y=TAVG, col=State),linetype ="twodash",size=0.8)+
theme_classic()+
geom_line(aes(y=tavg_11yr, col=State),size=0.8)+
labs(title="Fig.2. Statewide Annual Average Temperature",x="Year",y="TAVG (°C)")+
ggeasy::easy_center_title()
ggsave("../TAVG-4states.png")To make the graph wider, we set the width and height of the graph in advance. A possible improvement in the future can be extra legend on the graph explaining that dashed lines show annual average while solid lines illustrate 11-yr running mean.
Linear Regression
Then, we select 4 states for further analysis. These states are CA, NJ, VA, and MO. Because the writers for visualization and linear regression have different style in naming datasets and variables, we respect each’s work, clearing the previous one’s environment. ## Clear the Environment and Read-in the Data
# close all, clear all
rm(list=ls())
graphics.off()
# set working directory
bigdata<-read.csv("noaa_bigdata.csv")
bigdigest<-read.csv("noaa_bigdigest.csv")
## count NA
output <- vector("double", ncol(bigdata))
for (i in seq_along(bigdata)) {
output[[i]] <- sum(is.na(bigdata[[i]]))
}
output ## [1] 0 0 0 0 0 0 7382 7382 7382 73125
## [11] 43149 20443 20443 20021 20021 20021 165493 20443 7382 20021
## [21] 6335 43119 21988 20021 20443 193021 194953 196468 196176
Data cleaning
library(tidyverse)
# seperate city, state, and country
bigdata= separate(bigdata,col="NAME",into=c("City","State"),sep=",",remove=TRUE) %>%
separate(col="State",into=c("blank","State","Country"),sep=" ",remove=TRUE)
bigdata= bigdata[,-7]
# 1. subset data to NV (West America), MO (Middle America), and NJ (East America)
subdata<- bigdata %>%
filter(State=="CA" |State=="NJ" |State=="MO" |State=="VA") %>%
filter(DATE >= "1922-01-01") ### restrict date to the recent 100 years
### convert date format ##
library(zoo)
subdata$DATE=as.Date(as.yearmon(subdata$DATE))Regression Models
For regression models, we explore whether climate change happens in different states.
## Subset the data
subdataCA<- bigdata %>%
filter(State=="CA")
subdataNJ<- bigdata %>%
filter(State=="NJ")
subdataVA<- bigdata %>%
filter(State=="VA")
subdataMO<- bigdata %>%
filter(State=="MO")Our first explanation variable is the Number of days with >= 1 inch/25.4 millimeters in the month. Then we use the Highest daily total of precipitation in the month (EMXP), Total Monthly Precipitation (PRCP), Extreme maximum temperature for the month (in Degrees Fahrenheit) (EMXT), Extreme minimum temperature for the month (in Degrees Fahrenheit) (EMNT) as the dependent variable.
## We change the subdata here with each state's subset (subdataCA,subdataNJ,subdataVA, and subdataMO,) to get estimation in each state.
lm(DP1X ~ State, data = subdata) %>% summary()##
## Call:
## lm(formula = DP1X ~ State, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9971 -0.6832 -0.4753 0.3168 10.5247
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47527 0.01846 25.748 < 2e-16 ***
## StateMO 0.46135 0.02293 20.122 < 2e-16 ***
## StateNJ 0.52182 0.02606 20.024 < 2e-16 ***
## StateVA 0.20795 0.02594 8.016 1.16e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.079 on 16652 degrees of freedom
## (因为不存在,368个观察量被删除了)
## Multiple R-squared: 0.03289, Adjusted R-squared: 0.03271
## F-statistic: 188.7 on 3 and 16652 DF, p-value: < 2.2e-16
table(subdata$DP1X, subdata$State)##
## CA MO NJ VA
## 0 2684 2814 1333 1867
## 1 357 1960 1190 1094
## 2 157 914 621 388
## 3 98 400 211 109
## 4 44 148 65 34
## 5 33 42 15 11
## 6 19 13 5 1
## 7 11 4 0 0
## 8 5 0 0 0
## 9 6 0 0 0
## 11 3 0 0 0
lm(EMXP ~ State, data = subdata) %>% summary()##
## Call:
## lm(formula = EMXP ~ State, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.210 -14.273 -5.102 8.098 200.127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.2032 0.3668 46.90 <2e-16 ***
## StateMO 15.0696 0.4556 33.08 <2e-16 ***
## StateNJ 17.3063 0.5179 33.42 <2e-16 ***
## StateVA 10.7985 0.5155 20.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.44 on 16652 degrees of freedom
## (因为不存在,368个观察量被删除了)
## Multiple R-squared: 0.07824, Adjusted R-squared: 0.07807
## F-statistic: 471.1 on 3 and 16652 DF, p-value: < 2.2e-16
lm(PRCP ~ State, data = subdata) %>% summary()##
## Call:
## lm(formula = PRCP ~ State, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.14 -43.24 -14.52 25.59 707.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.953 1.064 48.82 <2e-16 ***
## StateMO 34.663 1.319 26.28 <2e-16 ***
## StateNJ 44.482 1.497 29.72 <2e-16 ***
## StateVA 27.932 1.495 18.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.22 on 16795 degrees of freedom
## (因为不存在,225个观察量被删除了)
## Multiple R-squared: 0.05674, Adjusted R-squared: 0.05657
## F-statistic: 336.8 on 3 and 16795 DF, p-value: < 2.2e-16
lm(EMXT ~ State, data = subdata) %>% summary()##
## Call:
## lm(formula = EMXT ~ State, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.540 -6.040 1.281 6.281 18.735
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.8404 0.1320 218.566 < 2e-16 ***
## StateMO -0.8756 0.1746 -5.015 5.36e-07 ***
## StateNJ -3.6489 0.1884 -19.364 < 2e-16 ***
## StateVA -1.2216 0.1859 -6.570 5.20e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.75 on 14860 degrees of freedom
## (因为不存在,2160个观察量被删除了)
## Multiple R-squared: 0.02715, Adjusted R-squared: 0.02695
## F-statistic: 138.2 on 3 and 14860 DF, p-value: < 2.2e-16
lm(EMNT ~ State,data = subdata) %>% summary()##
## Call:
## lm(formula = EMNT ~ State, data = subdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.088 -7.242 0.212 7.728 22.758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8876 0.1669 5.318 1.06e-07 ***
## StateMO -3.6456 0.2217 -16.441 < 2e-16 ***
## StateNJ -2.5153 0.2374 -10.596 < 2e-16 ***
## StateVA -3.5022 0.2358 -14.854 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.85 on 14934 degrees of freedom
## (因为不存在,2086个观察量被删除了)
## Multiple R-squared: 0.02112, Adjusted R-squared: 0.02092
## F-statistic: 107.4 on 3 and 14934 DF, p-value: < 2.2e-16