Formulate hypotheses for the data combinations suggested in the table. Indicate if you believe there is a direct causal relationship, indirect or none.
Answer: …
Variable 1 | Variable 2 | Hypothesis |
---|---|---|
Troposphere anomaly | Surface Temperature | B |
Cherry blossom date | Surface Temperature | A |
CO2 | Ice extent | B |
Methane | CO2 | C |
Cherry blossom date | Ice extent | C |
Surface temperature | CO2 | A |
require(readxl) # to read Excel files
## Loading required package: readxl
surfTemp <- read_xlsx("Correlations.xlsx", sheet = "Global_Surface", na="NA")
# exploring the dataset to confirm data loading went well
head(surfTemp)
## # A tibble: 6 × 2
## year temp
## <dbl> <dbl>
## 1 1880 13.4
## 2 1881 13.5
## 3 1882 13.4
## 4 1883 13.4
## 5 1884 13.3
## 6 1885 13.2
summary(surfTemp)
## year temp
## Min. :1880 Min. :13.06
## 1st Qu.:1915 1st Qu.:13.34
## Median :1950 Median :13.47
## Mean :1950 Mean :13.59
## 3rd Qu.:1985 3rd Qu.:13.77
## Max. :2020 Max. :14.56
# setting up data loading function to add all datasets
loadSheet <- function(file="Correlations.xlsx", sheet=NA){
df <- read_xlsx(file,
sheet=sheet,
na="NA")
print(head(df,2))
print(str(df))
return(df)
}
# loading all datasets into R
surfTemp <- loadSheet(sheet="Global_Surface")
## # A tibble: 2 × 2
## year temp
## <dbl> <dbl>
## 1 1880 13.4
## 2 1881 13.5
## tibble [141 × 2] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:141] 1880 1881 1882 1883 1884 ...
## $ temp: num [1:141] 13.4 13.5 13.4 13.4 13.3 ...
## NULL
tropTemp <- loadSheet(sheet="Troposphere_Temp")
## # A tibble: 2 × 2
## year anomaly
## <dbl> <dbl>
## 1 1978 NA
## 2 1979 -0.140
## tibble [44 × 2] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:44] 1978 1979 1980 1981 1982 ...
## $ anomaly: num [1:44] NA -0.14017 0.01242 -0.00192 -0.20267 ...
## NULL
co2PPM <- loadSheet(sheet="CO2_PPM")
## # A tibble: 2 × 2
## year ppm
## <dbl> <dbl>
## 1 13 277.
## 2 30 278.
## tibble [250 × 2] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:250] 13 30 56 104 136 168 202 228 274 302 ...
## $ ppm : num [1:250] 277 278 277 278 278 ...
## NULL
ch4PPB <- loadSheet(sheet="Methane_PPB")
## # A tibble: 2 × 2
## year ppb
## <dbl> <dbl>
## 1 14 648.
## 2 31 634.
## tibble [331 × 2] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:331] 14 31 57 105 137 169 203 229 275 303 ...
## $ ppb : num [1:331] 648 634 628 632 639 ...
## NULL
cherKy <- loadSheet(sheet="Cherry_Kyoto")
## # A tibble: 2 × 2
## year doy
## <dbl> <dbl>
## 1 801 NA
## 2 802 NA
## tibble [1,215 × 2] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:1215] 801 802 803 804 805 806 807 808 809 810 ...
## $ doy : num [1:1215] NA NA NA NA NA NA NA NA NA NA ...
## NULL
arcIce <- loadSheet(sheet="Arctic_Ice")
## # A tibble: 2 × 2
## year ice
## <dbl> <dbl>
## 1 1870 10.2
## 2 1871 10.0
## tibble [139 × 2] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:139] 1870 1871 1872 1873 1874 ...
## $ ice : num [1:139] 10.2 10 10.2 10.7 10.1 ...
## NULL
# plotting to check for relationships (for hypotheses)
par(cex.main = 1.5, mar = c(1, 5.6, 1, .1) + 0.1, mgp = c(4, 1, 0), cex.lab = 1.4,
font.lab = 2, cex.axis = 1.5, bty = "n", las = 1, mfrow=c(6,1),xaxt="n")
# global surface temp
degreeSym <- expression(paste("Temperature [",degree,"C]")) # obtaining degree symbol for graph
plot(temp~year, data=surfTemp, xlab="year", ylab=degreeSym, type="l",
lwd=2,col="darkblue", xlim=c(801,2020))
# limiting the start date to the beginning of cherry blossom festival since levels of co2 (ppm) and ch4 (ppb) remained relatively constant before this period anyways
# troposphere temp
plot(anomaly~year,data=tropTemp,xlab="year",ylab="temperature anomaly", type="l",
lwd=2,col="darkgreen", xlim=c(801,2020))
text(1950,14,"Troposphere Temperature", cex=2)
# CO2 ppm
plot(ppm~year, data=co2PPM, xlab="year", ylab="CO2 (ppm)", type="l",
lwd=2, col="red", xlim=c(801,2020))
text(1950,14,"Atmospheric CO2", cex=2)
# CH4 ppb
plot(ppb~year, data=ch4PPB, xlab="year", ylab="CH4 (ppb)", type="l",
lwd=2, col="brown", xlim=c(801,2020))
text(1950,14,"Atmospheric CH4", cex=2)
# cherry festival date
plot(doy~year, data=cherKy, xlab="year", ylab="day of year", type="l",
lwd=2, col="pink", xlim=c(801,2020))
text(1950,14,"Cherry Blossom Bloom Date", cex=2)
# arctic ice extent
plot(ice~year, data=arcIce, xlab="year", ylab="ice extent",
type="l", lwd=2, col="cyan", xlim=c(801,2020))
text(1950,14,"Arctic Sea Ice Cover",cex=2)
# another variation: from 1840 (industrial revolution) onwards
table(is.na(cherKy$doy)) # lots of missing data!
##
## FALSE TRUE
## 827 388
par(cex.main = 1.5, mar = c(1, 5.6, 1, .1) + 0.1, mgp = c(4, 1, 0), cex.lab = 1.4,
font.lab = 2, cex.axis = 1.5, bty = "n", las = 1, mfrow=c(6,1),xaxt="n")
# global surface temp
degreeSym <- expression(paste("Temperature [",degree,"C]")) # obtaining degree symbol for graph
plot(temp~year, data=surfTemp, xlab="year", ylab=degreeSym, type="l",
lwd=2,col="darkblue", xlim=c(1840,2020))
# troposphere temp
plot(anomaly~year,data=tropTemp,xlab="year",ylab="temperature anomaly", type="l",
lwd=2,col="darkgreen", xlim=c(1840,2020))
text(1950,14,"Troposphere Temperature", cex=2)
# CO2 ppm
plot(ppm~year, data=co2PPM, xlab="year", ylab="CO2 (ppm)", type="l",
lwd=2, col="red", xlim=c(1840,2020))
text(1950,14,"Atmospheric CO2", cex=2)
# CH4 ppb
plot(ppb~year, data=ch4PPB, xlab="year", ylab="CH4 (ppb)", type="l",
lwd=2, col="brown", xlim=c(1840,2020))
text(1950,14,"Atmospheric CH4", cex=2)
# cherry festival date
plot(doy~year, data=cherKy, xlab="year", ylab="day of year", type="l",
lwd=2, col="pink", xlim=c(1840,2020))
text(1950,14,"Cherry Blossom Bloom Date", cex=2)
# arctic ice extent
plot(ice~year, data=arcIce, xlab="year", ylab="ice extent",
type="l", lwd=2, col="cyan", xlim=c(1840,2020))
text(1950,14,"Arctic Sea Ice Cover",cex=2)
Use the same approach as we did and evaluate for each pair in the table above if the assumptions are satisfied.
Variables have been recorded during different time frames, which is an important consideration before testing for any assumptions or running the correlation analysis.
Variable 1 | Variable 2 | Date range |
---|---|---|
Troposphere anomaly | Surface Temperature | 1978 - 2020 |
Cherry blossom date | Surface Temperature | 1880 - 2015 |
CO2 | Ice extent | 1870 - 2006 |
Methane | CO2 | 14 - 2004 |
Cherry blossom date | Ice extent | 1870 - 2008 |
Surface temperature | CO2 | 1880 - 2006 |
The common data range for all variables is? 1978 to 2004: this is the common subset.
Note: Always remember to check for duplicate observations!
require(car) # loading car package
## Loading required package: car
## Loading required package: carData
# first, figuring out the dataset (no one-size fits all)
# checking the 1,2,3: mean, spread, distribution
range(surfTemp$year) # range of the years in dataset
## [1] 1880 2020
range(tropTemp$year)
## [1] 1978 2021
range(co2PPM$year)
## [1] 13 2006
range(ch4PPB$year)
## [1] 14 2004
range(cherKy$year)
## [1] 801 2015
range(arcIce$year)
## [1] 1870 2008
# then, looking at the distributions
par(mfrow=c(6,2), mar=c(2,4,2,1))
# surface temperature
hist(surfTemp$temp, main="Surface Temperature")
qqPlot(surfTemp$temp, main="Surface Temperature")
## [1] 141 137
# troposphere temperature
hist(tropTemp$anomaly, main="Troposphere Temperature")
qqPlot(tropTemp$anomaly, main="Troposphere Temperature")
## [1] 43 39
# CO2 ppm
hist(co2PPM$ppm, main="CO2 (ppm)")
qqPlot(co2PPM$ppm, main="CO2 (ppm)")
## [1] 250 249
# CH4 ppb
hist(ch4PPB$ppb, main="Methane (ppb)")
qqPlot(ch4PPB$ppb, main="Methane (ppb)")
## [1] 324 329
# cherry festival date
hist(cherKy$doy, main="Cherry_Kyoto")
qqPlot(cherKy$doy, main="Cherry_Kyoto")
## [1] 523 609
# arctic ice extent
hist(arcIce$ice, main="Arctic_Ice")
qqPlot(arcIce$ice, main="Arctic_Ice")
## [1] 138 139
# subsetting for each pair to their common range and testing assumptions
any(duplicated(co2PPM$year)) # Returns TRUE if duplicates exist
## [1] TRUE
any(duplicated(ch4PPB$year))
## [1] TRUE
# error due to duplicates! solving by removing them
co2PPMnew <- tapply(co2PPM$ppm, co2PPM$year, mean) # calculates mean ppm for each year
# making new data frame
co2PPMnew <- data.frame(co2PPMnew, names(co2PPMnew))
# renaming columns
colnames(co2PPMnew) <- c("ppm", "year")
# now for ch4 ppb
ch4PPBnew <- tapply(ch4PPB$ppb, ch4PPB$year, mean)
ch4PPBnew <- data.frame(ch4PPBnew, names(ch4PPBnew))
colnames(ch4PPBnew) <- c("ppb", "year")
For correlation, we need to check a few assumptions:
- Linearity: do the points follow a straight line?
- Homoscedasticity: is the spread constant across values?
- Normality: are both variables normally distributed? (checked with
QQ-plots).
We’ll check all six pairs by merging them on their common years. Merging
ensures we are actually comparing the same time points – this avoids
mismatched vector errors and maximizes statistical power.
If you create fulldf by merging all six variables, the data frame only contains years where ALL six have data. Even if you select just two columns later, you’re still limited to those restricted years. Therefore, you should merge the data pairwise to improve reliability of results.
# merging all pairwise datasets to their common year ranges
par(mfrow=c(6,3), mar=c(2,4,2,1))
# tropTemp x surfaceTemp
surfTrop <- merge(tropTemp, surfTemp, by="year")
colnames(surfTrop) <- c("year", "anomaly", "temp")
plot(surfTrop$anomaly, surfTrop$temp, main="Trop. anomaly vs Surface temp.",
xlab=expression("anomaly ("*degree*C*")"), ylab=expression("surface temp. ("*degree*C*")"))
qqPlot(surfTrop$anomaly, main="QQ: Trop. anomaly")
## [1] 43 39
qqPlot(surfTrop$temp, main="QQ: Surface temp.")
## [1] 43 39
# cherKy x surfaceTemp
cherSur <- merge(cherKy, surfTemp, by="year")
colnames(cherSur) <- c("year", "doy", "temp")
plot(cherSur$doy, cherSur$temp, main="Cherry bloom vs Surface temp.",
xlab="day of year", ylab=expression("surface temp. ("*degree*C*")"))
qqPlot(cherSur$doy, main="QQ: Cherry bloom")
## [1] 13 111
qqPlot(cherSur$temp, main="QQ: Surface temp.")
## [1] 136 135
# co2ppm x arcIce
co2Ice <- merge(co2PPMnew, arcIce, by="year")
colnames(co2Ice) <- c("year", "ppm", "ice")
plot(co2Ice$ppm, co2Ice$ice, main="CO2 (ppm) vs Arc. ice cov.",
xlab="co2 (ppm)", ylab="ice extent")
qqPlot(co2Ice$ppm, main="QQ: CO2 (ppm)")
## [1] 96 95
qqPlot(co2Ice$ice, main="QQ: Arc. ice cov.")
## [1] 95 85
# ch4ppb x co2ppm
ch4co2 <- merge(ch4PPBnew, co2PPMnew, by="year")
colnames(ch4co2) <- c("year", "ppb", "ppm")
plot(ch4co2$ppb, ch4co2$ppm, main="CO2 (ppm) vs Arc. ice cov.",
xlab="ch4 (ppb)", ylab="co2 (ppm)")
qqPlot(ch4co2$ppb, main="QQ: CH4 (ppb)")
## [1] 1 2
qqPlot(ch4co2$ppm, main="QQ: CO2 (ppm)")
## [1] 77 76
# cherKy x arcIce
cherIce <- merge(cherKy, arcIce, by="year")
colnames(cherIce) <- c("year", "doy", "ice")
plot(cherIce$doy, cherIce$ice, main="Cherry bloom vs Arc. ice cov.",
xlab="day of year", ylab="ice extent")
qqPlot(cherIce$doy, main="QQ: Cherry bloom")
## [1] 121 23
qqPlot(cherIce$ice, main="QQ: Arc. ice cov.")
## [1] 138 139
# surfaceTemp x co2ppm
surCo2 <- merge(surfTemp, co2PPMnew, by="year")
colnames(surCo2) <- c("year", "temp", "ppm")
plot(surCo2$temp, surCo2$ppm, main="Surface temp. vs CO2 (ppm)",
xlab=expression("surface temp. ("*degree*C*")"), ylab="co2 (ppm)")
qqPlot(surCo2$temp, main="QQ: Surface temp.")
## [1] 91 92
qqPlot(surCo2$ppm, main="QQ: CO2 (ppm)")
## [1] 92 91
Answer: QQ plot analysis reveals that most variables deviate from normality, with the exception of the troposphere anomaly and surface temperature pair, which show adherence to the normal distribution. Due to these deviations, Spearman’s rank correlation is the appropriate choice for most pairwise comparisons, as it does not require normality assumptions. Only the troposphere-surface temperature pair meets the normality requirement for Pearson’s correlation.
To test the hypothesis you developed based on table 2.2 how should you set your α level? Hint: apply the Bonferroni correction.
# setting the corrected alpha level
alphaLev <- 0.05/6 # due to N = 6 tests
Answer: Setting the Bonferroni corrected \(\alpha\) level at 0.05 divided by 6 (which is 0.0083333) to ensure the “family-wise erros rate remains at 0.05. If the P-value falls below this alpha level, the test can be judged significantly.
Which of the unique pairs in table 2.2 showed a significant relationship after Bonferroni correction - and apply the correct test (Pearson or Spearman)?
# running pairwise correlations (Pearson and Spearman)
# tropTemp x surfaceTemp
cor.test(surfTrop$temp, surfTrop$anomaly) # default is Pearson's!
##
## Pearson's product-moment correlation
##
## data: surfTrop$temp and surfTrop$anomaly
## t = 29.807, df = 40, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9595877 0.9883150
## sample estimates:
## cor
## 0.9782219
# cherKy x surfaceTemp
cor.test(cherSur$doy, cherSur$temp, method="spearman") # now using Spearman's!
## Warning in cor.test.default(cherSur$doy, cherSur$temp, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: cherSur$doy and cherSur$temp
## S = 597856, p-value = 3.012e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.5597366
# co2ppm x arcIce
cor.test(co2Ice$ppm, co2Ice$ice, method="spearman") # now using Spearman's!
##
## Spearman's rank correlation rho
##
## data: co2Ice$ppm and co2Ice$ice
## S = 252148, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.7101736
# ch4ppb x co2ppm
cor.test(ch4co2$ppb, ch4co2$ppm, method="spearman") # now using Spearman's!
##
## Spearman's rank correlation rho
##
## data: ch4co2$ppb and ch4co2$ppm
## S = 252, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9966875
# cherKy x arcIce
cor.test(cherIce$doy, cherIce$ice, method="spearman") # now using Spearman's!
## Warning in cor.test.default(cherIce$doy, cherIce$ice, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: cherIce$doy and cherIce$ice
## S = 230382, p-value = 2.979e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.425473
# surfaceTemp x co2ppm
cor.test(surCo2$temp, surCo2$ppm, method="spearman") # now using Spearman's!
## Warning in cor.test.default(surCo2$temp, surCo2$ppm, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: surCo2$temp and surCo2$ppm
## S = 14739, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8864169
# bringing all datasets together to inspect correlations in one overview
require(corrplot) # loading correlation package
## Loading required package: corrplot
## corrplot 0.95 loaded
fulldf<-merge(surfTemp, tropTemp, by="year", all=TRUE)
fulldf<-merge(fulldf, co2PPMnew, by="year", all=TRUE)
fulldf<-merge(fulldf, ch4PPBnew, by="year", all=TRUE)
fulldf<-merge(fulldf, cherKy, by="year", all=TRUE)
fulldf<-merge(fulldf, arcIce, by="year", all=TRUE)
colnames(fulldf)<-c("year", "temp", "anomaly", "ppm", "ppb", "doy", "ice")
corrMatrix <- cor(fulldf[,-1], use="pairwise.complete.obs", method="spearman")
# creating a correlation matrix
cols <- c('#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58')
corCols <- colorRampPalette(cols)
corrplot(corrMatrix, col=corCols(100), method="circle") # applying color palette to correlation
Answer: Correlation analysis reveals strong evidence of interconnected climate change indicators. Temperature variables (surface and troposphere) show very strong positive correlations with atmospheric greenhouse gases (CO2 and CH4), with r > 0.89 for all pairings. Arctic ice extent demonstrates strong negative correlations with both surface temperature and CO2 (r ≈ -0.67 to -0.71), consistent with warming-driven ice loss. Cherry blossom timing shows moderate negative correlations with surface temperature (r ≈ -0.56), indicating earlier blooming with warming, and moderate positive correlation with ice extent (r = 0.43). The near-perfect correlation between CO2 and CH4 (r = 0.997) suggests common anthropogenic forcing. All correlations remain statistically significant after Bonferroni correction (α = 0.0083), supporting the hypothesis that these climate variables are systematically linked through anthropogenic climate change processes.