Correlation Exercises

Exercise 1

Formulate hypotheses for the data combinations suggested in the table. Indicate if you believe there is a direct causal relationship, indirect or none.

Answer: …

Variable 1	Variable 2	Hypothesis
Troposphere anomaly	Surface Temperature	B
Cherry blossom date	Surface Temperature	A
CO2	Ice extent	B
Methane	CO2	C
Cherry blossom date	Ice extent	C
Surface temperature	CO2	A

require(readxl) # to read Excel files

## Loading required package: readxl

surfTemp <- read_xlsx("Correlations.xlsx", sheet = "Global_Surface", na="NA")

# exploring the dataset to confirm data loading went well
head(surfTemp)

## # A tibble: 6 × 2
##    year  temp
##   <dbl> <dbl>
## 1  1880  13.4
## 2  1881  13.5
## 3  1882  13.4
## 4  1883  13.4
## 5  1884  13.3
## 6  1885  13.2

summary(surfTemp)

##       year           temp      
##  Min.   :1880   Min.   :13.06  
##  1st Qu.:1915   1st Qu.:13.34  
##  Median :1950   Median :13.47  
##  Mean   :1950   Mean   :13.59  
##  3rd Qu.:1985   3rd Qu.:13.77  
##  Max.   :2020   Max.   :14.56

# setting up data loading function to add all datasets
loadSheet <- function(file="Correlations.xlsx", sheet=NA){
  df <- read_xlsx(file,
                  sheet=sheet,
                  na="NA")
  print(head(df,2))
  print(str(df))
  return(df)
}

# loading all datasets into R
surfTemp <- loadSheet(sheet="Global_Surface")

## # A tibble: 2 × 2
##    year  temp
##   <dbl> <dbl>
## 1  1880  13.4
## 2  1881  13.5
## tibble [141 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:141] 1880 1881 1882 1883 1884 ...
##  $ temp: num [1:141] 13.4 13.5 13.4 13.4 13.3 ...
## NULL

tropTemp <- loadSheet(sheet="Troposphere_Temp")

## # A tibble: 2 × 2
##    year anomaly
##   <dbl>   <dbl>
## 1  1978  NA    
## 2  1979  -0.140
## tibble [44 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year   : num [1:44] 1978 1979 1980 1981 1982 ...
##  $ anomaly: num [1:44] NA -0.14017 0.01242 -0.00192 -0.20267 ...
## NULL

co2PPM <- loadSheet(sheet="CO2_PPM")

## # A tibble: 2 × 2
##    year   ppm
##   <dbl> <dbl>
## 1    13  277.
## 2    30  278.
## tibble [250 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:250] 13 30 56 104 136 168 202 228 274 302 ...
##  $ ppm : num [1:250] 277 278 277 278 278 ...
## NULL

ch4PPB <- loadSheet(sheet="Methane_PPB")

## # A tibble: 2 × 2
##    year   ppb
##   <dbl> <dbl>
## 1    14  648.
## 2    31  634.
## tibble [331 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:331] 14 31 57 105 137 169 203 229 275 303 ...
##  $ ppb : num [1:331] 648 634 628 632 639 ...
## NULL

cherKy <- loadSheet(sheet="Cherry_Kyoto")

## # A tibble: 2 × 2
##    year   doy
##   <dbl> <dbl>
## 1   801    NA
## 2   802    NA
## tibble [1,215 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:1215] 801 802 803 804 805 806 807 808 809 810 ...
##  $ doy : num [1:1215] NA NA NA NA NA NA NA NA NA NA ...
## NULL

arcIce <- loadSheet(sheet="Arctic_Ice")

## # A tibble: 2 × 2
##    year   ice
##   <dbl> <dbl>
## 1  1870  10.2
## 2  1871  10.0
## tibble [139 × 2] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:139] 1870 1871 1872 1873 1874 ...
##  $ ice : num [1:139] 10.2 10 10.2 10.7 10.1 ...
## NULL

# plotting to check for relationships (for hypotheses)
par(cex.main = 1.5, mar = c(1, 5.6, 1, .1) + 0.1, mgp = c(4, 1, 0), cex.lab = 1.4, 
    font.lab = 2, cex.axis = 1.5, bty = "n", las = 1, mfrow=c(6,1),xaxt="n")

# global surface temp
degreeSym <- expression(paste("Temperature [",degree,"C]")) # obtaining degree symbol for graph
plot(temp~year, data=surfTemp, xlab="year", ylab=degreeSym, type="l", 
     lwd=2,col="darkblue", xlim=c(801,2020)) 

# limiting the start date to the beginning of cherry blossom festival since levels of co2 (ppm) and ch4 (ppb) remained relatively constant before this period anyways

# troposphere temp
plot(anomaly~year,data=tropTemp,xlab="year",ylab="temperature anomaly", type="l", 
     lwd=2,col="darkgreen", xlim=c(801,2020))
text(1950,14,"Troposphere Temperature", cex=2)
# CO2 ppm
plot(ppm~year, data=co2PPM, xlab="year", ylab="CO2 (ppm)", type="l", 
     lwd=2, col="red", xlim=c(801,2020))
text(1950,14,"Atmospheric CO2", cex=2)
# CH4 ppb
plot(ppb~year, data=ch4PPB, xlab="year", ylab="CH4 (ppb)", type="l", 
     lwd=2, col="brown", xlim=c(801,2020))
text(1950,14,"Atmospheric CH4", cex=2)
# cherry festival date
plot(doy~year, data=cherKy, xlab="year", ylab="day of year", type="l", 
     lwd=2, col="pink", xlim=c(801,2020))
text(1950,14,"Cherry Blossom Bloom Date", cex=2)
# arctic ice extent
plot(ice~year, data=arcIce, xlab="year", ylab="ice extent", 
     type="l", lwd=2, col="cyan", xlim=c(801,2020))
text(1950,14,"Arctic Sea Ice Cover",cex=2)

# another variation: from 1840 (industrial revolution) onwards
table(is.na(cherKy$doy)) # lots of missing data!

## 
## FALSE  TRUE 
##   827   388

par(cex.main = 1.5, mar = c(1, 5.6, 1, .1) + 0.1, mgp = c(4, 1, 0), cex.lab = 1.4, 
    font.lab = 2, cex.axis = 1.5, bty = "n", las = 1, mfrow=c(6,1),xaxt="n")
# global surface temp
degreeSym <- expression(paste("Temperature [",degree,"C]")) # obtaining degree symbol for graph
plot(temp~year, data=surfTemp, xlab="year", ylab=degreeSym, type="l", 
     lwd=2,col="darkblue", xlim=c(1840,2020)) 
# troposphere temp
plot(anomaly~year,data=tropTemp,xlab="year",ylab="temperature anomaly", type="l", 
     lwd=2,col="darkgreen", xlim=c(1840,2020))
text(1950,14,"Troposphere Temperature", cex=2)
# CO2 ppm
plot(ppm~year, data=co2PPM, xlab="year", ylab="CO2 (ppm)", type="l", 
     lwd=2, col="red", xlim=c(1840,2020))
text(1950,14,"Atmospheric CO2", cex=2)
# CH4 ppb
plot(ppb~year, data=ch4PPB, xlab="year", ylab="CH4 (ppb)", type="l", 
     lwd=2, col="brown", xlim=c(1840,2020))
text(1950,14,"Atmospheric CH4", cex=2)
# cherry festival date
plot(doy~year, data=cherKy, xlab="year", ylab="day of year", type="l", 
     lwd=2, col="pink", xlim=c(1840,2020))
text(1950,14,"Cherry Blossom Bloom Date", cex=2)
# arctic ice extent
plot(ice~year, data=arcIce, xlab="year", ylab="ice extent", 
     type="l", lwd=2, col="cyan", xlim=c(1840,2020))
text(1950,14,"Arctic Sea Ice Cover",cex=2)

Exercise 2

Use the same approach as we did and evaluate for each pair in the table above if the assumptions are satisfied.

Variables have been recorded during different time frames, which is an important consideration before testing for any assumptions or running the correlation analysis.

Variable 1	Variable 2	Date range
Troposphere anomaly	Surface Temperature	1978 - 2020
Cherry blossom date	Surface Temperature	1880 - 2015
CO2	Ice extent	1870 - 2006
Methane	CO2	14 - 2004
Cherry blossom date	Ice extent	1870 - 2008
Surface temperature	CO2	1880 - 2006

The common data range for all variables is? 1978 to 2004: this is the common subset.

Note: Always remember to check for duplicate observations!

require(car) # loading car package

## Loading required package: car

## Loading required package: carData

# first, figuring out the dataset (no one-size fits all)
# checking the 1,2,3: mean, spread, distribution
range(surfTemp$year)  # range of the years in dataset

## [1] 1880 2020

range(tropTemp$year)

## [1] 1978 2021

range(co2PPM$year)

## [1]   13 2006

range(ch4PPB$year)

## [1]   14 2004

range(cherKy$year)

## [1]  801 2015

range(arcIce$year)

## [1] 1870 2008

# then, looking at the distributions
par(mfrow=c(6,2), mar=c(2,4,2,1))
# surface temperature
hist(surfTemp$temp, main="Surface Temperature")
qqPlot(surfTemp$temp, main="Surface Temperature")

## [1] 141 137

# troposphere temperature
hist(tropTemp$anomaly, main="Troposphere Temperature")
qqPlot(tropTemp$anomaly, main="Troposphere Temperature")

## [1] 43 39

# CO2 ppm
hist(co2PPM$ppm, main="CO2 (ppm)")
qqPlot(co2PPM$ppm, main="CO2 (ppm)")

## [1] 250 249

# CH4 ppb
hist(ch4PPB$ppb, main="Methane (ppb)")
qqPlot(ch4PPB$ppb, main="Methane (ppb)")

## [1] 324 329

# cherry festival date
hist(cherKy$doy, main="Cherry_Kyoto")
qqPlot(cherKy$doy, main="Cherry_Kyoto")

## [1] 523 609

# arctic ice extent
hist(arcIce$ice, main="Arctic_Ice")
qqPlot(arcIce$ice, main="Arctic_Ice")

## [1] 138 139

# subsetting for each pair to their common range and testing assumptions
any(duplicated(co2PPM$year))  # Returns TRUE if duplicates exist

## [1] TRUE

any(duplicated(ch4PPB$year))

## [1] TRUE

# error due to duplicates! solving by removing them
co2PPMnew <- tapply(co2PPM$ppm, co2PPM$year, mean) # calculates mean ppm for each year
# making new data frame
co2PPMnew <- data.frame(co2PPMnew, names(co2PPMnew)) 
# renaming columns
colnames(co2PPMnew) <- c("ppm", "year")
# now for ch4 ppb
ch4PPBnew <- tapply(ch4PPB$ppb, ch4PPB$year, mean)
ch4PPBnew <- data.frame(ch4PPBnew, names(ch4PPBnew))
colnames(ch4PPBnew) <- c("ppb", "year")

For correlation, we need to check a few assumptions:
- Linearity: do the points follow a straight line?
- Homoscedasticity: is the spread constant across values?
- Normality: are both variables normally distributed? (checked with QQ-plots).
We’ll check all six pairs by merging them on their common years. Merging ensures we are actually comparing the same time points – this avoids mismatched vector errors and maximizes statistical power.

If you create fulldf by merging all six variables, the data frame only contains years where ALL six have data. Even if you select just two columns later, you’re still limited to those restricted years. Therefore, you should merge the data pairwise to improve reliability of results.

# merging all pairwise datasets to their common year ranges
par(mfrow=c(6,3), mar=c(2,4,2,1))

# tropTemp x surfaceTemp
surfTrop <- merge(tropTemp, surfTemp, by="year")
colnames(surfTrop) <- c("year", "anomaly", "temp")
plot(surfTrop$anomaly, surfTrop$temp, main="Trop. anomaly vs Surface temp.", 
     xlab=expression("anomaly ("*degree*C*")"), ylab=expression("surface temp. ("*degree*C*")"))
qqPlot(surfTrop$anomaly, main="QQ: Trop. anomaly")

## [1] 43 39

qqPlot(surfTrop$temp, main="QQ: Surface temp.")

## [1] 43 39

# cherKy x surfaceTemp
cherSur <- merge(cherKy, surfTemp, by="year")
colnames(cherSur) <- c("year", "doy", "temp")
plot(cherSur$doy, cherSur$temp, main="Cherry bloom vs Surface temp.", 
     xlab="day of year", ylab=expression("surface temp. ("*degree*C*")"))
qqPlot(cherSur$doy, main="QQ: Cherry bloom")

## [1]  13 111

qqPlot(cherSur$temp, main="QQ: Surface temp.")

## [1] 136 135

# co2ppm x arcIce
co2Ice <- merge(co2PPMnew, arcIce, by="year")
colnames(co2Ice) <- c("year", "ppm", "ice")
plot(co2Ice$ppm, co2Ice$ice, main="CO2 (ppm) vs Arc. ice cov.", 
     xlab="co2 (ppm)", ylab="ice extent")
qqPlot(co2Ice$ppm, main="QQ: CO2 (ppm)")

## [1] 96 95

qqPlot(co2Ice$ice, main="QQ: Arc. ice cov.")

## [1] 95 85

# ch4ppb x co2ppm
ch4co2 <- merge(ch4PPBnew, co2PPMnew, by="year")
colnames(ch4co2) <- c("year", "ppb", "ppm")
plot(ch4co2$ppb, ch4co2$ppm, main="CO2 (ppm) vs Arc. ice cov.", 
     xlab="ch4 (ppb)", ylab="co2 (ppm)")
qqPlot(ch4co2$ppb, main="QQ: CH4 (ppb)")

## [1] 1 2

qqPlot(ch4co2$ppm, main="QQ: CO2 (ppm)")

## [1] 77 76

# cherKy x arcIce
cherIce <- merge(cherKy, arcIce, by="year")
colnames(cherIce) <- c("year", "doy", "ice")
plot(cherIce$doy, cherIce$ice, main="Cherry bloom vs Arc. ice cov.", 
     xlab="day of year", ylab="ice extent")
qqPlot(cherIce$doy, main="QQ: Cherry bloom")

## [1] 121  23

qqPlot(cherIce$ice, main="QQ: Arc. ice cov.")

## [1] 138 139

# surfaceTemp x co2ppm
surCo2 <- merge(surfTemp, co2PPMnew, by="year")
colnames(surCo2) <- c("year", "temp", "ppm")
plot(surCo2$temp, surCo2$ppm, main="Surface temp. vs CO2 (ppm)", 
     xlab=expression("surface temp. ("*degree*C*")"), ylab="co2 (ppm)")
qqPlot(surCo2$temp, main="QQ: Surface temp.")

## [1] 91 92

qqPlot(surCo2$ppm, main="QQ: CO2 (ppm)")

## [1] 92 91

Answer: QQ plot analysis reveals that most variables deviate from normality, with the exception of the troposphere anomaly and surface temperature pair, which show adherence to the normal distribution. Due to these deviations, Spearman’s rank correlation is the appropriate choice for most pairwise comparisons, as it does not require normality assumptions. Only the troposphere-surface temperature pair meets the normality requirement for Pearson’s correlation.

Exercise 3

To test the hypothesis you developed based on table 2.2 how should you set your α level? Hint: apply the Bonferroni correction.

# setting the corrected alpha level
alphaLev <- 0.05/6 # due to N = 6 tests

Answer: Setting the Bonferroni corrected \(\alpha\) level at 0.05 divided by 6 (which is 0.0083333) to ensure the “family-wise erros rate remains at 0.05. If the P-value falls below this alpha level, the test can be judged significantly.

Exercise 4

Which of the unique pairs in table 2.2 showed a significant relationship after Bonferroni correction - and apply the correct test (Pearson or Spearman)?

# running pairwise correlations (Pearson and Spearman)
# tropTemp x surfaceTemp
cor.test(surfTrop$temp, surfTrop$anomaly) # default is Pearson's!

## 
##  Pearson's product-moment correlation
## 
## data:  surfTrop$temp and surfTrop$anomaly
## t = 29.807, df = 40, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9595877 0.9883150
## sample estimates:
##       cor 
## 0.9782219

# cherKy x surfaceTemp
cor.test(cherSur$doy, cherSur$temp, method="spearman") # now using Spearman's!

## Warning in cor.test.default(cherSur$doy, cherSur$temp, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  cherSur$doy and cherSur$temp
## S = 597856, p-value = 3.012e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5597366

# co2ppm x arcIce
cor.test(co2Ice$ppm, co2Ice$ice, method="spearman") # now using Spearman's!

## 
##  Spearman's rank correlation rho
## 
## data:  co2Ice$ppm and co2Ice$ice
## S = 252148, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.7101736

# ch4ppb x co2ppm
cor.test(ch4co2$ppb, ch4co2$ppm, method="spearman") # now using Spearman's!

## 
##  Spearman's rank correlation rho
## 
## data:  ch4co2$ppb and ch4co2$ppm
## S = 252, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9966875

# cherKy x arcIce
cor.test(cherIce$doy, cherIce$ice, method="spearman") # now using Spearman's!

## Warning in cor.test.default(cherIce$doy, cherIce$ice, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  cherIce$doy and cherIce$ice
## S = 230382, p-value = 2.979e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.425473

# surfaceTemp x co2ppm
cor.test(surCo2$temp, surCo2$ppm, method="spearman") # now using Spearman's!

## Warning in cor.test.default(surCo2$temp, surCo2$ppm, method = "spearman"):
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  surCo2$temp and surCo2$ppm
## S = 14739, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8864169

# bringing all datasets together to inspect correlations in one overview
require(corrplot) # loading correlation package

## Loading required package: corrplot

## corrplot 0.95 loaded

fulldf<-merge(surfTemp, tropTemp, by="year", all=TRUE)
fulldf<-merge(fulldf, co2PPMnew, by="year", all=TRUE)
fulldf<-merge(fulldf, ch4PPBnew, by="year", all=TRUE)
fulldf<-merge(fulldf, cherKy, by="year", all=TRUE)
fulldf<-merge(fulldf, arcIce, by="year", all=TRUE)
colnames(fulldf)<-c("year", "temp", "anomaly", "ppm", "ppb", "doy", "ice")
corrMatrix <- cor(fulldf[,-1], use="pairwise.complete.obs", method="spearman")

# creating a correlation matrix
cols <- c('#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58')
corCols <- colorRampPalette(cols)
corrplot(corrMatrix, col=corCols(100), method="circle") # applying color palette to correlation

Answer: Correlation analysis reveals strong evidence of interconnected climate change indicators. Temperature variables (surface and troposphere) show very strong positive correlations with atmospheric greenhouse gases (CO2 and CH4), with r > 0.89 for all pairings. Arctic ice extent demonstrates strong negative correlations with both surface temperature and CO2 (r ≈ -0.67 to -0.71), consistent with warming-driven ice loss. Cherry blossom timing shows moderate negative correlations with surface temperature (r ≈ -0.56), indicating earlier blooming with warming, and moderate positive correlation with ice extent (r = 0.43). The near-perfect correlation between CO2 and CH4 (r = 0.997) suggests common anthropogenic forcing. All correlations remain statistically significant after Bonferroni correction (α = 0.0083), supporting the hypothesis that these climate variables are systematically linked through anthropogenic climate change processes.

Key Takeaways

Writing a small function (load.sheet) saves repetition and keeps code tidy.
Always merge datasets by year before comparing – this guarantees aligned pairs and avoids subtle mistakes.
Most Earth system indicators are not normally distributed → Spearman’s correlation is the safe choice.
Bonferroni correction protects against inflated false positives when testing many pairs.
And most importantly: correlation is not causation – keep on thinking carefully about mechanisms and how to interpret the correlation.