# always load tidyverse :)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
# pretty colors
library(PNWColors)

Why is this plot bad?

foo <- anscombe %>% 
  dplyr::select(x4,y4) %>% 
  mutate(x4=jitter(x4,factor=0.1)) %>% 
  rename(y=y4,x=x4) 

foo %>% ggplot(mapping = aes(x,y)) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

foo %>% ggplot(mapping = aes(x,y)) + geom_smooth(method="lm",se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

These plots are great examples of how misleading smoothing can be if it is not done appropriately. For both plots, there are no variable names or units, so from the get-go we don’t know what we’re looking at. The original data are not plotted in addition to the trendlines, nor are there error bars or any other indication of how well the smoothing fit the original data. It is evident from the wild differences between the two that at least one of the models is a very poor fit, but we don’t know anything about the actual data. In the very, very least, we need some error or data points for this plot to make any sense.

foo %>% ggplot(aes(x, y)) + 
  geom_smooth(linewidth = 0.5) +
  geom_smooth(method = "lm", linewidth = 0.5, color = "coral2") +
  geom_point()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

We can see now that the weird trends and massive differences between the two smoothing methods were caused by a single outlier - one value of x is over 2x all other x values. Hence, neither model is really all that informative for the data.

Note: When I originally ran this code copied from the assignment, my trendline plots were the same as yours. However, when I knitted it, the loess smooth line flipped, and now renders that way every time. Not sure what changed…

Load Paleoclimate Data

temps <- read.csv("3Proxies.csv")
summary(temps)
##     Proxy                kya                 C         
##  Length:2191        Min.   : 0.03837   Min.   :-65.43  
##  Class :character   1st Qu.: 3.10221   1st Qu.:-55.48  
##  Mode  :character   Median : 7.45747   Median :-39.87  
##                     Mean   : 8.01993   Mean   :-38.87  
##                     3rd Qu.:12.17729   3rd Qu.:-30.63  
##                     Max.   :19.97922   Max.   : 28.40
head(temps)
##       Proxy       kya        C
## 1 Greenland 0.0951409 -31.5913
## 2 Greenland 0.1071300 -31.6220
## 3 Greenland 0.1131490 -31.6026
## 4 Greenland 0.1192050 -31.6002
## 5 Greenland 0.1192050 -31.5980
## 6 Greenland 0.1254510 -31.6656
str(temps)
## 'data.frame':    2191 obs. of  3 variables:
##  $ Proxy: chr  "Greenland" "Greenland" "Greenland" "Greenland" ...
##  $ kya  : num  0.0951 0.1071 0.1131 0.1192 0.1192 ...
##  $ C    : num  -31.6 -31.6 -31.6 -31.6 -31.6 ...

Smoothing Methods

Loess

Span = 1

# basic plot to look at data
p1.1 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              method = "loess",
              span = 1)
p1.1
## `geom_smooth()` using formula = 'y ~ x'

Span = 0.5

# basic plot to look at data
p1.2 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              method = "loess",
              span = 0.5)
p1.2
## `geom_smooth()` using formula = 'y ~ x'

Span = 0.25

# basic plot to look at data
p1.3 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              method = "loess",
              span = 0.25)
p1.3
## `geom_smooth()` using formula = 'y ~ x'

Span = 0.1

# basic plot to look at data
p1.4 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              method = "loess",
              span = 0.1)
p1.4
## `geom_smooth()` using formula = 'y ~ x'

GAM

p2 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              method = "gam")
p2
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

GLM

p3 <- ggplot(data = temps, aes(x = -kya, y = C)) +
  geom_path(aes(group = Proxy, color = Proxy), alpha = 0.5) +
  geom_smooth(aes(group = Proxy, color = Proxy), 
              formula = y ~ x, 
              method = "glm") 
p3

I played around with a few different methods, mainly a Generalized Additive Model (GAM) and different spans for the loess model. I decided on a loess model with a span of 0.25 because it highlights the overall trends as well as the Younger Dryas approximately 12,000 years ago, but is not overfit like a span of 0.1. For my final plot, I also included a linear model to better show the overall warming trends for each of the three proxies.

Final Plot

# sample palette to pull specific colors from
pal1 <- pnw_palette("Bay", n = 2191)
# used pal1[i] to pull hex-codes for specific colors, combined here
pal2 <- c("#E69B99", "#73AA77", "#238C97")

proxy_plot <- ggplot(data = temps, aes(x = -kya, y = C, group = Proxy)) +
  # original data - important to show how well trends fit
  geom_path(aes(color = Proxy), alpha = 0.75) +
  # violin density plots, oriented at the far right of the plot
  geom_violin(aes(x = 0.5, fill = Proxy, color = Proxy)) +
  # loess smoothed trendline 
  geom_smooth(aes(color = Proxy), formula = y ~ x,
              method = "loess", span = 0.25, se = FALSE,
              linewidth = 0.75) +
  # linear model trendline
  geom_smooth(method = "glm", formula = y ~ x, se = FALSE,
              color = "darkgrey", alpha = 0.75, linewidth = 0.5) +
  # change color and fill palettes, rearranging by breaks
  scale_color_manual(values = pal2, 
                     breaks = c("Cariaco Basin", "Greenland", "Antarctica")) +
  scale_fill_manual(values = pal2, 
                    breaks = c("Cariaco Basin", "Greenland", "Antarctica")) +
  # axis and title labels
  labs(x = "Thousands of Years Ago",
       y = "Average Annual Temperature (\u00B0C)",
       title = "Paleoclimate Proxies for Average Annual Temperature",
       subtitle = "20,000 Years Ago to Pre-Industrialization") +
  # theme options
  theme_classic() +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.title = element_blank()) 

proxy_plot