knitr::opts_chunk$set(echo = TRUE)
# Loading packages
pacman::p_load(tidyverse, ggrepel)
# Loading the data in:
temps <-
read.csv("bton weather2.csv")
Before we create the graph, we need to do a little data cleaning:
clean_names()
function in the
janitor
packagefilter()
and is.na()
mutate()
, yday()
, and
year()
function in the lubridate
packageFirst, let’s make sure there aren’t any duplicate days:
temps |>
# Counting how many times each date appears
count(
DATE,
name = "occurrences"
) |>
# Counting the number of times a day occurs:
count(occurrences)
## occurrences n
## 1 1 30236
There are 30236 days that only appear once, out of 30236 rows in the data. So no duplicates!
Now let’s clean our data
temps |>
# Making all the names lowercase:
janitor::clean_names() |>
# Only keeping the days with a non-missing tavg
filter(!is.na(tmax)) |>
# Converting the date column into days and year
mutate(
# Need to convert date column to a date type column with mdy()
date = mdy(date),
# find the day of the year with yday()
day = yday(date),
# find the year with year()
year = year(date)
) ->
temps2
tibble(temps2)
## # A tibble: 30,236 × 11
## station name date prcp snow snwd tavg tmax tmin day year
## <chr> <chr> <date> <dbl> <dbl> <dbl> <int> <int> <int> <dbl> <dbl>
## 1 USW00014742 BURLI… 1940-12-01 0.19 NA NA NA 27 5 336 1940
## 2 USW00014742 BURLI… 1940-12-02 0.03 NA NA NA 19 -3 337 1940
## 3 USW00014742 BURLI… 1940-12-03 0.01 NA NA NA 10 -18 338 1940
## 4 USW00014742 BURLI… 1940-12-04 0.04 NA NA NA 22 -20 339 1940
## 5 USW00014742 BURLI… 1940-12-05 0.06 NA NA NA 29 13 340 1940
## 6 USW00014742 BURLI… 1940-12-06 0.01 NA NA NA 22 -3 341 1940
## 7 USW00014742 BURLI… 1940-12-07 0.01 NA NA NA 38 22 342 1940
## 8 USW00014742 BURLI… 1940-12-08 0 NA NA NA 40 27 343 1940
## 9 USW00014742 BURLI… 1940-12-09 0 NA NA NA 27 6 344 1940
## 10 USW00014742 BURLI… 1940-12-10 0 NA NA NA 41 9 345 1940
## # ℹ 30,226 more rows
Let’s keep the last 10 years to get a clearer picture of what a spaghetti plot is: 2013 to 2023
temps2013 <-
temps2 |>
filter(year >= 2013)
tibble(temps2013)
## # A tibble: 3,910 × 11
## station name date prcp snow snwd tavg tmax tmin day year
## <chr> <chr> <date> <dbl> <dbl> <dbl> <int> <int> <int> <dbl> <dbl>
## 1 USW00014742 BURLI… 2013-01-01 0.03 1 15 NA 33 3 1 2013
## 2 USW00014742 BURLI… 2013-01-02 0.05 2.4 14 NA 16 -6 2 2013
## 3 USW00014742 BURLI… 2013-01-03 0 0 14 NA 16 -10 3 2013
## 4 USW00014742 BURLI… 2013-01-04 0 0 12 NA 36 15 4 2013
## 5 USW00014742 BURLI… 2013-01-05 0 0.1 11 NA 35 10 5 2013
## 6 USW00014742 BURLI… 2013-01-06 0.02 1.2 11 NA 34 14 6 2013
## 7 USW00014742 BURLI… 2013-01-07 0.01 0.8 10 NA 25 10 7 2013
## 8 USW00014742 BURLI… 2013-01-08 0 0 10 NA 36 18 8 2013
## 9 USW00014742 BURLI… 2013-01-09 0 0 10 NA 43 12 9 2013
## 10 USW00014742 BURLI… 2013-01-10 0.01 0 8 NA 40 25 10 2013
## # ℹ 3,900 more rows
To create a spaghetti plot, you’ll need 3 aesthetics: x
,
y
, and group
. We’ll map x = day
,
y = tmax
, and group = year
gg_temps <-
ggplot(
data = temps2013,
mapping = aes(
x = day,
y = tmax
)
) +
# Adding the separate lines by mapping group = year
geom_line(
mapping = aes(group = year),
color = "grey80",
linewidth = 0.25
) +
# Making the graph look better
labs(
x = "Day of the year",
y = "Max Daily Temperature (F)",
title = "Daily High Temperatures",
subtitle = "Burlington, VT: 2013 - 2023"
) +
# Centering all the text
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
) +
# Changing the breaks for every 30 days
scale_x_continuous(
breaks = seq(from = 0, to = 360, by = 30),
#labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
# "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", ),
minor_breaks = NULL
)
gg_temps
Typically with spaghetti plots, we want to show 1 to 3 different
groups (lines) in contrast to the other groups. Let’s add a “typical”
daily high temp line using geom_smooth()
.
gg_temps2 <-
gg_temps +
# Adding the "average" line
geom_smooth(
se = F,
method = "loess",
color = "black",
formula = y~x
) +
# Labeling the average line
annotate(
geom = "text",
x = 360,
y = 17,
fontface = "bold",
label = "Average\nMax Temp"
)
gg_temps2
We can also show a specific year by adding another
geom_line()
by including data =
and give it a
data frame with only the year we want to show. Let’s show 2022:
gg_temps2 +
# Adding the smoothed line for only 2023
geom_line(
data = temps2013 |> filter(year == 2022), # keeping just the days in 2022
color = "tomato"
) +
# Labeling the 2022 line
annotate(
geom = "text",
x = 375,
hjust = 0.35,
y = temps2013 |> filter(year == 2022, day == 365) |> pull(tmax),
fontface = "bold",
label = "2022",
color = "tomato"
)
We can replace the data set used in a saved ggplot()
graph with the %+%
operator. Let’s replace the last 10
years data set with all 70+ years
gg_temps_full <-
gg_temps %+%
temps2
gg_temps_full
Next we’ll add the 5 most recent full years: 2018 - 2022
gg_temps_full +
geom_line(
data = temps2 |> filter(between(year, 2018, 2022)),
mapping = aes(color = factor(year)),
show.legend = F,
linewidth = 0.5
) +
# Adding the year at the end of each line
geom_text_repel(
data = temps2 |> filter(between(year, 2018, 2022)) |> group_by(year) |> slice_max(day),
mapping = aes(color = factor(year),
label = year),
show.legend = F,
nudge_x = 11,
direction = "y",
max.overlaps = 15,
min.segment.length = 10
) +
labs(subtitle = "Burlington, VT: 2013 - 2022")