This is my work for DataViz Battle in Reddit’s (r/dataisbeautiful)[www.reddit.com/r/dataisbeautiful].
Disclaimer
* All the data was provide by r/dataisbeautiful;
* was done in R, with a lot of ggplot2 and dplyr;
* This is an original content
* Database;
* Database prerared in Excel.
library(ggplot2)
library(dplyr)
library(gghighlight)
library(lubridate)
library(tidyquant)
library(ggthemes)
library(scales)
library(ggpubr)theme_set(theme_fivethirtyeight())To highlight the time series, the scale Viridis will be used:
To highlight the values range, the scale Plasmawill be used:
lake <- read.csv("C:/Users/marra/Documents/NASA/LAKE MENDOTA/data_mendota.csv", sep = ";", header = TRUE)If you want to check the data:
head(lake)
str(lake)
summary(lake)Create some variables:
lake$closed_date <- as.Date(paste(lake$closed_day, lake$closed_month, lake$closed_year, sep = "/"), "%d/%m/%Y")
lake$opened_date <- as.Date(paste(lake$opened_day, lake$opened_month, lake$opened_year, sep = "/"), "%d/%m/%Y")
lake$days <- as.numeric(lake$opened_date - lake$closed_date)
lake$decade <- as.numeric(substr(lake$WINTER, 1, 3))*10
lake$closed_date2 <- as.Date(paste(lake$closed_day, lake$closed_month,
ifelse(lake$closed_month > 6, 2018, 2019),
sep = "/"), "%d/%m/%Y")
lake$opened_date2 <- as.Date(paste(lake$opened_day, lake$opened_month,
ifelse(lake$opened_month > 6, 2018, 2019),
sep = "/"), "%d/%m/%Y")Gather data by winter:
lake_gather <- lake %>%
group_by(WINTER, closed_end, decade) %>%
summarise(days = sum(days),
n = length(WINTER),
early_closed = min(closed_date2),
later_opened = max(opened_date2))
lake_gather$percent_days <- percent_rank(lake_gather$days)
lake_gather$extremes <- ifelse(lake_gather$percent_days >= 0.9, "upper 10%",
ifelse(lake_gather$percent_days <= 0.1, "lower 10%", NA))
lake_gather$last_winter <- lag(lake_gather$days)g1 <- ggplot(lake) +
geom_segment(aes(x=closed_end, xend=closed_end, y=closed_date2, yend=opened_date2, color=decade), size = 2) +
viridis::scale_color_viridis() + theme(legend.position="right", legend.direction='vertical') +
labs(title="Closing and Opening by Winter", subtitle = "Segment chart", color = "Decade")
g1g1 +
geom_smooth(data = lake_gather, aes(x=closed_end, y=later_opened), method = "loess", se = FALSE, colour = "red") +
geom_smooth(data = lake_gather, aes(x=closed_end, y=early_closed), method = "loess", se = FALSE, colour = "red")It seems that the lake is freezing later and melting sooner as the past years:
closing <- ggplot(lake_gather, aes(x=closed_end, y=early_closed, color=decade)) +
geom_point() +
viridis::scale_color_viridis() +
# theme(legend.position="right", legend.direction='vertical') +
geom_smooth(method = "loess", se = FALSE, colour = "red") +
labs(title="Closing dates by winter", subtitle = "Point chart", color = "Decade")
opening <- ggplot(lake_gather, aes(x=closed_end, y=later_opened, color=decade)) +
geom_point() +
viridis::scale_color_viridis() +
# theme(legend.position="right", legend.direction='vertical') +
geom_smooth(method = "loess", se = FALSE, colour = "red") +
labs(title="Opening dates by winter", subtitle = "Point chart", color = "Decade")
ggarrange(closing, opening)Now we can visualize the total time the lake was frozen.
polar_column <- ggplot(lake_gather, aes(x=closed_end, y=days, fill=days)) +
geom_col() +
scale_x_continuous(breaks = unique(lake_gather$decade)) +
ylim(-20, NA) +
viridis::scale_fill_viridis(option="plasma") +
coord_polar() +
# theme(legend.position="right", legend.direction='vertical') +
labs(title="Time closed by winter", subtitle = "Column chart with polar coordinates", fill = "# days") +
theme(axis.title = element_text(), axis.title.x = element_blank()) + ylab("Time closed")
polar_point <- ggplot(lake_gather, aes(x=closed_end, y=days, color=days)) +
geom_point(size = 3) +
scale_x_continuous(breaks = unique(lake_gather$decade)) +
ylim(-20, NA) +
viridis::scale_color_viridis(option="plasma") +
coord_polar() +
# theme(legend.position="right", legend.direction='vertical') +
labs(title="Time closed by winter", subtitle = "Point chart with polar coordinates", color = "# days") +
theme(axis.title = element_text(), axis.title.x = element_blank()) + ylab("Time closed")
ggarrange(polar_column, polar_point)h_column <- ggplot(lake_gather, aes(x=closed_end, y=days, fill=days)) +
geom_col() +
viridis::scale_fill_viridis(option="plasma") +
labs(title = "Time closed by winter", subtitle = "Columns chart + smooth line", fill = "# days") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Days closed") + xlab("Winter") +
geom_smooth(method = "loess", se = FALSE)
h_point <- ggplot(lake_gather, aes(x=closed_end, y=days, color=days)) +
geom_point(size = 3) +
viridis::scale_color_viridis(option="plasma") +
geom_smooth(method = "loess", se = FALSE) +
labs(title="Time closed by winter", subtitle = "Scatterplot + smooth line", color = "# days") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Days closed") + xlab("Winter")
ggarrange(h_column, h_point)extreme_column <- ggplot(lake_gather, aes(x=closed_end, y=days, fill=extremes)) +
geom_col() +
theme(legend.position="top") +
labs(title="10% Extreme", subtitle = "Columns chart + highlight", fill = "Extremes # days") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Days closed") + xlab("Winter") +
gghighlight(!is.na(extremes))
extreme_point <- ggplot(lake_gather, aes(x=closed_end, y=days, colour = as.factor(extremes))) +
geom_point(size = 3) +
theme(legend.position="top") +
gghighlight(!is.na(extremes)) +
labs(title="10% Extreme", subtitle = "Scatterplot + highlight", colour = "Extremes # days") +
theme(axis.title = element_text(), axis.title.x = element_blank()) + ylab("Time closed")
ggarrange(extreme_column, extreme_point)The historical average frozen time is 102 days per winter.
ggplot(lake_gather, aes(x=days, fill=factor(decade))) +
geom_histogram(binwidth = 10) +
viridis::scale_fill_viridis(discrete=TRUE) +
geom_vline(xintercept = mean(lake_gather$days), size = 1, colour = "#FF3721", linetype = "dashed") +
theme(legend.position="none") +
labs(title="Histogram of frozen time",
subtitle = "Histogram + historic average frozen time") +
theme(axis.title = element_text(), axis.title.x = element_blank()) + ylab("Time closed")If we seperate this in 4 periods of (approximately) 40 years, maybe we can see some pattern:
lake_gather$period <- cut(lake_gather$closed_end, breaks = c(1855, 1899, 1939, 1979, 2019),
labels = c("1856 - 1899", "1900 - 1939", "1940 - 1979", "1980 - 2018"))
ggplot(lake_gather, aes(x=days, fill=factor(decade))) +
geom_histogram(binwidth = 10) +
viridis::scale_fill_viridis(discrete=TRUE) +
geom_vline(xintercept = mean(lake_gather$days), size = 1, colour = "#FF3721", linetype = "dashed") +
theme(legend.position="none") +
labs(title="Histogram of frozen time, for 4 periods",
subtitle = "Histogram + historic average frozen time") +
theme(axis.title = element_text(), axis.title.x = element_blank()) + ylab("Time closed") +
facet_wrap(~period)Let’s do some data manipulation first:
cont_dates <- data.frame(date = seq(as.Date(min(lake_gather$early_closed)), as.Date(max(lake_gather$later_opened)), "days"),
freq = 0)
lake$interval <- interval(lake$closed_date2, lake$opened_date2)
for (i in 1:nrow(cont_dates)) {
cont_dates$freq[[i]] <- sum(cont_dates$date[[i]] %within% lake$interval)
}
cont_dates$prop <- cont_dates$freq / nrow(cont_dates)
cont_dates$month <- month(cont_dates$date)
cont_dates_month <- cont_dates %>% group_by(month) %>% summarise(freq = sum(freq), n = length(month))
cont_dates_month$freq_average <- cont_dates_month$freq / cont_dates_month$n
cont_dates_month$prop <- cont_dates_month$freq_average / nrow(cont_dates)
cont_dates_month$month <- factor(cont_dates_month$month,
levels = c(11, 12, 01, 02, 03, 04, 05))Now, there is a graph for that!
ggplot(cont_dates, aes(x=date, y=prop, fill=prop)) +
geom_col() +
viridis::scale_fill_viridis(option = "plasma") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position="right", legend.direction='vertical') +
labs(title="Porportional time closed by date", subtitle = "Column chart", fill = "% closed") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Proportion closed") +
xlab("Winter")Let’s have the same view, but focused only on the months:
ggplot(cont_dates_month, aes(x=month, y=prop, fill=prop)) +
geom_col() +
viridis::scale_fill_viridis(option = "plasma") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position="right", legend.direction='vertical') +
labs(title="Porportional time closed by month", subtitle = "Column chart", fill = "% closed") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Proportion closed") +
xlab("Month")The earliest closind date was on 23 November 1880.
And the latest was on 30 January 1931.
lake_gather %>% group_by(early_closed, decade) %>% summarise(n = length(early_closed)) %>%
ggplot(aes(x=early_closed, y=n, fill=decade)) +
geom_col() +
# coord_flip() +
viridis::scale_fill_viridis() +
scale_y_continuous(breaks = seq(0, 12, 3)) +
theme(legend.position="right", legend.direction='vertical') +
labs(title="Closing day frequency, colored by decade", subtitle = "Column chart", fill = "Decade") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Frequency") +
xlab("Day")The earliest opening date was on 27 February 1998.
And the latest was on 6 May 1857.
lake_gather %>% group_by(later_opened, decade) %>% summarise(n = length(later_opened)) %>%
ggplot(aes(x=later_opened, y=n, fill=decade)) +
geom_col() +
viridis::scale_fill_viridis() +
scale_y_continuous(breaks = seq(0, 9, 3), limits = c(0, 9)) +
theme(legend.position="right", legend.direction='vertical') +
labs(title="Opening day frequency, colored by decade", subtitle = "Column chart", fill = "Decade") +
theme(axis.title = element_text(), axis.title.x = element_text()) + ylab("Frequency") +
xlab("Day")