In this milestone, you’ll start by creating a new date
variable using existing columns in the nasa_power
dataset. Then, you’ll write code to iterate over a custom plotting function.
For this recreation, you will use a custom plotting function called plot_power_formatted()
that we’ve provided in the setup
chunk at the top of this page. Make sure that chunk has been run before proceeding with this milestone.
You don’t need to worry about how writing functions works, but you should notice that most of the code is reused from Milestone 5, where you pivoted temperature variables to plot them.
If you’d like to learn about writing custom functions, take a look at Chapter 19 of R For Data Science.
Before using plot_power_formatted()
to generate plots, you’ll need to add a date
column to the nasa_power
dataset, and save the result as an object called power_formatted
.
Your power_formatted
table should match the following output:
readr::read_csv("data/solution_07.csv")
## # A tibble: 3,653 × 9
## lon lat date ps t2m t2m_max t2m_min ws2m wd2m
## <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -67.5 -26.5 2012-01-01 64.2 9.65 17.3 3.73 4.23 155.
## 2 -67.5 -26.5 2012-01-02 64.2 9.65 18.0 3.12 3.6 161.
## 3 -67.5 -26.5 2012-01-03 64.1 9.25 18.4 1.76 2.64 176.
## 4 -67.5 -26.5 2012-01-04 64.1 7.94 14.2 2.93 2.33 125
## 5 -67.5 -26.5 2012-01-05 64.0 7.28 14.6 1.13 1.7 137.
## 6 -67.5 -26.5 2012-01-06 63.9 7.21 14.2 2.13 2.61 170.
## 7 -67.5 -26.5 2012-01-07 63.9 9.18 16.6 1.9 1.44 172.
## 8 -67.5 -26.5 2012-01-08 63.9 11.4 19.2 4.14 1.36 157.
## 9 -67.5 -26.5 2012-01-09 63.8 12.4 20.6 3.73 1.28 184.
## 10 -67.5 -26.5 2012-01-10 63.7 10.6 16.8 3.51 1.16 177.
## # … with 3,643 more rows
Write your code in the chunk below. Remember to save this table as an object called power_formatted
, as this will be necessary for using the custom function in part two.
power_formatted<-readr::read_csv(here::here("data","power.csv"),show_col_types = F) %>%
mutate(date=ymd(paste(year,mm,dd))) %>%
select(lon,lat,date,ps,t2m,t2m_max,t2m_min,ws2m,wd2m)
print(power_formatted)
## # A tibble: 3,653 × 9
## lon lat date ps t2m t2m_max t2m_min ws2m wd2m
## <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -67.5 -26.5 2012-01-01 64.2 9.65 17.3 3.73 4.23 155.
## 2 -67.5 -26.5 2012-01-02 64.2 9.65 18.0 3.12 3.6 161.
## 3 -67.5 -26.5 2012-01-03 64.1 9.25 18.4 1.76 2.64 176.
## 4 -67.5 -26.5 2012-01-04 64.1 7.94 14.2 2.93 2.33 125
## 5 -67.5 -26.5 2012-01-05 64.0 7.28 14.6 1.13 1.7 137.
## 6 -67.5 -26.5 2012-01-06 63.9 7.21 14.2 2.13 2.61 170.
## 7 -67.5 -26.5 2012-01-07 63.9 9.18 16.6 1.9 1.44 172.
## 8 -67.5 -26.5 2012-01-08 63.9 11.4 19.2 4.14 1.36 157.
## 9 -67.5 -26.5 2012-01-09 63.8 12.4 20.6 3.73 1.28 184.
## 10 -67.5 -26.5 2012-01-10 63.7 10.6 16.8 3.51 1.16 177.
## # … with 3,643 more rows
#
Now that you’ve formatted the date
column in power_formatted
, you’re ready to use iteration to generate multiple plots.
The plot_power_formatted()
function takes a year to plot as its only argument: plot_power_formatted(year_to_plot)
.
First, practice running the function without any iteration. Use the chunk below to run the plot_power_formatted()
function for the year 2020.
plot_power_formatted (year_to_plot = 2020)
Then, use purrr::map()
to generate three different plots of the data. Plot the data for the years stored in the vector years
: 2012, 2016, and 2020.
years <- c(2012, 2016, 2020)
# Write your code here
years<-c(2012,2016,2020)
yrs_gp<-map(.x = years,.f = plot_power_formatted)
For your extension, consider exploring a different iteration task (it can be unrelated to the nasa_power
dataset). Alternatively, try embellishing or changing the plot in the plot_power_formatted()
function.
Recall that you still have access to precip.xlsx
if you’d like to join tables for your extension.
library(parallel)
library(pbapply)
library(data.table)
library(dtplyr)
plot_power_formatted_dt <- function(year_to_plot) {
# year_to_plot<-2012
library(magrittr)
power_formatted %>%
data.table::as.data.table() %>%
data.table::melt.data.table(measure.vars=c("t2m","t2m_max","t2m_min"),
variable.name="temp_var",value.name="temp_value")%>%
dplyr::filter(lubridate::year(date) == year_to_plot) %>%
tibble::as_tibble() %>%
ggplot2::ggplot(mapping = ggplot2::aes(date, temp_value, color = temp_var)) +
ggplot2::geom_point() +
ggplot2::geom_line(mapping = ggplot2::aes(group = temp_var)) +
ggplot2::labs(
title = stringr::str_glue("Temperature metrics for the year {year_to_plot}"),
x = "Date",
y = "Temperature (C)",
color = "Temperature variable"
)
}
years<-2012:2021
#Parallel computing
cl<-makeCluster(detectCores()-1)
clusterExport(cl,varlist = list("plot_power_formatted","plot_power_formatted_dt","years","power_formatted"))
set.seed(342131)
bench_out<-microbenchmark::microbenchmark(
#Map
map.tb_single=map(.x = years,.f = plot_power_formatted),
#Map_dt
map.dt_single=map(.x = years,.f = plot_power_formatted_dt),
#sapply
sapply.tb_single=sapply(X = years,FUN = function(x){plot_power_formatted(x)}),
#sapply
sapply.dt_single=sapply(X = years,FUN = function(x){plot_power_formatted_dt(x)}),
#sapply with parallel computing
sapply.tb_multi=parallel::parSapply(cl = cl,X = years,FUN = function(x){plot_power_formatted(x)}),
#sapply with parallel computing
sapply.dt_multi=parallel::parSapply(cl = cl,X = years,FUN = function(x){plot_power_formatted_dt(x)}),
#pbapply
pbsapply.tb_single=pbapply::pbsapply(X = years,FUN = function(x){plot_power_formatted(x)}),
#pbapply
pbsapply.dt_single=pbapply::pbsapply(X = years,FUN = function(x){plot_power_formatted_dt(x)}),
#sapply with parallel computing
pbsapply.tb_multi= {pbapply::pbsapply(cl=cl,X = years,FUN = function(x){plot_power_formatted(x)})},
pbsapply.dt_multi={pbapply::pbsapply(cl=cl,X = years,FUN = function(x){ plot_power_formatted_dt(x)})},
#for_loop
for_loop.tb_single=for(i in years)plot_power_formatted(i),
#
# #for_loop with data table
for_loop.dt_single=for(i in years)plot_power_formatted_dt(i),
times = 100)
stopCluster(cl)
library(ggrepel)
bench_out %>%
mutate(expr=as.character(expr)) %>%
data.frame() %>%
separate(col = expr,into = c("method","data_structure"),sep = "\\.") %>%
separate(col = data_structure,into = c("data_structure","cores"),sep = "_") %>%
mutate(data_structure=case_when(data_structure=="dt" ~"data.table",
data_structure=="tb" ~"tibble")) %>%
# group_by(method,data_structure,cores) %>%
# summarize(time=median(time)) %>%
arrange(time) %>%
ggplot(., aes(y = time/1e6, x = method, fill = data_structure)) + # Plot performance comparison
geom_violin()+
# geom_boxplot()+
scale_y_continuous(limits=c(0,NA))+
# geom_bar(stat = "Identity",position = "dodge") +
facet_wrap(~cores)+
# scale_y_log10()+
theme_light()+
theme(legend.position="bottom")+
labs(caption="Notes:small time is better",fill="data structure:",
title="Benchmark - Running 2012:2021 graph function",y="time")
# annotation_logticks(sides = "b")
bench_out %>%
mutate(expr=as.character(expr)) %>%
data.frame() %>%
separate(col = expr,into = c("method","data_structure"),sep = "\\.") %>%
separate(col = data_structure,into = c("data_structure","cores"),sep = "_") %>%
mutate(data_structure=case_when(data_structure=="dt" ~"data.table",
data_structure=="tb" ~"tibble")) %>%
group_by(method,data_structure,cores) %>%
summarize(time=median(time)) %>%
arrange(time) %>%
ggplot(., aes(y = time/1e6, x = method, fill = data_structure)) + # Plot performance comparison
# geom_violin(scale = 0.1)+
# geom_boxplot()+
scale_y_continuous(limits=c(0,NA))+
geom_bar(stat = "Identity",position = "dodge") +
geom_label_repel(aes(label=signif(time/1e6,digits = 3)),show.legend = F)+
facet_wrap(~cores)+
# scale_y_log10()+
theme_light()+
theme(legend.position="bottom")+
labs(caption="Notes:small time is better",fill="data structure:",
title="Benchmark - Running 2012:2021 graph function",y="time")