Iteration and dates

In this milestone, you’ll start by creating a new date variable using existing columns in the nasa_power dataset. Then, you’ll write code to iterate over a custom plotting function.

Recreation

For this recreation, you will use a custom plotting function called plot_power_formatted() that we’ve provided in the setup chunk at the top of this page. Make sure that chunk has been run before proceeding with this milestone.

You don’t need to worry about how writing functions works, but you should notice that most of the code is reused from Milestone 5, where you pivoted temperature variables to plot them.

If you’d like to learn about writing custom functions, take a look at Chapter 19 of R For Data Science.

Part one: Add a date column

Before using plot_power_formatted() to generate plots, you’ll need to add a date column to the nasa_power dataset, and save the result as an object called power_formatted.

Your power_formatted table should match the following output:

readr::read_csv("data/solution_07.csv")
## # A tibble: 3,653 × 9
##      lon   lat date          ps   t2m t2m_max t2m_min  ws2m  wd2m
##    <dbl> <dbl> <date>     <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>
##  1 -67.5 -26.5 2012-01-01  64.2  9.65    17.3    3.73  4.23  155.
##  2 -67.5 -26.5 2012-01-02  64.2  9.65    18.0    3.12  3.6   161.
##  3 -67.5 -26.5 2012-01-03  64.1  9.25    18.4    1.76  2.64  176.
##  4 -67.5 -26.5 2012-01-04  64.1  7.94    14.2    2.93  2.33  125 
##  5 -67.5 -26.5 2012-01-05  64.0  7.28    14.6    1.13  1.7   137.
##  6 -67.5 -26.5 2012-01-06  63.9  7.21    14.2    2.13  2.61  170.
##  7 -67.5 -26.5 2012-01-07  63.9  9.18    16.6    1.9   1.44  172.
##  8 -67.5 -26.5 2012-01-08  63.9 11.4     19.2    4.14  1.36  157.
##  9 -67.5 -26.5 2012-01-09  63.8 12.4     20.6    3.73  1.28  184.
## 10 -67.5 -26.5 2012-01-10  63.7 10.6     16.8    3.51  1.16  177.
## # … with 3,643 more rows

Write your code in the chunk below. Remember to save this table as an object called power_formatted, as this will be necessary for using the custom function in part two.

power_formatted<-readr::read_csv(here::here("data","power.csv"),show_col_types = F) %>% 
  mutate(date=ymd(paste(year,mm,dd))) %>% 
  select(lon,lat,date,ps,t2m,t2m_max,t2m_min,ws2m,wd2m)

print(power_formatted)
## # A tibble: 3,653 × 9
##      lon   lat date          ps   t2m t2m_max t2m_min  ws2m  wd2m
##    <dbl> <dbl> <date>     <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>
##  1 -67.5 -26.5 2012-01-01  64.2  9.65    17.3    3.73  4.23  155.
##  2 -67.5 -26.5 2012-01-02  64.2  9.65    18.0    3.12  3.6   161.
##  3 -67.5 -26.5 2012-01-03  64.1  9.25    18.4    1.76  2.64  176.
##  4 -67.5 -26.5 2012-01-04  64.1  7.94    14.2    2.93  2.33  125 
##  5 -67.5 -26.5 2012-01-05  64.0  7.28    14.6    1.13  1.7   137.
##  6 -67.5 -26.5 2012-01-06  63.9  7.21    14.2    2.13  2.61  170.
##  7 -67.5 -26.5 2012-01-07  63.9  9.18    16.6    1.9   1.44  172.
##  8 -67.5 -26.5 2012-01-08  63.9 11.4     19.2    4.14  1.36  157.
##  9 -67.5 -26.5 2012-01-09  63.8 12.4     20.6    3.73  1.28  184.
## 10 -67.5 -26.5 2012-01-10  63.7 10.6     16.8    3.51  1.16  177.
## # … with 3,643 more rows
# 

Part two: Iterate

Now that you’ve formatted the date column in power_formatted, you’re ready to use iteration to generate multiple plots.

The plot_power_formatted() function takes a year to plot as its only argument: plot_power_formatted(year_to_plot).

First, practice running the function without any iteration. Use the chunk below to run the plot_power_formatted() function for the year 2020.

plot_power_formatted (year_to_plot = 2020)

Then, use purrr::map() to generate three different plots of the data. Plot the data for the years stored in the vector years: 2012, 2016, and 2020.

years <- c(2012, 2016, 2020)

# Write your code here

years<-c(2012,2016,2020)

yrs_gp<-map(.x = years,.f = plot_power_formatted)

Extension

For your extension, consider exploring a different iteration task (it can be unrelated to the nasa_power dataset). Alternatively, try embellishing or changing the plot in the plot_power_formatted() function.

Recall that you still have access to precip.xlsx if you’d like to join tables for your extension.

library(parallel)
library(pbapply)
library(data.table)
library(dtplyr)

plot_power_formatted_dt <- function(year_to_plot) {

  # year_to_plot<-2012
  library(magrittr)
  
  power_formatted %>%
    data.table::as.data.table() %>% 
    data.table::melt.data.table(measure.vars=c("t2m","t2m_max","t2m_min"),
                                variable.name="temp_var",value.name="temp_value")%>% 
    dplyr::filter(lubridate::year(date) == year_to_plot) %>%  
    tibble::as_tibble() %>% 
    ggplot2::ggplot(mapping = ggplot2::aes(date, temp_value, color = temp_var)) +
    ggplot2::geom_point() +
    ggplot2::geom_line(mapping = ggplot2::aes(group = temp_var)) +
    ggplot2::labs(
      title = stringr::str_glue("Temperature metrics for the year {year_to_plot}"),
      x = "Date",
      y = "Temperature (C)",
      color = "Temperature variable"
    )
  
}

years<-2012:2021

#Parallel computing
cl<-makeCluster(detectCores()-1)

clusterExport(cl,varlist = list("plot_power_formatted","plot_power_formatted_dt","years","power_formatted"))

set.seed(342131)

bench_out<-microbenchmark::microbenchmark(
  
  #Map
  map.tb_single=map(.x = years,.f = plot_power_formatted),
  
  #Map_dt
  map.dt_single=map(.x = years,.f = plot_power_formatted_dt),
  
  #sapply
  sapply.tb_single=sapply(X = years,FUN = function(x){plot_power_formatted(x)}),
  
  #sapply
  sapply.dt_single=sapply(X = years,FUN = function(x){plot_power_formatted_dt(x)}),
  
  #sapply with parallel computing
  sapply.tb_multi=parallel::parSapply(cl = cl,X = years,FUN = function(x){plot_power_formatted(x)}),
  
  #sapply with parallel computing
  sapply.dt_multi=parallel::parSapply(cl = cl,X = years,FUN = function(x){plot_power_formatted_dt(x)}),
  
  
  #pbapply
  pbsapply.tb_single=pbapply::pbsapply(X =  years,FUN = function(x){plot_power_formatted(x)}),
  
  #pbapply
  pbsapply.dt_single=pbapply::pbsapply(X =  years,FUN = function(x){plot_power_formatted_dt(x)}),
  
  #sapply with parallel computing
  pbsapply.tb_multi=  {pbapply::pbsapply(cl=cl,X =  years,FUN = function(x){plot_power_formatted(x)})},
  
  pbsapply.dt_multi={pbapply::pbsapply(cl=cl,X =  years,FUN = function(x){ plot_power_formatted_dt(x)})},

   #for_loop
   for_loop.tb_single=for(i in years)plot_power_formatted(i),
  # 
  # #for_loop with data table
   for_loop.dt_single=for(i in years)plot_power_formatted_dt(i),
  
  times = 100)

stopCluster(cl)
library(ggrepel)

bench_out %>%
  mutate(expr=as.character(expr)) %>% 
  data.frame() %>% 
  separate(col = expr,into = c("method","data_structure"),sep = "\\.") %>%
  separate(col = data_structure,into = c("data_structure","cores"),sep = "_") %>%
  mutate(data_structure=case_when(data_structure=="dt" ~"data.table",
                                  data_structure=="tb" ~"tibble")) %>% 
  # group_by(method,data_structure,cores) %>% 
  # summarize(time=median(time)) %>%
  arrange(time) %>% 
  ggplot(., aes(y = time/1e6, x = method, fill = data_structure)) +  # Plot performance comparison
  geom_violin()+
  # geom_boxplot()+
  scale_y_continuous(limits=c(0,NA))+
  
  # geom_bar(stat = "Identity",position = "dodge") +
  facet_wrap(~cores)+
# scale_y_log10()+
  theme_light()+
  theme(legend.position="bottom")+
  labs(caption="Notes:small time is better",fill="data structure:",
       title="Benchmark - Running 2012:2021 graph function",y="time")

# annotation_logticks(sides = "b")


bench_out %>%
  mutate(expr=as.character(expr)) %>% 
  data.frame() %>% 
  separate(col = expr,into = c("method","data_structure"),sep = "\\.") %>%
  separate(col = data_structure,into = c("data_structure","cores"),sep = "_") %>%
  mutate(data_structure=case_when(data_structure=="dt" ~"data.table",
                                  data_structure=="tb" ~"tibble")) %>% 
  group_by(method,data_structure,cores) %>%
  summarize(time=median(time)) %>%
  arrange(time) %>% 
  ggplot(., aes(y = time/1e6, x = method, fill = data_structure)) +  # Plot performance comparison
  # geom_violin(scale = 0.1)+
  # geom_boxplot()+
  scale_y_continuous(limits=c(0,NA))+
  
  geom_bar(stat = "Identity",position = "dodge") +
  geom_label_repel(aes(label=signif(time/1e6,digits = 3)),show.legend = F)+

  facet_wrap(~cores)+
# scale_y_log10()+
  theme_light()+
  theme(legend.position="bottom")+
  labs(caption="Notes:small time is better",fill="data structure:",
       title="Benchmark - Running 2012:2021 graph function",y="time")