Carpentries github page: https://jtvanleuven.github.io/2020-08-10-CAES/

Bootcamp overview page: https://caesenergy.org/remote-summer-boot-camp-2-computing-data-visualization/#agenda

Emergency use etherpad: https://pad.carpentries.org/08-10-2020-CAES

Carpentries workshops survey (one survey for all workshops):

https://carpentries.typeform.com/to/UgVdRQ?slug=2020-08-10-CAES

Part 1: Review of R and RStudio IDE

#panels in RStudio

#rprojects
##open new project
##start new script
##integrate with git

#math
1+1
## [1] 2
#commands
round(3.14)
## [1] 3
round(x=3.14, digits=1)
## [1] 3.1
#object assignments
##remember keyboard shortcuts
some_name <- 1
some_name <- 1 + 1 + round(x=3.14, digits=1)

#comparing things
some_name > 10
## [1] FALSE
  #more complex object assignment
some_name <- data.frame(first_vec=c("fresh", "soph", "junior", "senior", "job??"),
                        year=c(2004, 2005, 2006, 2007, 2008), 
                        gpa=c(2.5, 3.0, 3.5, 4.0, mean(3.5,3.6,4.0,2.0)),
                        stringsAsFactors = F)
some_name
##   first_vec year gpa
## 1     fresh 2004 2.5
## 2      soph 2005 3.0
## 3    junior 2006 3.5
## 4    senior 2007 4.0
## 5     job?? 2008 3.5
#View(some_name)
head(some_name, 1)
##   first_vec year gpa
## 1     fresh 2004 2.5
#simple plot
plot(x=some_name$year, y=some_name$gpa)

#characteristics of obects
str(some_name)  ##class(), type()
## 'data.frame':    5 obs. of  3 variables:
##  $ first_vec: chr  "fresh" "soph" "junior" "senior" ...
##  $ year     : num  2004 2005 2006 2007 2008
##  $ gpa      : num  2.5 3 3.5 4 3.5
#common types:
#chracter
#number
#logical
#factor

#in a data.frame, all elements of a vector must be the same type
some_name$gpa[4]
## [1] 4
some_name$gpa[4] <- "fail"
str(some_name)  #see how we changed the type of the "gpa" vector
## 'data.frame':    5 obs. of  3 variables:
##  $ first_vec: chr  "fresh" "soph" "junior" "senior" ...
##  $ year     : num  2004 2005 2006 2007 2008
##  $ gpa      : chr  "2.5" "3" "3.5" "fail" ...
some_name$gpa[4] <- "4.0"
str(some_name)
## 'data.frame':    5 obs. of  3 variables:
##  $ first_vec: chr  "fresh" "soph" "junior" "senior" ...
##  $ year     : num  2004 2005 2006 2007 2008
##  $ gpa      : chr  "2.5" "3" "3.5" "4.0" ...
some_name$gpa <- as.numeric(some_name$gpa)  ##change type
str(some_name)
## 'data.frame':    5 obs. of  3 variables:
##  $ first_vec: chr  "fresh" "soph" "junior" "senior" ...
##  $ year     : num  2004 2005 2006 2007 2008
##  $ gpa      : num  2.5 3 3.5 4 3.5
#install.packages('ggplot2')
#load packages
library(ggplot2)

#plot
plot(x=some_name$year, y=some_name$gpa)

#use ggplot to plot first_vec vs. gpa. Do you notice anything wrong with this plot? 
ggplot(data=some_name, aes(x=first_vec, y=gpa)) +
  geom_point()

###The order of x is all messed up. We'll work on this later.

#summary statistics
summary(some_name$gpa)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.5     3.0     3.5     3.3     3.5     4.0
mod <- lm(gpa~year, data=some_name)
summary(mod)
## 
## Call:
## lm(formula = gpa ~ year, data = some_name)
## 
## Residuals:
##          1          2          3          4          5 
## -2.000e-01  8.368e-15  2.000e-01  4.000e-01 -4.000e-01 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -598.5000   231.6330  -2.584   0.0815 .
## year           0.3000     0.1155   2.598   0.0805 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3651 on 3 degrees of freedom
## Multiple R-squared:  0.6923, Adjusted R-squared:  0.5897 
## F-statistic:  6.75 on 1 and 3 DF,  p-value: 0.08051
#export data
#write.csv(some_name,file="dummy_data.csv",row.names = F, quote = F)

#open up data directly from web
###copy this link into zoom chat for people to copy
gapminder <- 
  read.csv("https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/gapminder_data.csv", stringsAsFactors = F)  
str(gapminder)
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
#Exercise:What year has the lowest life expectancy? Hint: try plotting. What is the r2 value between year and life expectancy?
plot(x=gapminder$year, y=gapminder$lifeExp)

mod2 <- lm(lifeExp~year, gapminder)
summary(mod2)
## 
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.949  -9.651   1.697  10.335  22.158 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -585.65219   32.31396  -18.12   <2e-16 ***
## year           0.32590    0.01632   19.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared:  0.1898, Adjusted R-squared:  0.1893 
## F-statistic: 398.6 on 1 and 1702 DF,  p-value: < 2.2e-16
#Exercise: Practice writing and reading data using the gapminder dataset. Put the data into "data" folder.
#write.csv(gapminder, "data/gapminder.csv", row.names = F, quote = F)
#tmp <- read.csv("data/gapminder.csv")

Part 2: Plotting

http://swcarpentry.github.io/r-novice-gapminder/08-plot-ggplot2/index.html

library(ggplot2)

ggplot(data=some_name,aes(x=first_vec, y=gpa, by=)) +
  geom_point() +
  scale_x_discrete(limits=c("fresh","soph","junior","senior","job??"))  ##just a vector

gapminder <- 
  read.csv("https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/gapminder_data.csv", stringsAsFactors = F) 


ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

#understanding ggplot layers
ggplot()  #empty plotting layer

#empty plotting layer with correct scales
ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp)) 

ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

#can be really useful when plotting multiple things at once
ggplot() +
  geom_point(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp)) 

##Exercise: modify the plot to show life expectancy over time 
##while you're at it, color the dots by continent
ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent)) +
  geom_point()

#Need to make it more informative. Make line plot and connect the "by" country
ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line()

#layers are added on top of the last layer
ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line() +
  geom_point(color="black")

#fix up the plot with some edits
ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line() +
  labs(x= "Year", y= "Life expectancy (years)") + 
  theme_bw() +  ##lots of different pre-set themes
  scale_color_brewer(palette = "Dark2")  ##there are some nice colors https://www.datanovia.com/en/blog/ggplot-colors-best-tricks-you-will-love/

ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line() +
  labs(x= "Year", y= "Life expectancy (years)") + 
  theme_bw() +  ##lots of different pre-set themes
  scale_color_manual(values = c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")) +
  theme(legend.title = element_blank()) ##modify the legend

##slopes look different between the continents. find out if they really are
mod.table <- data.frame(continent=unique(gapminder$continent), slope=NA, r2=NA, p=NA, stringsAsFactors = F)
row.names(mod.table) <- mod.table$continent
for(i in mod.table$continent){
  mod <- lm(lifeExp~year, data=gapminder[gapminder$continent == i,])
  summary <- summary(mod)
  mod.table[i,]$slope <- round(summary$coefficients[2], digits=2)
  mod.table[i,]$r2 <- round(summary$adj.r.squared, digits = 2)
  mod.table[i,]$p <- formatC(summary$coefficients[,4][2], format="e", digits = 2)
}
mod.table$lab <- paste(mod.table$continent, ", slope=",mod.table$slope, ", r2=", mod.table$r2, sep="")

gapminder$lab <- mod.table[gapminder$continent,]$lab
ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line() +
  labs(x= "Year", y= "Life expectancy (years)") +
  theme_bw() +  ##lots of different pre-set themes
  scale_color_manual(values = c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")) +
  facet_wrap( ~ lab) +
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90),
        legend.position = "none")

p <- ggplot(gapminder, aes(x = year, y = lifeExp, color=continent, by=country)) +
  geom_line() +
  labs(x= "Year", y= "Life expectancy (years)") +
  theme_bw() +  ##lots of different pre-set themes
  scale_color_manual(values = c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")) +
  facet_wrap( ~ lab) +
  stat_summary(data=gapminder, aes(group=continent), fun=mean, geom="line", size=1, color="black") +
  #geom_smooth(method='lm', se = F, aes(group=continent), color="grey") +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(angle = 90),
        legend.position = "none") 
p

#ggsave(filename = "plots/lifeExp.pdf", plot = p, width = 6.5, height = 4.5, dpi = 300, units = "in")

Part 3: Plotting with extra packages

###There are many packages that can help you with plotting ###some that I find useful that you may want to look at: * cowplot * ggExtra * heatmap2 * gridExtra * RColorBrewer

#install.package('ggExtra')
library(ggExtra)


#Exercise: subset the gapminder data for years after 2000 to narrow down data a bit
gapminder_2000 <- gapminder[gapminder$year >= 2000,]

#Exercise: make scatter plot comparing gdp to lifeexp
p2 <- ggplot(gapminder_2000, aes(x = gdpPercap, y = lifeExp, color=continent)) +
  geom_point() +
  labs(x= "Per capita GDP ($)", y= "Life expectancy (years)") +
  theme_bw() +  ##lots of different pre-set themes
  scale_color_manual(values = c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")) +
  theme(legend.title = element_blank(),
        legend.position = "bottom") 
ggMarginal(p2, groupColour = TRUE, groupFill = TRUE)


#drop the oceania data
gapminder_2000_nooceania <- gapminder_2000[!gapminder_2000$continent == "Oceania",]
p2 <- ggplot(gapminder_2000_nooceania, aes(x = gdpPercap, y = lifeExp, color=continent)) +
  geom_point() +
  labs(x= "Per capita GDP ($)", y= "Life expectancy (years)") +
  theme_bw() +  ##lots of different pre-set themes
  scale_color_manual(values = c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")) +
  theme(legend.title = element_blank(),
        legend.position = "bottom") 
p3 <-ggMarginal(p2, groupColour = TRUE, groupFill = TRUE)
p3

#install.packages('cowplot')
library(cowplot)
plot_grid(p,p3,nrow=1)

#ggsave(filename = "plots/cowplot.pdf", plot = plot_grid(p,p3,nrow=1), width = 12, height = 6, dpi = 300, units = "in")


#install.packages("gganimate")
library(gganimate)
library(gapminder)

p3 <- ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
  geom_point(alpha = 0.7, show.legend = FALSE) +
  scale_colour_manual(values = country_colors) +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  facet_wrap(~continent) +
  # Here comes the gganimate specific bits
  labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
  transition_time(year) +
  ease_aes('linear')

animate(p3)

#anim_save("animate.gif", p3)
##may run into rendering issues

#install.packages('gifski')
#library(gifski)
#close and reopen Rstudio
#animate(p3, duration = 5, fps = 20, width = 400, height = 400, renderer = gifski_renderer())
#anim_save("animate.gif", p3)

List of useful resources:

Intro to R for Geospatial Data: https://datacarpentry.org/r-intro-geospatial/

R for Data Science: https://r4ds.had.co.nz/