Import
From csv
library(tidyverse)
data <- read_csv("folder/filename.csv")
From a googlesheet
library(googlesheets)
sheet <- gs_title(“Google sheet name”)
data <- sheet %>% gs_read(ws = “Worksheet name”)
From a table on a webpage
library(rvest)
url <- read_html('url')
data_raw <- url %>%
html_node("table") %>%
html_table(fill=TRUE)
From a PDF
library(SPARQL) # SPARQL querying package
library(ggplot2)
# Step 1 - Set up preliminaries and define query
# Define the data.gov endpoint
endpoint <- "http://services.data.gov/sparql"
# create query statement
query <-
"PREFIX dgp1187: <http://data-gov.tw.rpi.edu/vocab/p/1187/>
SELECT ?ye ?fi ?ac
WHERE {
?s dgp1187:year ?ye .
?s dgp1187:fires ?fi .
?s dgp1187:acres ?ac .
}"
# Step 2 - Use SPARQL package to submit query and save results to a data frame
qd <- SPARQL(endpoint,query)
df <- qd$results
From multiple csvs
- Create a vector of all the datasets
all_datasets <- tibble("file" = c('dataset_name_1', 'dataset_name_2'))
- Create a function to import one dataset
import <- function(df){
df <- read_csv(str_c("df, ".csv"))
}
- Use purrr::map() to run the function on (each element) of all the datasets
all_data <- all_datasets %>% mutate(dataset = purrr::map(file, import))
- Run a ‘for loop’ to get R to assign each tibble to an object
for(i in 1:nrow(all_data)){
assign(all_data$file[[i]], all_data$dataset[[i]])
}
From multiple webpages
- Create a vector with the URL paths needed, eg:
url_unique <- # function to list these, eg using seq() if numerical
url_base <- "https://abc"
url_end <- "/abc.html"
url_path <- url_unique %>% str_c(url_base, . , url_end)
data <- tibble(url_path)
- Define a function to scrape (and clean) data from one url
scrape <- function(url){
xml <- read_html(url) %>%
html_node("table") %>%
html_table(fill=TRUE) %>%
xml
- (if needed) Prevent error messages from URL paths that don’t exist
extract_results <- function(df){
df$result
}
- Loop scrape function over all URLs as nested tibbles
data <- data %>%
mutate(new_column = purrr::map(url_path, safely(scrape)),
new_column = map(new_column, extract_results))
Save as csv
write_csv(dataset, "filename.csv")
Tidy
Inspect
Get list of all column headings in a dataset
colnames(dataset)
Get the number of rows and columns in a dataset
summarise()
Get the breakdown of results within a column
table(dataset$column)
Get the number of values in a column
summary(dataset$column)
Get the number of NA values in a column
sum(is.na(dataset$column))
tidyr
Wide –> skinny
gather(dataset, "new_column_name1", "new_column_name2", column range eg '2:8')
Skinny –> wide
spread(dataset, $keycolumn, $valuecolumn)
Combine datasets by a common column
merge(dataset1, dataset2, by = "common column name")
Combine datasets by attaching rows
append(dataset1, dataset2)
Create a new column
mutate(old_variable, new_variable = (calculation))
Rename a column
rename(dataset, new name = old name)
Rename categories within a column
data %>%
mutate(variable = fct_recode(variable, "new name" = "old name",
"new name = "old name"))
Alternatives (more for correcting typos like Find & Replace): dataset$column[dataset$column == "old"] <- "new"
"new_name" <- length(grep("existing_label", dataset$column))
Drop column(s) / row(s)
subset(dataset, select = -c(columns))
subset(dataset, column != "")
Remove NAs
na.omit(dataset)
Arrange a column in descending order
arrange(dataset, desc(column)))
stringr
Filter a column if contains text string
grepl(“text string”, variable)
Remove symbols, eg % sign
dataset$column <- gsub("\\%","", dataset$column)
Convert character column to numeric
dataset$column <- as.numeric(as.character(dataset$column))
Round numbers
mutate(variable = round(variable))
Clean column heading names (to lowercase with underscores
clean_names()
Create a quick pivot table to summarise variables
tabyl(variable1, variable2) %>%
adorn_*
Remove empty rows and columns
remove_empty()
Fix dates stored as serial numbers
excel_numeric_to_date()
Visualise 🤩
Basics
Classic code structure
ggplot(dataset) +
viz_function(aes(x = variable,
y = variable,
group = variable,
size = variable,
fill = variable) +
scale_x/y_discrete/continuous(breaks = c(),
limits = c(),
labels = c()) +
labs(title = "...",
subtitle = "...",
x = "...",
y = "...",
caption = "...") +
theme() +
ggsave("name.filetype", height = , width = , unit = "")
Additional layers
geom_hline()
geom_vline()
geom_segment()
geom_text()
geom_rect()
Transform data within a ggplot function
plot1 <- ggplot(filter(dataset, variable == criteria))
Legends
Reorder
- First create a factor:
dataset$newcolumn <- factor(dataset$column, levels = c(“item1”, “item2”) , ordered = TRUE)
- Then in ggplot() aesthetic mappings, use this new column, rather than the pre-existing one.
Remove
theme(legend.position = "none")
Axes
Arrange in descending order
ggplot(aes(x = reorder(variable1, -variable2), y = variable2)
Text
Remove / style axes labels
axis.title.x/y = element_blank() or element_text(size = , family = , font = )
axis.text.x/y = element_blank() or element_text(size = , family = , font = )
Wrap long labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
Rotate labels 45 degrees
theme(axis.text.x=element_text(angle=45))
Themes
From ggplot
theme_gray() – signature ggplot2 theme
theme_bw() – dark on light ggplot2 theme
theme_linedraw() – uses black lines on white backgrounds only
theme_light() – similar to linedraw() but with grey lines aswell
theme_dark() – lines on a dark background instead of light
theme_minimal() – no background annotations, minimal feel.
theme_classic() – theme with no grid lines
theme_void() – empty theme with no elements
From ggthemes
theme_solid() - Theme with nothing other than a background color
theme_map() - Clean theme for maps
theme_igray() - Inverse gray theme
theme_economist()
theme_fivethirtyeight()
theme_wsj()
theme_few()
theme_tufte()
theme_excel()
theme_gdocs()
theme_stata()
theme_solarized()
theme_hc() - Highcharts JS theme
Interactive ggplots
- iraph
library(iraph)
ggplot(data, aes(x = , y = , tooltip = variable) +
geom_???_interactive()
- plotly
library(plotly)
ggplotly(plot, tooltip = variable)
highcharter
htmlwidgets
RShiny
To make a shiny app, you need to create script with a filename ending “app.R”
The basic structure of an R Shiny app:
library(shiny)
ui <- fluidPage()
input() functions, eg sliderInput()
output() functions, eg plotOutput()
server <- function(input, output) { }
output$xxxx <- render*(), eg renderPlot()
shinyApp(ui = ui, server = server)
Geographical maps
Introduction
There seems to be lots of different ways you can make maps in R: partly because the best approach will depend on what type of map you’re making; and partly just because there’s lots of competing packages available.
A note on shapefiles
Examples
- Create a basic map of the world (minus Antarctica)
library(maps)
library(ggplot2)
world_map <- map_data("world") %>%
filter(region != "Antarctica") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "light blue") +
coord_fixed() +
theme_void()
world_map
- Create a basic map of a country, eg UK
library(maps)
library(ggplot2)
UK_map <- map_data("world", region = "UK") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "dark blue") +
coord_fixed() +
theme_void()
UK_map
- Create a detailed map of a country, eg counties within UK Using sf and ggplot packages as per the following tutorials:
library(maps)
library(tidyverse)
library(sf)
UK_map <- map_data("world", region = "UK") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "dark blue") +
coord_fixed() +
theme_void()
UK_map
Tile grid maps
Package for creating tile grid maps for regions including:
Related: countrycode package
Model
“All models are wrong, but some are useful”
The goal of a model is not to uncover truth, but to discover a simple approximation that is useful (in order to understand the behaviour of something).
- linear model takes the general form:
y = a_0 + a_1 * x
where
(NB this is another way of expressing y = mx + c)
Linear models also assume that the ‘residuals’ (difference between observed and predicted values) have a normal distribution.
lm
or, how to apply a linear model to data
- Create a model (a type of list) with the function
lm(), where x_variable is the independent ‘cause’ and y_variable is the dependent ‘consequence’:
model_name <- lm(y_variable ~ x_variable, data = datset)
- Create a ‘grid’ of your data (evenly spaced grid of points from the data)
grid_name <- data %>% data_grid(x_variable)
- Add predictions to your gridded data using the model you created
grid_name <- grid_name %>% add_preidictions(model_name)
- To visualise the linear model, do the usual ggplot of the distribution of the
dataset, then add a geom_line layer using the grid_name as its data, eg
ggplot(dataset, aes(x_variable, y_variable)) +
geom_hex(bins = ###) +
geom_line(grid_name)
geom_smooth
or, modelling with ggplot (notes taken from Data visualization: a practical introduction).
Basic linear model:
geom_smooth(method = "lm")
Specify linear model:
geom_smooth(method = "lm", formula = y_variable ~ x_variable)
Iterate
Using the book ‘R for Data Science’ and the tidyverse package
Functions
Functions allow you to automate tasks, as opposed to copy-and-pasting similar bits of code
There are huge advantages in using a function rather than copy-and-paste: if you want to make a change, you only need to do it once; and you’re less likely to make typos and mistakes.
Use base R to write functions
There are 3 basic building blocks for creating a function.
A name
The inputs (or arguments)
The ‘body’ of the function, ie code that goes between {}
…which comes together as:
function_name <- function(arguments){ function code }
What you get back from the function will by default be the last statement the function evaluates.
BUT you can also choose to use:
return() in the middle of the function code to signal to make it easier to understand, eg if you’ve written very long if statements.
invisible() to ensure that something doesn’t get printed out as a result of the function, eg a new dataframe that you’re using to create a plot
Examples
If we were to create a function that adds 2 to the input, it would look like this:
add_two <- function(x){
x+2
}
and you can apply it like this
add_two(2)
you can also apply it to every element of a vector
b <- c(1:5)
add_two(b)
Here’s another example. We can create a fizzbuzz function that takes a single number as input and returns “fizz” if it’s divisible by three and “buzz” if it’s divisible by 5 and “fizzbuzz” if it’s divisible by 3 and 5.
fizzbuzz <- function(x) {
# stopifnot() function ensures the truth of an r expression
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
# %% and && are logical operators, where %% is 'remainder from division' and && is 'and'
if (!(x %% 3) && !(x %% 5)) {
"fizzbuzz"
} else if (!(x %% 3)) {
"fizz"
} else if (!(x %% 5)) {
"buzz"
} else {
x
}
}
fizzbuzz(378)
It’s useful to have some naming conventions for your function arguments. The following are recommended:
x, y and z for vectors
df for data frames
i for the row number (indices)
j for the column number (indices)
n for the number of rows
p for the numer of columns
It’s also useful to add a generic error message to your functions with stopifnot(), which you can use to assert what SHOULD be true (rather than checking for what might be wrong). For example:
function_name <- function(x, y, na.rm = FALSE) {
stopifnot(is.logical(na.rm), length(na.rm) == 1)
stopifnot(length(x) == length(y))
function code
}
For loops
Functions help to reduce duplication of repeated patterns of code
Iteration helps to reduce duplication of repeated operations on multiple inputs, eg performing the same operation on different columns or different datasets
There are 2 types of iteration:
Imperative programming, eg for loops and while loops
Functional programming means even less duplication than you get in imperative programming
For loops: the basics
Examples An example of a simple for loop
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
names(output) <- names(mtcars) # The names of the elements should be the names of the columns in mtcars
for (i in names(mtcars)) { # For every column in mtcars
output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
}
output # Print output
A similar example that doesn’t require the line names(output) <- names(mtcars) but gives results without column headings would be:
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
for (i in 1:ncol(mtcars)) { # For every column in the number of columns that mtcars has
output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
}
output # Print output
For loops: variations
For loops aren’t limited to creating new objects; you can also use them to change existing ones by just removing the assignment at the start
For loops can loop over names and values (not just indices / number of columns)
For loops can handle outputs and sequences of unknown length with ‘while’ loops (although this is most useful in context of simulation, so unlikely to need)
Examples
How to load a directory full of csvs into a single data frame using a for loop.
- Create a vector with the filename paths
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
- Use a for loop that preallocates a list to the filenames and loads them in with read_csv()
df <- vector("list"", length(files))
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
- Combine them into a single dataframe
df <- bind_rows(df)
Purrr
purrr = better than for loops because it makes code easier to write and to read by focusing on the operation being performed (and not the bookkeeping required to loop over every element and store the output!)
Key idea of functional programming = passing one function to another function
It eliminates the need for many common for loops and is similar to the apply() family of functions
Key idea is to have functions that perform the common pattern of:
Looping over a vector
Doing something to each element
Saving the results
There are different purrr functions for different types of output, as follows:
map() to make a list
map_lgl() to make a logical vector
map_int() to make an integer vector
map_dbl() to make a double vector
map_chr() to make a character vector
There are also useful shortcuts you can use within purrr:
Here’s the basic structure for using a map function:
map_vectortype(dataset, function)
Examples Calculate the mean of every column in mtcars
map_dbl(mtcars, mean)
Compute the number of unique values in each column of iris
# V1 WITHOUT SHORTCUTS
map_int(iris, function(x) length(unique(x)))
# V2 WITH SHORTCUTS
map_int(iris, ~ length(unique(.)))
Use a map function within a mutate, using a function you’ve created, ie applying a function to every variable within a dataset
# Example dataset
a <- tibble(x = c(1:4), y = c(100:103))
a
# Example function (the old add_two created earlier in the Functions chapter notes)
add_two <- function(x){
x+2
}
# Example map function
c <- a %>%
mutate(z = map_dbl(x, add_two),
z2 = map(x, add_two))
c
c$z2
Apply multiple map functions to nested tibbles, ie applying a set of functions to separate datasets
# Create function, in this case to filter a dataset by Petal.Length > 5
filter_petal <- function(df){
df %>%
filter(Petal.Length > 5)
}
# Turn a normal dataframe into a dataframe of nested tibbles, in this case the iris dataset nested by species
iris_mod <- iris %>%
group_by(Species) %>%
nest()
# Create 3 new columns in all of the nested tibbles
iris_mod %>%
# Use map to apply the new function filter_petal to each of the separate datasets for Species within iris_mod (We use map() because the output from applying filter_petal to data is a list.)
mutate(fl_petal = purrr::map(data, filter_petal),
# If we want to add the number of rows in each dataset, we would then use map_dbl(), because the output from nrow is a single number.
n_full_data = purrr::map_dbl(data, nrow),
# Ditto - use map_dbl() for this output, which is the number of rows in our new column, fl_petal, which we created above
n_petal_large = purrr::map_dbl(fl_petal, nrow))
Iterate with ggplot
Add creating a function to repeat the same ggplot & programmatically writing titles, captions etc <<
Iterate with walk
Iteratively save files
Use walk() and its variants walk2 and pwalk() when you want to call a function for its side effects, rather than its return value.
This is the case when you want to render output to the screen or save a file to disk.
For example, if you had a list of plots and a vector of filenames, you can use pwalk() as follows:
# Create a list of multiple plots using the map() function
plots <- mtcars %>%
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point())
plots
# Create an iterative filenaming structure for each plot, in this case the names of the plots followed by .pdf
paths <- stringr::str_c(names(plots), ".pdf")
# Save each plot to the path names defined in 'paths' by combining the plots and the pathnames in a list and saving that list
pwalk(list(paths, plots), ggsave, path = tempdir())
Notes
Factors
Factors are used for categories
Factors can either have an order that’s “arbitrary” (eg hair colour) or “principled” (eg months)
They’re esp useful if you want to display character vectors in non-alphabetical order
Use forcats package (not part of tidyverse) for working with factors
For example, say you have a string of months
library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
To create a “factor” of months over a year:
- First create a list of the different months, or levels, in the order they should appear
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
- Then create a factor with the function factor() and levels =
y1 <- factor(x1, levels = month_levels)
You now have a factor that you can sort according to the order you set with sort()
sort(y1)
You can check the levels in your factor with levels()
levels(y1)
You can reorder your factor in at least 2 ways:
- By another variable, eg which month has the most birthdays, with fct_reorder():
fct_reorder(factor, variable)
- By moving one or more levels to the end, eg showing “Not applicable” at the end, with fct_relevel():
fct_relevel(factor, "level to pull out", "level to pull out")
You can also edit your factor with fct_recode. This is particularly useful for editing labels for publication. And you can use this technique to combine multiple “old” levels into the same “new” level, eg grouping several levels under “Other”.
data %>% mutate(variable = fct_recode(variable, "new name" = "old name", "new name" = "old name"))
Other modifications you can make to your factor include:
- fct_collapse(), which is like fct_recode(), but works so that for each new variable you can provide a vector of old levels, eg
data %>% mutate(variable = fct_collapse(variable, "new name" = c("old name", "old name", "old name"))
- fct_lump, which crudely lumps together all the “small” groups to simplify a plot or table
data %>% mutate(variable = fct_lump(variable))
Vectors
Note about integers vs doubles: if you just enter 2 in R, it interprets this as 2.000000, ie a double. To enter 2 as an integer you have to write 2L.
typeof(2)
typeof(2L)
typeof(2.5L)
So what can you do with vectors?
You can find out what type of vector a vector is with `typeof()
You can find out how many items there are in a vector with `length()
You can specify how to read a vector when you import with readr by using col_types specifications, for example:
data <- read_csv("data.csv"),
col_types = cols(column_name = col_logical(),
column_name = col_integer(),
column_name = col_double(),
column_name = col_character(),
column_name = col_date(format = ""),
column_name = col_time(format = ""),
column_name = col_datetime(format = ""),
column_name = col_number(), # Isn't fussy about numbers containing commas, $ etc
column_name = col_skip(), # Skips importing this column
column_name = col_guess() # Guesses how to parse based on input
)
You can convert one type of vector to another with as.logical(), as.integer(), as.double() or as.character()
You can perform basic maths operations on vectors, ie don’t need to iterate for every item in the vector
You can create and name a vector with c()
Lists
Can contain a mix of atomic vector types and other lists
It’s useful to use str() because this will give you the structure of your list, rather than the contents
To subset items within a list use double brackets [[]] rather than single [].
Augmented vectors
Factors, dates, date-times and tibbles are augmented vectors, because they have additional attributes (on top of the usual vector attributes of names, dimensions and class)
Factors have a ‘levels’ attribute
Dates and date-times have additional ‘class’ attributes
Tibbles have additional ‘class’ attributes (tbl_df, tbl and data.frame) as well as column names and row.names
---
title: "Cookbook"
output:
  html_notebook:
    theme: darkly
    toc: yes
    toc_float: yes
---

# Setup

### Install
```
install.package("package_name")
```

### Load
```
library(package_name)
```

# Import

### From csv
```
library(tidyverse)
data <- read_csv("folder/filename.csv")
```

### From a googlesheet
```
library(googlesheets)
sheet <- gs_title(“Google sheet name”)
data <- sheet %>% gs_read(ws = “Worksheet name”)
```

### From a table on a webpage
```
library(rvest)
url <- read_html('url')
data_raw <- url %>%
              html_node("table") %>%
              html_table(fill=TRUE)
```

### From a PDF
- Try `pdftools` package & [this tutorial](https://www.brodrigues.co/blog/2018-06-10-scraping_pdfs/?utm_campaign=Data_Elixir&utm_medium=email&utm_source=Data_Elixir_187)

- Using SPARQL (code from [R bloggers](https://www.r-bloggers.com/sparql-with-r-in-less-than-5-minutes/amp/)):
```
library(SPARQL) # SPARQL querying package
library(ggplot2)

# Step 1 - Set up preliminaries and define query
# Define the data.gov endpoint
endpoint <- "http://services.data.gov/sparql"

# create query statement
query <-
"PREFIX  dgp1187: <http://data-gov.tw.rpi.edu/vocab/p/1187/>
SELECT ?ye ?fi ?ac
WHERE {
?s dgp1187:year ?ye .
?s dgp1187:fires ?fi .
?s dgp1187:acres ?ac .
}"

# Step 2 - Use SPARQL package to submit query and save results to a data frame
qd <- SPARQL(endpoint,query)
df <- qd$results
```

### From multiple csvs

  1. Create a vector of all the datasets
  
```
  all_datasets <- tibble("file" = c('dataset_name_1', 'dataset_name_2')) 
```

  2. Create a function to import one dataset
```
  import <- function(df){
    df <- read_csv(str_c("df, ".csv"))
  }
```

  3. Use purrr::map() to run the function on (each element) of all the datasets
```
  all_data <- all_datasets %>% mutate(dataset = purrr::map(file, import))
```
  
  4. Run a 'for loop' to get R to assign each tibble to an object
```
  for(i in 1:nrow(all_data)){
  assign(all_data$file[[i]], all_data$dataset[[i]])
  }
```
  
### From multiple webpages

  1. Create a vector with the URL paths needed, eg:
```
  url_unique <- # function to list these, eg using seq() if numerical
  url_base <- "https://abc"
  url_end <- "/abc.html"
  url_path <- url_unique %>% str_c(url_base, . , url_end)
  data <- tibble(url_path)
```
  
  2. Define a function to scrape (and clean) data from one url
```
  scrape <- function(url){
  xml <- read_html(url) %>%
          html_node("table") %>%
          html_table(fill=TRUE) %>%
  xml
```
  
  3. (if needed) Prevent error messages from URL paths that don't exist
```
  extract_results <- function(df){
    df$result
  }
```
  
  4. Loop scrape function over all URLs as nested tibbles
```
  data <- data %>%
            mutate(new_column = purrr::map(url_path, safely(scrape)),
            new_column = map(new_column, extract_results))
```

### Save as csv
```
write_csv(dataset, "filename.csv")
```

# Tidy
### Inspect
Get list of all column headings in a dataset
```
colnames(dataset)
```

Get the number of rows and columns in a dataset
```
summarise() 
```

Get the breakdown of results within a column
```
table(dataset$column)
```

Get the number of values in a column
```
summary(dataset$column)
```

Get the number of NA values in a column
```
sum(is.na(dataset$column))
```

### tidyr

Wide --> skinny
```
gather(dataset, "new_column_name1", "new_column_name2", column range eg '2:8')
```

Skinny --> wide
```
spread(dataset, $keycolumn, $valuecolumn)
``` 

Combine datasets by a common column
```
merge(dataset1, dataset2, by = "common column name")
``` 

Combine datasets by attaching rows
```
append(dataset1, dataset2)
``` 

Create a new column
```
mutate(old_variable, new_variable = (calculation))
```

Rename a column
```
rename(dataset, new name = old name)
```

Rename categories within a column
```
data %>% 
  mutate(variable = fct_recode(variable, "new name" = "old name", 
                                         "new name = "old name"))
```

Alternatives (more for correcting typos like Find & Replace):
  ```
  dataset$column[dataset$column == "old"] <- "new"
  ```

  ```
  "new_name" <- length(grep("existing_label", dataset$column))
  ```

Drop column(s) / row(s)
``` 
subset(dataset, select = -c(columns))
subset(dataset, column != "")
``` 

Remove NAs
```
na.omit(dataset)
``` 

Arrange a column in descending order
```
arrange(dataset, desc(column)))
```

### stringr

**Filter a column if contains text string**

```
grepl(“text string”, variable)
``` 

**Remove symbols, eg % sign**

```
dataset$column <- gsub("\\%","", dataset$column)
```

**Convert character column to numeric**

```
dataset$column <- as.numeric(as.character(dataset$column))
```

**Round numbers**

```
mutate(variable = round(variable))
```

### [janitor](https://github.com/sfirke/janitor)

**Clean column heading names (to lowercase with underscores**

```
clean_names()
```

**Create a quick pivot table to summarise variables**

```
tabyl(variable1, variable2) %>% 
adorn_*
```

**Remove empty rows and columns**

```
remove_empty() 
```

**Fix dates stored as serial numbers**

```
excel_numeric_to_date()
```

# Transform
### dplyr

Select columns

```
select(variable, variable, variable)
```

Select rows that meet criteria

```
filter(variable == "..." | variable == "...")
```

Analyse (like a pivot table)

```
group_by(column) %>% 
  summarise(calculation)
```

**Classic code structures**

- Pivot and perform calculations on data

```
data_analysis <- raw_data %>%
          	           filter(),
          	           group_by(),
          	           summarise()
```
...which sounds like:
"The original dataframe is XXX,
	now filter data to only include rows that satisfy the conditions YYY,
	now group the data at each level of the variable(s) ZZZ,
	now summarize the data and calculate summary functions XXX…"

- Pivot data to show percentage breakdown of each column

```
data_analysis <- raw_data %>%
                       group_ by(column 1, column 2) %>%
                       summarise(n = n()) %>%
                       mutate(pct = n / sum(n)) %>%
                       spread(column 2, pct)
```

**Calculations**

```
sum()
count()
unique()
n_distinct()
```

Count unique values in a column and arrange in descending order with
```
data %>% count(column) %>% arrange(desc(n))
```

### apply

Pivot one column of data (use 2 for rows) with 
```
sapply(dataset, function, 1)
```

Get the number of unique values in every column
```
rapply(data,function(x)length(unique(x)))
```

### stringr

JOIN with `str_c()`

- `sep =` how you combine the columns, eg sep = "," for comma separated values

- `collapse =` how you combine the rows, eg collapse = "-", to add a hyphen between each vector

SPLIT with `str_split()`

GET MATCHES

- `str_view()` to see results in Viewer window

- `str_detect()` to get TRUE/FALSE response

- `str_count()` to get counted response

GET MATCHES WHEN WORKING WITH A TABLE with `tidyr::extract(old_column, "new_column", "regular expression")`

GET LOCATION with `str_locate()`

SUBSET with `str_sub()`

COUNT LENGTH with `str_length()`

WRAP with `str_wrap`

- `width =` line width in characters

- `indent =` indentation of first line in each paragraph

- `exdent =` indentation of following lines in each paragraph

### regex
aka "regular expressions"

	- `.` = any character (wildcard)
	
	- `^` = start of the string
	
	- `$` = end of the string
	
	- `match = TRUE` = view just those matches that meet criteria
	
	- `\d:` any digit
	
	- `\s:` any whitespace (space, tab, newline)
	
	- `[abc]:` matches a, b, or c
	
	- `[^abc]:` matches anything except a, b, or c

# Visualise 🤩 

### Basics

Classic code structure
```
ggplot(dataset) +
	 viz_function(aes(x = variable,
				            y = variable,
				            group = variable,
                    size = variable,
                    fill = variable) +
   scale_x/y_discrete/continuous(breaks = c(), 
                      limits = c(), 
                      labels = c()) +
   labs(title = "...",
        subtitle = "...",
        x = "...",
        y = "...",
        caption = "...") +
   theme() +
   ggsave("name.filetype", height = , width = , unit = "")
```

Additional layers
```
geom_hline() 
geom_vline()
geom_segment()
geom_text()
geom_rect()
```

Transform data within a ggplot function
```
plot1 <- ggplot(filter(dataset, variable == criteria))
```

### Legends

Reorder

  1. First create a factor:
  ```
  dataset$newcolumn <- factor(dataset$column,  levels = c(“item1”, “item2”) , ordered = TRUE)
  ```
  
  2. Then in ggplot() aesthetic mappings, use this new column, rather than the pre-existing one.

Remove
```
theme(legend.position = "none")
``` 

### Axes

Arrange in descending order
```
ggplot(aes(x = reorder(variable1, -variable2), y = variable2)
```

### Text

Remove / style axes labels
```
axis.title.x/y = element_blank() or element_text(size = , family = , font = )
axis.text.x/y = element_blank() or element_text(size = , family = , font = )
```

Wrap long labels
```
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
```

Rotate labels 45 degrees
```
theme(axis.text.x=element_text(angle=45))
``` 

### Themes

From ggplot

- `theme_gray()` – signature ggplot2 theme

- `theme_bw()` – dark on light ggplot2 theme

- `theme_linedraw()` – uses black lines on white backgrounds only

- `theme_light()` – similar to linedraw() but with grey lines aswell

- `theme_dark()` – lines on a dark background instead of light

- `theme_minimal()` – no background annotations, minimal feel.

- `theme_classic()` – theme with no grid lines

- `theme_void()` – empty theme with no elements

From ggthemes

- `theme_solid()` - Theme with nothing other than a background color

- `theme_map()` - Clean theme for maps

- `theme_igray()` -  Inverse gray theme

- `theme_economist()`

- `theme_fivethirtyeight()`

- `theme_wsj()` 

- `theme_few()`

- `theme_tufte()`

- `theme_excel()`

- `theme_gdocs()`

- `theme_stata()` 

- `theme_solarized()`

- `theme_hc()` - Highcharts JS theme

### Interactive ggplots

1. **iraph**
  ```
  library(iraph)
  ggplot(data, aes(x = , y = , tooltip = variable) +
      geom_???_interactive()
  ```

2. **plotly**
  ```
  library(plotly)
  ggplotly(plot, tooltip = variable)
  ```

3. **[highcharter](http://jkunst.com/highcharter/index.html)**

4. **[htmlwidgets](http://www.htmlwidgets.org/.html)**

5. **RShiny**

  - To make a shiny app, you need to create script with a filename ending "app.R"
  
  - The basic structure of an R Shiny app:
  
  ```
  library(shiny)
  ui <- fluidPage()
  input() functions, eg sliderInput()
  output() functions, eg plotOutput()
  
  server <- function(input, output) { }
  output$xxxx <- render*(), eg renderPlot()
  
  shinyApp(ui = ui, server = server)
  ```

### Geographical maps

Introduction

There seems to be lots of different ways you can make maps in R: partly because the best approach will depend on what type of map you're making; and partly just because there's lots of competing packages available.

- Ways of getting map data

    1. Vector image maps, eg sf package or maps package and map_data() function

    2. Raster image maps, eg raster package

- Packages for plotting map data

    1. ggplot, eg geom_polygon() and geom_sf()

    2. tmap and tmaptools (dunno)

    3. leaflet (dunno)

**A note on shapefiles**

- Raster

  - static image files generated previously by the mapping service, which limits your ability to redraw or change the appearance of the geographic map

  - tradeoff means you can immediately focus on incorporating additional data into the map; better for gridded data, like satellite imagery

- Vector 

  - spatial data files which contain detailed information necessary to draw all the components of a map (e.g. points, lines, polygons)

  - requires importing this data into R; once imported though, ggplot2 works with simple features (sf) data frames to easily generate geospatial visualizations using all the core elements and approaches of ggplot()

- Where to get shapefiles: http://www.naturalearthdata.com/downloads/

Examples

1. Create a basic map of the world (minus Antarctica)
```{r world map}
library(maps)
library(ggplot2)
world_map <- map_data("world") %>% 
  filter(region != "Antarctica") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "light blue") + 
  coord_fixed() +
  theme_void()
world_map
```

2. Create a basic map of a country, eg UK
```{r country map}
library(maps)
library(ggplot2)
UK_map <- map_data("world", region = "UK") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map
```

3. Create a detailed map of a country, eg counties within UK 
Using sf and ggplot packages as per the following tutorials:

- https://www.r-spatial.org/r/2018/10/25/ggplot2-sf.html

- https://cfss.uchicago.edu/geoviz_plot.html
```{r shapefile map}
library(maps)
library(tidyverse)
library(sf)

UK_map <- map_data("world", region = "UK") %>%
  
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map
```

### Tile grid maps

#### [geofacet](https://hafen.github.io/geofacet/)

Package for creating tile grid maps for regions including:

- `london_boroughs_grid`

- `uk_regions1` 

- `world_86countries_grid` & `world_countries_grid1`

- `china_prov_grid1`

Related: [countrycode](https://github.com/vincentarelbundock/countrycode) package

# Model

**"All models are wrong, but some are useful"**

The goal of a model is not to uncover truth, but to discover a simple approximation that is useful (in order to understand the behaviour of something).

* l**inear **model takes the general form:

```
y = a_0 + a_1 * x
```

where 

- `a_0` = starting point

- `a_1` = difference between x and y at a given point

(NB this is another way of expressing `y = mx + c`)

Linear models also assume that the 'residuals' (difference between observed and predicted values) have a normal distribution.

### lm

or, how to apply a linear model to data

1. Create a model (a type of list) with the function `lm()`, where `x_variable` is the independent 'cause' and `y_variable` is the dependent 'consequence':

```
model_name <- lm(y_variable ~ x_variable, data = datset)
```

2. Create a 'grid' of your data (evenly spaced grid of points from the data)

```
grid_name <- data %>% data_grid(x_variable)
```

3. Add predictions to your gridded data using the model you created

```
grid_name <- grid_name %>% add_preidictions(model_name)
```

4. To visualise the linear model, do the usual ggplot of the distribution of the `dataset`, then add a `geom_line` layer using the `grid_name` as its data, eg

```
ggplot(dataset, aes(x_variable, y_variable)) +
  geom_hex(bins = ###) +
  geom_line(grid_name)
```

### geom_smooth

or, modelling with ggplot (notes taken from [Data visualization: a practical introduction](https://socviz.co/modeling.html)).

Basic linear model:
```
geom_smooth(method = "lm")
```

Specify linear model:
```
geom_smooth(method = "lm", formula = y_variable ~ x_variable)
```

# Iterate

Using the book 'R for Data Science' and the tidyverse package

### Functions

- Functions allow you to automate tasks, as opposed to copy-and-pasting similar bits of code

- There are huge advantages in using a function rather than copy-and-paste: if you want to make a change, you only need to do it once; and you're less likely to make typos and mistakes.

- Use base R to write functions

There are **3 basic building blocks** for creating a function.

1. A name

2. The inputs (or arguments)

3. The 'body' of the function, ie code that goes between {}

...which comes together as:

```function_name <- function(arguments){ function code }```

What you get back from the function will by default be the last statement the function evaluates. 

BUT you can also choose to use:

- **`return()`** in the middle of the function code to signal to make it easier to understand, eg if you've written very long `if` statements.

- **`invisible()`** to ensure that something doesn't get printed out as a result of the function, eg a new dataframe that you're using to create a plot

**Examples**

If we were to create a function that adds 2 to the input, it would look like this:
```{r}
add_two <- function(x){
  x+2
}
```

and you can apply it like this
```{r}
add_two(2)
```

you can also apply it to every element of a vector
```{r}
b <- c(1:5)
add_two(b)
```

Here's another example. We can create a `fizzbuzz` function that takes a single number as input and returns "fizz" if it's divisible by three and "buzz" if it's divisible by 5 and "fizzbuzz" if it's divisible by 3 and 5.
```{r}
fizzbuzz <- function(x) {
  
  # stopifnot() function ensures the truth of an r expression
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))

  # %% and && are logical operators, where %% is 'remainder from division' and && is 'and'  
  if (!(x %% 3) && !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    x
  }
}

fizzbuzz(378)
```

It's useful to have some **naming conventions** for your function arguments. The following are recommended:

- `x`, `y` and `z` for vectors

- `df` for data frames

- `i` for the row number (indices)

- `j` for the column number (indices)

- `n` for the number of rows

- `p` for the numer of columns

It's also useful to add a generic error message to your functions with **`stopifnot()`**, which you can use to assert what SHOULD be true (rather than checking for what might be wrong). For example:

```
function_name <- function(x, y, na.rm = FALSE) {
  stopifnot(is.logical(na.rm), length(na.rm) == 1)
  stopifnot(length(x) == length(y))
  
  function code
}
```

### For loops

- Functions help to reduce duplication of repeated patterns of code

- Iteration helps to reduce duplication of repeated operations on multiple inputs, eg performing the same operation on different columns or different datasets

- There are 2 types of iteration:

  1. **Imperative programming**, eg for loops and while loops
  
  2. **Functional programming** means even less duplication than you get in imperative programming

**For loops: the basics**

- Every for loop has 3 components:

  1. An output, ie **specify what you want the output of your for loop to be**, eg a vector and details like its vector type; or you could also specify your output as being a tibble, a factor etc.
  
  2. A sequence, ie what to loop over & involves assigning `i` (like 'it') to a different value, so `for i in _____`
  
  3. The body {}, ie the code that does the work, each time with a different value for `i`

**Examples**
An example of a simple for loop  
```{r}
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            names(output) <- names(mtcars) # The names of the elements should be the names of the columns in mtcars
            for (i in names(mtcars)) { # For every column in mtcars
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output
```

A similar example that doesn't require the line `names(output) <- names(mtcars)` but gives results without column headings would be:
```{r}
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            for (i in 1:ncol(mtcars)) { # For every column in the number of columns that mtcars has
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output
```

**For loops: variations**

- For loops aren't limited to creating new objects; you can also use them to change existing ones by just removing the assignment at the start

- For loops can loop over names and values (not just indices / number of columns)
  
  - loop over elements with `for (x in xs)`
  
  - loop over names with `for (nm in names(xs))`

- For loops can handle outputs and sequences of unknown length with 'while' loops (although this is most useful in context of simulation, so unlikely to need)

**Examples**

How to load a directory full of csvs into a single data frame using a for loop.

1) Create a vector with the filename paths
```
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
```

2) Use a for loop that preallocates a list to the filenames and loads them in with read_csv()
```
df <- vector("list"", length(files)) 
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
```

3) Combine them into a single dataframe
```
df <- bind_rows(df)
```

### Purrr

- **purrr = better than for loops** because it makes code easier to write and to read by focusing on the operation being performed (and not the bookkeeping required to loop over every element and store the output!)

- Key idea of functional programming = passing one function to another function

- It eliminates the need for many common for loops and is similar to the `apply()` family of functions

- Key idea is to have functions that perform the common pattern of:
  
  1. Looping over a vector
  
  2. Doing something to each element
  
  3. Saving the results 

- There are different purrr functions for different types of output, as follows:

  - `map()` to make a list
  
  - `map_lgl()` to make a logical vector
  
  - `map_int()` to make an integer vector
  
  - `map_dbl()` to make a double vector
  
  - `map_chr()` to make a character vector
  
- There are also useful shortcuts you can use within purrr:

  - `~` will replace `function(x)` when you want to create an anonymous function
  
  - `.` will refer to the current list element (a bit like how `i` works in a for loop)
  
Here's the basic structure for using a map function:
```
map_vectortype(dataset, function)
```

**Examples**
Calculate the mean of every column in mtcars
```{r}
map_dbl(mtcars, mean)
```

Compute the number of unique values in each column of iris
```{r}
# V1 WITHOUT SHORTCUTS
map_int(iris, function(x) length(unique(x)))
# V2 WITH SHORTCUTS
map_int(iris, ~ length(unique(.)))
```

Use a map function within a mutate, using a function you've created, ie applying a function to every variable within a dataset
```{r}
# Example dataset
a <- tibble(x = c(1:4), y = c(100:103))
a

# Example function (the old add_two created earlier in the Functions chapter notes)
add_two <- function(x){
  x+2
}

# Example map function
c <- a %>%
  mutate(z = map_dbl(x, add_two),
         z2 = map(x, add_two))
c
c$z2
```

Apply multiple map functions to nested tibbles, ie applying a set of functions to separate datasets
```{r}
# Create function, in this case to filter a dataset by Petal.Length > 5
filter_petal <- function(df){
  df %>%
  filter(Petal.Length > 5)
}

# Turn a normal dataframe into a dataframe of nested tibbles, in this case the iris dataset nested by species
iris_mod <- iris %>%
  group_by(Species) %>%
  nest()

# Create 3 new columns in all of the nested tibbles
iris_mod %>%
  
  # Use map to apply the new function filter_petal to each of the separate datasets for Species within iris_mod (We use map() because the output from applying filter_petal to data is a list.)
  mutate(fl_petal = purrr::map(data, filter_petal), 
         
  # If we want to add the number of rows in each dataset, we would then use map_dbl(), because the output from nrow is a single number.
  n_full_data = purrr::map_dbl(data, nrow), 
  
  # Ditto - use map_dbl() for this output, which is the number of rows in our new column, fl_petal, which we created above
  n_petal_large = purrr::map_dbl(fl_petal, nrow)) 
```

### Iterate with ggplot

>> Add creating a function to repeat the same ggplot & programmatically writing titles, captions etc <<

### Iterate with walk

**Iteratively save files**

Use **`walk()`** and its variants **`walk2`** and **`pwalk()`** when you want to call a function for its side effects, rather than its return value.

This is the case when you want to render output to the screen or save a file to disk.

For example, if you had a list of plots and a vector of filenames, you can use `pwalk()` as follows:
```{r}
# Create a list of multiple plots using the map() function
plots <- mtcars %>% 
          split(.$cyl) %>% 
          map(~ggplot(., aes(mpg, wt)) + geom_point())
plots

# Create an iterative filenaming structure for each plot, in this case the names of the plots followed by .pdf
paths <- stringr::str_c(names(plots), ".pdf")

# Save each plot to the path names defined in 'paths' by combining the plots and the pathnames in a list and saving that list
pwalk(list(paths, plots), ggsave, path = tempdir())
```

# Communicate

### RMarkdown

Choose a markdown themes on **[Bootswatch](https://bootswatch.com/3/)**, including:

- cosmo ⭐️

- journal

- darkly (black)

- flatly

- lumen

- paper

- readable

- sandstone

- simplex

- slate (grey)

- superhero (blue)

Create a website in RMarkdown with **[bookdown](https://bookdown.org/yihui/blogdown/get-started.html)**

# Bookmarks

- [R for data science](https://r4ds.had.co.nz/)

- [Data visualization: a practical introduction](https://socviz.co/index.html#preface)

- [R Markdown](https://bookdown.org/yihui/rmarkdown/)

- [R and Github](https://happygitwithr.com/git-client.html)

- [Tidyverse: ggplot](https://ggplot2.tidyverse.org/reference/index.html#)

# Notes
### Factors

- Factors are used for **categories**

- Factors can either have an order that's "arbitrary" (eg hair colour) or "principled" (eg months)

- They're esp useful if you want to display character vectors in non-alphabetical order

- Use **forcats** package (not part of tidyverse) for working with factors

For example, say you have a string of months
```{r}
library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
```

To create a "factor" of months over a year:

  1. First create a list of the different months, or **levels**, in the order they should appear
```{r}
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
```

  2. Then create a factor with the function factor() and levels = 
```{r}
y1 <- factor(x1, levels = month_levels)
```

You now have a factor that you can **sort** according to the order you set with sort()
```{r}
sort(y1)
```

You can **check** the levels in your factor with levels()
```{r}
levels(y1)
```

You can **reorder** your factor in at least 2 ways:

  1. By another variable, eg which month has the most birthdays, with **fct_reorder()**:
```{r}
fct_reorder(factor, variable)
```
  
  2. By moving one or more levels to the end, eg showing "Not applicable" at the end, with **fct_relevel()**:
```{r}
fct_relevel(factor, "level to pull out", "level to pull out")
```

You can also **edit** your factor with **fct_recode**. This is particularly useful for editing labels for publication. And you can use this technique to combine multiple "old" levels into the same "new" level, eg grouping several levels under "Other".

```{r}
data %>% mutate(variable = fct_recode(variable, "new name" = "old name", "new name" = "old name"))
```

Other modifications you can make to your factor include:

- **fct_collapse()**, which is like fct_recode(), but works so that for each new variable you can provide a vector of old levels, eg
```{r}
data %>% mutate(variable = fct_collapse(variable, "new name" = c("old name", "old name", "old name"))
```

- **fct_lump**, which crudely lumps together all the "small" groups to simplify a plot or table
```{r}
data %>% mutate(variable = fct_lump(variable))
```

### Vectors

- Vectors are essentially variables within a tibble

- There are 3 types of vector:
  
  1. **Atomic** which can be further subcategorised into:
  
      - **logical** (TRUE, FALSE and NA)
      
      - **integer** (numeric without decimal places, eg 243)
      
      - **double** (numeric with decimal places, eg 243.5678) 
      
      - **character** (made up of strings)
      
      - **complex** (rarely used during data analysis)
      
      - **raw** (ditto)
  
  2. **Lists** (aka 'recursive vectors' and can contain other lists, so they're good for representing hierarchies in data)

  3. **`NULL`** which is the absence of a vector (and distinct from `NA` which is the absence of a value in a vector)

**Note about integers vs doubles**: if you just enter `2` in R, it interprets this as `2.000000`, ie a double. To enter `2` as an integer you have to write `2L`. 
```{r}
typeof(2)
typeof(2L)
typeof(2.5L)
```

So what can you do with vectors?

- You can find out what type of vector a vector is with **`typeof()** 

- You can find out how many items there are in a vector with **`length()**

- You can specify how to read a vector when you import with `readr` by using `col_types` specifications, for example:

```{r}
data <- read_csv("data.csv"), 
              col_types = cols(column_name = col_logical(),
                               column_name = col_integer(),
                               column_name = col_double(),
                               column_name = col_character(),
                               column_name = col_date(format = ""),
                               column_name = col_time(format = ""),
                               column_name = col_datetime(format = ""),
                               column_name = col_number(), # Isn't fussy about numbers containing commas, $ etc
                               column_name = col_skip(), # Skips importing this column
                               column_name = col_guess() # Guesses how to parse based on input
                               )
```

- You can convert one type of vector to another with **`as.logical()`**, **`as.integer()`**, **`as.double()`** or **`as.character()`**

- You can perform basic maths operations on vectors, ie don't need to iterate for every item in the vector

- You can create and name a vector with `c()`

**Lists**

- Can contain a mix of atomic vector types and other lists

- It's useful to use `str()` because this will give you the structure of your list, rather than the contents

- To subset items within a list use double brackets `[[]]` rather than single `[]`.

**Augmented vectors**

- Factors, dates, date-times and tibbles are augmented vectors, because they have additional attributes (on top of the usual vector attributes of names, dimensions and class)

  - Factors have a 'levels' attribute
  
  - Dates and date-times have additional 'class' attributes
  
  - Tibbles have additional 'class' attributes (`tbl_df`, `tbl` and `data.frame`) as well as column `names` and `row.names`
