Import
From csv
library(tidyverse)
data <- read_csv("folder/filename.csv")
From a googlesheet
library(googlesheets)
sheet <- gs_title(“Google sheet name”)
data <- sheet %>% gs_read(ws = “Worksheet name”)
From a table on a webpage
library(rvest)
url <- read_html('url')
data_raw <- url %>%
html_node("table") %>%
html_table(fill=TRUE)
From a PDF
library(SPARQL) # SPARQL querying package
library(ggplot2)
# Step 1 - Set up preliminaries and define query
# Define the data.gov endpoint
endpoint <- "http://services.data.gov/sparql"
# create query statement
query <-
"PREFIX dgp1187: <http://data-gov.tw.rpi.edu/vocab/p/1187/>
SELECT ?ye ?fi ?ac
WHERE {
?s dgp1187:year ?ye .
?s dgp1187:fires ?fi .
?s dgp1187:acres ?ac .
}"
# Step 2 - Use SPARQL package to submit query and save results to a data frame
qd <- SPARQL(endpoint,query)
df <- qd$results
From multiple csvs
- Create a vector of all the datasets
all_datasets <- tibble("file" = c('dataset_name_1', 'dataset_name_2'))
- Create a function to import one dataset
import <- function(df){
df <- read_csv(str_c("df, ".csv"))
}
- Use purrr::map() to run the function on (each element) of all the datasets
all_data <- all_datasets %>% mutate(dataset = purrr::map(file, import))
- Run a ‘for loop’ to get R to assign each tibble to an object
for(i in 1:nrow(all_data)){
assign(all_data$file[[i]], all_data$dataset[[i]])
}
From multiple webpages
- Create a vector with the URL paths needed, eg:
url_unique <- # function to list these, eg using seq() if numerical
url_base <- "https://abc"
url_end <- "/abc.html"
url_path <- url_unique %>% str_c(url_base, . , url_end)
data <- tibble(url_path)
- Define a function to scrape (and clean) data from one url
scrape <- function(url){
xml <- read_html(url) %>%
html_node("table") %>%
html_table(fill=TRUE) %>%
xml
- (if needed) Prevent error messages from URL paths that don’t exist
extract_results <- function(df){
df$result
}
- Loop scrape function over all URLs as nested tibbles
data <- data %>%
mutate(new_column = purrr::map(url_path, safely(scrape)),
new_column = map(new_column, extract_results))
Save as csv
write_csv(dataset, "filename.csv")
Tidy
Inspect
Get list of all column headings in a dataset
colnames(dataset)
Get the number of rows and columns in a dataset
summarise()
Get the breakdown of results within a column
table(dataset$column)
Get the number of values in a column
summary(dataset$column)
Get the number of NA values in a column
sum(is.na(dataset$column))
tidyr
Wide –> skinny
gather(dataset, "new_column_name1", "new_column_name2", column range eg '2:8')
Skinny –> wide
spread(dataset, $keycolumn, $valuecolumn)
Combine datasets by a common column
merge(dataset1, dataset2, by = "common column name")
Combine datasets by attaching rows
append(dataset1, dataset2)
Create a new column
mutate(old_variable, new_variable = (calculation))
Rename a column
rename(dataset, new name = old name)
Rename categories within a column
data %>%
mutate(variable = fct_recode(variable, "new name" = "old name",
"new name = "old name"))
Alternatives (more for correcting typos like Find & Replace): dataset$column[dataset$column == "old"] <- "new"
"new_name" <- length(grep("existing_label", dataset$column))
Drop column(s) / row(s)
subset(dataset, select = -c(columns))
subset(dataset, column != "")
Remove NAs
na.omit(dataset)
Arrange a column in descending order
arrange(dataset, desc(column)))
stringr
Filter a column if contains text string
grepl(“text string”, variable)
Remove symbols, eg % sign
dataset$column <- gsub("\\%","", dataset$column)
Convert character column to numeric
dataset$column <- as.numeric(as.character(dataset$column))
Round numbers
mutate(variable = round(variable))
Clean column heading names (to lowercase with underscores
clean_names()
Create a quick pivot table to summarise variables
tabyl(variable1, variable2) %>%
adorn_*
Remove empty rows and columns
remove_empty()
Fix dates stored as serial numbers
excel_numeric_to_date()
Visualise 🤩
Basics
Classic code structure
ggplot(dataset) +
viz_function(aes(x = variable,
y = variable,
group = variable,
size = variable,
fill = variable) +
scale_x/y_discrete/continuous(breaks = c(),
limits = c(),
labels = c()) +
labs(title = "...",
subtitle = "...",
x = "...",
y = "...",
caption = "...") +
theme() +
ggsave("name.filetype", height = , width = , unit = "")
Additional layers
geom_hline()
geom_vline()
geom_segment()
geom_text()
geom_rect()
Transform data within a ggplot function
plot1 <- ggplot(filter(dataset, variable == criteria))
Legends
Reorder
- First create a factor:
dataset$newcolumn <- factor(dataset$column, levels = c(“item1”, “item2”) , ordered = TRUE)
- Then in ggplot() aesthetic mappings, use this new column, rather than the pre-existing one.
Remove
theme(legend.position = "none")
Axes
Arrange in descending order
ggplot(aes(x = reorder(variable1, -variable2), y = variable2)
Text
Remove / style axes labels
axis.title.x/y = element_blank() or element_text(size = , family = , font = )
axis.text.x/y = element_blank() or element_text(size = , family = , font = )
Wrap long labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
Rotate labels 45 degrees
theme(axis.text.x=element_text(angle=45))
Themes
From ggplot
theme_gray() – signature ggplot2 theme
theme_bw() – dark on light ggplot2 theme
theme_linedraw() – uses black lines on white backgrounds only
theme_light() – similar to linedraw() but with grey lines aswell
theme_dark() – lines on a dark background instead of light
theme_minimal() – no background annotations, minimal feel.
theme_classic() – theme with no grid lines
theme_void() – empty theme with no elements
From ggthemes
theme_solid() - Theme with nothing other than a background color
theme_map() - Clean theme for maps
theme_igray() - Inverse gray theme
theme_economist()
theme_fivethirtyeight()
theme_wsj()
theme_few()
theme_tufte()
theme_excel()
theme_gdocs()
theme_stata()
theme_solarized()
theme_hc() - Highcharts JS theme
Interactive ggplots
- iraph
library(iraph)
ggplot(data, aes(x = , y = , tooltip = variable) +
geom_???_interactive()
- plotly
library(plotly)
ggplotly(plot, tooltip = variable)
highcharter
htmlwidgets
RShiny
To make a shiny app, you need to create script with a filename ending “app.R”
The basic structure of an R Shiny app:
library(shiny)
ui <- fluidPage()
input() functions, eg sliderInput()
output() functions, eg plotOutput()
server <- function(input, output) { }
output$xxxx <- render*(), eg renderPlot()
shinyApp(ui = ui, server = server)
Geographical maps
Introduction
There seems to be lots of different ways you can make maps in R: partly because the best approach will depend on what type of map you’re making; and partly just because there’s lots of competing packages available.
A note on shapefiles
Examples
- Create a basic map of the world (minus Antarctica)
library(maps)
library(ggplot2)
world_map <- map_data("world") %>%
filter(region != "Antarctica") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "light blue") +
coord_fixed() +
theme_void()
world_map
- Create a basic map of a country, eg UK
library(maps)
library(ggplot2)
UK_map <- map_data("world", region = "UK") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "dark blue") +
coord_fixed() +
theme_void()
UK_map
- Create a detailed map of a country, eg counties within UK Using sf and ggplot packages as per the following tutorials:
library(maps)
library(tidyverse)
library(sf)
UK_map <- map_data("world", region = "UK") %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "dark blue") +
coord_fixed() +
theme_void()
UK_map
Tile grid maps
Package for creating tile grid maps for regions including:
Related: countrycode package
Model
“All models are wrong, but some are useful”
The goal of a model is not to uncover truth, but to discover a simple approximation that is useful (in order to understand the behaviour of something).
- linear model takes the general form:
y = a_0 + a_1 * x
where
(NB this is another way of expressing y = mx + c)
Linear models also assume that the ‘residuals’ (difference between observed and predicted values) have a normal distribution.
lm
or, how to apply a linear model to data
- Create a model (a type of list) with the function
lm(), where x_variable is the independent ‘cause’ and y_variable is the dependent ‘consequence’:
model_name <- lm(y_variable ~ x_variable, data = datset)
- Create a ‘grid’ of your data (evenly spaced grid of points from the data)
grid_name <- data %>% data_grid(x_variable)
- Add predictions to your gridded data using the model you created
grid_name <- grid_name %>% add_preidictions(model_name)
- To visualise the linear model, do the usual ggplot of the distribution of the
dataset, then add a geom_line layer using the grid_name as its data, eg
ggplot(dataset, aes(x_variable, y_variable)) +
geom_hex(bins = ###) +
geom_line(grid_name)
geom_smooth
or, modelling with ggplot (notes taken from Data visualization: a practical introduction).
Basic linear model:
geom_smooth(method = "lm")
Specify linear model:
geom_smooth(method = "lm", formula = y_variable ~ x_variable)
Iterate
Using the book ‘R for Data Science’ and the tidyverse package
Functions
Functions allow you to automate tasks, as opposed to copy-and-pasting similar bits of code
There are huge advantages in using a function rather than copy-and-paste: if you want to make a change, you only need to do it once; and you’re less likely to make typos and mistakes.
Use base R to write functions
There are 3 basic building blocks for creating a function.
A name
The inputs (or arguments)
The ‘body’ of the function, ie code that goes between {}
…which comes together as:
function_name <- function(arguments){ function code }
What you get back from the function will by default be the last statement the function evaluates.
BUT you can also choose to use:
return() in the middle of the function code to signal to make it easier to understand, eg if you’ve written very long if statements.
invisible() to ensure that something doesn’t get printed out as a result of the function, eg a new dataframe that you’re using to create a plot
Examples
If we were to create a function that adds 2 to the input, it would look like this:
add_two <- function(x){
x+2
}
and you can apply it like this
add_two(2)
you can also apply it to every element of a vector
b <- c(1:5)
add_two(b)
Here’s another example. We can create a fizzbuzz function that takes a single number as input and returns “fizz” if it’s divisible by three and “buzz” if it’s divisible by 5 and “fizzbuzz” if it’s divisible by 3 and 5.
fizzbuzz <- function(x) {
# stopifnot() function ensures the truth of an r expression
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
# %% and && are logical operators, where %% is 'remainder from division' and && is 'and'
if (!(x %% 3) && !(x %% 5)) {
"fizzbuzz"
} else if (!(x %% 3)) {
"fizz"
} else if (!(x %% 5)) {
"buzz"
} else {
x
}
}
fizzbuzz(378)
It’s useful to have some naming conventions for your function arguments. The following are recommended:
x, y and z for vectors
df for data frames
i for the row number (indices)
j for the column number (indices)
n for the number of rows
p for the numer of columns
It’s also useful to add a generic error message to your functions with stopifnot(), which you can use to assert what SHOULD be true (rather than checking for what might be wrong). For example:
function_name <- function(x, y, na.rm = FALSE) {
stopifnot(is.logical(na.rm), length(na.rm) == 1)
stopifnot(length(x) == length(y))
function code
}
For loops
Functions help to reduce duplication of repeated patterns of code
Iteration helps to reduce duplication of repeated operations on multiple inputs, eg performing the same operation on different columns or different datasets
There are 2 types of iteration:
Imperative programming, eg for loops and while loops
Functional programming means even less duplication than you get in imperative programming
For loops: the basics
Examples An example of a simple for loop
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
names(output) <- names(mtcars) # The names of the elements should be the names of the columns in mtcars
for (i in names(mtcars)) { # For every column in mtcars
output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
}
output # Print output
A similar example that doesn’t require the line names(output) <- names(mtcars) but gives results without column headings would be:
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
for (i in 1:ncol(mtcars)) { # For every column in the number of columns that mtcars has
output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
}
output # Print output
For loops: variations
For loops aren’t limited to creating new objects; you can also use them to change existing ones by just removing the assignment at the start
For loops can loop over names and values (not just indices / number of columns)
For loops can handle outputs and sequences of unknown length with ‘while’ loops (although this is most useful in context of simulation, so unlikely to need)
Examples
How to load a directory full of csvs into a single data frame using a for loop.
- Create a vector with the filename paths
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
- Use a for loop that preallocates a list to the filenames and loads them in with read_csv()
df <- vector("list"", length(files))
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
- Combine them into a single dataframe
df <- bind_rows(df)
Purrr
purrr = better than for loops because it makes code easier to write and to read by focusing on the operation being performed (and not the bookkeeping required to loop over every element and store the output!)
Key idea of functional programming = passing one function to another function
It eliminates the need for many common for loops and is similar to the apply() family of functions
Key idea is to have functions that perform the common pattern of:
Looping over a vector
Doing something to each element
Saving the results
There are different purrr functions for different types of output, as follows:
map() to make a list
map_lgl() to make a logical vector
map_int() to make an integer vector
map_dbl() to make a double vector
map_chr() to make a character vector
There are also useful shortcuts you can use within purrr:
Here’s the basic structure for using a map function:
map_vectortype(dataset, function)
Examples Calculate the mean of every column in mtcars
map_dbl(mtcars, mean)
Compute the number of unique values in each column of iris
# V1 WITHOUT SHORTCUTS
map_int(iris, function(x) length(unique(x)))
# V2 WITH SHORTCUTS
map_int(iris, ~ length(unique(.)))
Use a map function within a mutate, using a function you’ve created, ie applying a function to every variable within a dataset
# Example dataset
a <- tibble(x = c(1:4), y = c(100:103))
a
# Example function (the old add_two created earlier in the Functions chapter notes)
add_two <- function(x){
x+2
}
# Example map function
c <- a %>%
mutate(z = map_dbl(x, add_two),
z2 = map(x, add_two))
c
c$z2
Apply multiple map functions to nested tibbles, ie applying a set of functions to separate datasets
# Create function, in this case to filter a dataset by Petal.Length > 5
filter_petal <- function(df){
df %>%
filter(Petal.Length > 5)
}
# Turn a normal dataframe into a dataframe of nested tibbles, in this case the iris dataset nested by species
iris_mod <- iris %>%
group_by(Species) %>%
nest()
# Create 3 new columns in all of the nested tibbles
iris_mod %>%
# Use map to apply the new function filter_petal to each of the separate datasets for Species within iris_mod (We use map() because the output from applying filter_petal to data is a list.)
mutate(fl_petal = purrr::map(data, filter_petal),
# If we want to add the number of rows in each dataset, we would then use map_dbl(), because the output from nrow is a single number.
n_full_data = purrr::map_dbl(data, nrow),
# Ditto - use map_dbl() for this output, which is the number of rows in our new column, fl_petal, which we created above
n_petal_large = purrr::map_dbl(fl_petal, nrow))
Iterate with ggplot
Add creating a function to repeat the same ggplot & programmatically writing titles, captions etc <<
Iterate with walk
Iteratively save files
Use walk() and its variants walk2 and pwalk() when you want to call a function for its side effects, rather than its return value.
This is the case when you want to render output to the screen or save a file to disk.
For example, if you had a list of plots and a vector of filenames, you can use pwalk() as follows:
# Create a list of multiple plots using the map() function
plots <- mtcars %>%
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point())
plots
# Create an iterative filenaming structure for each plot, in this case the names of the plots followed by .pdf
paths <- stringr::str_c(names(plots), ".pdf")
# Save each plot to the path names defined in 'paths' by combining the plots and the pathnames in a list and saving that list
pwalk(list(paths, plots), ggsave, path = tempdir())
Notes
Factors
Factors are used for categories
Factors can either have an order that’s “arbitrary” (eg hair colour) or “principled” (eg months)
They’re esp useful if you want to display character vectors in non-alphabetical order
Use forcats package (not part of tidyverse) for working with factors
For example, say you have a string of months
library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
To create a “factor” of months over a year:
- First create a list of the different months, or levels, in the order they should appear
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
- Then create a factor with the function factor() and levels =
y1 <- factor(x1, levels = month_levels)
You now have a factor that you can sort according to the order you set with sort()
sort(y1)
You can check the levels in your factor with levels()
levels(y1)
You can reorder your factor in at least 2 ways:
- By another variable, eg which month has the most birthdays, with fct_reorder():
fct_reorder(factor, variable)
- By moving one or more levels to the end, eg showing “Not applicable” at the end, with fct_relevel():
fct_relevel(factor, "level to pull out", "level to pull out")
You can also edit your factor with fct_recode. This is particularly useful for editing labels for publication. And you can use this technique to combine multiple “old” levels into the same “new” level, eg grouping several levels under “Other”.
data %>% mutate(variable = fct_recode(variable, "new name" = "old name", "new name" = "old name"))
Other modifications you can make to your factor include:
- fct_collapse(), which is like fct_recode(), but works so that for each new variable you can provide a vector of old levels, eg
data %>% mutate(variable = fct_collapse(variable, "new name" = c("old name", "old name", "old name"))
- fct_lump, which crudely lumps together all the “small” groups to simplify a plot or table
data %>% mutate(variable = fct_lump(variable))
Vectors
Note about integers vs doubles: if you just enter 2 in R, it interprets this as 2.000000, ie a double. To enter 2 as an integer you have to write 2L.
typeof(2)
typeof(2L)
typeof(2.5L)
So what can you do with vectors?
You can find out what type of vector a vector is with `typeof()
You can find out how many items there are in a vector with `length()
You can specify how to read a vector when you import with readr by using col_types specifications, for example:
data <- read_csv("data.csv"),
col_types = cols(column_name = col_logical(),
column_name = col_integer(),
column_name = col_double(),
column_name = col_character(),
column_name = col_date(format = ""),
column_name = col_time(format = ""),
column_name = col_datetime(format = ""),
column_name = col_number(), # Isn't fussy about numbers containing commas, $ etc
column_name = col_skip(), # Skips importing this column
column_name = col_guess() # Guesses how to parse based on input
)
You can convert one type of vector to another with as.logical(), as.integer(), as.double() or as.character()
You can perform basic maths operations on vectors, ie don’t need to iterate for every item in the vector
You can create and name a vector with c()
Lists
Can contain a mix of atomic vector types and other lists
It’s useful to use str() because this will give you the structure of your list, rather than the contents
To subset items within a list use double brackets [[]] rather than single [].
Augmented vectors
Factors, dates, date-times and tibbles are augmented vectors, because they have additional attributes (on top of the usual vector attributes of names, dimensions and class)
Factors have a ‘levels’ attribute
Dates and date-times have additional ‘class’ attributes
Tibbles have additional ‘class’ attributes (tbl_df, tbl and data.frame) as well as column names and row.names
