Setup

Install

install.package("package_name")

Load

library(package_name)

Import

From csv

library(tidyverse)
data <- read_csv("folder/filename.csv")

From a googlesheet

library(googlesheets)
sheet <- gs_title(“Google sheet name”)
data <- sheet %>% gs_read(ws = “Worksheet name”)

From a table on a webpage

library(rvest)
url <- read_html('url')
data_raw <- url %>%
              html_node("table") %>%
              html_table(fill=TRUE)

From a PDF

library(SPARQL) # SPARQL querying package
library(ggplot2)

# Step 1 - Set up preliminaries and define query
# Define the data.gov endpoint
endpoint <- "http://services.data.gov/sparql"

# create query statement
query <-
"PREFIX  dgp1187: <http://data-gov.tw.rpi.edu/vocab/p/1187/>
SELECT ?ye ?fi ?ac
WHERE {
?s dgp1187:year ?ye .
?s dgp1187:fires ?fi .
?s dgp1187:acres ?ac .
}"

# Step 2 - Use SPARQL package to submit query and save results to a data frame
qd <- SPARQL(endpoint,query)
df <- qd$results

From multiple csvs

  1. Create a vector of all the datasets
  all_datasets <- tibble("file" = c('dataset_name_1', 'dataset_name_2')) 
  1. Create a function to import one dataset
  import <- function(df){
    df <- read_csv(str_c("df, ".csv"))
  }
  1. Use purrr::map() to run the function on (each element) of all the datasets
  all_data <- all_datasets %>% mutate(dataset = purrr::map(file, import))
  1. Run a ‘for loop’ to get R to assign each tibble to an object
  for(i in 1:nrow(all_data)){
  assign(all_data$file[[i]], all_data$dataset[[i]])
  }

From multiple webpages

  1. Create a vector with the URL paths needed, eg:
  url_unique <- # function to list these, eg using seq() if numerical
  url_base <- "https://abc"
  url_end <- "/abc.html"
  url_path <- url_unique %>% str_c(url_base, . , url_end)
  data <- tibble(url_path)
  1. Define a function to scrape (and clean) data from one url
  scrape <- function(url){
  xml <- read_html(url) %>%
          html_node("table") %>%
          html_table(fill=TRUE) %>%
  xml
  1. (if needed) Prevent error messages from URL paths that don’t exist
  extract_results <- function(df){
    df$result
  }
  1. Loop scrape function over all URLs as nested tibbles
  data <- data %>%
            mutate(new_column = purrr::map(url_path, safely(scrape)),
            new_column = map(new_column, extract_results))

Save as csv

write_csv(dataset, "filename.csv")

Tidy

Inspect

Get list of all column headings in a dataset

colnames(dataset)

Get the number of rows and columns in a dataset

summarise() 

Get the breakdown of results within a column

table(dataset$column)

Get the number of values in a column

summary(dataset$column)

Get the number of NA values in a column

sum(is.na(dataset$column))

tidyr

Wide –> skinny

gather(dataset, "new_column_name1", "new_column_name2", column range eg '2:8')

Skinny –> wide

spread(dataset, $keycolumn, $valuecolumn)

Combine datasets by a common column

merge(dataset1, dataset2, by = "common column name")

Combine datasets by attaching rows

append(dataset1, dataset2)

Create a new column

mutate(old_variable, new_variable = (calculation))

Rename a column

rename(dataset, new name = old name)

Rename categories within a column

data %>% 
  mutate(variable = fct_recode(variable, "new name" = "old name", 
                                         "new name = "old name"))

Alternatives (more for correcting typos like Find & Replace): dataset$column[dataset$column == "old"] <- "new"

"new_name" <- length(grep("existing_label", dataset$column))

Drop column(s) / row(s)

subset(dataset, select = -c(columns))
subset(dataset, column != "")

Remove NAs

na.omit(dataset)

Arrange a column in descending order

arrange(dataset, desc(column)))

stringr

Filter a column if contains text string

grepl(“text string”, variable)

Remove symbols, eg % sign

dataset$column <- gsub("\\%","", dataset$column)

Convert character column to numeric

dataset$column <- as.numeric(as.character(dataset$column))

Round numbers

mutate(variable = round(variable))

janitor

Clean column heading names (to lowercase with underscores

clean_names()

Create a quick pivot table to summarise variables

tabyl(variable1, variable2) %>% 
adorn_*

Remove empty rows and columns

remove_empty() 

Fix dates stored as serial numbers

excel_numeric_to_date()

Transform

dplyr

Select columns

select(variable, variable, variable)

Select rows that meet criteria

filter(variable == "..." | variable == "...")

Analyse (like a pivot table)

group_by(column) %>% 
  summarise(calculation)

Classic code structures

  • Pivot and perform calculations on data
data_analysis <- raw_data %>%
                       filter(),
                       group_by(),
                       summarise()

…which sounds like: “The original dataframe is XXX, now filter data to only include rows that satisfy the conditions YYY, now group the data at each level of the variable(s) ZZZ, now summarize the data and calculate summary functions XXX…”

  • Pivot data to show percentage breakdown of each column
data_analysis <- raw_data %>%
                       group_ by(column 1, column 2) %>%
                       summarise(n = n()) %>%
                       mutate(pct = n / sum(n)) %>%
                       spread(column 2, pct)

Calculations

sum()
count()
unique()
n_distinct()

Count unique values in a column and arrange in descending order with

data %>% count(column) %>% arrange(desc(n))

apply

Pivot one column of data (use 2 for rows) with

sapply(dataset, function, 1)

Get the number of unique values in every column

rapply(data,function(x)length(unique(x)))

stringr

JOIN with str_c()

  • sep = how you combine the columns, eg sep = “,” for comma separated values

  • collapse = how you combine the rows, eg collapse = “-”, to add a hyphen between each vector

SPLIT with str_split()

GET MATCHES

  • str_view() to see results in Viewer window

  • str_detect() to get TRUE/FALSE response

  • str_count() to get counted response

GET MATCHES WHEN WORKING WITH A TABLE with tidyr::extract(old_column, "new_column", "regular expression")

GET LOCATION with str_locate()

SUBSET with str_sub()

COUNT LENGTH with str_length()

WRAP with str_wrap

  • width = line width in characters

  • indent = indentation of first line in each paragraph

  • exdent = indentation of following lines in each paragraph

regex

aka “regular expressions”

- `.` = any character (wildcard)

- `^` = start of the string

- `$` = end of the string

- `match = TRUE` = view just those matches that meet criteria

- `\d:` any digit

- `\s:` any whitespace (space, tab, newline)

- `[abc]:` matches a, b, or c

- `[^abc]:` matches anything except a, b, or c

Visualise 🤩

Basics

Classic code structure

ggplot(dataset) +
     viz_function(aes(x = variable,
                            y = variable,
                            group = variable,
                    size = variable,
                    fill = variable) +
   scale_x/y_discrete/continuous(breaks = c(), 
                      limits = c(), 
                      labels = c()) +
   labs(title = "...",
        subtitle = "...",
        x = "...",
        y = "...",
        caption = "...") +
   theme() +
   ggsave("name.filetype", height = , width = , unit = "")

Additional layers

geom_hline() 
geom_vline()
geom_segment()
geom_text()
geom_rect()

Transform data within a ggplot function

plot1 <- ggplot(filter(dataset, variable == criteria))

Legends

Reorder

  1. First create a factor:
dataset$newcolumn <- factor(dataset$column,  levels = c(“item1”, “item2”) , ordered = TRUE)
  1. Then in ggplot() aesthetic mappings, use this new column, rather than the pre-existing one.

Remove

theme(legend.position = "none")

Axes

Arrange in descending order

ggplot(aes(x = reorder(variable1, -variable2), y = variable2)

Text

Remove / style axes labels

axis.title.x/y = element_blank() or element_text(size = , family = , font = )
axis.text.x/y = element_blank() or element_text(size = , family = , font = )

Wrap long labels

scale_x_discrete(labels = function(x) str_wrap(x, width = 10))

Rotate labels 45 degrees

theme(axis.text.x=element_text(angle=45))

Themes

From ggplot

  • theme_gray() – signature ggplot2 theme

  • theme_bw() – dark on light ggplot2 theme

  • theme_linedraw() – uses black lines on white backgrounds only

  • theme_light() – similar to linedraw() but with grey lines aswell

  • theme_dark() – lines on a dark background instead of light

  • theme_minimal() – no background annotations, minimal feel.

  • theme_classic() – theme with no grid lines

  • theme_void() – empty theme with no elements

From ggthemes

  • theme_solid() - Theme with nothing other than a background color

  • theme_map() - Clean theme for maps

  • theme_igray() - Inverse gray theme

  • theme_economist()

  • theme_fivethirtyeight()

  • theme_wsj()

  • theme_few()

  • theme_tufte()

  • theme_excel()

  • theme_gdocs()

  • theme_stata()

  • theme_solarized()

  • theme_hc() - Highcharts JS theme

Interactive ggplots

  1. iraph
library(iraph)
ggplot(data, aes(x = , y = , tooltip = variable) +
    geom_???_interactive()
  1. plotly
library(plotly)
ggplotly(plot, tooltip = variable)
  1. highcharter

  2. htmlwidgets

  3. RShiny

  • To make a shiny app, you need to create script with a filename ending “app.R”

  • The basic structure of an R Shiny app:

library(shiny)
ui <- fluidPage()
input() functions, eg sliderInput()
output() functions, eg plotOutput()

server <- function(input, output) { }
output$xxxx <- render*(), eg renderPlot()

shinyApp(ui = ui, server = server)

Geographical maps

Introduction

There seems to be lots of different ways you can make maps in R: partly because the best approach will depend on what type of map you’re making; and partly just because there’s lots of competing packages available.

  • Ways of getting map data

    1. Vector image maps, eg sf package or maps package and map_data() function

    2. Raster image maps, eg raster package

  • Packages for plotting map data

    1. ggplot, eg geom_polygon() and geom_sf()

    2. tmap and tmaptools (dunno)

    3. leaflet (dunno)

A note on shapefiles

  • Raster

    • static image files generated previously by the mapping service, which limits your ability to redraw or change the appearance of the geographic map

    • tradeoff means you can immediately focus on incorporating additional data into the map; better for gridded data, like satellite imagery

  • Vector

    • spatial data files which contain detailed information necessary to draw all the components of a map (e.g. points, lines, polygons)

    • requires importing this data into R; once imported though, ggplot2 works with simple features (sf) data frames to easily generate geospatial visualizations using all the core elements and approaches of ggplot()

  • Where to get shapefiles: http://www.naturalearthdata.com/downloads/

Examples

  1. Create a basic map of the world (minus Antarctica)
library(maps)
library(ggplot2)
world_map <- map_data("world") %>% 
  filter(region != "Antarctica") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "light blue") + 
  coord_fixed() +
  theme_void()
world_map
  1. Create a basic map of a country, eg UK
library(maps)
library(ggplot2)
UK_map <- map_data("world", region = "UK") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map
  1. Create a detailed map of a country, eg counties within UK Using sf and ggplot packages as per the following tutorials:
library(maps)
library(tidyverse)
library(sf)

UK_map <- map_data("world", region = "UK") %>%
  
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map

Tile grid maps

geofacet

Package for creating tile grid maps for regions including:

  • london_boroughs_grid

  • uk_regions1

  • world_86countries_grid & world_countries_grid1

  • china_prov_grid1

Related: countrycode package

Model

“All models are wrong, but some are useful”

The goal of a model is not to uncover truth, but to discover a simple approximation that is useful (in order to understand the behaviour of something).

  • linear model takes the general form:
y = a_0 + a_1 * x

where

  • a_0 = starting point

  • a_1 = difference between x and y at a given point

(NB this is another way of expressing y = mx + c)

Linear models also assume that the ‘residuals’ (difference between observed and predicted values) have a normal distribution.

lm

or, how to apply a linear model to data

  1. Create a model (a type of list) with the function lm(), where x_variable is the independent ‘cause’ and y_variable is the dependent ‘consequence’:
model_name <- lm(y_variable ~ x_variable, data = datset)
  1. Create a ‘grid’ of your data (evenly spaced grid of points from the data)
grid_name <- data %>% data_grid(x_variable)
  1. Add predictions to your gridded data using the model you created
grid_name <- grid_name %>% add_preidictions(model_name)
  1. To visualise the linear model, do the usual ggplot of the distribution of the dataset, then add a geom_line layer using the grid_name as its data, eg
ggplot(dataset, aes(x_variable, y_variable)) +
  geom_hex(bins = ###) +
  geom_line(grid_name)

geom_smooth

or, modelling with ggplot (notes taken from Data visualization: a practical introduction).

Basic linear model:

geom_smooth(method = "lm")

Specify linear model:

geom_smooth(method = "lm", formula = y_variable ~ x_variable)

Iterate

Using the book ‘R for Data Science’ and the tidyverse package

Functions

  • Functions allow you to automate tasks, as opposed to copy-and-pasting similar bits of code

  • There are huge advantages in using a function rather than copy-and-paste: if you want to make a change, you only need to do it once; and you’re less likely to make typos and mistakes.

  • Use base R to write functions

There are 3 basic building blocks for creating a function.

  1. A name

  2. The inputs (or arguments)

  3. The ‘body’ of the function, ie code that goes between {}

…which comes together as:

function_name <- function(arguments){ function code }

What you get back from the function will by default be the last statement the function evaluates.

BUT you can also choose to use:

  • return() in the middle of the function code to signal to make it easier to understand, eg if you’ve written very long if statements.

  • invisible() to ensure that something doesn’t get printed out as a result of the function, eg a new dataframe that you’re using to create a plot

Examples

If we were to create a function that adds 2 to the input, it would look like this:

add_two <- function(x){
  x+2
}

and you can apply it like this

add_two(2)

you can also apply it to every element of a vector

b <- c(1:5)
add_two(b)

Here’s another example. We can create a fizzbuzz function that takes a single number as input and returns “fizz” if it’s divisible by three and “buzz” if it’s divisible by 5 and “fizzbuzz” if it’s divisible by 3 and 5.

fizzbuzz <- function(x) {
  
  # stopifnot() function ensures the truth of an r expression
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))

  # %% and && are logical operators, where %% is 'remainder from division' and && is 'and'  
  if (!(x %% 3) && !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    x
  }
}

fizzbuzz(378)

It’s useful to have some naming conventions for your function arguments. The following are recommended:

  • x, y and z for vectors

  • df for data frames

  • i for the row number (indices)

  • j for the column number (indices)

  • n for the number of rows

  • p for the numer of columns

It’s also useful to add a generic error message to your functions with stopifnot(), which you can use to assert what SHOULD be true (rather than checking for what might be wrong). For example:

function_name <- function(x, y, na.rm = FALSE) {
  stopifnot(is.logical(na.rm), length(na.rm) == 1)
  stopifnot(length(x) == length(y))
  
  function code
}

For loops

  • Functions help to reduce duplication of repeated patterns of code

  • Iteration helps to reduce duplication of repeated operations on multiple inputs, eg performing the same operation on different columns or different datasets

  • There are 2 types of iteration:

    1. Imperative programming, eg for loops and while loops

    2. Functional programming means even less duplication than you get in imperative programming

For loops: the basics

  • Every for loop has 3 components:

    1. An output, ie specify what you want the output of your for loop to be, eg a vector and details like its vector type; or you could also specify your output as being a tibble, a factor etc.

    2. A sequence, ie what to loop over & involves assigning i (like ‘it’) to a different value, so for i in _____

    3. The body {}, ie the code that does the work, each time with a different value for i

Examples An example of a simple for loop

output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            names(output) <- names(mtcars) # The names of the elements should be the names of the columns in mtcars
            for (i in names(mtcars)) { # For every column in mtcars
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output

A similar example that doesn’t require the line names(output) <- names(mtcars) but gives results without column headings would be:

output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            for (i in 1:ncol(mtcars)) { # For every column in the number of columns that mtcars has
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output

For loops: variations

  • For loops aren’t limited to creating new objects; you can also use them to change existing ones by just removing the assignment at the start

  • For loops can loop over names and values (not just indices / number of columns)

    • loop over elements with for (x in xs)

    • loop over names with for (nm in names(xs))

  • For loops can handle outputs and sequences of unknown length with ‘while’ loops (although this is most useful in context of simulation, so unlikely to need)

Examples

How to load a directory full of csvs into a single data frame using a for loop.

  1. Create a vector with the filename paths
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
  1. Use a for loop that preallocates a list to the filenames and loads them in with read_csv()
df <- vector("list"", length(files)) 
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
  1. Combine them into a single dataframe
df <- bind_rows(df)

Purrr

  • purrr = better than for loops because it makes code easier to write and to read by focusing on the operation being performed (and not the bookkeeping required to loop over every element and store the output!)

  • Key idea of functional programming = passing one function to another function

  • It eliminates the need for many common for loops and is similar to the apply() family of functions

  • Key idea is to have functions that perform the common pattern of:

    1. Looping over a vector

    2. Doing something to each element

    3. Saving the results

  • There are different purrr functions for different types of output, as follows:

    • map() to make a list

    • map_lgl() to make a logical vector

    • map_int() to make an integer vector

    • map_dbl() to make a double vector

    • map_chr() to make a character vector

  • There are also useful shortcuts you can use within purrr:

    • ~ will replace function(x) when you want to create an anonymous function

    • . will refer to the current list element (a bit like how i works in a for loop)

Here’s the basic structure for using a map function:

map_vectortype(dataset, function)

Examples Calculate the mean of every column in mtcars

map_dbl(mtcars, mean)

Compute the number of unique values in each column of iris

# V1 WITHOUT SHORTCUTS
map_int(iris, function(x) length(unique(x)))
# V2 WITH SHORTCUTS
map_int(iris, ~ length(unique(.)))

Use a map function within a mutate, using a function you’ve created, ie applying a function to every variable within a dataset

# Example dataset
a <- tibble(x = c(1:4), y = c(100:103))
a

# Example function (the old add_two created earlier in the Functions chapter notes)
add_two <- function(x){
  x+2
}

# Example map function
c <- a %>%
  mutate(z = map_dbl(x, add_two),
         z2 = map(x, add_two))
c
c$z2

Apply multiple map functions to nested tibbles, ie applying a set of functions to separate datasets

# Create function, in this case to filter a dataset by Petal.Length > 5
filter_petal <- function(df){
  df %>%
  filter(Petal.Length > 5)
}

# Turn a normal dataframe into a dataframe of nested tibbles, in this case the iris dataset nested by species
iris_mod <- iris %>%
  group_by(Species) %>%
  nest()

# Create 3 new columns in all of the nested tibbles
iris_mod %>%
  
  # Use map to apply the new function filter_petal to each of the separate datasets for Species within iris_mod (We use map() because the output from applying filter_petal to data is a list.)
  mutate(fl_petal = purrr::map(data, filter_petal), 
         
  # If we want to add the number of rows in each dataset, we would then use map_dbl(), because the output from nrow is a single number.
  n_full_data = purrr::map_dbl(data, nrow), 
  
  # Ditto - use map_dbl() for this output, which is the number of rows in our new column, fl_petal, which we created above
  n_petal_large = purrr::map_dbl(fl_petal, nrow)) 

Iterate with ggplot

Add creating a function to repeat the same ggplot & programmatically writing titles, captions etc <<

Iterate with walk

Iteratively save files

Use walk() and its variants walk2 and pwalk() when you want to call a function for its side effects, rather than its return value.

This is the case when you want to render output to the screen or save a file to disk.

For example, if you had a list of plots and a vector of filenames, you can use pwalk() as follows:

# Create a list of multiple plots using the map() function
plots <- mtcars %>% 
          split(.$cyl) %>% 
          map(~ggplot(., aes(mpg, wt)) + geom_point())
plots

# Create an iterative filenaming structure for each plot, in this case the names of the plots followed by .pdf
paths <- stringr::str_c(names(plots), ".pdf")

# Save each plot to the path names defined in 'paths' by combining the plots and the pathnames in a list and saving that list
pwalk(list(paths, plots), ggsave, path = tempdir())

Communicate

RMarkdown

Choose a markdown themes on Bootswatch, including:

  • cosmo ⭐️

  • journal

  • darkly (black)

  • flatly

  • lumen

  • paper

  • readable

  • sandstone

  • simplex

  • slate (grey)

  • superhero (blue)

Create a website in RMarkdown with bookdown

Notes

Factors

  • Factors are used for categories

  • Factors can either have an order that’s “arbitrary” (eg hair colour) or “principled” (eg months)

  • They’re esp useful if you want to display character vectors in non-alphabetical order

  • Use forcats package (not part of tidyverse) for working with factors

For example, say you have a string of months

library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")

To create a “factor” of months over a year:

  1. First create a list of the different months, or levels, in the order they should appear
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
  1. Then create a factor with the function factor() and levels =
y1 <- factor(x1, levels = month_levels)

You now have a factor that you can sort according to the order you set with sort()

sort(y1)

You can check the levels in your factor with levels()

levels(y1)

You can reorder your factor in at least 2 ways:

  1. By another variable, eg which month has the most birthdays, with fct_reorder():
fct_reorder(factor, variable)
  1. By moving one or more levels to the end, eg showing “Not applicable” at the end, with fct_relevel():
fct_relevel(factor, "level to pull out", "level to pull out")

You can also edit your factor with fct_recode. This is particularly useful for editing labels for publication. And you can use this technique to combine multiple “old” levels into the same “new” level, eg grouping several levels under “Other”.

data %>% mutate(variable = fct_recode(variable, "new name" = "old name", "new name" = "old name"))

Other modifications you can make to your factor include:

  • fct_collapse(), which is like fct_recode(), but works so that for each new variable you can provide a vector of old levels, eg
data %>% mutate(variable = fct_collapse(variable, "new name" = c("old name", "old name", "old name"))
  • fct_lump, which crudely lumps together all the “small” groups to simplify a plot or table
data %>% mutate(variable = fct_lump(variable))

Vectors

  • Vectors are essentially variables within a tibble

  • There are 3 types of vector:

    1. Atomic which can be further subcategorised into:

      • logical (TRUE, FALSE and NA)

      • integer (numeric without decimal places, eg 243)

      • double (numeric with decimal places, eg 243.5678)

      • character (made up of strings)

      • complex (rarely used during data analysis)

      • raw (ditto)

    2. Lists (aka ‘recursive vectors’ and can contain other lists, so they’re good for representing hierarchies in data)

    3. NULL which is the absence of a vector (and distinct from NA which is the absence of a value in a vector)

Note about integers vs doubles: if you just enter 2 in R, it interprets this as 2.000000, ie a double. To enter 2 as an integer you have to write 2L.

typeof(2)
typeof(2L)
typeof(2.5L)

So what can you do with vectors?

  • You can find out what type of vector a vector is with `typeof()

  • You can find out how many items there are in a vector with `length()

  • You can specify how to read a vector when you import with readr by using col_types specifications, for example:

data <- read_csv("data.csv"), 
              col_types = cols(column_name = col_logical(),
                               column_name = col_integer(),
                               column_name = col_double(),
                               column_name = col_character(),
                               column_name = col_date(format = ""),
                               column_name = col_time(format = ""),
                               column_name = col_datetime(format = ""),
                               column_name = col_number(), # Isn't fussy about numbers containing commas, $ etc
                               column_name = col_skip(), # Skips importing this column
                               column_name = col_guess() # Guesses how to parse based on input
                               )
  • You can convert one type of vector to another with as.logical(), as.integer(), as.double() or as.character()

  • You can perform basic maths operations on vectors, ie don’t need to iterate for every item in the vector

  • You can create and name a vector with c()

Lists

  • Can contain a mix of atomic vector types and other lists

  • It’s useful to use str() because this will give you the structure of your list, rather than the contents

  • To subset items within a list use double brackets [[]] rather than single [].

Augmented vectors

  • Factors, dates, date-times and tibbles are augmented vectors, because they have additional attributes (on top of the usual vector attributes of names, dimensions and class)

    • Factors have a ‘levels’ attribute

    • Dates and date-times have additional ‘class’ attributes

    • Tibbles have additional ‘class’ attributes (tbl_df, tbl and data.frame) as well as column names and row.names

---
title: "Cookbook"
output:
  html_notebook:
    theme: darkly
    toc: yes
    toc_float: yes
---

# Setup

### Install
```
install.package("package_name")
```

### Load
```
library(package_name)
```

# Import

### From csv
```
library(tidyverse)
data <- read_csv("folder/filename.csv")
```

### From a googlesheet
```
library(googlesheets)
sheet <- gs_title(“Google sheet name”)
data <- sheet %>% gs_read(ws = “Worksheet name”)
```

### From a table on a webpage
```
library(rvest)
url <- read_html('url')
data_raw <- url %>%
              html_node("table") %>%
              html_table(fill=TRUE)
```

### From a PDF
- Try `pdftools` package & [this tutorial](https://www.brodrigues.co/blog/2018-06-10-scraping_pdfs/?utm_campaign=Data_Elixir&utm_medium=email&utm_source=Data_Elixir_187)

- Using SPARQL (code from [R bloggers](https://www.r-bloggers.com/sparql-with-r-in-less-than-5-minutes/amp/)):
```
library(SPARQL) # SPARQL querying package
library(ggplot2)

# Step 1 - Set up preliminaries and define query
# Define the data.gov endpoint
endpoint <- "http://services.data.gov/sparql"

# create query statement
query <-
"PREFIX  dgp1187: <http://data-gov.tw.rpi.edu/vocab/p/1187/>
SELECT ?ye ?fi ?ac
WHERE {
?s dgp1187:year ?ye .
?s dgp1187:fires ?fi .
?s dgp1187:acres ?ac .
}"

# Step 2 - Use SPARQL package to submit query and save results to a data frame
qd <- SPARQL(endpoint,query)
df <- qd$results
```

### From multiple csvs

  1. Create a vector of all the datasets
  
```
  all_datasets <- tibble("file" = c('dataset_name_1', 'dataset_name_2')) 
```

  2. Create a function to import one dataset
```
  import <- function(df){
    df <- read_csv(str_c("df, ".csv"))
  }
```

  3. Use purrr::map() to run the function on (each element) of all the datasets
```
  all_data <- all_datasets %>% mutate(dataset = purrr::map(file, import))
```
  
  4. Run a 'for loop' to get R to assign each tibble to an object
```
  for(i in 1:nrow(all_data)){
  assign(all_data$file[[i]], all_data$dataset[[i]])
  }
```
  
### From multiple webpages

  1. Create a vector with the URL paths needed, eg:
```
  url_unique <- # function to list these, eg using seq() if numerical
  url_base <- "https://abc"
  url_end <- "/abc.html"
  url_path <- url_unique %>% str_c(url_base, . , url_end)
  data <- tibble(url_path)
```
  
  2. Define a function to scrape (and clean) data from one url
```
  scrape <- function(url){
  xml <- read_html(url) %>%
          html_node("table") %>%
          html_table(fill=TRUE) %>%
  xml
```
  
  3. (if needed) Prevent error messages from URL paths that don't exist
```
  extract_results <- function(df){
    df$result
  }
```
  
  4. Loop scrape function over all URLs as nested tibbles
```
  data <- data %>%
            mutate(new_column = purrr::map(url_path, safely(scrape)),
            new_column = map(new_column, extract_results))
```

### Save as csv
```
write_csv(dataset, "filename.csv")
```

# Tidy
### Inspect
Get list of all column headings in a dataset
```
colnames(dataset)
```

Get the number of rows and columns in a dataset
```
summarise() 
```

Get the breakdown of results within a column
```
table(dataset$column)
```

Get the number of values in a column
```
summary(dataset$column)
```

Get the number of NA values in a column
```
sum(is.na(dataset$column))
```

### tidyr

Wide --> skinny
```
gather(dataset, "new_column_name1", "new_column_name2", column range eg '2:8')
```

Skinny --> wide
```
spread(dataset, $keycolumn, $valuecolumn)
``` 

Combine datasets by a common column
```
merge(dataset1, dataset2, by = "common column name")
``` 

Combine datasets by attaching rows
```
append(dataset1, dataset2)
``` 

Create a new column
```
mutate(old_variable, new_variable = (calculation))
```

Rename a column
```
rename(dataset, new name = old name)
```

Rename categories within a column
```
data %>% 
  mutate(variable = fct_recode(variable, "new name" = "old name", 
                                         "new name = "old name"))
```

Alternatives (more for correcting typos like Find & Replace):
  ```
  dataset$column[dataset$column == "old"] <- "new"
  ```

  ```
  "new_name" <- length(grep("existing_label", dataset$column))
  ```

Drop column(s) / row(s)
``` 
subset(dataset, select = -c(columns))
subset(dataset, column != "")
``` 

Remove NAs
```
na.omit(dataset)
``` 

Arrange a column in descending order
```
arrange(dataset, desc(column)))
```

### stringr

**Filter a column if contains text string**

```
grepl(“text string”, variable)
``` 

**Remove symbols, eg % sign**

```
dataset$column <- gsub("\\%","", dataset$column)
```

**Convert character column to numeric**

```
dataset$column <- as.numeric(as.character(dataset$column))
```

**Round numbers**

```
mutate(variable = round(variable))
```

### [janitor](https://github.com/sfirke/janitor)

**Clean column heading names (to lowercase with underscores**

```
clean_names()
```

**Create a quick pivot table to summarise variables**

```
tabyl(variable1, variable2) %>% 
adorn_*
```

**Remove empty rows and columns**

```
remove_empty() 
```

**Fix dates stored as serial numbers**

```
excel_numeric_to_date()
```

# Transform
### dplyr

Select columns

```
select(variable, variable, variable)
```

Select rows that meet criteria

```
filter(variable == "..." | variable == "...")
```

Analyse (like a pivot table)

```
group_by(column) %>% 
  summarise(calculation)
```

**Classic code structures**

- Pivot and perform calculations on data

```
data_analysis <- raw_data %>%
          	           filter(),
          	           group_by(),
          	           summarise()
```
...which sounds like:
"The original dataframe is XXX,
	now filter data to only include rows that satisfy the conditions YYY,
	now group the data at each level of the variable(s) ZZZ,
	now summarize the data and calculate summary functions XXX…"

- Pivot data to show percentage breakdown of each column

```
data_analysis <- raw_data %>%
                       group_ by(column 1, column 2) %>%
                       summarise(n = n()) %>%
                       mutate(pct = n / sum(n)) %>%
                       spread(column 2, pct)
```

**Calculations**

```
sum()
count()
unique()
n_distinct()
```

Count unique values in a column and arrange in descending order with
```
data %>% count(column) %>% arrange(desc(n))
```

### apply

Pivot one column of data (use 2 for rows) with 
```
sapply(dataset, function, 1)
```

Get the number of unique values in every column
```
rapply(data,function(x)length(unique(x)))
```

### stringr

JOIN with `str_c()`

- `sep =` how you combine the columns, eg sep = "," for comma separated values

- `collapse =` how you combine the rows, eg collapse = "-", to add a hyphen between each vector

SPLIT with `str_split()`

GET MATCHES

- `str_view()` to see results in Viewer window

- `str_detect()` to get TRUE/FALSE response

- `str_count()` to get counted response

GET MATCHES WHEN WORKING WITH A TABLE with `tidyr::extract(old_column, "new_column", "regular expression")`

GET LOCATION with `str_locate()`

SUBSET with `str_sub()`

COUNT LENGTH with `str_length()`

WRAP with `str_wrap`

- `width =` line width in characters

- `indent =` indentation of first line in each paragraph

- `exdent =` indentation of following lines in each paragraph

### regex
aka "regular expressions"

	- `.` = any character (wildcard)
	
	- `^` = start of the string
	
	- `$` = end of the string
	
	- `match = TRUE` = view just those matches that meet criteria
	
	- `\d:` any digit
	
	- `\s:` any whitespace (space, tab, newline)
	
	- `[abc]:` matches a, b, or c
	
	- `[^abc]:` matches anything except a, b, or c

# Visualise 🤩 

### Basics

Classic code structure
```
ggplot(dataset) +
	 viz_function(aes(x = variable,
				            y = variable,
				            group = variable,
                    size = variable,
                    fill = variable) +
   scale_x/y_discrete/continuous(breaks = c(), 
                      limits = c(), 
                      labels = c()) +
   labs(title = "...",
        subtitle = "...",
        x = "...",
        y = "...",
        caption = "...") +
   theme() +
   ggsave("name.filetype", height = , width = , unit = "")
```

Additional layers
```
geom_hline() 
geom_vline()
geom_segment()
geom_text()
geom_rect()
```

Transform data within a ggplot function
```
plot1 <- ggplot(filter(dataset, variable == criteria))
```

### Legends

Reorder

  1. First create a factor:
  ```
  dataset$newcolumn <- factor(dataset$column,  levels = c(“item1”, “item2”) , ordered = TRUE)
  ```
  
  2. Then in ggplot() aesthetic mappings, use this new column, rather than the pre-existing one.

Remove
```
theme(legend.position = "none")
``` 

### Axes

Arrange in descending order
```
ggplot(aes(x = reorder(variable1, -variable2), y = variable2)
```

### Text

Remove / style axes labels
```
axis.title.x/y = element_blank() or element_text(size = , family = , font = )
axis.text.x/y = element_blank() or element_text(size = , family = , font = )
```

Wrap long labels
```
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
```

Rotate labels 45 degrees
```
theme(axis.text.x=element_text(angle=45))
``` 

### Themes

From ggplot

- `theme_gray()` – signature ggplot2 theme

- `theme_bw()` – dark on light ggplot2 theme

- `theme_linedraw()` – uses black lines on white backgrounds only

- `theme_light()` – similar to linedraw() but with grey lines aswell

- `theme_dark()` – lines on a dark background instead of light

- `theme_minimal()` – no background annotations, minimal feel.

- `theme_classic()` – theme with no grid lines

- `theme_void()` – empty theme with no elements

From ggthemes

- `theme_solid()` - Theme with nothing other than a background color

- `theme_map()` - Clean theme for maps

- `theme_igray()` -  Inverse gray theme

- `theme_economist()`

- `theme_fivethirtyeight()`

- `theme_wsj()` 

- `theme_few()`

- `theme_tufte()`

- `theme_excel()`

- `theme_gdocs()`

- `theme_stata()` 

- `theme_solarized()`

- `theme_hc()` - Highcharts JS theme

### Interactive ggplots

1. **iraph**
  ```
  library(iraph)
  ggplot(data, aes(x = , y = , tooltip = variable) +
      geom_???_interactive()
  ```

2. **plotly**
  ```
  library(plotly)
  ggplotly(plot, tooltip = variable)
  ```

3. **[highcharter](http://jkunst.com/highcharter/index.html)**

4. **[htmlwidgets](http://www.htmlwidgets.org/.html)**

5. **RShiny**

  - To make a shiny app, you need to create script with a filename ending "app.R"
  
  - The basic structure of an R Shiny app:
  
  ```
  library(shiny)
  ui <- fluidPage()
  input() functions, eg sliderInput()
  output() functions, eg plotOutput()
  
  server <- function(input, output) { }
  output$xxxx <- render*(), eg renderPlot()
  
  shinyApp(ui = ui, server = server)
  ```

### Geographical maps

Introduction

There seems to be lots of different ways you can make maps in R: partly because the best approach will depend on what type of map you're making; and partly just because there's lots of competing packages available.

- Ways of getting map data

    1. Vector image maps, eg sf package or maps package and map_data() function

    2. Raster image maps, eg raster package

- Packages for plotting map data

    1. ggplot, eg geom_polygon() and geom_sf()

    2. tmap and tmaptools (dunno)

    3. leaflet (dunno)

**A note on shapefiles**

- Raster

  - static image files generated previously by the mapping service, which limits your ability to redraw or change the appearance of the geographic map

  - tradeoff means you can immediately focus on incorporating additional data into the map; better for gridded data, like satellite imagery

- Vector 

  - spatial data files which contain detailed information necessary to draw all the components of a map (e.g. points, lines, polygons)

  - requires importing this data into R; once imported though, ggplot2 works with simple features (sf) data frames to easily generate geospatial visualizations using all the core elements and approaches of ggplot()

- Where to get shapefiles: http://www.naturalearthdata.com/downloads/

Examples

1. Create a basic map of the world (minus Antarctica)
```{r world map}
library(maps)
library(ggplot2)
world_map <- map_data("world") %>% 
  filter(region != "Antarctica") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "light blue") + 
  coord_fixed() +
  theme_void()
world_map
```

2. Create a basic map of a country, eg UK
```{r country map}
library(maps)
library(ggplot2)
UK_map <- map_data("world", region = "UK") %>% 
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map
```

3. Create a detailed map of a country, eg counties within UK 
Using sf and ggplot packages as per the following tutorials:

- https://www.r-spatial.org/r/2018/10/25/ggplot2-sf.html

- https://cfss.uchicago.edu/geoviz_plot.html
```{r shapefile map}
library(maps)
library(tidyverse)
library(sf)

UK_map <- map_data("world", region = "UK") %>%
  
  ggplot(aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "dark blue") + 
  coord_fixed() +
  theme_void()
UK_map
```

### Tile grid maps

#### [geofacet](https://hafen.github.io/geofacet/)

Package for creating tile grid maps for regions including:

- `london_boroughs_grid`

- `uk_regions1` 

- `world_86countries_grid` & `world_countries_grid1`

- `china_prov_grid1`

Related: [countrycode](https://github.com/vincentarelbundock/countrycode) package

# Model

**"All models are wrong, but some are useful"**

The goal of a model is not to uncover truth, but to discover a simple approximation that is useful (in order to understand the behaviour of something).

* l**inear **model takes the general form:

```
y = a_0 + a_1 * x
```

where 

- `a_0` = starting point

- `a_1` = difference between x and y at a given point

(NB this is another way of expressing `y = mx + c`)

Linear models also assume that the 'residuals' (difference between observed and predicted values) have a normal distribution.

### lm

or, how to apply a linear model to data

1. Create a model (a type of list) with the function `lm()`, where `x_variable` is the independent 'cause' and `y_variable` is the dependent 'consequence':

```
model_name <- lm(y_variable ~ x_variable, data = datset)
```

2. Create a 'grid' of your data (evenly spaced grid of points from the data)

```
grid_name <- data %>% data_grid(x_variable)
```

3. Add predictions to your gridded data using the model you created

```
grid_name <- grid_name %>% add_preidictions(model_name)
```

4. To visualise the linear model, do the usual ggplot of the distribution of the `dataset`, then add a `geom_line` layer using the `grid_name` as its data, eg

```
ggplot(dataset, aes(x_variable, y_variable)) +
  geom_hex(bins = ###) +
  geom_line(grid_name)
```

### geom_smooth

or, modelling with ggplot (notes taken from [Data visualization: a practical introduction](https://socviz.co/modeling.html)).

Basic linear model:
```
geom_smooth(method = "lm")
```

Specify linear model:
```
geom_smooth(method = "lm", formula = y_variable ~ x_variable)
```

# Iterate

Using the book 'R for Data Science' and the tidyverse package

### Functions

- Functions allow you to automate tasks, as opposed to copy-and-pasting similar bits of code

- There are huge advantages in using a function rather than copy-and-paste: if you want to make a change, you only need to do it once; and you're less likely to make typos and mistakes.

- Use base R to write functions

There are **3 basic building blocks** for creating a function.

1. A name

2. The inputs (or arguments)

3. The 'body' of the function, ie code that goes between {}

...which comes together as:

```function_name <- function(arguments){ function code }```

What you get back from the function will by default be the last statement the function evaluates. 

BUT you can also choose to use:

- **`return()`** in the middle of the function code to signal to make it easier to understand, eg if you've written very long `if` statements.

- **`invisible()`** to ensure that something doesn't get printed out as a result of the function, eg a new dataframe that you're using to create a plot

**Examples**

If we were to create a function that adds 2 to the input, it would look like this:
```{r}
add_two <- function(x){
  x+2
}
```

and you can apply it like this
```{r}
add_two(2)
```

you can also apply it to every element of a vector
```{r}
b <- c(1:5)
add_two(b)
```

Here's another example. We can create a `fizzbuzz` function that takes a single number as input and returns "fizz" if it's divisible by three and "buzz" if it's divisible by 5 and "fizzbuzz" if it's divisible by 3 and 5.
```{r}
fizzbuzz <- function(x) {
  
  # stopifnot() function ensures the truth of an r expression
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))

  # %% and && are logical operators, where %% is 'remainder from division' and && is 'and'  
  if (!(x %% 3) && !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    x
  }
}

fizzbuzz(378)
```

It's useful to have some **naming conventions** for your function arguments. The following are recommended:

- `x`, `y` and `z` for vectors

- `df` for data frames

- `i` for the row number (indices)

- `j` for the column number (indices)

- `n` for the number of rows

- `p` for the numer of columns

It's also useful to add a generic error message to your functions with **`stopifnot()`**, which you can use to assert what SHOULD be true (rather than checking for what might be wrong). For example:

```
function_name <- function(x, y, na.rm = FALSE) {
  stopifnot(is.logical(na.rm), length(na.rm) == 1)
  stopifnot(length(x) == length(y))
  
  function code
}
```

### For loops

- Functions help to reduce duplication of repeated patterns of code

- Iteration helps to reduce duplication of repeated operations on multiple inputs, eg performing the same operation on different columns or different datasets

- There are 2 types of iteration:

  1. **Imperative programming**, eg for loops and while loops
  
  2. **Functional programming** means even less duplication than you get in imperative programming

**For loops: the basics**

- Every for loop has 3 components:

  1. An output, ie **specify what you want the output of your for loop to be**, eg a vector and details like its vector type; or you could also specify your output as being a tibble, a factor etc.
  
  2. A sequence, ie what to loop over & involves assigning `i` (like 'it') to a different value, so `for i in _____`
  
  3. The body {}, ie the code that does the work, each time with a different value for `i`

**Examples**
An example of a simple for loop  
```{r}
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            names(output) <- names(mtcars) # The names of the elements should be the names of the columns in mtcars
            for (i in names(mtcars)) { # For every column in mtcars
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output
```

A similar example that doesn't require the line `names(output) <- names(mtcars)` but gives results without column headings would be:
```{r}
output <- vector("double", ncol(mtcars)) # The vector should be of the type 'double' and with the same number of elements as there are columns in mtcars
            for (i in 1:ncol(mtcars)) { # For every column in the number of columns that mtcars has
                   output[i] <- mean(mtcars[[i]]) # make the i'th element of the vector output the mean of the i'th column in mtcars (where i'th = 1st, 2nd, 3rd etc)
            }
output # Print output
```

**For loops: variations**

- For loops aren't limited to creating new objects; you can also use them to change existing ones by just removing the assignment at the start

- For loops can loop over names and values (not just indices / number of columns)
  
  - loop over elements with `for (x in xs)`
  
  - loop over names with `for (nm in names(xs))`

- For loops can handle outputs and sequences of unknown length with 'while' loops (although this is most useful in context of simulation, so unlikely to need)

**Examples**

How to load a directory full of csvs into a single data frame using a for loop.

1) Create a vector with the filename paths
```
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
```

2) Use a for loop that preallocates a list to the filenames and loads them in with read_csv()
```
df <- vector("list"", length(files)) 
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
```

3) Combine them into a single dataframe
```
df <- bind_rows(df)
```

### Purrr

- **purrr = better than for loops** because it makes code easier to write and to read by focusing on the operation being performed (and not the bookkeeping required to loop over every element and store the output!)

- Key idea of functional programming = passing one function to another function

- It eliminates the need for many common for loops and is similar to the `apply()` family of functions

- Key idea is to have functions that perform the common pattern of:
  
  1. Looping over a vector
  
  2. Doing something to each element
  
  3. Saving the results 

- There are different purrr functions for different types of output, as follows:

  - `map()` to make a list
  
  - `map_lgl()` to make a logical vector
  
  - `map_int()` to make an integer vector
  
  - `map_dbl()` to make a double vector
  
  - `map_chr()` to make a character vector
  
- There are also useful shortcuts you can use within purrr:

  - `~` will replace `function(x)` when you want to create an anonymous function
  
  - `.` will refer to the current list element (a bit like how `i` works in a for loop)
  
Here's the basic structure for using a map function:
```
map_vectortype(dataset, function)
```

**Examples**
Calculate the mean of every column in mtcars
```{r}
map_dbl(mtcars, mean)
```

Compute the number of unique values in each column of iris
```{r}
# V1 WITHOUT SHORTCUTS
map_int(iris, function(x) length(unique(x)))
# V2 WITH SHORTCUTS
map_int(iris, ~ length(unique(.)))
```

Use a map function within a mutate, using a function you've created, ie applying a function to every variable within a dataset
```{r}
# Example dataset
a <- tibble(x = c(1:4), y = c(100:103))
a

# Example function (the old add_two created earlier in the Functions chapter notes)
add_two <- function(x){
  x+2
}

# Example map function
c <- a %>%
  mutate(z = map_dbl(x, add_two),
         z2 = map(x, add_two))
c
c$z2
```

Apply multiple map functions to nested tibbles, ie applying a set of functions to separate datasets
```{r}
# Create function, in this case to filter a dataset by Petal.Length > 5
filter_petal <- function(df){
  df %>%
  filter(Petal.Length > 5)
}

# Turn a normal dataframe into a dataframe of nested tibbles, in this case the iris dataset nested by species
iris_mod <- iris %>%
  group_by(Species) %>%
  nest()

# Create 3 new columns in all of the nested tibbles
iris_mod %>%
  
  # Use map to apply the new function filter_petal to each of the separate datasets for Species within iris_mod (We use map() because the output from applying filter_petal to data is a list.)
  mutate(fl_petal = purrr::map(data, filter_petal), 
         
  # If we want to add the number of rows in each dataset, we would then use map_dbl(), because the output from nrow is a single number.
  n_full_data = purrr::map_dbl(data, nrow), 
  
  # Ditto - use map_dbl() for this output, which is the number of rows in our new column, fl_petal, which we created above
  n_petal_large = purrr::map_dbl(fl_petal, nrow)) 
```

### Iterate with ggplot

>> Add creating a function to repeat the same ggplot & programmatically writing titles, captions etc <<

### Iterate with walk

**Iteratively save files**

Use **`walk()`** and its variants **`walk2`** and **`pwalk()`** when you want to call a function for its side effects, rather than its return value.

This is the case when you want to render output to the screen or save a file to disk.

For example, if you had a list of plots and a vector of filenames, you can use `pwalk()` as follows:
```{r}
# Create a list of multiple plots using the map() function
plots <- mtcars %>% 
          split(.$cyl) %>% 
          map(~ggplot(., aes(mpg, wt)) + geom_point())
plots

# Create an iterative filenaming structure for each plot, in this case the names of the plots followed by .pdf
paths <- stringr::str_c(names(plots), ".pdf")

# Save each plot to the path names defined in 'paths' by combining the plots and the pathnames in a list and saving that list
pwalk(list(paths, plots), ggsave, path = tempdir())
```

# Communicate

### RMarkdown

Choose a markdown themes on **[Bootswatch](https://bootswatch.com/3/)**, including:

- cosmo ⭐️

- journal

- darkly (black)

- flatly

- lumen

- paper

- readable

- sandstone

- simplex

- slate (grey)

- superhero (blue)

Create a website in RMarkdown with **[bookdown](https://bookdown.org/yihui/blogdown/get-started.html)**

# Bookmarks

- [R for data science](https://r4ds.had.co.nz/)

- [Data visualization: a practical introduction](https://socviz.co/index.html#preface)

- [R Markdown](https://bookdown.org/yihui/rmarkdown/)

- [R and Github](https://happygitwithr.com/git-client.html)

- [Tidyverse: ggplot](https://ggplot2.tidyverse.org/reference/index.html#)

# Notes
### Factors

- Factors are used for **categories**

- Factors can either have an order that's "arbitrary" (eg hair colour) or "principled" (eg months)

- They're esp useful if you want to display character vectors in non-alphabetical order

- Use **forcats** package (not part of tidyverse) for working with factors

For example, say you have a string of months
```{r}
library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
```

To create a "factor" of months over a year:

  1. First create a list of the different months, or **levels**, in the order they should appear
```{r}
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
```

  2. Then create a factor with the function factor() and levels = 
```{r}
y1 <- factor(x1, levels = month_levels)
```

You now have a factor that you can **sort** according to the order you set with sort()
```{r}
sort(y1)
```

You can **check** the levels in your factor with levels()
```{r}
levels(y1)
```

You can **reorder** your factor in at least 2 ways:

  1. By another variable, eg which month has the most birthdays, with **fct_reorder()**:
```{r}
fct_reorder(factor, variable)
```
  
  2. By moving one or more levels to the end, eg showing "Not applicable" at the end, with **fct_relevel()**:
```{r}
fct_relevel(factor, "level to pull out", "level to pull out")
```

You can also **edit** your factor with **fct_recode**. This is particularly useful for editing labels for publication. And you can use this technique to combine multiple "old" levels into the same "new" level, eg grouping several levels under "Other".

```{r}
data %>% mutate(variable = fct_recode(variable, "new name" = "old name", "new name" = "old name"))
```

Other modifications you can make to your factor include:

- **fct_collapse()**, which is like fct_recode(), but works so that for each new variable you can provide a vector of old levels, eg
```{r}
data %>% mutate(variable = fct_collapse(variable, "new name" = c("old name", "old name", "old name"))
```

- **fct_lump**, which crudely lumps together all the "small" groups to simplify a plot or table
```{r}
data %>% mutate(variable = fct_lump(variable))
```

### Vectors

- Vectors are essentially variables within a tibble

- There are 3 types of vector:
  
  1. **Atomic** which can be further subcategorised into:
  
      - **logical** (TRUE, FALSE and NA)
      
      - **integer** (numeric without decimal places, eg 243)
      
      - **double** (numeric with decimal places, eg 243.5678) 
      
      - **character** (made up of strings)
      
      - **complex** (rarely used during data analysis)
      
      - **raw** (ditto)
  
  2. **Lists** (aka 'recursive vectors' and can contain other lists, so they're good for representing hierarchies in data)

  3. **`NULL`** which is the absence of a vector (and distinct from `NA` which is the absence of a value in a vector)

**Note about integers vs doubles**: if you just enter `2` in R, it interprets this as `2.000000`, ie a double. To enter `2` as an integer you have to write `2L`. 
```{r}
typeof(2)
typeof(2L)
typeof(2.5L)
```

So what can you do with vectors?

- You can find out what type of vector a vector is with **`typeof()** 

- You can find out how many items there are in a vector with **`length()**

- You can specify how to read a vector when you import with `readr` by using `col_types` specifications, for example:

```{r}
data <- read_csv("data.csv"), 
              col_types = cols(column_name = col_logical(),
                               column_name = col_integer(),
                               column_name = col_double(),
                               column_name = col_character(),
                               column_name = col_date(format = ""),
                               column_name = col_time(format = ""),
                               column_name = col_datetime(format = ""),
                               column_name = col_number(), # Isn't fussy about numbers containing commas, $ etc
                               column_name = col_skip(), # Skips importing this column
                               column_name = col_guess() # Guesses how to parse based on input
                               )
```

- You can convert one type of vector to another with **`as.logical()`**, **`as.integer()`**, **`as.double()`** or **`as.character()`**

- You can perform basic maths operations on vectors, ie don't need to iterate for every item in the vector

- You can create and name a vector with `c()`

**Lists**

- Can contain a mix of atomic vector types and other lists

- It's useful to use `str()` because this will give you the structure of your list, rather than the contents

- To subset items within a list use double brackets `[[]]` rather than single `[]`.

**Augmented vectors**

- Factors, dates, date-times and tibbles are augmented vectors, because they have additional attributes (on top of the usual vector attributes of names, dimensions and class)

  - Factors have a 'levels' attribute
  
  - Dates and date-times have additional 'class' attributes
  
  - Tibbles have additional 'class' attributes (`tbl_df`, `tbl` and `data.frame`) as well as column `names` and `row.names`
