note: for some reason you have to install tidyverse on Rcloud every time, so here’s the code for easier access:
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
using ctrl+L is useful to erase all the ugly code that comes from that
We can now use tidyverse!
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
using
ggplot(data = mpg)
we get an empty plot, because we haven’t told R what to do with the data
AFTER telling R that ggplot(data = mpg), we add onto it and use geom_point & aes() to tell R what to put on the plot
geom_point(): creates a scatterplot with x and y axis that best shows a relationship between two continuous variables
aes():decides what goes where on the scatterplot
for this example we’ll be using the displ & hwy variables
picture <- ggplot(data = mpg) +
geom_point( # plots data points
mapping = aes( # information about which data to analyse
x = displ, #x axis information
y = hwy #y axis information
)
)
print(picture)
Use size = cyl to make the size of the points vary depending on the variable:
picture <- ggplot(data = mpg) +
geom_point(
mapping = aes(
x = displ,
y=hwy,
size = cyl # size is according to how many cylinders the car has
)
)
print(picture)
Use color = factor(cyl) to make the points of the colour change depending on the variable:
picture <- ggplot(data = mpg) +
geom_point(
mapping = aes(
x = displ,
y=hwy,
color = factor(cyl)# to make the colours vary with level, not continuous
)
)
print(picture)
Use Color = “what colour I want” outside of aes() to make it a certain colour
picture <- ggplot(data = mpg) +
geom_point(
mapping = aes(
x = displ,
y=hwy
),
color = "blue"
)
print(picture)
picture <- ggplot(data = mpg) +
geom_point(
mapping = aes(
x = displ,
y=hwy,
color = factor(cyl)
)
)+ #to add on another function
geom_smooth(
#for making the regression line onto the chart
mapping = aes(
x = displ,
y = hwy
)
)+
geom_rug(
# geom_rug() is for margin distributions
mapping = aes(
x = displ,
y = hwy,
color = factor(cyl) # to make the margin colours according to cylinders
)
)
print(picture)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
We want to avoid having redundancy in the code to make it as efficient as possible
Using the example above, we can change the factors x and y to universal factors by doing this:
picture <- ggplot(data = mpg,
mapping = aes(
x = displ,
y = hwy # having them in the ggplot() bracket instead of the rest simplifies the information
)
)+
geom_point(
mapping = aes(
color = factor(cyl)
)
)+
geom_smooth()+
geom_rug(
mapping = aes(
color = factor(cyl)
)
)
plot(picture) # for plotting, plot() and print() work the same way!
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The output looks the same!
Boxplots are good for when you have one continuous and one discrete variable
library(tidyverse)
forensic <- read.csv("data_forensic.csv")
We use geom_boxplot to turn data into boxplots
fbox <- ggplot(data = forensic,
mapping = aes(
x = band,
y = est
)
)+
geom_boxplot(mapping = aes(
fill = band #to make the boxes coloured according to band
)
)+
facet_wrap(vars(handwriting_expert)) + #groups the plots by handwriting expert/ novice
theme_minimal() + # changes how the plot looks
scale_x_discrete(
name = NULL, # takes away the name (we already have it in the key)
labels = NULL
) +
scale_y_continuous(
name = "Estimate" # gives the Y axis a clearer name
)+
ggtitle(
label = "Handwriting feature probability for experts/ novices",
subtitle = "Source: Martire et al."
)+ # adds title/ subtitle to the whole plot
scale_fill_viridis_d(
alpha = .5, # transparency of the plot colours (to make median easier to find)
name = NULL # taking out the name of the keys
)
print(fbox)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).