Research Methods and Data Analysis
Week 1: Introduction
- Introduction to the module and to Quarto
Notes
- I have downloaded the penguin workbook and customized it
- I have also used the ‘visual’ tool to insert an image I believe works well with this module
Week 2: Meet the penguins
The penguins
data from the palmerpenguins package contains size measurements for 344 penguins from three species observed on three islands in the Palmer Archipelago, Antarctica.
The plot below shows the relationship between flipper and bill lengths of these penguins.
Week 3: Data Wrangling
Penguins Data
library(tidyverse)
library(palmerpenguins)
data("penguins")
%>%
penguins select(1:5)
# A tibble: 344 × 5
species island bill_length_mm bill_depth_mm flipper_length_mm
<fct> <fct> <dbl> <dbl> <int>
1 Adelie Torgersen 39.1 18.7 181
2 Adelie Torgersen 39.5 17.4 186
3 Adelie Torgersen 40.3 18 195
4 Adelie Torgersen NA NA NA
5 Adelie Torgersen 36.7 19.3 193
6 Adelie Torgersen 39.3 20.6 190
7 Adelie Torgersen 38.9 17.8 181
8 Adelie Torgersen 39.2 19.6 195
9 Adelie Torgersen 34.1 18.1 193
10 Adelie Torgersen 42 20.2 190
# ℹ 334 more rows
Histogram
library(tidyverse)
library(palmerpenguins)
data("penguins")
%>%
penguins group_by(species) %>%
ggplot(aes(x=bill_length_mm, color=species, fill=species))+
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).
Box Plots
library(tidyverse)
library(palmerpenguins)
data("penguins")
%>%
penguins group_by(species) %>%
ggplot(aes(x=species,
y=bill_length_mm,
color=species,
fill=species))+
geom_boxplot(alpha=0.5)+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Checking Categorical Data
Species of Penguins
library(tidyverse)
library(palmerpenguins)
%>%
penguins ggplot(aes(x=species,
color=species,
fill=species))+
geom_bar(alpha=0.5)+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Observations Per Year
library(tidyverse)
library(palmerpenguins)
%>%
penguins ggplot(aes(x=year,
color=species,
fill=species))+
geom_bar()+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Observations Per Island
library(tidyverse)
library(palmerpenguins)
%>%
penguins ggplot(aes(x=island,
color=species,
fill=species))+
geom_bar()+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Visualising Correlations
%>%
penguins ggplot(aes(x=bill_length_mm,
y = bill_depth_mm))+
geom_point()+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
Visualising Correlations per Species
%>%
penguins ggplot(aes(x=bill_length_mm,
y = bill_depth_mm,
color=species,
fill=species))+
geom_point()+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
Body Mass per Sex
%>%
penguins na.omit() %>%
ggplot(aes(x=sex,
y = body_mass_g,
color=species,
fill=species))+
geom_boxplot(alpha=0.7)+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Body Mass per Sex (inverting groups)
%>%
penguins na.omit() %>%
ggplot(aes(x=species,
y = body_mass_g,
color=sex,
fill=sex))+
geom_boxplot(alpha=0.7)+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))
Check Distributions
%>%
penguins na.omit() %>%
pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") %>%
ggplot(aes(x=value,
group=species,
fill=species,
color=species))+
geom_density(alpha=0.7)+
facet_grid(~trait, scales = "free_x" )+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))+
theme_minimal()
Checking via Histogram
set.seed(999)
<-rnorm(100)
normal%>%
normal as.tibble() %>%
ggplot(aes(value))+
geom_histogram(color="#DD4A48", fill="#DD4A48")+
geom_vline(xintercept=c(mean(normal), (mean(normal)+sd(normal)),mean(normal)-sd(normal)),
linetype="dashed")
Warning: `as.tibble()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
%>% # take the object penguins
penguins select(1:3) # then, select the columns 1 to 3
# A tibble: 344 × 3
species island bill_length_mm
<fct> <fct> <dbl>
1 Adelie Torgersen 39.1
2 Adelie Torgersen 39.5
3 Adelie Torgersen 40.3
4 Adelie Torgersen NA
5 Adelie Torgersen 36.7
6 Adelie Torgersen 39.3
7 Adelie Torgersen 38.9
8 Adelie Torgersen 39.2
9 Adelie Torgersen 34.1
10 Adelie Torgersen 42
# ℹ 334 more rows
install.packages("vtable", repos = "https://cran.rstudio.com/")
The downloaded binary packages are in
/var/folders/y0/1zx51nnj0034fnt0zrf7sbtr0000gn/T//RtmplvVCBs/downloaded_packages
install.packages("gt", repos = "https://cran.rstudio.com/")
The downloaded binary packages are in
/var/folders/y0/1zx51nnj0034fnt0zrf7sbtr0000gn/T//RtmplvVCBs/downloaded_packages
library(vtable)
Loading required package: kableExtra
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
library(gt)
%>%
penguins vtable(., lush = TRUE)
Name | Class | Values | Missing | Summary |
---|---|---|---|---|
species | factor | 'Adelie' 'Chinstrap' 'Gentoo' | 0 | nuniq: 3 |
island | factor | 'Biscoe' 'Dream' 'Torgersen' | 0 | nuniq: 3 |
bill_length_mm | numeric | Num: 32.1 to 59.6 | 2 | mean: 43.922, sd: 5.46, nuniq: 164 |
bill_depth_mm | numeric | Num: 13.1 to 21.5 | 2 | mean: 17.151, sd: 1.975, nuniq: 80 |
flipper_length_mm | integer | Num: 172 to 231 | 2 | mean: 200.915, sd: 14.062, nuniq: 55 |
body_mass_g | integer | Num: 2700 to 6300 | 2 | mean: 4201.754, sd: 801.955, nuniq: 94 |
sex | factor | 'female' 'male' | 11 | nuniq: 2 |
year | integer | Num: 2007 to 2009 | 0 | mean: 2008.029, sd: 0.818, nuniq: 3 |
library(vtable)
library(gt)
%>%
penguins group_by(species) %>%
na.omit() %>%
summarise(mean = mean(bill_length_mm), sd=sd(bill_length_mm), n = n())
# A tibble: 3 × 4
species mean sd n
<fct> <dbl> <dbl> <int>
1 Adelie 38.8 2.66 146
2 Chinstrap 48.8 3.34 68
3 Gentoo 47.6 3.11 119
library(vtable)
library(gt)
%>%
penguins group_by(island) %>%
na.omit() %>%
summarise(mean = mean(bill_length_mm), sd=sd(bill_length_mm), n = n())
# A tibble: 3 × 4
island mean sd n
<fct> <dbl> <dbl> <int>
1 Biscoe 45.2 4.83 163
2 Dream 44.2 5.95 123
3 Torgersen 39.0 3.03 47
%>%
penguins filter(species=="Gentoo",
> 50,
bill_length_mm =="male") %>%
sexselect(bill_length_mm,
%>%
bill_depth_mm) arrange(bill_depth_mm)
# A tibble: 21 × 2
bill_length_mm bill_depth_mm
<dbl> <dbl>
1 51.3 14.2
2 50.2 14.3
3 50.1 15
4 50.7 15
5 50.4 15.3
6 52.5 15.6
7 54.3 15.7
8 50.8 15.7
9 50.4 15.7
10 53.4 15.8
# ℹ 11 more rows
%>%
penguins filter(species=="Adelie",
> 20,
bill_length_mm =="male") %>%
sexselect(bill_length_mm,
%>%
flipper_length_mm) arrange(flipper_length_mm)
# A tibble: 73 × 2
bill_length_mm flipper_length_mm
<dbl> <int>
1 37.2 178
2 37.7 180
3 38.8 180
4 40.5 180
5 39.1 181
6 41.1 182
7 40.6 183
8 40.9 184
9 39.8 184
10 37.2 184
# ℹ 63 more rows
%>%
penguins select(bill_length_mm,
bill_depth_mm,%>%
year) pivot_longer(col=c(bill_length_mm:bill_depth_mm),
names_to = "bill_feature", values_to = "value")
# A tibble: 688 × 3
year bill_feature value
<int> <chr> <dbl>
1 2007 bill_length_mm 39.1
2 2007 bill_depth_mm 18.7
3 2007 bill_length_mm 39.5
4 2007 bill_depth_mm 17.4
5 2007 bill_length_mm 40.3
6 2007 bill_depth_mm 18
7 2007 bill_length_mm NA
8 2007 bill_depth_mm NA
9 2007 bill_length_mm 36.7
10 2007 bill_depth_mm 19.3
# ℹ 678 more rows
%>%
penguins mutate(row = row_number()) %>% # needed to add a rwoo number to identify each row as separate case
select(row, species, island, body_mass_g) %>%
pivot_wider(names_from = island, values_from = body_mass_g)
# A tibble: 344 × 5
row species Torgersen Biscoe Dream
<int> <fct> <int> <int> <int>
1 1 Adelie 3750 NA NA
2 2 Adelie 3800 NA NA
3 3 Adelie 3250 NA NA
4 4 Adelie NA NA NA
5 5 Adelie 3450 NA NA
6 6 Adelie 3650 NA NA
7 7 Adelie 3625 NA NA
8 8 Adelie 4675 NA NA
9 9 Adelie 3475 NA NA
10 10 Adelie 4250 NA NA
# ℹ 334 more rows
Week 4: Data Exploration & Scientific Hypotheses
Pre-Session Work
data(mtcars)
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Scatter Plot
library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
Line Graph
library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_line()
library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_line() +
geom_point()
Bar Graph
library(ggplot2)
ggplot(mtcars, aes(x = factor(disp), y = mpg)) +
geom_col()
Histogram
library(ggplot2)
ggplot(mtcars, aes(x = mpg)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 4)
Box Plot
library(ggplot2)
ggplot(mtcars, aes(x = interaction(cyl), y = disp)) +
geom_boxplot()