Research Methods and Data Analysis

Week 1: Introduction

  • Introduction to the module and to Quarto

Notes

  • I have downloaded the penguin workbook and customized it
  • I have also used the ‘visual’ tool to insert an image I believe works well with this module

Week 2: Meet the penguins

Illustration of three species of Palmer Archipelago penguins: Chinstrap, Gentoo, and Adelie. Artwork by @allison_horst.

The penguins data from the palmerpenguins package contains size measurements for 344 penguins from three species observed on three islands in the Palmer Archipelago, Antarctica.

The plot below shows the relationship between flipper and bill lengths of these penguins.

Week 3: Data Wrangling

Penguins Data

library(tidyverse)
library(palmerpenguins)

data("penguins")
penguins %>% 
  select(1:5)
# A tibble: 344 × 5
   species island    bill_length_mm bill_depth_mm flipper_length_mm
   <fct>   <fct>              <dbl>         <dbl>             <int>
 1 Adelie  Torgersen           39.1          18.7               181
 2 Adelie  Torgersen           39.5          17.4               186
 3 Adelie  Torgersen           40.3          18                 195
 4 Adelie  Torgersen           NA            NA                  NA
 5 Adelie  Torgersen           36.7          19.3               193
 6 Adelie  Torgersen           39.3          20.6               190
 7 Adelie  Torgersen           38.9          17.8               181
 8 Adelie  Torgersen           39.2          19.6               195
 9 Adelie  Torgersen           34.1          18.1               193
10 Adelie  Torgersen           42            20.2               190
# ℹ 334 more rows

Histogram

library(tidyverse)
library(palmerpenguins)

data("penguins")
penguins %>% 
group_by(species) %>% 
  ggplot(aes(x=bill_length_mm, color=species, fill=species))+
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

Box Plots

library(tidyverse)
library(palmerpenguins)

data("penguins")
penguins %>% 
group_by(species) %>% 
  ggplot(aes(x=species, 
             y=bill_length_mm, 
             color=species, 
             fill=species))+
  geom_boxplot(alpha=0.5)+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Checking Categorical Data

Species of Penguins
library(tidyverse)
library(palmerpenguins)

penguins %>% 
  ggplot(aes(x=species,
             color=species, 
             fill=species))+
  geom_bar(alpha=0.5)+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))

Observations Per Year
library(tidyverse)
library(palmerpenguins)

penguins %>% 
  ggplot(aes(x=year,
             color=species, 
             fill=species))+
  geom_bar()+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))

Observations Per Island
library(tidyverse)
library(palmerpenguins)

penguins %>% 
  ggplot(aes(x=island,
             color=species, 
             fill=species))+
  geom_bar()+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))

Visualising Correlations

penguins %>% 
  ggplot(aes(x=bill_length_mm, 
             y = bill_depth_mm))+
  geom_point()+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Visualising Correlations per Species
penguins %>% 
  ggplot(aes(x=bill_length_mm, 
             y = bill_depth_mm,
             color=species, 
             fill=species))+
  geom_point()+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Body Mass per Sex
penguins %>% 
  na.omit() %>% 
  ggplot(aes(x=sex, 
             y = body_mass_g,
             color=species, 
             fill=species))+
  geom_boxplot(alpha=0.7)+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))

Body Mass per Sex (inverting groups)
penguins %>% 
  na.omit() %>% 
  ggplot(aes(x=species, 
             y = body_mass_g,
             color=sex, 
             fill=sex))+
  geom_boxplot(alpha=0.7)+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))

Check Distributions

penguins %>% 
  na.omit() %>% 
  pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") %>% 
  ggplot(aes(x=value,
         group=species,
         fill=species,
         color=species))+
  geom_density(alpha=0.7)+
  facet_grid(~trait, scales = "free_x" )+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))+
  theme_minimal()

Checking via Histogram
set.seed(999)
normal<-rnorm(100)
normal %>% 
  as.tibble() %>% 
  ggplot(aes(value))+
  geom_histogram(color="#DD4A48", fill="#DD4A48")+
  geom_vline(xintercept=c(mean(normal), (mean(normal)+sd(normal)),mean(normal)-sd(normal)), 
             linetype="dashed")
Warning: `as.tibble()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

penguins %>% # take the object penguins
  select(1:3) # then, select the columns 1 to 3
# A tibble: 344 × 3
   species island    bill_length_mm
   <fct>   <fct>              <dbl>
 1 Adelie  Torgersen           39.1
 2 Adelie  Torgersen           39.5
 3 Adelie  Torgersen           40.3
 4 Adelie  Torgersen           NA  
 5 Adelie  Torgersen           36.7
 6 Adelie  Torgersen           39.3
 7 Adelie  Torgersen           38.9
 8 Adelie  Torgersen           39.2
 9 Adelie  Torgersen           34.1
10 Adelie  Torgersen           42  
# ℹ 334 more rows
install.packages("vtable", repos = "https://cran.rstudio.com/")

The downloaded binary packages are in
    /var/folders/y0/1zx51nnj0034fnt0zrf7sbtr0000gn/T//RtmplvVCBs/downloaded_packages
install.packages("gt", repos = "https://cran.rstudio.com/")

The downloaded binary packages are in
    /var/folders/y0/1zx51nnj0034fnt0zrf7sbtr0000gn/T//RtmplvVCBs/downloaded_packages
library(vtable)
Loading required package: kableExtra

Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':

    group_rows
library(gt)

penguins %>% 
  vtable(., lush = TRUE)
.
Name Class Values Missing Summary
species factor 'Adelie' 'Chinstrap' 'Gentoo' 0 nuniq: 3
island factor 'Biscoe' 'Dream' 'Torgersen' 0 nuniq: 3
bill_length_mm numeric Num: 32.1 to 59.6 2 mean: 43.922, sd: 5.46, nuniq: 164
bill_depth_mm numeric Num: 13.1 to 21.5 2 mean: 17.151, sd: 1.975, nuniq: 80
flipper_length_mm integer Num: 172 to 231 2 mean: 200.915, sd: 14.062, nuniq: 55
body_mass_g integer Num: 2700 to 6300 2 mean: 4201.754, sd: 801.955, nuniq: 94
sex factor 'female' 'male' 11 nuniq: 2
year integer Num: 2007 to 2009 0 mean: 2008.029, sd: 0.818, nuniq: 3
library(vtable)
library(gt)

penguins %>% 
  group_by(species) %>% 
  na.omit() %>% 
  summarise(mean = mean(bill_length_mm), sd=sd(bill_length_mm), n = n())
# A tibble: 3 × 4
  species    mean    sd     n
  <fct>     <dbl> <dbl> <int>
1 Adelie     38.8  2.66   146
2 Chinstrap  48.8  3.34    68
3 Gentoo     47.6  3.11   119
library(vtable)
library(gt)

penguins %>% 
  group_by(island) %>% 
  na.omit() %>% 
  summarise(mean = mean(bill_length_mm), sd=sd(bill_length_mm), n = n())
# A tibble: 3 × 4
  island     mean    sd     n
  <fct>     <dbl> <dbl> <int>
1 Biscoe     45.2  4.83   163
2 Dream      44.2  5.95   123
3 Torgersen  39.0  3.03    47
penguins %>% 
  filter(species=="Gentoo", 
         bill_length_mm > 50, 
         sex=="male") %>% 
  select(bill_length_mm, 
         bill_depth_mm) %>% 
  arrange(bill_depth_mm)
# A tibble: 21 × 2
   bill_length_mm bill_depth_mm
            <dbl>         <dbl>
 1           51.3          14.2
 2           50.2          14.3
 3           50.1          15  
 4           50.7          15  
 5           50.4          15.3
 6           52.5          15.6
 7           54.3          15.7
 8           50.8          15.7
 9           50.4          15.7
10           53.4          15.8
# ℹ 11 more rows
penguins %>% 
  filter(species=="Adelie", 
         bill_length_mm > 20, 
         sex=="male") %>% 
  select(bill_length_mm, 
         flipper_length_mm) %>% 
  arrange(flipper_length_mm)
# A tibble: 73 × 2
   bill_length_mm flipper_length_mm
            <dbl>             <int>
 1           37.2               178
 2           37.7               180
 3           38.8               180
 4           40.5               180
 5           39.1               181
 6           41.1               182
 7           40.6               183
 8           40.9               184
 9           39.8               184
10           37.2               184
# ℹ 63 more rows
penguins %>% 
  select(bill_length_mm, 
         bill_depth_mm,
         year) %>% 
  pivot_longer(col=c(bill_length_mm:bill_depth_mm), 
               names_to = "bill_feature", values_to = "value")
# A tibble: 688 × 3
    year bill_feature   value
   <int> <chr>          <dbl>
 1  2007 bill_length_mm  39.1
 2  2007 bill_depth_mm   18.7
 3  2007 bill_length_mm  39.5
 4  2007 bill_depth_mm   17.4
 5  2007 bill_length_mm  40.3
 6  2007 bill_depth_mm   18  
 7  2007 bill_length_mm  NA  
 8  2007 bill_depth_mm   NA  
 9  2007 bill_length_mm  36.7
10  2007 bill_depth_mm   19.3
# ℹ 678 more rows
penguins %>% 
  mutate(row = row_number()) %>% # needed to add a rwoo number to identify each row as separate case
  select(row, species, island, body_mass_g) %>% 
  pivot_wider(names_from = island, values_from = body_mass_g)
# A tibble: 344 × 5
     row species Torgersen Biscoe Dream
   <int> <fct>       <int>  <int> <int>
 1     1 Adelie       3750     NA    NA
 2     2 Adelie       3800     NA    NA
 3     3 Adelie       3250     NA    NA
 4     4 Adelie         NA     NA    NA
 5     5 Adelie       3450     NA    NA
 6     6 Adelie       3650     NA    NA
 7     7 Adelie       3625     NA    NA
 8     8 Adelie       4675     NA    NA
 9     9 Adelie       3475     NA    NA
10    10 Adelie       4250     NA    NA
# ℹ 334 more rows

Week 4: Data Exploration & Scientific Hypotheses

Pre-Session Work

data(mtcars)
head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
Scatter Plot
library(ggplot2)

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point()

Line Graph
library(ggplot2)

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_line()

library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_line() +
  geom_point()

Bar Graph
library(ggplot2)


ggplot(mtcars, aes(x = factor(disp), y = mpg)) +
  geom_col()

Histogram
library(ggplot2)
ggplot(mtcars, aes(x = mpg)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 4)

Box Plot
library(ggplot2)
ggplot(mtcars, aes(x = interaction(cyl), y = disp)) +
  geom_boxplot()

Week 4 Session