Katie’s Workbook

Author

Katie Prange

Published

November 5, 2024

library(tidyverse)
library(readr)
library(knitr)
library(palmerpenguins)
library(modeldata)
library(gplots)
library(graphics)
library(vcd)
library(corrplot)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gmodels)

Chapter 1 - Introduction

Learning outcomes

Install R and RStudio
Upload data files
Create a workbook using Quarto

🐧 Exploring palmer penguins dataset 🐧

The penguins data from the palmerpenguins package contains size measurements for 344 penguins from three species observed on three islands in the Palmer Archipelago, Antarctica.

Tip

Include cute photos of penguins to make the reader happy :)

The plot below shows the relationship between flipper and bill lengths of these penguins.

Chapter 2

2.1 Data Wrangling

Learning outcomes

Understand the importance of data wrangling
Learn basic skills

Post-session exercises from 6.6.1

Aim: follow the instructions and provide a comment for each line of code

Problem A

midwest %>%                                         # Utilises the midwest dataset
  group_by(state) %>%                               # Groups data by state variable
  summarize(poptotalmean = mean(poptotal),          # Summarises the grouped data and calculates average of poptotal 
            poptotalmed = median(poptotal),         # Median of poptotal
            popmax = max(poptotal),                 # Maximum value of poptotal
            popmin = min(poptotal),                 # Minimum value of poptotal
            popdistinct = n_distinct(poptotal),     # Counts the number of distinct values of poptotal
            popfirst = first(poptotal),             # Retrieves the first value of poptotal
            popany = any(poptotal < 5000),          # Checks if there are any values of poptotal less than 5000
            popany2 = any(poptotal > 2000000)) %>%  # Checks if there are any values of poptotal greater than 2000000
  ungroup()                                         # Final ungrouping of data

# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

Problem B

## Problem B
midwest %>%                                     # Utilises the midwest dataset
  group_by(state) %>%                           # Groups data by state variable
  summarize(num5k = sum(poptotal < 5000),       # Summarises the grouped data and calculates the number of instances where poptotal in less than 5000
            num2mil = sum(poptotal > 2000000),  # Calculates the number of instances where poptotal in greater than 2000000
            numrows = n()) %>%                  # Counts the number of rows in each state group
  ungroup()                                     # Final ungrouping of data

# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

Problem C

# part I
midwest %>%                              # Utilises the midwest dataset
  group_by(county) %>%                   # Groups data by county variable
  summarize(x = n_distinct(state)) %>%   # Summarises the grouped data by calculating the number of distinct states associated with each county
  arrange(desc(x)) %>%                   # Visualise data in descending order based on values in column x
  ungroup()                              # Final ungrouping of data

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

# part II
# Question: How does n() differ from n_distinct()? 
# Answer: n() counts the total number of rows in the current group whereas n_distinct() counts the number of unique or distinct values in a specified column

midwest %>%                              # Utilises the midwest dataset
  group_by(county) %>%                   # Groups data by county variable
  summarize(x = n()) %>%                 # Summarises the grouped data by counting the total number of rows in each county
  ungroup()                              # Final ungrouping of data

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

# part III
midwest %>%                              # Utilises the midwest dataset
  group_by(county) %>%                   # Groups data by county variable
  summarize(x = n_distinct(county)) %>%  # Summarises the grouped data by calculating the number of distinct counties within each group
  ungroup()                              # Final ungrouping of data

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

Problem D

diamonds %>%                        # Utilises the diamonds dataset
  group_by(clarity) %>%             # Groups data by clarity variable
  summarize(a = n_distinct(color),  # Summarises the grouped data and calculates the number of distinct colors for diamonds within each clarity group
            b = n_distinct(price),  # Calculates the number of distinct price values for diamonds within each clarity group
            c = n()) %>%            # Counts the total number of rows for each clarity group
  ungroup()                         # Final ungrouping of data

# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790

Problem E

# part I
diamonds %>%                         # Utilises the diamonds dataset
  group_by(color, cut) %>%           # Groups data by color and cut variables
  summarize(m = mean(price),         # Summarises the grouped data and calculates the average price of diamonds for each combination of color and cut
            s = sd(price)) %>%       # Calculates the standard deviation of the price for each combination of color and cut
  ungroup()                          # Final ungrouping of data

`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.

# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows

# part II
diamonds %>%                         # Utilises the diamonds dataset
  group_by(cut, color) %>%           # Groups data by cut and color variables
  summarize(m = mean(price),         # Summarises the grouped data and calculates the average price of diamonds for each combination of color and cut
            s = sd(price)) %>%       # Calculates the standard deviation of the price for each combination of color and cut
  ungroup()                          # Final ungrouping of data

`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.

# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows

# part III
diamonds %>%                         # Utilises the diamonds dataset
  group_by(cut, color, clarity) %>%  # Groups data by cut, color and clarity variables
  summarize(m = mean(price),         # Summarises the grouped data and calculates the average price of diamonds for each group
            s = sd(price),           # Calculates the standard deviation of the price within each group 
            msale = m * 0.80) %>%    # Creates a new column msale, which represents 80% of the mean price calculated earlier
  ungroup()                          # Final ungrouping of data

`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.

# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows

Problem F

diamonds %>%                             # Utilises the diamonds dataset
  group_by(cut) %>%                      # Groups data by cut variable
  summarize(potato = mean(depth),        # Summarises the grouped data and calculates the depth variable for each group
            pizza = mean(price),         # Calculates the mean of the price variable for each group 
            popcorn = median(y),         # Calculates the median of the variable y for each group 
            pineapple = potato - pizza,  # Calculates the difference between the mean depth (potato) and mean price (pizza)
            papaya = pineapple ^ 2,      # Calculates the square of the value in pineapple
            peach = n()) %>%             # Counts the number of rows in each group
  ungroup()                              # Final ungrouping of data

# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551

Problem G

# part I
diamonds %>%                                   # Utilises the diamonds dataset
  group_by(color) %>%                          # Groups data by color variable
  summarize(m = mean(price)) %>%               # Calculates the average price of diamonds within each color group 
  mutate(x1 = str_c("Diamond color ", color),  # Creates new column named x1
         x2 = 5) %>%                           # Creates new column named x2, which contains the constant value 5 for all rows
  ungroup()                                    # Final ungrouping of data

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

# part II
diamonds %>%                                   # Utilises the diamonds dataset
  group_by(color) %>%                          # Groups data by color variable
  summarize(m = mean(price)) %>%               # Calculates the average price of diamonds within each color group 
  ungroup() %>%                                # Data no longer grouped by color
  mutate(x1 = str_c("Diamond color ", color),  # Creates new column named x1
         x2 = 5)                               # Creates new column named x2, which contains the constant value 5 for all rows

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

Problem H

# part I
diamonds %>%                     # Utilises the diamonds dataset
  group_by(color) %>%            # Groups data by color variable
  mutate(x1 = price * 0.5) %>%   # Creates new column named x1, which is calculated by taking half of the price for each diamond
  ungroup()                      # Final ungrouping of data

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z    x1
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43  163 
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31  163 
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31  164.
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63  167 
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75  168.
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48  168 
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47  168 
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53  168.
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49  168.
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39  169 
# ℹ 53,930 more rows

# part II
diamonds %>%                     # Utilises the diamonds dataset
  group_by(color) %>%            # Groups data by color variable
  mutate(x1 = price * 0.5) %>%   # Creates new column named x1, which is calculated by taking half of the price for each diamond
  ungroup() %>%                  # Data no longer grouped by color
  summarize(m = mean(x1))        # Calculates the average of the x1 column

# A tibble: 1 × 1
      m
  <dbl>
1 1966.

Tip

In the words of Rihanna: shine bright like a diamond 💎

Questions

Why is grouping data necessary?

It allows you to calculate summary statistics for specific data subsets and simplifies data analysis
Why is ungrouping data necessary?

After performing an operation using grouped data, it allows you to restore the original data frame structure
When should you ungroup data?

After you have performed a grouped calculation on the grouped data
If the code does not contain group_by(), do you still need ungroup() at the end? For example, does data() %>% mutate(newVar = 1 + 2) require ungroup()?

No, if the code doesn’t contain a group_by() statement, you don’t need to use ungroup() at the end

Post-session extra practice

Task 1

names(diamonds)  # View all the variable names in diamonds

 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
 [8] "x"       "y"       "z"

Task 2

diamonds %>% arrange(price)                   # Arrange diamonds by lowest to highest price

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

diamonds %>% arrange(desc(price))             # Arrange diamonds by highest to lowest price

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
10  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
# ℹ 53,930 more rows

diamonds %>% arrange(price,cut)               # Arrange diamonds by lowest price and cut

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 2  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
 9  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

diamonds %>% arrange(desc(price), desc(cut))  # Arrange diamonds by highest price and cut

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
10  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
# ℹ 53,930 more rows

Task 3

diamonds %>% arrange(price, desc(clarity))  # Arrange diamonds by ascending price and descending clarity

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 2  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 7  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
 9  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

Task 4

diamonds %>%                       # Utilises the diamonds dataset
  mutate(salePrice = price - 250)  # New variable named salePrice which represents a discount of $250 off of the original cost of each diamond

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z salePrice
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43        76
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31        76
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31        77
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63        84
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75        85
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48        86
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47        86
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53        87
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49        87
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39        88
# ℹ 53,930 more rows

Task 5

diamonds %>% select(-x,-y,-z)  # Remove the x, y and z variables from the diamonds dataset

# A tibble: 53,940 × 7
   carat cut       color clarity depth table price
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int>
 1  0.23 Ideal     E     SI2      61.5    55   326
 2  0.21 Premium   E     SI1      59.8    61   326
 3  0.23 Good      E     VS1      56.9    65   327
 4  0.29 Premium   I     VS2      62.4    58   334
 5  0.31 Good      J     SI2      63.3    58   335
 6  0.24 Very Good J     VVS2     62.8    57   336
 7  0.24 Very Good I     VVS1     62.3    57   336
 8  0.26 Very Good H     SI1      61.9    55   337
 9  0.22 Fair      E     VS2      65.1    61   337
10  0.23 Very Good H     VS1      59.4    61   338
# ℹ 53,930 more rows

Task 6

diamonds %>%              # Utilises the diamonds dataset
  group_by(cut) %>%       # Groups the dataset by the cut variable
  summarize(count = n())  # Summarises the number of diamonds there are for each cut value

# A tibble: 5 × 2
  cut       count
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551

Task 7

diamonds %>%              # Utilises the diamonds dataset
  mutate(totalNum = n())  # Creates a new column named totalNum that calculates the total number of diamonds

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z totalNum
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>    <int>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43    53940
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31    53940
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31    53940
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63    53940
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75    53940
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48    53940
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47    53940
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53    53940
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49    53940
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39    53940
# ℹ 53,930 more rows

2.2 Research questions

Good question

How do the proportions of diamond cuts vary with different colours?

Examines the relationship between two categorical variables

Bad question

What can we learn from the diamonds dataset?

Too broad and lacking specificity

Chapter 3

3.1 Data Exploration

Learning outcomes

Make questions about your data
Explore the basic features of your data
Make simple exploratory graphics

Post-session exercises from Andrew’s video

Aim: follow the video tutorial and reproduce the graphs

Basic scatter plot

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

Modifying basic properties of scatter plot

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point(color = "red",
             size = 2,
             alpha = .3,
             shape = "square") +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

Adding another layer to scatter plot

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

`geom_smooth()` using formula = 'y ~ x'

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

`geom_smooth()` using formula = 'y ~ x'

Histogram

ggplot(crickets, aes(x = rate)) + 
  geom_histogram(bins = 15)

Frequency polygon

ggplot(crickets, aes(x = rate)) + 
  geom_freqpoly(bins = 15)

Bar graph

ggplot(crickets, aes(x = species)) + 
  geom_bar(color = "black",
           fill = "lightblue")

ggplot(crickets, aes(x = species, 
                     fill = species)) + 
  geom_bar(show.legend = FALSE) +
  scale_fill_brewer(palette = "Dark2")

Box plot

ggplot(crickets, aes(x = species, 
                     y = rate,
                     color = species)) + 
  geom_boxplot(show.legend = FALSE) +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

Histogram - faceting

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species,
             ncol = 1) +
  scale_fill_brewer(palette = "Dark2") + 
  theme_minimal()

3.2 Scientific Hypotheses

Learning outcomes

How to elaborate a sound scientific hypothesis
How to differentiate scientific and statistical hypotheses
Connecting hypotheses and questions
How to write hypotheses in scientific texts

Components of a good research hypothesis

Testability: possible to test the hypothesis through experiments, observations or data analysis
Falsifiable: it must be possible to disprove the hypothesis
Clarity: easily understandable and unambiguous
Brevity: presented in a concise statement
Relevant: addresses a significant knowledge gap of importance
Focused: targets a specific issue without being too broad or vague
Researchable: there must be sufficient data available or methods to investigate the hypothesis
Originality: contributes new perspectives and knowledge by building on previous theory

Chapter 4 - Choosing correct analyses

Learning outcomes

Identify the nature of variables
Foresee the types of analyses
Draw graphics of predicted results

Post-session exercises

Aim: reproduce graphs using the iris dataset and identify which tests can be associated with them

Box plot

Mean tests - testing differences in means

T-tests
Anovas
Non-parametric equivalents
Nested and two way
Post-hoc tests

iris %>% 
  na.omit() %>% 
  ggplot(aes(
    x= Species, 
    y= Sepal.Length,
    color=Species))+
  geom_boxplot(alpha=0.7)-> cat_x_num

cat_x_num

Density plot

Density tests - examines distribution of a continuous variable

Mann-Whitney
Anovas

iris %>%
  na.omit() %>% 
  ggplot(aes(x = Petal.Length, color = Species, fill = Species)) +
  geom_density(alpha = 0.5)

Scatter plot

Correlations - testing associations between continuous variables

Correlations (many variations)
Linear models

iris %>%
  na.omit() %>%
  ggplot(aes(
    x = Petal.Length, 
    y = Petal.Width)) +
  geom_point(aes(
    color = Species, 
    shape = Species)) +  
  geom_smooth(method = "lm", se = TRUE) -> num_x_num  

num_x_num

`geom_smooth()` using formula = 'y ~ x'

Bar plot

Frequency tests - testing associations between categorical variables

Chi-square
G-tests
Contingency tables
Log-linear models

iris <- iris %>%
  mutate(size = ifelse(Sepal.Length < median(Sepal.Length), "small", "big"))

iris %>% 
  na.omit() %>% 
  ggplot(aes(
    x= Species, 
    color=size, 
    fill=size))+
  geom_bar(position = "dodge")-> cat_x_cat
cat_x_cat

Chapter 5 - Frequency Tests

Learning outcomes

Know when to use frequency tests
Work with tables and graphs

Post-session exercises

Aim: run examples to produce contingency tables and visualisations

Comparing proportions

1) One-proportion z-test

Used to compare an observed proportion to a theoretical one, when there are only two categories

res <- prop.test(x = 95, n = 160, p = 0.5, 
                 correct = FALSE)
# Printing the results
res


    1-sample proportions test without continuity correction

data:  95 out of 160, null probability 0.5
X-squared = 5.625, df = 1, p-value = 0.01771
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.5163169 0.6667870
sample estimates:
      p 
0.59375

The p-value of the test is 0.01771, which is less than the significance level so we can conclude that the proportion of male with cancer is significantly different from 0.5 with a p-value = 0.01771

2) Two-proportions z-test

Used to compare two observed proportions

res <- prop.test(x = c(490, 400), n = c(500, 500))
# Printing the results
res


    2-sample test for equality of proportions with continuity correction

data:  c(490, 400) out of c(500, 500)
X-squared = 80.909, df = 1, p-value < 2.2e-16
alternative hypothesis: two.sided
95 percent confidence interval:
 0.1408536 0.2191464
sample estimates:
prop 1 prop 2 
  0.98   0.80

The p-value of the test is 2.36310^{-19}, which is less than the significance level so we can conclude that the proportion of smokers is significantly different in the two groups with a p-value = 2.36310^{-19}

3) Chi-square goodness of fit test

Used to compare the observed distribution to an expected distribution, in a situation where we have two or more categories in a discrete data

tulip <- c(81, 50, 27)
res <- chisq.test(tulip, p = c(1/3, 1/3, 1/3))
res


    Chi-squared test for given probabilities

data:  tulip
X-squared = 27.886, df = 2, p-value = 8.803e-07

The p-value of the test is 8.80310^{-7}, which is less than the significance level so we can conclude that the colors are significantly not commonly distributed with a p-value = 8.80310^{-7}

tulip <- c(81, 50, 27)
res <- chisq.test(tulip, p = c(1/2, 1/3, 1/6))
res


    Chi-squared test for given probabilities

data:  tulip
X-squared = 0.20253, df = 2, p-value = 0.9037

The p-value of the test is 0.9037, which is greater than the significance level so we can conclude that the observed proportions are not significantly different from the expected proportions

3) Chi-square test of independence

Used to analyse the frequency table (contingency table) formed by two categorical variables

# Import the data
file_path <- "http://www.sthda.com/sthda/RDoc/data/housetasks.txt"
housetasks <- read.delim(file_path, row.names = 1)
# head(housetasks)

# 1. convert the data as a table
dt <- as.table(as.matrix(housetasks))
# 2. Graph
balloonplot(t(dt), main ="housetasks", xlab ="", ylab="",
            label = FALSE, show.margins = FALSE)

mosaicplot(dt, shade = TRUE, las=2,
           main = "housetasks")

# plot just a subset of the table
assoc(head(dt, 5), shade = TRUE, las=3)

chisq <- chisq.test(housetasks)
chisq


    Pearson's Chi-squared test

data:  housetasks
X-squared = 1944.5, df = 36, p-value < 2.2e-16

# Observed counts
chisq$observed

           Wife Alternating Husband Jointly
Laundry     156          14       2       4
Main_meal   124          20       5       4
Dinner       77          11       7      13
Breakfeast   82          36      15       7
Tidying      53          11       1      57
Dishes       32          24       4      53
Shopping     33          23       9      55
Official     12          46      23      15
Driving      10          51      75       3
Finances     13          13      21      66
Insurance     8           1      53      77
Repairs       0           3     160       2
Holidays      0           1       6     153

# Expected counts
round(chisq$expected,2)

            Wife Alternating Husband Jointly
Laundry    60.55       25.63   38.45   51.37
Main_meal  52.64       22.28   33.42   44.65
Dinner     37.16       15.73   23.59   31.52
Breakfeast 48.17       20.39   30.58   40.86
Tidying    41.97       17.77   26.65   35.61
Dishes     38.88       16.46   24.69   32.98
Shopping   41.28       17.48   26.22   35.02
Official   33.03       13.98   20.97   28.02
Driving    47.82       20.24   30.37   40.57
Finances   38.88       16.46   24.69   32.98
Insurance  47.82       20.24   30.37   40.57
Repairs    56.77       24.03   36.05   48.16
Holidays   55.05       23.30   34.95   46.70

corrplot(chisq$residuals, is.cor = FALSE)

# Contibution in percentage (%)
contrib <- 100*chisq$residuals^2/chisq$statistic
round(contrib, 3)

            Wife Alternating Husband Jointly
Laundry    7.738       0.272   1.777   2.246
Main_meal  4.976       0.012   1.243   1.903
Dinner     2.197       0.073   0.600   0.560
Breakfeast 1.222       0.615   0.408   1.443
Tidying    0.149       0.133   1.270   0.661
Dishes     0.063       0.178   0.891   0.625
Shopping   0.085       0.090   0.581   0.586
Official   0.688       3.771   0.010   0.311
Driving    1.538       2.403   3.374   1.789
Finances   0.886       0.037   0.028   1.700
Insurance  1.705       0.941   0.868   1.683
Repairs    2.919       0.947  21.921   2.275
Holidays   2.831       1.098   1.233  12.445

# Visualise the contribution
corrplot(contrib, is.cor = FALSE)

Contingency tables

1) Summary tables using dplyr and tidyr

Summarise the total number of cars with each class and cylinder combination

mpg%>%
  group_by(class, cyl)%>%
  summarise(n=n())%>%
  spread(cyl, n)%>%
  kable()

`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.

class	4	5	6	8
2seater	NA	NA	NA	5
compact	32	2	13	NA
midsize	16	NA	23	2
minivan	1	NA	10	NA
pickup	3	NA	10	20
subcompact	21	2	7	5
suv	8	NA	16	38

Summarise the average number of city miles by class and cylinders

mpg%>%
  group_by(class, cyl)%>%
  summarise(mean_cty=mean(cty))%>%
  spread(cyl, mean_cty)%>%
  kable()

`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.

class	4	5	6	8
2seater	NA	NA	NA	15.40000
compact	21.37500	21	16.92308	NA
midsize	20.50000	NA	17.78261	16.00000
minivan	18.00000	NA	15.60000	NA
pickup	16.00000	NA	14.50000	11.80000
subcompact	22.85714	20	17.00000	14.80000
suv	18.00000	NA	14.50000	12.13158

Summarise the maximum number of city miles by class and cylinders

mpg%>%
  group_by(class, cyl)%>%
  summarise(max_cty=max(cty))%>%
  spread(cyl, max_cty)%>%
  kable()

`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.

class	4	5	6	8
2seater	NA	NA	NA	16
compact	33	21	18	NA
midsize	23	NA	19	16
minivan	18	NA	17	NA
pickup	17	NA	16	14
subcompact	35	20	18	15
suv	20	NA	17	14

Contingency table of proportion values

mpg%>%
  group_by(class, cyl)%>%
  summarize(n=n())%>%
  mutate(prop=n/sum(n))%>%
  subset(select=c("class","cyl","prop"))%>%   #drop the frequency value
  spread(class, prop)%>%
  kable()

`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.

cyl	2seater	compact	midsize	minivan	pickup	subcompact	suv
4	NA	0.6808511	0.3902439	0.0909091	0.0909091	0.6000000	0.1290323
5	NA	0.0425532	NA	NA	NA	0.0571429	NA
6	NA	0.2765957	0.5609756	0.9090909	0.3030303	0.2000000	0.2580645
8	1	NA	0.0487805	NA	0.6060606	0.1428571	0.6129032

2) Frequency and proportion tables using table()

Contingency table of vehicle class by number of cylinders

table(mpg$class, mpg$cyl)

            
              4  5  6  8
  2seater     0  0  0  5
  compact    32  2 13  0
  midsize    16  0 23  2
  minivan     1  0 10  0
  pickup      3  0 10 20
  subcompact 21  2  7  5
  suv         8  0 16 38

Table frequency

mpg_table<- table(mpg$class, mpg$cyl) #define object w/table parameters for simple calling
ftable(mpg_table)

             4  5  6  8
                       
2seater      0  0  0  5
compact     32  2 13  0
midsize     16  0 23  2
minivan      1  0 10  0
pickup       3  0 10 20
subcompact  21  2  7  5
suv          8  0 16 38

Row frequencies

margin.table(mpg_table, 1)


   2seater    compact    midsize    minivan     pickup subcompact        suv 
         5         47         41         11         33         35         62

Column frequencies

margin.table(mpg_table, 2)


 4  5  6  8 
81  4 79 70

Proportion of the entire table

prop.table(mpg_table)

            
                       4           5           6           8
  2seater    0.000000000 0.000000000 0.000000000 0.021367521
  compact    0.136752137 0.008547009 0.055555556 0.000000000
  midsize    0.068376068 0.000000000 0.098290598 0.008547009
  minivan    0.004273504 0.000000000 0.042735043 0.000000000
  pickup     0.012820513 0.000000000 0.042735043 0.085470085
  subcompact 0.089743590 0.008547009 0.029914530 0.021367521
  suv        0.034188034 0.000000000 0.068376068 0.162393162

Row proportions

prop.table(mpg_table, 1)

            
                      4          5          6          8
  2seater    0.00000000 0.00000000 0.00000000 1.00000000
  compact    0.68085106 0.04255319 0.27659574 0.00000000
  midsize    0.39024390 0.00000000 0.56097561 0.04878049
  minivan    0.09090909 0.00000000 0.90909091 0.00000000
  pickup     0.09090909 0.00000000 0.30303030 0.60606061
  subcompact 0.60000000 0.05714286 0.20000000 0.14285714
  suv        0.12903226 0.00000000 0.25806452 0.61290323

Column proportions

prop.table(mpg_table, 2)

            
                      4          5          6          8
  2seater    0.00000000 0.00000000 0.00000000 0.07142857
  compact    0.39506173 0.50000000 0.16455696 0.00000000
  midsize    0.19753086 0.00000000 0.29113924 0.02857143
  minivan    0.01234568 0.00000000 0.12658228 0.00000000
  pickup     0.03703704 0.00000000 0.12658228 0.28571429
  subcompact 0.25925926 0.50000000 0.08860759 0.07142857
  suv        0.09876543 0.00000000 0.20253165 0.54285714

3) Frequency, proportion and chi-square values using CrossTable()

CrossTable(mpg$class, mpg$cyl)


 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  234 

 
             | mpg$cyl 
   mpg$class |         4 |         5 |         6 |         8 | Row Total | 
-------------|-----------|-----------|-----------|-----------|-----------|
     2seater |         0 |         0 |         0 |         5 |         5 | 
             |     1.731 |     0.085 |     1.688 |     8.210 |           | 
             |     0.000 |     0.000 |     0.000 |     1.000 |     0.021 | 
             |     0.000 |     0.000 |     0.000 |     0.071 |           | 
             |     0.000 |     0.000 |     0.000 |     0.021 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
     compact |        32 |         2 |        13 |         0 |        47 | 
             |    15.210 |     1.782 |     0.518 |    14.060 |           | 
             |     0.681 |     0.043 |     0.277 |     0.000 |     0.201 | 
             |     0.395 |     0.500 |     0.165 |     0.000 |           | 
             |     0.137 |     0.009 |     0.056 |     0.000 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
     midsize |        16 |         0 |        23 |         2 |        41 | 
             |     0.230 |     0.701 |     6.059 |     8.591 |           | 
             |     0.390 |     0.000 |     0.561 |     0.049 |     0.175 | 
             |     0.198 |     0.000 |     0.291 |     0.029 |           | 
             |     0.068 |     0.000 |     0.098 |     0.009 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
     minivan |         1 |         0 |        10 |         0 |        11 | 
             |     2.070 |     0.188 |    10.641 |     3.291 |           | 
             |     0.091 |     0.000 |     0.909 |     0.000 |     0.047 | 
             |     0.012 |     0.000 |     0.127 |     0.000 |           | 
             |     0.004 |     0.000 |     0.043 |     0.000 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
      pickup |         3 |         0 |        10 |        20 |        33 | 
             |     6.211 |     0.564 |     0.117 |    10.391 |           | 
             |     0.091 |     0.000 |     0.303 |     0.606 |     0.141 | 
             |     0.037 |     0.000 |     0.127 |     0.286 |           | 
             |     0.013 |     0.000 |     0.043 |     0.085 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
  subcompact |        21 |         2 |         7 |         5 |        35 | 
             |     6.515 |     3.284 |     1.963 |     2.858 |           | 
             |     0.600 |     0.057 |     0.200 |     0.143 |     0.150 | 
             |     0.259 |     0.500 |     0.089 |     0.071 |           | 
             |     0.090 |     0.009 |     0.030 |     0.021 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
         suv |         8 |         0 |        16 |        38 |        62 | 
             |     8.444 |     1.060 |     1.162 |    20.403 |           | 
             |     0.129 |     0.000 |     0.258 |     0.613 |     0.265 | 
             |     0.099 |     0.000 |     0.203 |     0.543 |           | 
             |     0.034 |     0.000 |     0.068 |     0.162 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
Column Total |        81 |         4 |        79 |        70 |       234 | 
             |     0.346 |     0.017 |     0.338 |     0.299 |           | 
-------------|-----------|-----------|-----------|-----------|-----------|

4) Exploratory

chisq2 <- chisq.test(mpg_table)

Warning in chisq.test(mpg_table): Chi-squared approximation may be incorrect

chisq2


    Pearson's Chi-squared test

data:  mpg_table
X-squared = 138.03, df = 18, p-value < 2.2e-16

# Observed counts
chisq2$observed

            
              4  5  6  8
  2seater     0  0  0  5
  compact    32  2 13  0
  midsize    16  0 23  2
  minivan     1  0 10  0
  pickup      3  0 10 20
  subcompact 21  2  7  5
  suv         8  0 16 38

# Expected counts
round(chisq2$expected,2)

            
                 4    5     6     8
  2seater     1.73 0.09  1.69  1.50
  compact    16.27 0.80 15.87 14.06
  midsize    14.19 0.70 13.84 12.26
  minivan     3.81 0.19  3.71  3.29
  pickup     11.42 0.56 11.14  9.87
  subcompact 12.12 0.60 11.82 10.47
  suv        21.46 1.06 20.93 18.55

round(chisq2$residuals, 3)

            
                  4      5      6      8
  2seater    -1.316 -0.292 -1.299  2.865
  compact     3.900  1.335 -0.720 -3.750
  midsize     0.480 -0.837  2.462 -2.931
  minivan    -1.439 -0.434  3.262 -1.814
  pickup     -2.492 -0.751 -0.342  3.224
  subcompact  2.553  1.812 -1.401 -1.691
  suv        -2.906 -1.029 -1.078  4.517

corrplot(chisq2$residuals, is.cor = FALSE)

Contibution in percentage (%)

contrib2 <- 100*chisq2$residuals^2/chisq2$statistic
round(contrib2, 3)

            
                  4      5      6      8
  2seater     1.254  0.062  1.223  5.948
  compact    11.020  1.291  0.375 10.186
  midsize     0.167  0.508  4.390  6.224
  minivan     1.500  0.136  7.709  2.384
  pickup      4.500  0.409  0.085  7.528
  subcompact  4.720  2.379  1.422  2.070
  suv         6.117  0.768  0.842 14.782

Visualise the contribution

corrplot(contrib2, is.cor = FALSE)