library(tidyverse)
library(readr)
library(knitr)
library(palmerpenguins)
library(modeldata)
library(gplots)
library(graphics)
library(vcd)
library(corrplot)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gmodels)Katie’s Workbook
Chapter 1 - Introduction
Learning outcomes
🐧 Exploring palmer penguins dataset 🐧
The penguins data from the palmerpenguins package contains size measurements for 344 penguins from three species observed on three islands in the Palmer Archipelago, Antarctica.
Include cute photos of penguins to make the reader happy :)
The plot below shows the relationship between flipper and bill lengths of these penguins.
Chapter 2
2.1 Data Wrangling
Learning outcomes
Understand the importance of data wrangling
Learn basic skills
Post-session exercises from 6.6.1
Aim: follow the instructions and provide a comment for each line of code
Problem A
midwest %>% # Utilises the midwest dataset
group_by(state) %>% # Groups data by state variable
summarize(poptotalmean = mean(poptotal), # Summarises the grouped data and calculates average of poptotal
poptotalmed = median(poptotal), # Median of poptotal
popmax = max(poptotal), # Maximum value of poptotal
popmin = min(poptotal), # Minimum value of poptotal
popdistinct = n_distinct(poptotal), # Counts the number of distinct values of poptotal
popfirst = first(poptotal), # Retrieves the first value of poptotal
popany = any(poptotal < 5000), # Checks if there are any values of poptotal less than 5000
popany2 = any(poptotal > 2000000)) %>% # Checks if there are any values of poptotal greater than 2000000
ungroup() # Final ungrouping of data# A tibble: 5 × 9
state poptotalmean poptotalmed popmax popmin popdistinct popfirst popany
<chr> <dbl> <dbl> <int> <int> <int> <int> <lgl>
1 IL 112065. 24486. 5105067 4373 101 66090 TRUE
2 IN 60263. 30362. 797159 5315 92 31095 FALSE
3 MI 111992. 37308 2111687 1701 83 10145 TRUE
4 OH 123263. 54930. 1412140 11098 88 25371 FALSE
5 WI 67941. 33528 959275 3890 72 15682 TRUE
# ℹ 1 more variable: popany2 <lgl>
Problem B
## Problem B
midwest %>% # Utilises the midwest dataset
group_by(state) %>% # Groups data by state variable
summarize(num5k = sum(poptotal < 5000), # Summarises the grouped data and calculates the number of instances where poptotal in less than 5000
num2mil = sum(poptotal > 2000000), # Calculates the number of instances where poptotal in greater than 2000000
numrows = n()) %>% # Counts the number of rows in each state group
ungroup() # Final ungrouping of data# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
Problem C
# part I
midwest %>% # Utilises the midwest dataset
group_by(county) %>% # Groups data by county variable
summarize(x = n_distinct(state)) %>% # Summarises the grouped data by calculating the number of distinct states associated with each county
arrange(desc(x)) %>% # Visualise data in descending order based on values in column x
ungroup() # Final ungrouping of data# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
# part II
# Question: How does n() differ from n_distinct()?
# Answer: n() counts the total number of rows in the current group whereas n_distinct() counts the number of unique or distinct values in a specified column
midwest %>% # Utilises the midwest dataset
group_by(county) %>% # Groups data by county variable
summarize(x = n()) %>% # Summarises the grouped data by counting the total number of rows in each county
ungroup() # Final ungrouping of data# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
# part III
midwest %>% # Utilises the midwest dataset
group_by(county) %>% # Groups data by county variable
summarize(x = n_distinct(county)) %>% # Summarises the grouped data by calculating the number of distinct counties within each group
ungroup() # Final ungrouping of data# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
Problem D
diamonds %>% # Utilises the diamonds dataset
group_by(clarity) %>% # Groups data by clarity variable
summarize(a = n_distinct(color), # Summarises the grouped data and calculates the number of distinct colors for diamonds within each clarity group
b = n_distinct(price), # Calculates the number of distinct price values for diamonds within each clarity group
c = n()) %>% # Counts the total number of rows for each clarity group
ungroup() # Final ungrouping of data# A tibble: 8 × 4
clarity a b c
<ord> <int> <int> <int>
1 I1 7 632 741
2 SI2 7 4904 9194
3 SI1 7 5380 13065
4 VS2 7 5051 12258
5 VS1 7 3926 8171
6 VVS2 7 2409 5066
7 VVS1 7 1623 3655
8 IF 7 902 1790
Problem E
# part I
diamonds %>% # Utilises the diamonds dataset
group_by(color, cut) %>% # Groups data by color and cut variables
summarize(m = mean(price), # Summarises the grouped data and calculates the average price of diamonds for each combination of color and cut
s = sd(price)) %>% # Calculates the standard deviation of the price for each combination of color and cut
ungroup() # Final ungrouping of data`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
# part II
diamonds %>% # Utilises the diamonds dataset
group_by(cut, color) %>% # Groups data by cut and color variables
summarize(m = mean(price), # Summarises the grouped data and calculates the average price of diamonds for each combination of color and cut
s = sd(price)) %>% # Calculates the standard deviation of the price for each combination of color and cut
ungroup() # Final ungrouping of data`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
# part III
diamonds %>% # Utilises the diamonds dataset
group_by(cut, color, clarity) %>% # Groups data by cut, color and clarity variables
summarize(m = mean(price), # Summarises the grouped data and calculates the average price of diamonds for each group
s = sd(price), # Calculates the standard deviation of the price within each group
msale = m * 0.80) %>% # Creates a new column msale, which represents 80% of the mean price calculated earlier
ungroup() # Final ungrouping of data`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
Problem F
diamonds %>% # Utilises the diamonds dataset
group_by(cut) %>% # Groups data by cut variable
summarize(potato = mean(depth), # Summarises the grouped data and calculates the depth variable for each group
pizza = mean(price), # Calculates the mean of the price variable for each group
popcorn = median(y), # Calculates the median of the variable y for each group
pineapple = potato - pizza, # Calculates the difference between the mean depth (potato) and mean price (pizza)
papaya = pineapple ^ 2, # Calculates the square of the value in pineapple
peach = n()) %>% # Counts the number of rows in each group
ungroup() # Final ungrouping of data# A tibble: 5 × 7
cut potato pizza popcorn pineapple papaya peach
<ord> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Fair 64.0 4359. 6.1 -4295. 18444586. 1610
2 Good 62.4 3929. 5.99 -3866. 14949811. 4906
3 Very Good 61.8 3982. 5.77 -3920. 15365942. 12082
4 Premium 61.3 4584. 6.06 -4523. 20457466. 13791
5 Ideal 61.7 3458. 5.26 -3396. 11531679. 21551
Problem G
# part I
diamonds %>% # Utilises the diamonds dataset
group_by(color) %>% # Groups data by color variable
summarize(m = mean(price)) %>% # Calculates the average price of diamonds within each color group
mutate(x1 = str_c("Diamond color ", color), # Creates new column named x1
x2 = 5) %>% # Creates new column named x2, which contains the constant value 5 for all rows
ungroup() # Final ungrouping of data# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
# part II
diamonds %>% # Utilises the diamonds dataset
group_by(color) %>% # Groups data by color variable
summarize(m = mean(price)) %>% # Calculates the average price of diamonds within each color group
ungroup() %>% # Data no longer grouped by color
mutate(x1 = str_c("Diamond color ", color), # Creates new column named x1
x2 = 5) # Creates new column named x2, which contains the constant value 5 for all rows# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
Problem H
# part I
diamonds %>% # Utilises the diamonds dataset
group_by(color) %>% # Groups data by color variable
mutate(x1 = price * 0.5) %>% # Creates new column named x1, which is calculated by taking half of the price for each diamond
ungroup() # Final ungrouping of data# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z x1
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 163
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 163
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 164.
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 167
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 168.
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 168
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 168
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 168.
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 168.
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 169
# ℹ 53,930 more rows
# part II
diamonds %>% # Utilises the diamonds dataset
group_by(color) %>% # Groups data by color variable
mutate(x1 = price * 0.5) %>% # Creates new column named x1, which is calculated by taking half of the price for each diamond
ungroup() %>% # Data no longer grouped by color
summarize(m = mean(x1)) # Calculates the average of the x1 column# A tibble: 1 × 1
m
<dbl>
1 1966.
In the words of Rihanna: shine bright like a diamond 💎
Questions
Why is grouping data necessary?
It allows you to calculate summary statistics for specific data subsets and simplifies data analysis
Why is ungrouping data necessary?
After performing an operation using grouped data, it allows you to restore the original data frame structure
When should you ungroup data?
After you have performed a grouped calculation on the grouped data
If the code does not contain group_by(), do you still need ungroup() at the end? For example, does data() %>% mutate(newVar = 1 + 2) require ungroup()?
No, if the code doesn’t contain a group_by() statement, you don’t need to use ungroup() at the end
Post-session extra practice
Task 1
names(diamonds) # View all the variable names in diamonds [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
[8] "x" "y" "z"
Task 2
diamonds %>% arrange(price) # Arrange diamonds by lowest to highest price# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>% arrange(desc(price)) # Arrange diamonds by highest to lowest price# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
# ℹ 53,930 more rows
diamonds %>% arrange(price,cut) # Arrange diamonds by lowest price and cut# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
2 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
9 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>% arrange(desc(price), desc(cut)) # Arrange diamonds by highest price and cut# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
10 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
# ℹ 53,930 more rows
Task 3
diamonds %>% arrange(price, desc(clarity)) # Arrange diamonds by ascending price and descending clarity# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
2 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
7 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
9 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
Task 4
diamonds %>% # Utilises the diamonds dataset
mutate(salePrice = price - 250) # New variable named salePrice which represents a discount of $250 off of the original cost of each diamond# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z salePrice
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 76
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 76
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 77
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 84
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 85
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 86
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 86
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 87
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 87
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 88
# ℹ 53,930 more rows
Task 5
diamonds %>% select(-x,-y,-z) # Remove the x, y and z variables from the diamonds dataset# A tibble: 53,940 × 7
carat cut color clarity depth table price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326
2 0.21 Premium E SI1 59.8 61 326
3 0.23 Good E VS1 56.9 65 327
4 0.29 Premium I VS2 62.4 58 334
5 0.31 Good J SI2 63.3 58 335
6 0.24 Very Good J VVS2 62.8 57 336
7 0.24 Very Good I VVS1 62.3 57 336
8 0.26 Very Good H SI1 61.9 55 337
9 0.22 Fair E VS2 65.1 61 337
10 0.23 Very Good H VS1 59.4 61 338
# ℹ 53,930 more rows
Task 6
diamonds %>% # Utilises the diamonds dataset
group_by(cut) %>% # Groups the dataset by the cut variable
summarize(count = n()) # Summarises the number of diamonds there are for each cut value# A tibble: 5 × 2
cut count
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
Task 7
diamonds %>% # Utilises the diamonds dataset
mutate(totalNum = n()) # Creates a new column named totalNum that calculates the total number of diamonds# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z totalNum
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 53940
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 53940
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 53940
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 53940
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 53940
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 53940
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 53940
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 53940
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 53940
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 53940
# ℹ 53,930 more rows
2.2 Research questions
Good question
How do the proportions of diamond cuts vary with different colours?
Examines the relationship between two categorical variables
Bad question
What can we learn from the diamonds dataset?
Too broad and lacking specificity
Chapter 3
3.1 Data Exploration
Learning outcomes
Make questions about your data
Explore the basic features of your data
Make simple exploratory graphics
Post-session exercises from Andrew’s video
Aim: follow the video tutorial and reproduce the graphs
Basic scatter plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2")Modifying basic properties of scatter plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point(color = "red",
size = 2,
alpha = .3,
shape = "square") +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")Adding another layer to scatter plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")`geom_smooth()` using formula = 'y ~ x'
ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2") `geom_smooth()` using formula = 'y ~ x'
Histogram
ggplot(crickets, aes(x = rate)) +
geom_histogram(bins = 15) Frequency polygon
ggplot(crickets, aes(x = rate)) +
geom_freqpoly(bins = 15)Bar graph
ggplot(crickets, aes(x = species)) +
geom_bar(color = "black",
fill = "lightblue")ggplot(crickets, aes(x = species,
fill = species)) +
geom_bar(show.legend = FALSE) +
scale_fill_brewer(palette = "Dark2")Box plot
ggplot(crickets, aes(x = species,
y = rate,
color = species)) +
geom_boxplot(show.legend = FALSE) +
scale_color_brewer(palette = "Dark2") +
theme_minimal()Histogram - faceting
ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species,
ncol = 1) +
scale_fill_brewer(palette = "Dark2") +
theme_minimal()3.2 Scientific Hypotheses
Learning outcomes
How to elaborate a sound scientific hypothesis
How to differentiate scientific and statistical hypotheses
Connecting hypotheses and questions
How to write hypotheses in scientific texts
Components of a good research hypothesis
Testability: possible to test the hypothesis through experiments, observations or data analysis
Falsifiable: it must be possible to disprove the hypothesis
Clarity: easily understandable and unambiguous
Brevity: presented in a concise statement
Relevant: addresses a significant knowledge gap of importance
Focused: targets a specific issue without being too broad or vague
Researchable: there must be sufficient data available or methods to investigate the hypothesis
Originality: contributes new perspectives and knowledge by building on previous theory
Chapter 4 - Choosing correct analyses
Learning outcomes
Identify the nature of variables
Foresee the types of analyses
Draw graphics of predicted results
Post-session exercises
Aim: reproduce graphs using the iris dataset and identify which tests can be associated with them
Box plot
Mean tests - testing differences in means
T-tests
Anovas
Non-parametric equivalents
Nested and two way
Post-hoc tests
iris %>%
na.omit() %>%
ggplot(aes(
x= Species,
y= Sepal.Length,
color=Species))+
geom_boxplot(alpha=0.7)-> cat_x_num
cat_x_numDensity plot
Density tests - examines distribution of a continuous variable
Mann-Whitney
Anovas
iris %>%
na.omit() %>%
ggplot(aes(x = Petal.Length, color = Species, fill = Species)) +
geom_density(alpha = 0.5) Scatter plot
Correlations - testing associations between continuous variables
Correlations (many variations)
Linear models
iris %>%
na.omit() %>%
ggplot(aes(
x = Petal.Length,
y = Petal.Width)) +
geom_point(aes(
color = Species,
shape = Species)) +
geom_smooth(method = "lm", se = TRUE) -> num_x_num
num_x_num`geom_smooth()` using formula = 'y ~ x'
Bar plot
Frequency tests - testing associations between categorical variables
Chi-square
G-tests
Contingency tables
Log-linear models
iris <- iris %>%
mutate(size = ifelse(Sepal.Length < median(Sepal.Length), "small", "big"))
iris %>%
na.omit() %>%
ggplot(aes(
x= Species,
color=size,
fill=size))+
geom_bar(position = "dodge")-> cat_x_cat
cat_x_catChapter 5 - Frequency Tests
Learning outcomes
Know when to use frequency tests
Work with tables and graphs
Post-session exercises
Aim: run examples to produce contingency tables and visualisations
Comparing proportions
1) One-proportion z-test
Used to compare an observed proportion to a theoretical one, when there are only two categories
res <- prop.test(x = 95, n = 160, p = 0.5,
correct = FALSE)
# Printing the results
res
1-sample proportions test without continuity correction
data: 95 out of 160, null probability 0.5
X-squared = 5.625, df = 1, p-value = 0.01771
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.5163169 0.6667870
sample estimates:
p
0.59375
The p-value of the test is 0.01771, which is less than the significance level so we can conclude that the proportion of male with cancer is significantly different from 0.5 with a p-value = 0.01771
2) Two-proportions z-test
Used to compare two observed proportions
res <- prop.test(x = c(490, 400), n = c(500, 500))
# Printing the results
res
2-sample test for equality of proportions with continuity correction
data: c(490, 400) out of c(500, 500)
X-squared = 80.909, df = 1, p-value < 2.2e-16
alternative hypothesis: two.sided
95 percent confidence interval:
0.1408536 0.2191464
sample estimates:
prop 1 prop 2
0.98 0.80
The p-value of the test is 2.36310^{-19}, which is less than the significance level so we can conclude that the proportion of smokers is significantly different in the two groups with a p-value = 2.36310^{-19}
3) Chi-square goodness of fit test
Used to compare the observed distribution to an expected distribution, in a situation where we have two or more categories in a discrete data
tulip <- c(81, 50, 27)
res <- chisq.test(tulip, p = c(1/3, 1/3, 1/3))
res
Chi-squared test for given probabilities
data: tulip
X-squared = 27.886, df = 2, p-value = 8.803e-07
The p-value of the test is 8.80310^{-7}, which is less than the significance level so we can conclude that the colors are significantly not commonly distributed with a p-value = 8.80310^{-7}
tulip <- c(81, 50, 27)
res <- chisq.test(tulip, p = c(1/2, 1/3, 1/6))
res
Chi-squared test for given probabilities
data: tulip
X-squared = 0.20253, df = 2, p-value = 0.9037
The p-value of the test is 0.9037, which is greater than the significance level so we can conclude that the observed proportions are not significantly different from the expected proportions
3) Chi-square test of independence
Used to analyse the frequency table (contingency table) formed by two categorical variables
# Import the data
file_path <- "http://www.sthda.com/sthda/RDoc/data/housetasks.txt"
housetasks <- read.delim(file_path, row.names = 1)
# head(housetasks)
# 1. convert the data as a table
dt <- as.table(as.matrix(housetasks))
# 2. Graph
balloonplot(t(dt), main ="housetasks", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)mosaicplot(dt, shade = TRUE, las=2,
main = "housetasks")# plot just a subset of the table
assoc(head(dt, 5), shade = TRUE, las=3)chisq <- chisq.test(housetasks)
chisq
Pearson's Chi-squared test
data: housetasks
X-squared = 1944.5, df = 36, p-value < 2.2e-16
# Observed counts
chisq$observed Wife Alternating Husband Jointly
Laundry 156 14 2 4
Main_meal 124 20 5 4
Dinner 77 11 7 13
Breakfeast 82 36 15 7
Tidying 53 11 1 57
Dishes 32 24 4 53
Shopping 33 23 9 55
Official 12 46 23 15
Driving 10 51 75 3
Finances 13 13 21 66
Insurance 8 1 53 77
Repairs 0 3 160 2
Holidays 0 1 6 153
# Expected counts
round(chisq$expected,2) Wife Alternating Husband Jointly
Laundry 60.55 25.63 38.45 51.37
Main_meal 52.64 22.28 33.42 44.65
Dinner 37.16 15.73 23.59 31.52
Breakfeast 48.17 20.39 30.58 40.86
Tidying 41.97 17.77 26.65 35.61
Dishes 38.88 16.46 24.69 32.98
Shopping 41.28 17.48 26.22 35.02
Official 33.03 13.98 20.97 28.02
Driving 47.82 20.24 30.37 40.57
Finances 38.88 16.46 24.69 32.98
Insurance 47.82 20.24 30.37 40.57
Repairs 56.77 24.03 36.05 48.16
Holidays 55.05 23.30 34.95 46.70
corrplot(chisq$residuals, is.cor = FALSE)# Contibution in percentage (%)
contrib <- 100*chisq$residuals^2/chisq$statistic
round(contrib, 3) Wife Alternating Husband Jointly
Laundry 7.738 0.272 1.777 2.246
Main_meal 4.976 0.012 1.243 1.903
Dinner 2.197 0.073 0.600 0.560
Breakfeast 1.222 0.615 0.408 1.443
Tidying 0.149 0.133 1.270 0.661
Dishes 0.063 0.178 0.891 0.625
Shopping 0.085 0.090 0.581 0.586
Official 0.688 3.771 0.010 0.311
Driving 1.538 2.403 3.374 1.789
Finances 0.886 0.037 0.028 1.700
Insurance 1.705 0.941 0.868 1.683
Repairs 2.919 0.947 21.921 2.275
Holidays 2.831 1.098 1.233 12.445
# Visualise the contribution
corrplot(contrib, is.cor = FALSE)Contingency tables
1) Summary tables using dplyr and tidyr
Summarise the total number of cars with each class and cylinder combination
mpg%>%
group_by(class, cyl)%>%
summarise(n=n())%>%
spread(cyl, n)%>%
kable()`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.
| class | 4 | 5 | 6 | 8 |
|---|---|---|---|---|
| 2seater | NA | NA | NA | 5 |
| compact | 32 | 2 | 13 | NA |
| midsize | 16 | NA | 23 | 2 |
| minivan | 1 | NA | 10 | NA |
| pickup | 3 | NA | 10 | 20 |
| subcompact | 21 | 2 | 7 | 5 |
| suv | 8 | NA | 16 | 38 |
Summarise the average number of city miles by class and cylinders
mpg%>%
group_by(class, cyl)%>%
summarise(mean_cty=mean(cty))%>%
spread(cyl, mean_cty)%>%
kable()`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.
| class | 4 | 5 | 6 | 8 |
|---|---|---|---|---|
| 2seater | NA | NA | NA | 15.40000 |
| compact | 21.37500 | 21 | 16.92308 | NA |
| midsize | 20.50000 | NA | 17.78261 | 16.00000 |
| minivan | 18.00000 | NA | 15.60000 | NA |
| pickup | 16.00000 | NA | 14.50000 | 11.80000 |
| subcompact | 22.85714 | 20 | 17.00000 | 14.80000 |
| suv | 18.00000 | NA | 14.50000 | 12.13158 |
Summarise the maximum number of city miles by class and cylinders
mpg%>%
group_by(class, cyl)%>%
summarise(max_cty=max(cty))%>%
spread(cyl, max_cty)%>%
kable()`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.
| class | 4 | 5 | 6 | 8 |
|---|---|---|---|---|
| 2seater | NA | NA | NA | 16 |
| compact | 33 | 21 | 18 | NA |
| midsize | 23 | NA | 19 | 16 |
| minivan | 18 | NA | 17 | NA |
| pickup | 17 | NA | 16 | 14 |
| subcompact | 35 | 20 | 18 | 15 |
| suv | 20 | NA | 17 | 14 |
Contingency table of proportion values
mpg%>%
group_by(class, cyl)%>%
summarize(n=n())%>%
mutate(prop=n/sum(n))%>%
subset(select=c("class","cyl","prop"))%>% #drop the frequency value
spread(class, prop)%>%
kable()`summarise()` has grouped output by 'class'. You can override using the
`.groups` argument.
| cyl | 2seater | compact | midsize | minivan | pickup | subcompact | suv |
|---|---|---|---|---|---|---|---|
| 4 | NA | 0.6808511 | 0.3902439 | 0.0909091 | 0.0909091 | 0.6000000 | 0.1290323 |
| 5 | NA | 0.0425532 | NA | NA | NA | 0.0571429 | NA |
| 6 | NA | 0.2765957 | 0.5609756 | 0.9090909 | 0.3030303 | 0.2000000 | 0.2580645 |
| 8 | 1 | NA | 0.0487805 | NA | 0.6060606 | 0.1428571 | 0.6129032 |
2) Frequency and proportion tables using table()
Contingency table of vehicle class by number of cylinders
table(mpg$class, mpg$cyl)
4 5 6 8
2seater 0 0 0 5
compact 32 2 13 0
midsize 16 0 23 2
minivan 1 0 10 0
pickup 3 0 10 20
subcompact 21 2 7 5
suv 8 0 16 38
Table frequency
mpg_table<- table(mpg$class, mpg$cyl) #define object w/table parameters for simple calling
ftable(mpg_table) 4 5 6 8
2seater 0 0 0 5
compact 32 2 13 0
midsize 16 0 23 2
minivan 1 0 10 0
pickup 3 0 10 20
subcompact 21 2 7 5
suv 8 0 16 38
Row frequencies
margin.table(mpg_table, 1)
2seater compact midsize minivan pickup subcompact suv
5 47 41 11 33 35 62
Column frequencies
margin.table(mpg_table, 2)
4 5 6 8
81 4 79 70
Proportion of the entire table
prop.table(mpg_table)
4 5 6 8
2seater 0.000000000 0.000000000 0.000000000 0.021367521
compact 0.136752137 0.008547009 0.055555556 0.000000000
midsize 0.068376068 0.000000000 0.098290598 0.008547009
minivan 0.004273504 0.000000000 0.042735043 0.000000000
pickup 0.012820513 0.000000000 0.042735043 0.085470085
subcompact 0.089743590 0.008547009 0.029914530 0.021367521
suv 0.034188034 0.000000000 0.068376068 0.162393162
Row proportions
prop.table(mpg_table, 1)
4 5 6 8
2seater 0.00000000 0.00000000 0.00000000 1.00000000
compact 0.68085106 0.04255319 0.27659574 0.00000000
midsize 0.39024390 0.00000000 0.56097561 0.04878049
minivan 0.09090909 0.00000000 0.90909091 0.00000000
pickup 0.09090909 0.00000000 0.30303030 0.60606061
subcompact 0.60000000 0.05714286 0.20000000 0.14285714
suv 0.12903226 0.00000000 0.25806452 0.61290323
Column proportions
prop.table(mpg_table, 2)
4 5 6 8
2seater 0.00000000 0.00000000 0.00000000 0.07142857
compact 0.39506173 0.50000000 0.16455696 0.00000000
midsize 0.19753086 0.00000000 0.29113924 0.02857143
minivan 0.01234568 0.00000000 0.12658228 0.00000000
pickup 0.03703704 0.00000000 0.12658228 0.28571429
subcompact 0.25925926 0.50000000 0.08860759 0.07142857
suv 0.09876543 0.00000000 0.20253165 0.54285714
3) Frequency, proportion and chi-square values using CrossTable()
CrossTable(mpg$class, mpg$cyl)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 234
| mpg$cyl
mpg$class | 4 | 5 | 6 | 8 | Row Total |
-------------|-----------|-----------|-----------|-----------|-----------|
2seater | 0 | 0 | 0 | 5 | 5 |
| 1.731 | 0.085 | 1.688 | 8.210 | |
| 0.000 | 0.000 | 0.000 | 1.000 | 0.021 |
| 0.000 | 0.000 | 0.000 | 0.071 | |
| 0.000 | 0.000 | 0.000 | 0.021 | |
-------------|-----------|-----------|-----------|-----------|-----------|
compact | 32 | 2 | 13 | 0 | 47 |
| 15.210 | 1.782 | 0.518 | 14.060 | |
| 0.681 | 0.043 | 0.277 | 0.000 | 0.201 |
| 0.395 | 0.500 | 0.165 | 0.000 | |
| 0.137 | 0.009 | 0.056 | 0.000 | |
-------------|-----------|-----------|-----------|-----------|-----------|
midsize | 16 | 0 | 23 | 2 | 41 |
| 0.230 | 0.701 | 6.059 | 8.591 | |
| 0.390 | 0.000 | 0.561 | 0.049 | 0.175 |
| 0.198 | 0.000 | 0.291 | 0.029 | |
| 0.068 | 0.000 | 0.098 | 0.009 | |
-------------|-----------|-----------|-----------|-----------|-----------|
minivan | 1 | 0 | 10 | 0 | 11 |
| 2.070 | 0.188 | 10.641 | 3.291 | |
| 0.091 | 0.000 | 0.909 | 0.000 | 0.047 |
| 0.012 | 0.000 | 0.127 | 0.000 | |
| 0.004 | 0.000 | 0.043 | 0.000 | |
-------------|-----------|-----------|-----------|-----------|-----------|
pickup | 3 | 0 | 10 | 20 | 33 |
| 6.211 | 0.564 | 0.117 | 10.391 | |
| 0.091 | 0.000 | 0.303 | 0.606 | 0.141 |
| 0.037 | 0.000 | 0.127 | 0.286 | |
| 0.013 | 0.000 | 0.043 | 0.085 | |
-------------|-----------|-----------|-----------|-----------|-----------|
subcompact | 21 | 2 | 7 | 5 | 35 |
| 6.515 | 3.284 | 1.963 | 2.858 | |
| 0.600 | 0.057 | 0.200 | 0.143 | 0.150 |
| 0.259 | 0.500 | 0.089 | 0.071 | |
| 0.090 | 0.009 | 0.030 | 0.021 | |
-------------|-----------|-----------|-----------|-----------|-----------|
suv | 8 | 0 | 16 | 38 | 62 |
| 8.444 | 1.060 | 1.162 | 20.403 | |
| 0.129 | 0.000 | 0.258 | 0.613 | 0.265 |
| 0.099 | 0.000 | 0.203 | 0.543 | |
| 0.034 | 0.000 | 0.068 | 0.162 | |
-------------|-----------|-----------|-----------|-----------|-----------|
Column Total | 81 | 4 | 79 | 70 | 234 |
| 0.346 | 0.017 | 0.338 | 0.299 | |
-------------|-----------|-----------|-----------|-----------|-----------|
4) Exploratory
chisq2 <- chisq.test(mpg_table)Warning in chisq.test(mpg_table): Chi-squared approximation may be incorrect
chisq2
Pearson's Chi-squared test
data: mpg_table
X-squared = 138.03, df = 18, p-value < 2.2e-16
# Observed counts
chisq2$observed
4 5 6 8
2seater 0 0 0 5
compact 32 2 13 0
midsize 16 0 23 2
minivan 1 0 10 0
pickup 3 0 10 20
subcompact 21 2 7 5
suv 8 0 16 38
# Expected counts
round(chisq2$expected,2)
4 5 6 8
2seater 1.73 0.09 1.69 1.50
compact 16.27 0.80 15.87 14.06
midsize 14.19 0.70 13.84 12.26
minivan 3.81 0.19 3.71 3.29
pickup 11.42 0.56 11.14 9.87
subcompact 12.12 0.60 11.82 10.47
suv 21.46 1.06 20.93 18.55
round(chisq2$residuals, 3)
4 5 6 8
2seater -1.316 -0.292 -1.299 2.865
compact 3.900 1.335 -0.720 -3.750
midsize 0.480 -0.837 2.462 -2.931
minivan -1.439 -0.434 3.262 -1.814
pickup -2.492 -0.751 -0.342 3.224
subcompact 2.553 1.812 -1.401 -1.691
suv -2.906 -1.029 -1.078 4.517
corrplot(chisq2$residuals, is.cor = FALSE)Contibution in percentage (%)
contrib2 <- 100*chisq2$residuals^2/chisq2$statistic
round(contrib2, 3)
4 5 6 8
2seater 1.254 0.062 1.223 5.948
compact 11.020 1.291 0.375 10.186
midsize 0.167 0.508 4.390 6.224
minivan 1.500 0.136 7.709 2.384
pickup 4.500 0.409 0.085 7.528
subcompact 4.720 2.379 1.422 2.070
suv 6.117 0.768 0.842 14.782
Visualise the contribution
corrplot(contrib2, is.cor = FALSE)