library(tidyverse)RM+DA Quarto Demo
Quarto
Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.
We can write in italics, and also bullet points:
one
two.
Running Code
When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
Include a plot
Code
ggplot(diamonds, aes(x = cut)) +
geom_bar()This is a reference: Figure 1.
Exercises from R Data book for Graduates
library(tidyverse)
##Problem A
midwest %>%
group_by(state) %>%
summarise(poptotalmean = mean(poptotal),
poptotalmed = median(poptotal),
popmax = max(poptotal),
popmin = min(poptotal),
popdistinct = n_distinct(poptotal),
popfirst = first(poptotal),
popany = any(poptotal < 5000),
popany2 = any(poptotal > 2000000)) %>%
ungroup()# A tibble: 5 × 9
state poptotalmean poptotalmed popmax popmin popdistinct popfirst popany
<chr> <dbl> <dbl> <int> <int> <int> <int> <lgl>
1 IL 112065. 24486. 5105067 4373 101 66090 TRUE
2 IN 60263. 30362. 797159 5315 92 31095 FALSE
3 MI 111992. 37308 2111687 1701 83 10145 TRUE
4 OH 123263. 54930. 1412140 11098 88 25371 FALSE
5 WI 67941. 33528 959275 3890 72 15682 TRUE
# ℹ 1 more variable: popany2 <lgl>
##Problem B
midwest %>%
group_by(state) %>%
summarise(num5k = sum(poptotal < 5000),
num2mil = sum(poptotal > 2000000),
numrows = n()) %>%
ungroup()# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
##Problem C
# part I
midwest %>%
group_by(county) %>%
summarise(x = n_distinct(state)) %>%
arrange(desc(x)) %>%
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
# part II
# How does n() differ from n_distinct()?
#When would they be the same? different?
midwest %>%
group_by(county) %>%
summarise (x = n()) %>%
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
# part III
# hint:
# - How many distinctly different counties are there for each country?
# - Can there be more than 1 (county) county in each county?
# - What if we replace county with 'state'?
midwest%>%
group_by(county) %>%
summarise(x = n_distinct(county)) %>%
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
## Problem D
diamonds %>%
group_by(clarity) %>%
summarise(a = n_distinct(color),
b = n_distinct(price),
c = n()) %>%
ungroup()# A tibble: 8 × 4
clarity a b c
<ord> <int> <int> <int>
1 I1 7 632 741
2 SI2 7 4904 9194
3 SI1 7 5380 13065
4 VS2 7 5051 12258
5 VS1 7 3926 8171
6 VVS2 7 2409 5066
7 VVS1 7 1623 3655
8 IF 7 902 1790
## Problem E
# part I
diamonds %>%
group_by(color, cut) %>%
summarise(m = mean(price),
s = sd(price)) %>%
ungroup()`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
# part II
diamonds %>%
group_by(cut, color) %>%
summarise(m = mean(price),
s = sd(price)) %>%
ungroup()`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
# part III
# hint:
# - How good is the sale if the price of diamonds equaled msale?
# - e.x. The diamonds are x% off the original price in msale.
diamonds %>%
group_by(cut, color, clarity) %>%
summarise(m = mean(price),
s = sd(price),
msale = m * 0.80) %>%
ungroup()`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
##Problem F
diamonds %>%
group_by(cut) %>%
summarise(potato = mean(depth),
pizza = mean(price),
popcorn = median(y),
pineapple = potato - pizza,
papaya = pineapple ^ 2,
peach = n()) %>%
ungroup()# A tibble: 5 × 7
cut potato pizza popcorn pineapple papaya peach
<ord> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Fair 64.0 4359. 6.1 -4295. 18444586. 1610
2 Good 62.4 3929. 5.99 -3866. 14949811. 4906
3 Very Good 61.8 3982. 5.77 -3920. 15365942. 12082
4 Premium 61.3 4584. 6.06 -4523. 20457466. 13791
5 Ideal 61.7 3458. 5.26 -3396. 11531679. 21551
## Problem G
# part I
diamonds %>%
group_by(color) %>%
summarise(m = mean(price)) %>%
mutate(x1 = str_c("Diamond color ", color),
x2 = 5) %>%
ungroup()# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
# part II
# What does the first ungroup () do? Is it useful here? Why/why not?
# Why isn't there a closing ungroup() after the mutate()?
diamonds %>%
group_by(color) %>%
summarise(m = mean(price)) %>%
ungroup() %>%
mutate(x1 = str_c("Diamond color ", color),
x2 = 5)# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
## Problem H
# part I
diamonds %>%
group_by(color) %>%
mutate(x1 = price * 0.5) %>%
summarise(m = mean(x1)) %>%
ungroup()# A tibble: 7 × 2
color m
<ord> <dbl>
1 D 1585.
2 E 1538.
3 F 1862.
4 G 2000.
5 H 2243.
6 I 2546.
7 J 2662.
# part II
# What's the difference between part I and part II?
diamonds%>%
group_by(color) %>%
mutate(x1 = price * 0.5) %>%
ungroup() %>%
summarise(m = mean(x1))# A tibble: 1 × 1
m
<dbl>
1 1966.
library(tidyverse)
view(diamonds)
diamonds %>%
group_by(price)%>%
ungroup()# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%
group_by(desc(price))%>%
ungroup()# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z `desc(price)`
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 -326
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 -326
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 -327
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 -334
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 -335
6 0.24 Very G… J VVS2 62.8 57 336 3.94 3.96 2.48 -336
7 0.24 Very G… I VVS1 62.3 57 336 3.95 3.98 2.47 -336
8 0.26 Very G… H SI1 61.9 55 337 4.07 4.11 2.53 -337
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 -337
10 0.23 Very G… H VS1 59.4 61 338 4 4.05 2.39 -338
# ℹ 53,930 more rows
diamonds %>%
group_by(price, cut)%>%
ungroup()# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%
group_by(-price, cut)%>%
ungroup()# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z `-price`
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 -326
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 -326
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 -327
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 -334
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 -335
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 -336
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 -336
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 -337
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 -337
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 -338
# ℹ 53,930 more rows
diamonds %>%
group_by(price, -clarity)%>%
ungroup()Warning: There was 1 warning in `group_by()`.
ℹ In argument: `-clarity`.
Caused by warning in `Ops.ordered()`:
! '-' is not meaningful for ordered factors
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z `-clarity`
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <lgl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 NA
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 NA
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 NA
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 NA
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 NA
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 NA
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 NA
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 NA
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 NA
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 NA
# ℹ 53,930 more rows
library(tidyverse)
diamonds %>%
mutate(salePrice = price - 250)# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z salePrice
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 76
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 76
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 77
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 84
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 85
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 86
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 86
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 87
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 87
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 88
# ℹ 53,930 more rows
library(tidyverse)
diamonds %>%
select(-x, -y, -z) # A tibble: 53,940 × 7
carat cut color clarity depth table price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326
2 0.21 Premium E SI1 59.8 61 326
3 0.23 Good E VS1 56.9 65 327
4 0.29 Premium I VS2 62.4 58 334
5 0.31 Good J SI2 63.3 58 335
6 0.24 Very Good J VVS2 62.8 57 336
7 0.24 Very Good I VVS1 62.3 57 336
8 0.26 Very Good H SI1 61.9 55 337
9 0.22 Fair E VS2 65.1 61 337
10 0.23 Very Good H VS1 59.4 61 338
# ℹ 53,930 more rows
library(tidyverse)
diamonds %>%
group_by(cut) %>%
summarise(count = n())# A tibble: 5 × 2
cut count
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
library(tidyverse)
diamonds %>%
mutate(totalNum = n())# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z totalNum
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 53940
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 53940
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 53940
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 53940
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 53940
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 53940
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 53940
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 53940
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 53940
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 53940
# ℹ 53,930 more rows
Why is grouping data necessary?
It allows you to summarise data for specific categories
It helps to organise data in a way that is meaningful, easy to understand and analyse.
It allows for complex analyses of data by providing a framework for comparing groups side by side. This is essential for exploratory data analysis.
Grouping also makes operations more efficient by allowing R to process subsets of data at a time instead of the entire dataset. This is beneficial when working with a large dataset.
It fascilitates the visualisation of data. For example, if you were to want a bar chart showing the count of diamonds for each cut, the data would first have to be grouped and summarised.
In short, grouping data is a crucial step in data manipulation and analysis that enhances clarity, organisation, and the ability to perforn specific calculations efficiently.
Why is ungrouping data inportant?
Ungrouping data is important for a number of reasons:
Returning to the original structure - this helps when you are wanting to perform subsequent analyses or manipulations without the grouping affecting the results.
Avoiding unintended consequences - if you forget to ungroup data after a grouped operation, subsequent calculations or transformations may be inadvertently applied to the group data, leading to unexpected results.
Fascilitating other operations - certain functions and operations expect a regukar data frame, not a grouped one. Ungrouping allows you to apply functions that may not behave correctly on grouped data.
Simplifying further analysis - once data had been summarised, you may want to perform further analysis on the resulting data frame. Ungrouping ensures that you’re working with the data in a straightforward manner.
Clarity in code - ungrouping can improve code readability. It makes it clear to anyone reading the code, that the intention is to work with a regular data frame, rather than one that retains rouping attributes.
In short, ungrouping is a necessary step to ensure that subsequent operations are performed correctly, and to maintain clarity and structure in the data analysis workflow.
When should you ungroup data?
Data should be ungrouped:
After summarising - once data has been summarised, it should be ungrouped so that further operations can be performed that should not be grouped.
Before additional transformations - if transformations are going to be applied (e.g., mutate) that will apply to the entire dataset rather than withing groups, it would be best to ungroup data first.
When you no longer need grouping - if your analysis requires a flat data structure (e.g., when plotting or exporting data), ungrouping helps achieve that.
To avoid errors - if you notice that subsequent calculations are behaving unexpectedly or producing errors due to residual grouping, ungrouping can resolve these issues.
To improve code clarity - if your code is becoming complex, ungrouping can make it clearer where the data is being grouped and when its being treated as a standard data frame.
In short, ungrouping is good practice after you have completed the intended grouped operations, ensuring that workflow remains clear and that results are as expected.
If the code does not contain group_by, do you still need to ungroup at the end?
No, because the data has not been grouped. Ungroup should only be used when the group_by function has been performed.
Andrew’s data
library(tidyverse)
library(modeldata)
View(crickets)
# The basics
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2")# Modifying basic properties of the plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point(color = "red",
size = 2,
alpha = .4,
shape = "square") +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")`geom_smooth()` using formula = 'y ~ x'
ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2") `geom_smooth()` using formula = 'y ~ x'
# Other plots by Andrew
ggplot(crickets, aes(x = rate)) +
geom_histogram(bins = 15) # one quantitative variableggplot(crickets, aes(x = rate)) +
geom_freqpoly(bins = 15)ggplot(crickets, aes(x = species)) +
geom_bar(color = "black",
fill = "lightblue")ggplot(crickets, aes(x = species,
fill = species)) +
geom_bar(show.legend = FALSE) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = species,
y = rate,
color = species)) +
geom_boxplot(show.legend = FALSE) +
scale_color_brewer(palette = "Dark2") +
theme_minimal()# Faceting
# Not great:
ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species,
ncol = 1) +
scale_fill_brewer(palette = "Dark2") +
theme_minimal()