── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Problem Amidwest %>%#Utilizes the midwest datasetgroup_by(state) %>%#Groups data by state variablessummarize(poptotalmean =mean(poptotal),#The mean population for each statepoptotalmed =median(poptotal),#The median population for each statepopmax =max(poptotal),#The maximum population in each statepopmin =min(poptotal),#The minimum population in each statepopdistinct =n_distinct(poptotal),#Calculate the number of unique values for each state's total populationpopfirst =first(poptotal),#Take the first total population value in each statepopany =any(poptotal <5000),# Check if there is a population of less than 5000popany2 =any(poptotal >2000000)) %>%## Check if there are cases with a total population greater than 2 millionungroup() #Final ungrouping of data
# Problem Bmidwest %>%#Utilizes the midwest datasetgroup_by(state) %>%#Groups data by state variablessummarize(num5k =sum(poptotal <5000),#Count the number of areas in each state with a population of less than 5,000num2mil =sum(poptotal >2000000),#Count the number of areas in each state with a population greater than 2 millionnumrows =n()) %>%#Count the total number of regions in each stateungroup() #Final ungrouping of data
# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
#Problem C#Part 1midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Groups data by country variablessummarize(x =n_distinct(state)) %>%#Count the number of different states in each countyarrange(desc(x)) %>%#Listed in descending order by number of statesungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
#Part 2midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Groups data by country variablessummarize(x =n()) %>%#Count the total number of records for each countyungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
#Part 3midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Groups data by country variablessummarize(x =n_distinct(county)) %>%#Count the difference number of the countyungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
#Problem Ddiamonds %>%#Utilizes the diamonds datasetgroup_by(clarity) %>%#Groups data by clarity variablessummarize(a =n_distinct(color),#The number of different colors in each clarityb =n_distinct(price),#The number of different prices under each clarityc =n()) %>%#The total number of diamonds in each clarityungroup() #Final ungrouping of data
#Problem E#Part 1diamonds %>%#Utilizes the diamonds datasetgroup_by(color, cut) %>%#Groups data by color and cut variablessummarize(m =mean(price),#The mean price of each diamonds =sd(price)) %>%#The standard deviation of the price of each diamondungroup() #Final ungrouping of data
`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
#Part 2diamonds %>%#Utilizes the diamonds datasetgroup_by(cut, color) %>%#Groups data by cut and color variablessummarize(m =mean(price),#The mean price of each diamonds =sd(price)) %>%#The standard deviation of the price of each diamond(The difference of part 1 is changing the sequence of cut and color)ungroup() #Final ungrouping of data
`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
#Part 3diamonds %>%#Utilizes the diamonds datasetgroup_by(cut, color, clarity) %>%#Groups data by cut,color and clarity variables summarize(m =mean(price),#The mean price of each diamonds =sd(price),#The standard deviation of the price of each diamond(The difference of part 1&2 is adding clarity)msale = m *0.80) %>%#Reduce the average price by 20%ungroup() #Final ungrouping of data
`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
#Problem Fdiamonds %>%#Utilizes the diamonds datasetgroup_by(cut) %>%#Groups data by cut variablessummarize(potato =mean(depth),#Mean depth per cutpizza =mean(price),#Mean price per cutpopcorn =median(y),#The median of the y coordinates under each cutpineapple = potato - pizza,#Mean depth minus mean price for each cutpapaya = pineapple^2,#Square the result of the previous steppeach =n())%>%#Calculate the total number of diamonds under each cutungroup() #Final ungrouping of data
#Problem G#Part 1diamonds %>%#Utilizes the diamonds datasetgroup_by(color) %>%#Groups data by color variablessummarize(m =mean(price)) %>%#Mean price per color groupmutate(x1 =str_c("Diamond color ", color),#Create a new column x1 with "Diamond color "and the color namex2 =5) %>%#Create a new column x2 with all values of 5ungroup() #Final ungrouping of data
# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
#Part 2diamonds %>%#Utilizes the diamonds datasetgroup_by(color) %>%#Groups data by color variablessummarize(m =mean(price)) %>%#Mean price per color groupungroup() %>%#Final ungrouping of datamutate(x1 =str_c("Diamond color ", color),#Create a new column x1 with "Diamond color "and the color namex2 =5) #Create a new column x2 with all values of 5
# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
#For hint,since the main purpose of ungroup() is to cancel the grouping and prevent the grouping from affecting subsequent operations, mutate() does not rely on the grouping, so the result is the same for both groups#Problem H#Part 1diamonds %>%#Utilizes the diamonds datasetgroup_by(color) %>%#Groups data by color variablesmutate(x1 = price *0.5) %>%#Create a new column x1 with a value of half the pricesummarize(m =mean(x1)) %>%# The mean value of x1 in each color groupungroup() #Final ungrouping of data
# A tibble: 7 × 2
color m
<ord> <dbl>
1 D 1585.
2 E 1538.
3 F 1862.
4 G 2000.
5 H 2243.
6 I 2546.
7 J 2662.
#Part 2diamonds %>%#Utilizes the diamonds datasetgroup_by(color) %>%#Groups data by color variablesmutate(x1 = price *0.5) %>%#Create a new column x1 with a value of half the priceungroup() %>%#Final ungrouping of datasummarize(m =mean(x1)) # The mean value of x1 in the entire data set
# A tibble: 1 × 1
m
<dbl>
1 1966.
Why is grouping data necessary?
The role of grouping in R is to sort a lot of mixed data into different labels. The advantage of this is that you can do a specific analysis for each type of data, rather than a huge amount of overall data that makes it difficult to know where to start. For example, there are different colors of diamonds in the title, by grouping by color, you can separately calculate the average price, quantity, and so on for each color. In this way, you can not only see the performance of the overall data, but also discover what is unique about each color. In addition, grouping allows users to analyze multiple variables at the same time, such as grouping by color and cut, to see how these combinations affect prices. Finally, this grouping also makes it easier for users to create visual charts that show the differences between different categories and help them make clearer judgments.
Why is ungrouping data necessary?
For ungrouping data, I understand that for example you are grouping a large grouping of books, grouping them according to different categories and grouping them into different bookshelves (equivalent to group_by()). These books were then statistically analyzed, such as the number of books in each category (mean, standard deviation, etc.) and the average thickness of the books. But once you’ve done that, if you want to know the total number of books or do other global operations, you can’t keep the books on these different shelves. In this case, ungroup() is like taking the books from their respective shelves and rearranging them together, so that subsequent operations can work on the entire stack, rather than just on a single shelf. So ungroup() for R allows you to avoid subsequent analysis errors when the data is still in the group state.
When should you ungroup data?
Ungroup() is like tidying up after finishing a specific task with your data. Imagine you split your data into groups to calculate something, like averages or totals for each group. Once you’ve done this grouped work, you might want to perform some operations on the entire dataset, not just the groups. At this point, you don’t want the data to stay divided into groups. This is when ungroup() is useful—it clears the grouping, signaling to R that you’ve finished working on the individual groups and now want to treat the dataset as a whole again for the next steps. This ensures any further calculations apply to the entire dataset rather than being restricted to the groups.
If the code does not contain group_by(), do you still need ungroup() at the end? For example, does data() %>% mutate(newVar = 1 + 2) require ungroup()?
If group_by() is not used in the code, then ungroup() is not needed, because ungroup() is used to cancel the grouping operation. ungroup() is not required for the code “data() %>% mutate(newVar = 1 + 2)”.
For 6.7 Extra Practice
#1.View all of the variable names in diamondsView(diamonds)#2.Arrange the diamonds by:diamonds %>%arrange(price)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%arrange(desc(price))
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
# ℹ 53,930 more rows
diamonds %>%arrange(price, cut)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
2 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
9 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%arrange(desc(price), cut)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
# ℹ 53,930 more rows
#3.Arrange the diamonds by lowest to highest price and worst to best clarity.diamonds %>%arrange(price, clarity)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
#4.Create a new variable named salePrice to reflect a discount of $250 off of the original cost of each diamond (hint: mutate()).diamonds %>%mutate(salePrice = price -250)
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z salePrice
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 76
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 76
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 77
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 84
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 85
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 86
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 86
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 87
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 87
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 88
# ℹ 53,930 more rows
#5.Remove the x, y, and z variables from the diamonds dataset (hint: select()).diamonds %>%select(-x, -y, -z)
# A tibble: 53,940 × 7
carat cut color clarity depth table price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326
2 0.21 Premium E SI1 59.8 61 326
3 0.23 Good E VS1 56.9 65 327
4 0.29 Premium I VS2 62.4 58 334
5 0.31 Good J SI2 63.3 58 335
6 0.24 Very Good J VVS2 62.8 57 336
7 0.24 Very Good I VVS1 62.3 57 336
8 0.26 Very Good H SI1 61.9 55 337
9 0.22 Fair E VS2 65.1 61 337
10 0.23 Very Good H VS1 59.4 61 338
# ℹ 53,930 more rows
#6.Determine the number of diamonds there are for each cut value (hint: group_by(), summarize()).diamonds %>%group_by(cut) %>%summarize(count =n())
# A tibble: 5 × 2
cut count
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
#7.Create a new column named totalNum that calculates the total number of diamonds.diamonds %>%mutate(totalNum =n())
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z totalNum
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 53940
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 53940
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 53940
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 53940
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 53940
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 53940
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 53940
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 53940
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 53940
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 53940
# ℹ 53,930 more rows