week 3

Author

Qiuyang Zhang

Published

October 11, 2024

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Problem A
midwest %>% #Utilizes the midwest dataset
  group_by(state) %>%#Groups data by state variables
  summarize(
    poptotalmean = mean(poptotal),#The mean population for each state
    poptotalmed = median(poptotal),#The median population for each state
    popmax = max(poptotal),#The maximum population in each state
    popmin = min(poptotal),#The minimum population in each state
    popdistinct = n_distinct(poptotal),#Calculate the number of unique values for each state's total population
    popfirst = first(poptotal),#Take the first total population value in each state
    popany = any(poptotal < 5000),# Check if there is a population of less than 5000
    popany2 = any(poptotal > 2000000)) %>%## Check if there are cases with a total population greater than 2 million
  ungroup() #Final ungrouping of data
# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>
# Problem B
midwest %>%#Utilizes the midwest dataset
  group_by(state) %>%#Groups data by state variables
  summarize(
    num5k = sum(poptotal < 5000),#Count the number of areas in each state with a population of less than 5,000
    num2mil = sum(poptotal > 2000000),#Count the number of areas in each state with a population greater than 2 million
    numrows = n()) %>%#Count the total number of regions in each state
  ungroup() #Final ungrouping of data
# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72
#Problem C
#Part 1
midwest %>%#Utilizes the midwest dataset
  group_by(county) %>%#Groups data by country variables
  summarize(x = n_distinct(state)) %>%#Count the number of different states in each county
  arrange(desc(x)) %>%#Listed in descending order by number of states
  ungroup() #Final ungrouping of data
# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows
#Part 2
midwest %>%#Utilizes the midwest dataset
  group_by(county) %>%#Groups data by country variables
  summarize(x = n()) %>%#Count the total number of records for each county
  ungroup() #Final ungrouping of data
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows
#Part 3
midwest %>%#Utilizes the midwest dataset
  group_by(county) %>%#Groups data by country variables
  summarize(x = n_distinct(county)) %>%#Count the difference number of the county
  ungroup() #Final ungrouping of data
# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows
#Problem D
diamonds %>%#Utilizes the diamonds dataset
  group_by(clarity) %>%#Groups data by clarity variables
  summarize(
    a = n_distinct(color),#The number of different colors in each clarity
    b = n_distinct(price),#The number of different prices under each clarity
    c = n()) %>%#The total number of diamonds in each clarity
  ungroup() #Final ungrouping of data
# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790
#Problem E
#Part 1
diamonds %>% #Utilizes the diamonds dataset
  group_by(color, cut) %>% #Groups data by color and cut variables
  summarize(
    m = mean(price),#The mean price of each diamond
    s = sd(price)) %>% #The standard deviation of the price of each diamond
  ungroup() #Final ungrouping of data
`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows
#Part 2
diamonds %>%  #Utilizes the diamonds dataset
  group_by(cut, color) %>% #Groups data by cut and color variables
  summarize(
    m = mean(price),#The mean price of each diamond
    s = sd(price)) %>% #The standard deviation of the price of each diamond(The difference of part 1 is changing the sequence of cut and color)
  ungroup() #Final ungrouping of data
`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows
#Part 3
diamonds %>% #Utilizes the diamonds dataset
  group_by(cut, color, clarity) %>%#Groups data by cut,color and clarity variables 
  summarize(
    m = mean(price),#The mean price of each diamond
    s = sd(price),#The standard deviation of the price of each diamond(The difference of part 1&2 is adding clarity)
    msale = m * 0.80) %>% #Reduce the average price by 20%
  ungroup() #Final ungrouping of data
`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows
#Problem F
diamonds %>% #Utilizes the diamonds dataset
  group_by(cut) %>% #Groups data by cut variables
  summarize(
    potato = mean(depth),#Mean depth per cut
    pizza = mean(price),#Mean price per cut
    popcorn = median(y),#The median of the y coordinates under each cut
    pineapple = potato - pizza,#Mean depth minus mean price for each cut
    papaya = pineapple^2,#Square the result of the previous step
    peach = n())%>% #Calculate the total number of diamonds under each cut
  ungroup() #Final ungrouping of data
# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551
#Problem G
#Part 1
diamonds %>% #Utilizes the diamonds dataset
  group_by(color) %>% #Groups data by color variables
  summarize(m = mean(price)) %>% #Mean price per color group
  mutate(
    x1 = str_c("Diamond color ", color),#Create a new column x1 with "Diamond color "and the color name
    x2 = 5) %>% #Create a new column x2 with all values of 5
  ungroup() #Final ungrouping of data
# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5
#Part 2
diamonds %>% #Utilizes the diamonds dataset
  group_by(color) %>% #Groups data by color variables
  summarize(m = mean(price)) %>% #Mean price per color group
  ungroup() %>% #Final ungrouping of data
  mutate(
    x1 = str_c("Diamond color ", color),#Create a new column x1 with "Diamond color "and the color name
    x2 = 5) #Create a new column x2 with all values of 5
# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5
#For hint,since the main purpose of ungroup() is to cancel the grouping and prevent the grouping from affecting subsequent operations, mutate() does not rely on the grouping, so the result is the same for both groups

#Problem H
#Part 1
diamonds %>% #Utilizes the diamonds dataset
  group_by(color) %>% #Groups data by color variables
  mutate(x1 = price * 0.5) %>% #Create a new column x1 with a value of half the price
  summarize(m = mean(x1)) %>% # The mean value of x1 in each color group
  ungroup() #Final ungrouping of data
# A tibble: 7 × 2
  color     m
  <ord> <dbl>
1 D     1585.
2 E     1538.
3 F     1862.
4 G     2000.
5 H     2243.
6 I     2546.
7 J     2662.
#Part 2
diamonds %>% #Utilizes the diamonds dataset
  group_by(color) %>% #Groups data by color variables
  mutate(x1 = price * 0.5) %>% #Create a new column x1 with a value of half the price
  ungroup() %>%  #Final ungrouping of data
  summarize(m = mean(x1)) # The mean value of x1 in the entire data set
# A tibble: 1 × 1
      m
  <dbl>
1 1966.
  1. Why is grouping data necessary?

    The role of grouping in R is to sort a lot of mixed data into different labels. The advantage of this is that you can do a specific analysis for each type of data, rather than a huge amount of overall data that makes it difficult to know where to start. For example, there are different colors of diamonds in the title, by grouping by color, you can separately calculate the average price, quantity, and so on for each color. In this way, you can not only see the performance of the overall data, but also discover what is unique about each color. In addition, grouping allows users to analyze multiple variables at the same time, such as grouping by color and cut, to see how these combinations affect prices. Finally, this grouping also makes it easier for users to create visual charts that show the differences between different categories and help them make clearer judgments.

  2. Why is ungrouping data necessary?

    For ungrouping data, I understand that for example you are grouping a large grouping of books, grouping them according to different categories and grouping them into different bookshelves (equivalent to group_by()). These books were then statistically analyzed, such as the number of books in each category (mean, standard deviation, etc.) and the average thickness of the books. But once you’ve done that, if you want to know the total number of books or do other global operations, you can’t keep the books on these different shelves. In this case, ungroup() is like taking the books from their respective shelves and rearranging them together, so that subsequent operations can work on the entire stack, rather than just on a single shelf. So ungroup() for R allows you to avoid subsequent analysis errors when the data is still in the group state.

  3. When should you ungroup data?

    Ungroup() is like tidying up after finishing a specific task with your data. Imagine you split your data into groups to calculate something, like averages or totals for each group. Once you’ve done this grouped work, you might want to perform some operations on the entire dataset, not just the groups. At this point, you don’t want the data to stay divided into groups. This is when ungroup() is useful—it clears the grouping, signaling to R that you’ve finished working on the individual groups and now want to treat the dataset as a whole again for the next steps. This ensures any further calculations apply to the entire dataset rather than being restricted to the groups.

  4. If the code does not contain group_by(), do you still need ungroup() at the end? For example, does data() %>% mutate(newVar = 1 + 2) require ungroup()?

    If group_by() is not used in the code, then ungroup() is not needed, because ungroup() is used to cancel the grouping operation. ungroup() is not required for the code “data() %>% mutate(newVar = 1 + 2)”.

For 6.7 Extra Practice

#1.View all of the variable names in diamonds
View(diamonds)
#2.Arrange the diamonds by:
diamonds %>% 
  arrange(price)
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows
diamonds %>% 
  arrange(desc(price))
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
10  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
# ℹ 53,930 more rows
diamonds %>% 
  arrange(price, cut)
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 2  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
 9  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows
diamonds %>% 
  arrange(desc(price), cut)
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
10  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
# ℹ 53,930 more rows
#3.Arrange the diamonds by lowest to highest price and worst to best clarity.
diamonds %>% 
  arrange(price, clarity)
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows
#4.Create a new variable named salePrice to reflect a discount of $250 off of the original cost of each diamond (hint: mutate()).
diamonds %>% 
  mutate(salePrice = price - 250)
# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z salePrice
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43        76
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31        76
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31        77
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63        84
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75        85
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48        86
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47        86
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53        87
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49        87
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39        88
# ℹ 53,930 more rows
#5.Remove the x, y, and z variables from the diamonds dataset (hint: select()).
diamonds %>% 
  select(-x, -y, -z)
# A tibble: 53,940 × 7
   carat cut       color clarity depth table price
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int>
 1  0.23 Ideal     E     SI2      61.5    55   326
 2  0.21 Premium   E     SI1      59.8    61   326
 3  0.23 Good      E     VS1      56.9    65   327
 4  0.29 Premium   I     VS2      62.4    58   334
 5  0.31 Good      J     SI2      63.3    58   335
 6  0.24 Very Good J     VVS2     62.8    57   336
 7  0.24 Very Good I     VVS1     62.3    57   336
 8  0.26 Very Good H     SI1      61.9    55   337
 9  0.22 Fair      E     VS2      65.1    61   337
10  0.23 Very Good H     VS1      59.4    61   338
# ℹ 53,930 more rows
#6.Determine the number of diamonds there are for each cut value (hint: group_by(), summarize()).
diamonds %>% 
  group_by(cut) %>%
  summarize(count = n())
# A tibble: 5 × 2
  cut       count
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551
#7.Create a new column named totalNum that calculates the total number of diamonds.
diamonds %>% 
  mutate(totalNum = n())
# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z totalNum
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>    <int>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43    53940
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31    53940
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31    53940
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63    53940
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75    53940
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48    53940
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47    53940
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53    53940
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49    53940
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39    53940
# ℹ 53,930 more rows