Week 3 - Post sessions

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

##problem A
midwest %>% 
  group_by(state) %>% 
  summarize(poptotalmean = mean(poptotal),
            poptotalmed = median(poptotal),
            popmax = max(poptotal),
            popmin = min(poptotal),
            popdistinct = n_distinct(poptotal),
            popfirst = first(poptotal),
            popany = any(poptotal < 5000),
            popany2 = any(poptotal > 2000000)) %>% 
  ungroup()

# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

##problem B

midwest %>% 
  group_by(state) %>% 
  summarize(num5k = sum(poptotal < 5000),
            num2mil = sum(poptotal > 2000000),
            numrows = n()) %>% 
  ungroup()

# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

## Problem C
# part I
midwest %>% 
  group_by(county) %>% 
  summarize(x = n_distinct(state)) %>% 
  arrange(desc(x)) %>% 
  ungroup()

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

# part II
# How does n() differ from n_distinct()? n_distinct is used to count the number of unique variables. Where as n() counts the number of current group size. 
# When would they be the same? different? 
midwest %>% 
  group_by(county) %>% 
  summarize(x = n()) %>% 
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

# part III
# hint: 
# - How many distinctly different counties are there for each county? 1
# - Can there be more than 1 (county) county in each county? no
# - What if we replace 'county' with 'state'? Nothing shows up, it shows an x for state.
midwest %>% 
  group_by(state) %>% 
  summarize(x = n_distinct(state)) %>% 
  ungroup()

# A tibble: 5 × 2
  state     x
  <chr> <int>
1 IL        1
2 IN        1
3 MI        1
4 OH        1
5 WI        1

## Problem D
diamonds %>% 
  group_by(clarity) %>% 
  summarize(a = n_distinct(color),
            b = n_distinct(price),
            c = n()) %>% 
  ungroup()

# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790

## Problem E
# part I
diamonds %>% 
  group_by(color, cut) %>% 
  summarize(m = mean(price),
            s = sd(price)) %>% 
  ungroup()

`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.

# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows

# part II
diamonds %>% 
  group_by(cut, color) %>% 
  summarize(m = mean(price),
            s = sd(price)) %>% 
  ungroup()

`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.

# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows

# part III
# hint: 
# - How good is the sale if the price of diamonds equaled msale? 
# - e.x. The diamonds are x% off original price in msale.
diamonds %>% 
  group_by(cut, color, clarity) %>% 
  summarize(m = mean(price),
            s = sd(price),
            msale = m * 0.80) %>% 
  ungroup()

`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.

# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows

## Problem F
diamonds %>% 
  group_by(cut) %>% 
  summarize(potato = mean(depth),
            pizza = mean(price),
            popcorn = median(y),
            pineapple = potato - pizza,
            papaya = pineapple ^ 2,
            peach = n()) %>% 
  ungroup()

# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551

## Problem G
# part I
diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  mutate(x1 = str_c("Diamond color ", color),
         x2 = 5) %>% 
  ungroup()

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

# part II
# What does the first ungroup() do? Is it useful here? Why/why not? 
# Why isn't there a closing ungroup() after the mutate()? The first ungroup() is removing a variable there is no ungroup() at the the end because a variable is being added. 
diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  ungroup() %>% 
  mutate(x1 = str_c("Diamond color ", color),
         x2 = 5)

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

## Problem H
# part I
diamonds %>% 
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% 
  summarize(m = mean(x1)) %>% 
  ungroup()

# A tibble: 7 × 2
  color     m
  <ord> <dbl>
1 D     1585.
2 E     1538.
3 F     1862.
4 G     2000.
5 H     2243.
6 I     2546.
7 J     2662.

# part II
# What's the difference between part I and II? Part I is ungrouping the data whereas part II is not. 
diamonds %>% 
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% 
  ungroup() %>%  
  summarize(m = mean(x1))

# A tibble: 1 × 1
      m
  <dbl>
1 1966.

Why is grouping data necessary?

It’s important to group data to increase the accuracy of the estimation.

Why is ungrouping data necessary?

Ungroubing data is essential as it reduces the possibility of future errors.

When should you ungroup data?

You should ungroup data when you no longer want to focus on it.

If the code does not contain group_by(), do you still need ungroup() at the end? For example, does data() %>% mutate(newVar = 1 + 2) require ungroup()?

If the code does not contain group_by(), you do not need to ungroup() at the end, because there is not a selection of data groups.