Week 2 Workbook

Author

Kaenat Gul

Published

October 18, 2024

Data Analysis

Exercise 6.6.1

Problem A

midwest %>%                                         #Utilizes the midwest dataset
  group_by(state) %>%                               #Group data by state variable
  summarize(                                        #Summarizes values in 'poptotal' by mean, median, maximum, minimum, distinct, first, less than 5000, and greater than 2000000
    poptotalmean = mean(poptotal),                  #Calculate the mean of population total values
            poptotalmed = median(poptotal),         #Calculate the median of population total values
            popmax = max(poptotal),                 #Calculate the maximum value of population total 
            popmin = min(poptotal),                 #Calculate the minimum value of population total 
            popdistinct = n_distinct(poptotal),     #Calculate the number of distinct values in population total
            popfirst = first(poptotal),             #Return the first value of population total
            popany = any(poptotal < 5000),          #Return True  if any value in the 'poptotal' for each group is less than 5000 else False
            popany2 = any(poptotal > 2000000)) %>%  #Return True  if any value in the 'poptotal' for each group is greater than 2000000 else False
  ungroup()                                         #Final ungrouping of data

# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

Problem B

midwest %>%                                     #Utilizes the midwest dataset
  group_by(state) %>%                           #Group data by state variable
  summarize(                                    #Summarize the grouped data by number of population less than 5000, greater than 2000000, and number of rows in each group
            num5k = sum(poptotal < 5000),       #Count number of populations less than 5000
            num2mil = sum(poptotal > 2000000),  #Count number of populations greater than 2000000
            numrows = n()) %>%                  #Count total number of states in each group
  ungroup()                                     #Final ungrouping of data

# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

Problem C - Part I

midwest %>%                             #Utilizes the midwest dataset
  group_by(county) %>%                  #Group data by county variable
  summarize(                            #Summarize the grouped data by number of distinct states
  x = n_distinct(state)) %>%            #Count the number of distinct states in each county
  arrange(desc(x)) %>%                  #Arrange the results in descending order of 'x' (distinct states)
  ungroup()                             #Final ungrouping of data

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

Problem C - Part II

midwest %>%              #Utilizes the midwest dataset
  group_by(county) %>%   #Group data by county variable
  summarize(x = n()) %>% #Summarize by number of counties in each group 
  ungroup()              #Final ungrouping of data

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

Problem C - Part III

midwest %>%                              #Utilizes the midwest dataset
  group_by(county) %>%                   #Group data by county variable
  summarize(x = n_distinct(county)) %>%  #Summarize grouped data by number of distinct counties 
  ungroup()                              #Final ungrouping of data

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

Problem D

diamonds %>%                        #Utilizes the diamonds dataset
  group_by(clarity) %>%             #Group data by clarity variable
  summarize(                        #Summarize data by number of unique color, price, and number of rows in each clarity group
            a = n_distinct(color),  #Count the Number of distinct colors per clarity
            b = n_distinct(price),  #Count the number of distinct prices values per clarity
            c = n()) %>%            #Count total number of clarities in each group
  ungroup()                         #Final ungrouping of data

# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790

Problem E Part-I

diamonds %>%                      #Utilizes the diamonds dataset
  group_by(color, cut) %>%        #Group data by color and then by cut variable
  summarize(m = mean(price),      #Calculate mean of the price values
            s = sd(price),        #Calculate standard deviation of the price values
            .groups = "drop") %>% 
  ungroup()                       #Final ungrouping of data

# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows

Problem E Part-II

diamonds %>%                      #Utilizes the diamonds dataset
  group_by(cut, color) %>%        #Group data by cut and then by color variable
  summarize(m = mean(price),      #Calculate mean of the price values
            s = sd(price),
            .groups = "drop") %>% #Caculate standard deviation of the price values
  ungroup()                       #Final ungrouping of data

# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows

Problem E Part-III

diamonds %>%                         #Utilizes the diamonds dataset
  group_by(cut, color, clarity) %>%  #Group data by cut, then by color, and then by clarity variable 
  summarize(m = mean(price),         #Calculate the mean of the price values   
            s = sd(price),           #Calculate the standard deviation of the price values
            msale = m * 0.80,
            .groups = "drop") %>%    #Calculate a new variable msale as 80% of the mean price
  ungroup()                          #Final ungrouping of data

# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows

Problem F

diamonds %>%                            #Utilizes the diamonds dataset
  group_by(cut) %>%                     #Group data by cut variable
  summarize(potato = mean(depth),       #Calculate the mean of the depth values and name it 'potato'
            pizza = mean(price),        #Calculate the mean of the price values and name it 'pizza'
            popcorn = median(y),        #Calculate the median of the variable 'y' and name it 'popcorn'
            pineapple = potato - pizza, #Calculate the difference between 'potato' (mean of depth) and 'pizza' (mean of price) and name it 'pineapple'
            papaya = pineapple ^ 2,     #Square the value of 'pineapple' and name it 'papaya'
            peach = n()) %>%            #Count the number of cuts in each group
  ungroup()                             #Final ungrouping of data

# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551

Problem G Part I

diamonds %>%                                   #Utilizes the diamond dataset
  group_by(color) %>%                          #Group data by 'color' variable
  summarize(m = mean(price)) %>%               #Calculate the mean of price values
  mutate(x1 = str_c("Diamond color ", color),  #Create a new variable 'x1' that combines "Diamond color " with the 'color' value
         x2 = 5) %>%                           #Create a new variable 'x2' and assign it a constant value of 5
  ungroup()                                    #Final ungrouping of data

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

Problem G Part II

diamonds %>%                                  #Utilizes the diamond dataset
  group_by(color) %>%                         #Group data by 'color' variable
  summarize(m = mean(price)) %>%              #Calculate the mean of price values
  ungroup() %>%                               #Ungrouping of data
  mutate(x1 = str_c("Diamond color ", color), #Create a new variable 'x1' that combines "Diamond color " with the 'color' value
         x2 = 5)                               #Create a new variable 'x2' and assign it a constant value of 5

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

Problem H Part I

diamonds %>%                    #Utilizes the diamond dataset
  group_by(color) %>%           #Group data by 'color' variable
  mutate(x1 = price * 0.5) %>%  #Create a new variable 'x1' that is half of the 'price' within each group
  summarize(m = mean(x1)) %>%   #Calculate the mean of 'x1' within each group
  ungroup()                     #Final ungrouping of data

# A tibble: 7 × 2
  color     m
  <ord> <dbl>
1 D     1585.
2 E     1538.
3 F     1862.
4 G     2000.
5 H     2243.
6 I     2546.
7 J     2662.

Problem H Part II

diamonds %>%                    #Utilizes diamond dataset
  group_by(color) %>%           #Group data by 'color' variable
  mutate(x1 = price * 0.5) %>%  #Create a new variable 'x1' that is half of the 'price' within each group
  ungroup() %>%                 #Ungrouping of data      
  summarize(m = mean(x1))       #Calculate the mean of 'x1' across the entire dataset (after ungrouping)

# A tibble: 1 × 1
      m
  <dbl>
1 1966.

6.7 Extra Practice

1. View all of the variable names in diamonds (hint: View()).

view(diamonds)   #Opens the diamonds dataset in a viewer in RStudio
names(diamonds)  #Displays the names of all the variables in the diamond dataset

 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
 [8] "x"       "y"       "z"

2. Arrange the diamonds by:

Lowest to highest price (hint: arrange())

diamonds %>%     #Utilizes the diamond dataset
  arrange(price) #Arrange diamonds in ascending order by price (Lowest to Highest)

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

Highest to lowest price (hint: arrange(), desc())

diamonds %>%            #Utilizes the diamond dataset
  arrange(desc(price))  #Arrange diamonds in descending order by price (Highest to Lowest)

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
10  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
# ℹ 53,930 more rows

Lowest price and cut

diamonds %>%           #Utilizes the diamond dataset
  arrange(price) %>%   #Arrange diamonds in ascending order by price
  arrange(cut)         #Again arrange diamonds in ascending order by cut

# A tibble: 53,940 × 10
   carat cut   color clarity depth table price     x     y     z
   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.22 Fair  E     VS2      65.1    61   337  3.87  3.78  2.49
 2  0.25 Fair  E     VS1      55.2    64   361  4.21  4.23  2.33
 3  0.23 Fair  G     VVS2     61.4    66   369  3.87  3.91  2.39
 4  0.27 Fair  E     VS1      66.4    58   371  3.99  4.02  2.66
 5  0.3  Fair  J     VS2      64.8    58   416  4.24  4.16  2.72
 6  0.3  Fair  F     SI1      63.1    58   496  4.3   4.22  2.69
 7  0.34 Fair  J     SI1      64.5    57   497  4.38  4.36  2.82
 8  0.37 Fair  F     SI1      65.3    56   527  4.53  4.47  2.94
 9  0.3  Fair  D     SI2      64.6    54   536  4.29  4.25  2.76
10  0.25 Fair  D     VS1      61.2    55   563  4.09  4.11  2.51
# ℹ 53,930 more rows

Highest price and cut

diamonds %>%                #Utilizes the diamond dataset
  arrange(desc(price)) %>%  #Arrange diamonds in descending order by price
  arrange(desc(cut))        #Arrange diamonds in descending order by cut

# A tibble: 53,940 × 10
   carat cut   color clarity depth table price     x     y     z
   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  1.51 Ideal G     IF       61.7  55   18806  7.37  7.41  4.56
 2  2.07 Ideal G     SI2      62.5  55   18804  8.2   8.13  5.11
 3  2.15 Ideal G     SI2      62.6  54   18791  8.29  8.35  5.21
 4  2.05 Ideal G     SI1      61.9  57   18787  8.1   8.16  5.03
 5  1.6  Ideal F     VS1      62    56   18780  7.47  7.52  4.65
 6  2.06 Ideal I     VS2      62.2  55   18779  8.15  8.19  5.08
 7  1.71 Ideal G     VVS2     62.1  55   18768  7.66  7.63  4.75
 8  2.08 Ideal H     SI1      58.7  60   18760  8.36  8.4   4.92
 9  2.03 Ideal G     SI1      60    55.8 18757  8.17  8.3   4.95
10  2.61 Ideal I     SI2      62.1  56   18756  8.85  8.73  5.46
# ℹ 53,930 more rows

3. Arrange the diamonds by lowest to highest price and worst to best clarity.

diamonds %>%               #Utilizes the diamond dataset
  arrange(price, clarity)  #Arrange the diamonds in ascending order by lowest to highest price and worst to best clarity

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

4. Create a new variable named salePrice to reflect a discount of $250 off of the original cost of each diamond (hint: mutate()).

diamonds %>%                      #Utilizes the diamond dataset
  mutate(salePrice = price - 250) #Create a new variable 'salePrice' with a $250 discount of the original cost of each diamond

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z salePrice
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43        76
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31        76
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31        77
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63        84
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75        85
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48        86
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47        86
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53        87
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49        87
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39        88
# ℹ 53,930 more rows

5. Remove the x, y, and z variables from the diamonds dataset (hint: select()).

diamonds %>%         #Utilizes the diamond dataset
  select(-x,-y,-z)   #Removes the x, y, and z variables from the diamond dataset

# A tibble: 53,940 × 7
   carat cut       color clarity depth table price
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int>
 1  0.23 Ideal     E     SI2      61.5    55   326
 2  0.21 Premium   E     SI1      59.8    61   326
 3  0.23 Good      E     VS1      56.9    65   327
 4  0.29 Premium   I     VS2      62.4    58   334
 5  0.31 Good      J     SI2      63.3    58   335
 6  0.24 Very Good J     VVS2     62.8    57   336
 7  0.24 Very Good I     VVS1     62.3    57   336
 8  0.26 Very Good H     SI1      61.9    55   337
 9  0.22 Fair      E     VS2      65.1    61   337
10  0.23 Very Good H     VS1      59.4    61   338
# ℹ 53,930 more rows

6. Determine the number of diamonds there are for each cut value (hint: group_by(), summarize()).

diamonds %>%                  #Utilizes the diamond dataset
  group_by(cut) %>%           #Group data by 'cut' variable
  summarise(total = n()) %>%  #Count the total number of diamonds in each cut group
  ungroup()                   #Final ungrouping of data

# A tibble: 5 × 2
  cut       total
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551

7. Create a new column named totalNum that calculates the total number of diamonds.

diamonds %>%                 #Utilizes the diamond dataset
  mutate(totalNum = n())     #Create a new column 'totalNum' calculates the total number of diamonds in dataset

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z totalNum
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>    <int>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43    53940
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31    53940
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31    53940
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63    53940
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75    53940
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48    53940
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47    53940
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53    53940
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49    53940
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39    53940
# ℹ 53,930 more rows

Research Method

Generate a good question and a bad question about the diamond data set.

Good Question

What is the average price of diamonds in the dataset, and how does it vary across different cut qualities?

Bad Question

What is the most interesting thing about the diamond dataset?