midwest %>%#Utilizes the midwest datasetgroup_by(state) %>%#Group data by state variablesummarize( #Summarizes values in 'poptotal' by mean, median, maximum, minimum, distinct, first, less than 5000, and greater than 2000000poptotalmean =mean(poptotal), #Calculate the mean of population total valuespoptotalmed =median(poptotal), #Calculate the median of population total valuespopmax =max(poptotal), #Calculate the maximum value of population total popmin =min(poptotal), #Calculate the minimum value of population total popdistinct =n_distinct(poptotal), #Calculate the number of distinct values in population totalpopfirst =first(poptotal), #Return the first value of population totalpopany =any(poptotal <5000), #Return True if any value in the 'poptotal' for each group is less than 5000 else Falsepopany2 =any(poptotal >2000000)) %>%#Return True if any value in the 'poptotal' for each group is greater than 2000000 else Falseungroup() #Final ungrouping of data
midwest %>%#Utilizes the midwest datasetgroup_by(state) %>%#Group data by state variablesummarize( #Summarize the grouped data by number of population less than 5000, greater than 2000000, and number of rows in each groupnum5k =sum(poptotal <5000), #Count number of populations less than 5000num2mil =sum(poptotal >2000000), #Count number of populations greater than 2000000numrows =n()) %>%#Count total number of states in each groupungroup() #Final ungrouping of data
# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
Problem C - Part I
midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Group data by county variablesummarize( #Summarize the grouped data by number of distinct statesx =n_distinct(state)) %>%#Count the number of distinct states in each countyarrange(desc(x)) %>%#Arrange the results in descending order of 'x' (distinct states)ungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
Problem C - Part II
midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Group data by county variablesummarize(x =n()) %>%#Summarize by number of counties in each group ungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
Problem C - Part III
midwest %>%#Utilizes the midwest datasetgroup_by(county) %>%#Group data by county variablesummarize(x =n_distinct(county)) %>%#Summarize grouped data by number of distinct counties ungroup() #Final ungrouping of data
# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
Problem D
diamonds %>%#Utilizes the diamonds datasetgroup_by(clarity) %>%#Group data by clarity variablesummarize( #Summarize data by number of unique color, price, and number of rows in each clarity groupa =n_distinct(color), #Count the Number of distinct colors per clarityb =n_distinct(price), #Count the number of distinct prices values per clarityc =n()) %>%#Count total number of clarities in each groupungroup() #Final ungrouping of data
diamonds %>%#Utilizes the diamonds datasetgroup_by(color, cut) %>%#Group data by color and then by cut variablesummarize(m =mean(price), #Calculate mean of the price valuess =sd(price), #Calculate standard deviation of the price values.groups ="drop") %>%ungroup() #Final ungrouping of data
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
Problem E Part-II
diamonds %>%#Utilizes the diamonds datasetgroup_by(cut, color) %>%#Group data by cut and then by color variablesummarize(m =mean(price), #Calculate mean of the price valuess =sd(price),.groups ="drop") %>%#Caculate standard deviation of the price valuesungroup() #Final ungrouping of data
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
Problem E Part-III
diamonds %>%#Utilizes the diamonds datasetgroup_by(cut, color, clarity) %>%#Group data by cut, then by color, and then by clarity variable summarize(m =mean(price), #Calculate the mean of the price values s =sd(price), #Calculate the standard deviation of the price valuesmsale = m *0.80,.groups ="drop") %>%#Calculate a new variable msale as 80% of the mean priceungroup() #Final ungrouping of data
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
Problem F
diamonds %>%#Utilizes the diamonds datasetgroup_by(cut) %>%#Group data by cut variablesummarize(potato =mean(depth), #Calculate the mean of the depth values and name it 'potato'pizza =mean(price), #Calculate the mean of the price values and name it 'pizza'popcorn =median(y), #Calculate the median of the variable 'y' and name it 'popcorn'pineapple = potato - pizza, #Calculate the difference between 'potato' (mean of depth) and 'pizza' (mean of price) and name it 'pineapple'papaya = pineapple ^2, #Square the value of 'pineapple' and name it 'papaya'peach =n()) %>%#Count the number of cuts in each groupungroup() #Final ungrouping of data
diamonds %>%#Utilizes the diamond datasetgroup_by(color) %>%#Group data by 'color' variablesummarize(m =mean(price)) %>%#Calculate the mean of price valuesmutate(x1 =str_c("Diamond color ", color), #Create a new variable 'x1' that combines "Diamond color " with the 'color' valuex2 =5) %>%#Create a new variable 'x2' and assign it a constant value of 5ungroup() #Final ungrouping of data
# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
Problem G Part II
diamonds %>%#Utilizes the diamond datasetgroup_by(color) %>%#Group data by 'color' variablesummarize(m =mean(price)) %>%#Calculate the mean of price valuesungroup() %>%#Ungrouping of datamutate(x1 =str_c("Diamond color ", color), #Create a new variable 'x1' that combines "Diamond color " with the 'color' valuex2 =5) #Create a new variable 'x2' and assign it a constant value of 5
# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
Problem H Part I
diamonds %>%#Utilizes the diamond datasetgroup_by(color) %>%#Group data by 'color' variablemutate(x1 = price *0.5) %>%#Create a new variable 'x1' that is half of the 'price' within each groupsummarize(m =mean(x1)) %>%#Calculate the mean of 'x1' within each groupungroup() #Final ungrouping of data
# A tibble: 7 × 2
color m
<ord> <dbl>
1 D 1585.
2 E 1538.
3 F 1862.
4 G 2000.
5 H 2243.
6 I 2546.
7 J 2662.
Problem H Part II
diamonds %>%#Utilizes diamond datasetgroup_by(color) %>%#Group data by 'color' variablemutate(x1 = price *0.5) %>%#Create a new variable 'x1' that is half of the 'price' within each groupungroup() %>%#Ungrouping of data summarize(m =mean(x1)) #Calculate the mean of 'x1' across the entire dataset (after ungrouping)
# A tibble: 1 × 1
m
<dbl>
1 1966.
6.7 Extra Practice
1. View all of the variable names in diamonds (hint: View()).
view(diamonds) #Opens the diamonds dataset in a viewer in RStudionames(diamonds) #Displays the names of all the variables in the diamond dataset
diamonds %>%#Utilizes the diamond datasetarrange(price) #Arrange diamonds in ascending order by price (Lowest to Highest)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
Highest to lowest price (hint: arrange(), desc())
diamonds %>%#Utilizes the diamond datasetarrange(desc(price)) #Arrange diamonds in descending order by price (Highest to Lowest)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
# ℹ 53,930 more rows
Lowest price and cut
diamonds %>%#Utilizes the diamond datasetarrange(price) %>%#Arrange diamonds in ascending order by pricearrange(cut) #Again arrange diamonds in ascending order by cut
diamonds %>%#Utilizes the diamond datasetarrange(desc(price)) %>%#Arrange diamonds in descending order by pricearrange(desc(cut)) #Arrange diamonds in descending order by cut
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
2 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
3 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
4 2.05 Ideal G SI1 61.9 57 18787 8.1 8.16 5.03
5 1.6 Ideal F VS1 62 56 18780 7.47 7.52 4.65
6 2.06 Ideal I VS2 62.2 55 18779 8.15 8.19 5.08
7 1.71 Ideal G VVS2 62.1 55 18768 7.66 7.63 4.75
8 2.08 Ideal H SI1 58.7 60 18760 8.36 8.4 4.92
9 2.03 Ideal G SI1 60 55.8 18757 8.17 8.3 4.95
10 2.61 Ideal I SI2 62.1 56 18756 8.85 8.73 5.46
# ℹ 53,930 more rows
3. Arrange the diamonds by lowest to highest price and worst to best clarity.
diamonds %>%#Utilizes the diamond datasetarrange(price, clarity) #Arrange the diamonds in ascending order by lowest to highest price and worst to best clarity
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
4. Create a new variable named salePrice to reflect a discount of $250 off of the original cost of each diamond (hint: mutate()).
diamonds %>%#Utilizes the diamond datasetmutate(salePrice = price -250) #Create a new variable 'salePrice' with a $250 discount of the original cost of each diamond
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z salePrice
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 76
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 76
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 77
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 84
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 85
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 86
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 86
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 87
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 87
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 88
# ℹ 53,930 more rows
5. Remove the x, y, and z variables from the diamonds dataset (hint: select()).
diamonds %>%#Utilizes the diamond datasetselect(-x,-y,-z) #Removes the x, y, and z variables from the diamond dataset
# A tibble: 53,940 × 7
carat cut color clarity depth table price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326
2 0.21 Premium E SI1 59.8 61 326
3 0.23 Good E VS1 56.9 65 327
4 0.29 Premium I VS2 62.4 58 334
5 0.31 Good J SI2 63.3 58 335
6 0.24 Very Good J VVS2 62.8 57 336
7 0.24 Very Good I VVS1 62.3 57 336
8 0.26 Very Good H SI1 61.9 55 337
9 0.22 Fair E VS2 65.1 61 337
10 0.23 Very Good H VS1 59.4 61 338
# ℹ 53,930 more rows
6. Determine the number of diamonds there are for each cut value (hint: group_by(), summarize()).
diamonds %>%#Utilizes the diamond datasetgroup_by(cut) %>%#Group data by 'cut' variablesummarise(total =n()) %>%#Count the total number of diamonds in each cut groupungroup() #Final ungrouping of data
# A tibble: 5 × 2
cut total
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
7. Create a new column named totalNum that calculates the total number of diamonds.
diamonds %>%#Utilizes the diamond datasetmutate(totalNum =n()) #Create a new column 'totalNum' calculates the total number of diamonds in dataset
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z totalNum
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 53940
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 53940
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 53940
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 53940
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 53940
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 53940
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 53940
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 53940
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 53940
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 53940
# ℹ 53,930 more rows
Research Method
Generate a good question and a bad question about the diamond data set.
Good Question
What is the average price of diamonds in the dataset, and how does it vary across different cut qualities?
Bad Question
What is the most interesting thing about the diamond dataset?