suppressPackageStartupMessages(library(tidyverse))
View(diamonds)Research Methods & Data Analysis Workbook
Introduction
This workbook allows me to keep my weekly notes and activities in an organised format which is accessible and hopefully reproducible.
Formatting Basics (Week 1-2)
Points can be emphasized by italics and bold
- and formatted in bullet points too!
Pictures can also be inserted…
For important notes these different boxes can be used (these are easy to find in the contents list)
E.g. It’s always better to be in the ‘source window’. But when inserting pictures I find this much simpler to do from the ‘visual window’.
This type of box could be used to show how to draw attention to something I’m struggling with etc
Week 3
Key Notes
- Conflicts occur when different packages use the same labeling/names BUT function varies
- Rstudio default uses the definition of the package most recently loaded
Pay attention to which packages are loaded within your packages you have loaded in the session as these will override/mask functions of pre-loaded basic packages if conflicts occur
Can check functions using ? function_name
install.packages(“tidyverse”)
I find the simplest way is to go to the packages tab and tick the left side box (loading needs to take place each time you reopen Rstudio)
Or can do library(function_name)
If you want a particular function only from a package you can do package_name::function_name()
Tidyverse Intro
https://bookdown.org/yih_huynh/Guide-to-R-Book/
- Includes dplyr and ggplot2 packages
- In tidyverse dplyr and stats (preloaded) packages have conflicts for filter()
Diamonds dataset
- Loaded with ggplot2 (which is loaded in tidyverse)
str(diamonds)
# ?diamonds
names(diamonds)Data Management
# adding new columns/altering current variablesWeek 3 Post Session Exercise
6.6.1 Basic Data Management Functions
In this exercise I am practicing basic data management functions and labeling their usage
library(tidyverse)
## Problem A
midwest %>% # utilizes the widwest dataset
group_by(state) %>% # groups data by state variable
summarize(poptotalmean = mean(poptotal), # produces a one row summary for the mean values and so on of the total populations of the counties in each states
poptotalmed = median(poptotal), # provides each states median value for total population
popmax = max(poptotal), # finds the maximum total population value
popmin = min(poptotal), # finds the minimum total population value
popdistinct = n_distinct(poptotal), # counts how many different total population values there are e.g. number of counties
popfirst = first(poptotal), # finds first county population total for each state based on data set order
popany = any(poptotal < 5000), # finds out if any county total population in the state is a value of less than 5000 > true
popany2 = any(poptotal > 2000000)) %>% # finds out if any state has a county with total popualtion value of more than 2,000,000
ungroup () # ungroups the data # A tibble: 5 × 9
state poptotalmean poptotalmed popmax popmin popdistinct popfirst popany
<chr> <dbl> <dbl> <int> <int> <int> <int> <lgl>
1 IL 112065. 24486. 5105067 4373 101 66090 TRUE
2 IN 60263. 30362. 797159 5315 92 31095 FALSE
3 MI 111992. 37308 2111687 1701 83 10145 TRUE
4 OH 123263. 54930. 1412140 11098 88 25371 FALSE
5 WI 67941. 33528 959275 3890 72 15682 TRUE
# ℹ 1 more variable: popany2 <lgl>
## Problem B
midwest %>% # using widwest dataset
group_by(state) %>% # group data into state variable
summarize(num5k = sum(poptotal < 5000), # produce one row summary of number of counties in the state with a total population of less than 5000 and so on
num2mil = sum(poptotal > 2000000), # counts number of counties in the state with total population of more than 2 million
numrows = n()) %>% # counts number of rows/counties in each state
ungroup() # ungroups data# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
## Problem C
# Part 1
# Seems unusual that the counties have more than 1 associated state but could be due to data erros or state names not being unique
midwest %>% # using midwest dataset
group_by(county) %>% # group data by county variable
summarize(x = n_distinct(state)) %>% #count number of states tied to the counties
arrange(desc(x)) %>% # using previous result and sorting it into descending order
ungroup() # ungroup data# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
##Problem C
# Part 2
# n() function counts the number of rows whereas n_distinct() counts the number of *distinct* values, they would be the same if all rows had different values
midwest %>% # using midwest dataset
group_by(county) %>% # group data into county variable
summarize(x = n()) %>% #produce one row summary for the number of rows for each county
ungroup() # ungroups data# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
## Problem C
# Part 3
midwest %>% # uses midwest dataset
group_by(county) %>% # groups data by county variable
summarize(x = n_distinct(county)) %>% # produces single row summary for the number of different counties in each county
ungroup() # ungroups data# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
## Problem D
diamonds %>% # utiling diamonds dataset
group_by(clarity) %>% # group data by clarity variable
summarize(a = n_distinct(color), # produce 1 row summary of the number of different colors for each clarity level
b = n_distinct(price), # number of different prices for each clairty level
c = n()) %>% # number of rows for each clarity level
ungroup() # ungroup data# A tibble: 8 × 4
clarity a b c
<ord> <int> <int> <int>
1 I1 7 632 741
2 SI2 7 4904 9194
3 SI1 7 5380 13065
4 VS2 7 5051 12258
5 VS1 7 3926 8171
6 VVS2 7 2409 5066
7 VVS1 7 1623 3655
8 IF 7 902 1790
## Problem E
# Part 1
diamonds %>% # using dimaonds data set
group_by(color, cut) %>% # group data by color and cut variables
summarize(m = mean(price), # using grouped data produce a 1 row summary of the mean price of all colour and cut levels
s = sd(price)) %>% # also get the standard deviation of the price for all color and cut levels
ungroup() # ungroup data `summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
## Problem E
# Part 2
# does the same thing but groups by cut first then by colour so table rearranged differently
diamonds %>%
group_by(cut, color) %>%
summarize(m = mean(price),
s = sd(price)) %>%
ungroup()`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
## Problem E
# Part 3
# not selling for very good as selling for 20% than price of making
diamonds %>% # using diamonds dataset
group_by(cut, color, clarity) %>% # group by cut, color then clarity variables
summarize(m = mean(price), # summarize into 1 row summary for the mean price of each cut, color and clarity combination
s = sd(price), # and then for standard deviation
msale = m * 0.80) %>% # multiplying mean price of each combination by 0.8
ungroup() # ungrouping data `summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
## Problem F
diamonds %>% # using diamonds dataset
group_by(cut) %>% # grouping data by cut variable
summarize(potato = mean(depth), # producing 1 row summary of the mean depth of diamonds of each cut level
pizza = mean(price), # then mean price of diamonds at each cut level
popcorn = median(y), # finding the median of y/height of each level of diamond cut
pineapple = potato - pizza, # working out what mean depth minus mean price is for each cut of diamond
papaya = pineapple ^ 2, # calculates what mean depth minus mean price is to the power of 2 for each level of diamond cut
peach = n()) %>% # counts the number of rows for each level of cut
ungroup() # ungroups data # A tibble: 5 × 7
cut potato pizza popcorn pineapple papaya peach
<ord> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Fair 64.0 4359. 6.1 -4295. 18444586. 1610
2 Good 62.4 3929. 5.99 -3866. 14949811. 4906
3 Very Good 61.8 3982. 5.77 -3920. 15365942. 12082
4 Premium 61.3 4584. 6.06 -4523. 20457466. 13791
5 Ideal 61.7 3458. 5.26 -3396. 11531679. 21551
## Problem G
# Part 1
diamonds %>% # using diamond dataset
group_by(color) %>% # group by color variable
summarize(m = mean(price)) %>% # produce 1 row summary of the mean price for each diamond color
mutate(x1 = str_c("Diamond color ", color), # making a new column that's more descriptive by combining chosen string of words diamond color with the actual color of each color level against mean price
x2 = 5) %>% # making a new column which makes each row value 5 for each color level
ungroup() # ungroup data # A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
## Problem G
# Part 2
# dont need ungroup at the end as didnt regroup after ungrouping, here didn't make a difference as new column has same value of 5 for each row and diamond color is assigned to each row which isn't affected by ungrouping by color
diamonds %>%
group_by(color) %>%
summarize(m = mean(price)) %>%
ungroup() %>% #ungroups the data from being organized by color level
mutate(x1 = str_c("Diamond color ", color),
x2 = 5) # A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
## Problem H
## part 1
diamonds %>% # using diamond dataset
group_by(color) %>% # grouping data by color variable
mutate(x1 = price * 0.5) %>% # creating new column with result from multiplying the price of each diamond color by 0.5
summarize(m = mean(x1)) %>% # creates a 1 row summary from previous data where get the mean
ungroup() # ungroups data# A tibble: 7 × 2
color m
<ord> <dbl>
1 D 1585.
2 E 1538.
3 F 1862.
4 G 2000.
5 H 2243.
6 I 2546.
7 J 2662.