penguins <- read.csv("~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Quarto R/penguins.csv")Eleanor Salisbury Workbook
Showcasing my work from my Research Methods and Data Analysis module
Tutorial 1
Click to expand Tutorial 1
Uploaded penguins data onto R
Displaying data
head(penguins) X species island bill_length_mm bill_depth_mm flipper_length_mm
1 1 Adelie Torgersen 39.1 18.7 181
2 2 Adelie Torgersen 39.5 17.4 186
3 3 Adelie Torgersen 40.3 18.0 195
4 4 Adelie Torgersen NA NA NA
5 5 Adelie Torgersen 36.7 19.3 193
6 6 Adelie Torgersen 39.3 20.6 190
body_mass_g sex year
1 3750 male 2007
2 3800 female 2007
3 3250 female 2007
4 NA <NA> 2007
5 3450 female 2007
6 3650 male 2007
library(knitr)
kable(head(penguins))| X | species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year |
|---|---|---|---|---|---|---|---|---|
| 1 | Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | male | 2007 |
| 2 | Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | female | 2007 |
| 3 | Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | female | 2007 |
| 4 | Adelie | Torgersen | NA | NA | NA | NA | NA | 2007 |
| 5 | Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | female | 2007 |
| 6 | Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | male | 2007 |
Tutorial 2
Click to expand Tutorial 2
Basic Data Managment
library(tidyverse)%>% allows manipulation of data (put at start and end)
mutate() adds new columns or modifies current variable in the dataset
recode() modifies the values within a variable, normally to fix inconsistent labelling (e.g. m, M and male change all to Male)
#data %>% mutate(Variable = recode(Variable, "old value" = "new value"))summarize() collapses all rows and returns a one-row summary
group_by() and ungroup() takes existing data and groups specific variables together for future operations (e.g. group by age and sex to compare females and males of a certain age)
filter() only retain specific rows of data that meet the specified requirements
#filter(variable == "1" | same variable == "2") ~ shows variables with 1 or 2
#filter(variable %in% c("1", "2")) ~ alternative method where c() combines values that want to be shown
#filter(variable <= number) ~ <= means at or under e.g. price
#filter(variable != "value") ~ != means show data without that valueselect() select only the columns (variables) that you want to see, gets rid of all other columns
can also retain all except for something (select(-(1:5)) or select(-x, -y, -z)) - take away columns 1 to 5 or take away x, y, z
can also rearrange columns (select(value.at.the.back, everything()))
arrange() arranges values within a variable in ascending or descending (arrange(desc()) order (numerical or alphabetical)
ifelse() turns numbers into categories
Click to expand ifelse() example
#ifelse(variable<numerical value, "category name for previous definition", "category name for everything else")
diamonds %>%
select(carat) %>%
mutate(carat_categ=ifelse(carat<2, "small", "big")) # A tibble: 53,940 × 2
carat carat_categ
<dbl> <chr>
1 0.23 small
2 0.21 small
3 0.23 small
4 0.29 small
5 0.31 small
6 0.24 small
7 0.24 small
8 0.26 small
9 0.22 small
10 0.23 small
# ℹ 53,930 more rows
pivot_longer() collapses wide format data with multiple columns into long format data with multiple rows (organize data better for graphs)
Click to expand pivot_longer() example
midwest %>%
select(county, state, poptotal, popwhite:popother) %>%
head(10) %>%
kable()| county | state | poptotal | popwhite | popblack | popamerindian | popasian | popother |
|---|---|---|---|---|---|---|---|
| ADAMS | IL | 66090 | 63917 | 1702 | 98 | 249 | 124 |
| ALEXANDER | IL | 10626 | 7054 | 3496 | 19 | 48 | 9 |
| BOND | IL | 14991 | 14477 | 429 | 35 | 16 | 34 |
| BOONE | IL | 30806 | 29344 | 127 | 46 | 150 | 1139 |
| BROWN | IL | 5836 | 5264 | 547 | 14 | 5 | 6 |
| BUREAU | IL | 35688 | 35157 | 50 | 65 | 195 | 221 |
| CALHOUN | IL | 5322 | 5298 | 1 | 8 | 15 | 0 |
| CARROLL | IL | 16805 | 16519 | 111 | 30 | 61 | 84 |
| CASS | IL | 13437 | 13384 | 16 | 8 | 23 | 6 |
| CHAMPAIGN | IL | 173025 | 146506 | 16559 | 331 | 8033 | 1596 |
midwest %>%
select(county, state, poptotal, popwhite:popother) %>%
pivot_longer(cols = popwhite:popother,
names_to = "Ethnicity",
values_to = "Population") %>%
head(10) %>%
kable()| county | state | poptotal | Ethnicity | Population |
|---|---|---|---|---|
| ADAMS | IL | 66090 | popwhite | 63917 |
| ADAMS | IL | 66090 | popblack | 1702 |
| ADAMS | IL | 66090 | popamerindian | 98 |
| ADAMS | IL | 66090 | popasian | 249 |
| ADAMS | IL | 66090 | popother | 124 |
| ALEXANDER | IL | 10626 | popwhite | 7054 |
| ALEXANDER | IL | 10626 | popblack | 3496 |
| ALEXANDER | IL | 10626 | popamerindian | 19 |
| ALEXANDER | IL | 10626 | popasian | 48 |
| ALEXANDER | IL | 10626 | popother | 9 |
pivot_wider() collapses long format data with multiple rows into wide format data with multiple columns (organize data better for graphs)
Click to expand pivot_wider() example
midwest %>%
select(county, state, poptotal) %>%
head(10) %>%
kable()| county | state | poptotal |
|---|---|---|
| ADAMS | IL | 66090 |
| ALEXANDER | IL | 10626 |
| BOND | IL | 14991 |
| BOONE | IL | 30806 |
| BROWN | IL | 5836 |
| BUREAU | IL | 35688 |
| CALHOUN | IL | 5322 |
| CARROLL | IL | 16805 |
| CASS | IL | 13437 |
| CHAMPAIGN | IL | 173025 |
midwest %>%
mutate(row = row_number()) %>%
select(county, state, poptotal) %>%
pivot_wider(names_from = state,
values_from = poptotal) %>%
head(10) %>%
kable()| county | IL | IN | MI | OH | WI |
|---|---|---|---|---|---|
| ADAMS | 66090 | 31095 | NA | 25371 | 15682 |
| ALEXANDER | 10626 | NA | NA | NA | NA |
| BOND | 14991 | NA | NA | NA | NA |
| BOONE | 30806 | 38147 | NA | NA | NA |
| BROWN | 5836 | 14080 | NA | 34966 | 194594 |
| BUREAU | 35688 | NA | NA | NA | NA |
| CALHOUN | 5322 | NA | 135982 | NA | NA |
| CARROLL | 16805 | 18809 | NA | 26521 | NA |
| CASS | 13437 | 38413 | 49477 | NA | NA |
| CHAMPAIGN | 173025 | NA | NA | 36019 | NA |
na.omit() removes rows that contain missing values (NA)
Data Analyses Exercise
6.6.1 Exercises
Click to expand 6.6.1 Exercises
#Problem A
midwest %>% # utilizes the midwest dataset
group_by(state) %>% # groups data by state
summarize(poptotalmean = mean(poptotal), # summarizes data (average population of the state)
poptotalmed = median(poptotal), # middle population value of the state
popmax = max(poptotal), # biggest population of the state
popmin = min(poptotal), # smallest population of the state
popdistinct = n_distinct(poptotal), # how many unique population values are present for each state
popfirst = first(poptotal), # first population value of the state
popany = any(poptotal < 5000), # is any of the population below 5000 in the state
popany2 = any(poptotal > 2000000)) %>% # is any of the population above 2000000 in the state
ungroup() %>% # final ungrouping of data
kable() # creates table for html| state | poptotalmean | poptotalmed | popmax | popmin | popdistinct | popfirst | popany | popany2 |
|---|---|---|---|---|---|---|---|---|
| IL | 112064.73 | 24486.5 | 5105067 | 4373 | 101 | 66090 | TRUE | TRUE |
| IN | 60262.60 | 30362.5 | 797159 | 5315 | 92 | 31095 | FALSE | FALSE |
| MI | 111991.53 | 37308.0 | 2111687 | 1701 | 83 | 10145 | TRUE | TRUE |
| OH | 123262.67 | 54929.5 | 1412140 | 11098 | 88 | 25371 | FALSE | FALSE |
| WI | 67941.24 | 33528.0 | 959275 | 3890 | 72 | 15682 | TRUE | FALSE |
#Problem B
midwest %>%
group_by(state) %>%
summarize(num5k = sum(poptotal < 5000), # the sum of all the population values smaller than 5000 for each state
num2mil = sum(poptotal > 2000000), # the sum of all the population values bigger than 2000000 for each state
numrows = n()) %>% # number of population values per state
ungroup () %>%
kable()| state | num5k | num2mil | numrows |
|---|---|---|---|
| IL | 1 | 1 | 102 |
| IN | 0 | 0 | 92 |
| MI | 1 | 1 | 83 |
| OH | 0 | 0 | 88 |
| WI | 2 | 0 | 72 |
#Problem C ~ Part I
midwest %>%
group_by(county) %>% # group data by county
summarize(x = n_distinct(state)) %>% # summarise how many state values exist for each county
arrange(desc(x)) %>% # arrange x data in descending order
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
#Problem C ~ Part II
midwest %>%
group_by(county) %>%
summarize(x = n()) %>% # the number of values that exist per county
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
#Problem C ~ Part III
midwest %>%
group_by(county) %>%
summarize(x = n_distinct(county)) %>% # the number of unique county values that exist per county (which will always be 1 if grouped by county)
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
#Problem D
diamonds %>% # utilizes the diamonds dataset
group_by(clarity) %>% # groups diamonds data by clarity
summarize(a = n_distinct(color), # the number of color values per clarity
b = n_distinct(price), # the number of unique prices per clarity
c = n()) %>% # the number of values for clarity
ungroup() %>%
kable()| clarity | a | b | c |
|---|---|---|---|
| I1 | 7 | 632 | 741 |
| SI2 | 7 | 4904 | 9194 |
| SI1 | 7 | 5380 | 13065 |
| VS2 | 7 | 5051 | 12258 |
| VS1 | 7 | 3926 | 8171 |
| VVS2 | 7 | 2409 | 5066 |
| VVS1 | 7 | 1623 | 3655 |
| IF | 7 | 902 | 1790 |
#Problem E ~ Part I
diamonds %>%
group_by(color, cut) %>% # groups data by color and cut
summarize(m = mean(price), # summarises data for average price per color and cut
s = sd(price)) %>% # standard deviation for price per color and cut
ungroup() %>%
head() %>%
kable()| color | cut | m | s |
|---|---|---|---|
| D | Fair | 4291.061 | 3286.114 |
| D | Good | 3405.382 | 3175.149 |
| D | Very Good | 3470.467 | 3523.753 |
| D | Premium | 3631.293 | 3711.634 |
| D | Ideal | 2629.095 | 3001.070 |
| E | Fair | 3682.312 | 2976.652 |
#Problem E ~ Part II
diamonds %>%
group_by(cut, color) %>% # groups data by cut and color
summarize(m = mean(price), # summarises data for average price per cut and color
s = sd(price)) %>% # standard deviation for price per cut and color
ungroup() %>%
head() %>%
kable()| cut | color | m | s |
|---|---|---|---|
| Fair | D | 4291.061 | 3286.114 |
| Fair | E | 3682.312 | 2976.652 |
| Fair | F | 3827.003 | 3223.303 |
| Fair | G | 4239.255 | 3609.644 |
| Fair | H | 5135.683 | 3886.482 |
| Fair | I | 4685.446 | 3730.271 |
#Problem E ~ Part III
diamonds %>%
group_by(cut, color, clarity) %>% # groups data by cut, color and clarity
summarize(m = mean(price), # summarises data for average price per cut, color and clarity
s = sd(price), # standard deviation for price per cut, color and clarity
msale = m * 0.80) %>% # 20% off average price per cut, color and clarity
ungroup() %>%
head() %>%
kable()| cut | color | clarity | m | s | msale |
|---|---|---|---|---|---|
| Fair | D | I1 | 7383.000 | 5898.641 | 5906.400 |
| Fair | D | SI2 | 4355.143 | 3260.153 | 3484.114 |
| Fair | D | SI1 | 4273.345 | 3018.899 | 3418.676 |
| Fair | D | VS2 | 4512.880 | 3382.871 | 3610.304 |
| Fair | D | VS1 | 2921.200 | 2549.931 | 2336.960 |
| Fair | D | VVS2 | 3607.000 | 3628.604 | 2885.600 |
6.7 Extra Practice
Click to expand 6.7 Extra Practice
#Practice 2A
diamonds %>% # utilises the diamonds dataset
arrange(price) %>% # arranges price values in ascending order
select(price) # selects only the price column to view# A tibble: 53,940 × 1
price
<int>
1 326
2 326
3 327
4 334
5 335
6 336
7 336
8 337
9 337
10 338
# ℹ 53,930 more rows
#Practice 2B
diamonds %>%
arrange(desc(price)) %>% # arranges price values in descending order
select(price) # A tibble: 53,940 × 1
price
<int>
1 18823
2 18818
3 18806
4 18804
5 18803
6 18797
7 18795
8 18795
9 18791
10 18791
# ℹ 53,930 more rows
#Practice 2C
diamonds %>%
arrange(price) %>% # arranges price values in ascending order
arrange(cut) %>% # arranges cut values in ascending order
select(price, cut) # selects price and cut columns to view # A tibble: 53,940 × 2
price cut
<int> <ord>
1 337 Fair
2 361 Fair
3 369 Fair
4 371 Fair
5 416 Fair
6 496 Fair
7 497 Fair
8 527 Fair
9 536 Fair
10 563 Fair
# ℹ 53,930 more rows
#Practice 2D
diamonds %>%
arrange(desc(price)) %>% # arranges price values in descending order
arrange(desc(cut)) %>% # arranges cut values in descending order
select(price, cut) # selects price and cut columns to view# A tibble: 53,940 × 2
price cut
<int> <ord>
1 18806 Ideal
2 18804 Ideal
3 18791 Ideal
4 18787 Ideal
5 18780 Ideal
6 18779 Ideal
7 18768 Ideal
8 18760 Ideal
9 18757 Ideal
10 18756 Ideal
# ℹ 53,930 more rows
#Practice 3
diamonds %>%
arrange(price) %>% # arranges price in ascending order (lowest to highest)
arrange(clarity) %>% # arranges clarity from wrost to best
select(price, clarity) # selects price and clarity columns to view# A tibble: 53,940 × 2
price clarity
<int> <ord>
1 345 I1
2 361 I1
3 394 I1
4 413 I1
5 413 I1
6 444 I1
7 452 I1
8 467 I1
9 468 I1
10 490 I1
# ℹ 53,930 more rows
#Practice 4
diamonds %>%
mutate(salePrice = price-250) %>% # creates new variable of salePrice where the price has $250 discount off
select(price, salePrice) # select price and salePrice columns to view# A tibble: 53,940 × 2
price salePrice
<int> <dbl>
1 326 76
2 326 76
3 327 77
4 334 84
5 335 85
6 336 86
7 336 86
8 337 87
9 337 87
10 338 88
# ℹ 53,930 more rows
#Practice 5
diamonds %>%
select(-x, -y, -z) %>% # removes x, y and z from the dataset to view
head() %>%
kable()| carat | cut | color | clarity | depth | table | price |
|---|---|---|---|---|---|---|
| 0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 |
| 0.21 | Premium | E | SI1 | 59.8 | 61 | 326 |
| 0.23 | Good | E | VS1 | 56.9 | 65 | 327 |
| 0.29 | Premium | I | VS2 | 62.4 | 58 | 334 |
| 0.31 | Good | J | SI2 | 63.3 | 58 | 335 |
| 0.24 | Very Good | J | VVS2 | 62.8 | 57 | 336 |
#Practice 6
diamonds %>%
group_by(cut) %>% # groups data by cut
summarise(n()) # summarises number of values per cut# A tibble: 5 × 2
cut `n()`
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
#Practice 7
diamonds %>%
mutate(totalNum = n()) %>% # adds a new column showing total number of values / diamonds
select(totalNum) # selects only the totalNum column to be shown# A tibble: 53,940 × 1
totalNum
<int>
1 53940
2 53940
3 53940
4 53940
5 53940
6 53940
7 53940
8 53940
9 53940
10 53940
# ℹ 53,930 more rows
Research Method Exercise
Bad Research Question: Is the price of diamonds affected by its carat and cut?
Good Research Question: How does the carat weight and cut quality of diamonds influence their average price?
Out of curiousity, I chose to answer / visualise this question
Click to expand code and visualised tables
diamonds %>%
mutate(carat_categ=ifelse(carat<2, "small",
ifelse(carat>=2 & carat<3.5, "medium", "big"))) %>%
group_by(carat_categ, cut) %>%
summarise(avg_price = mean(price)) %>%
arrange(desc(carat_categ), cut) %>%
ungroup() %>%
kable(caption = "Average Price of Diamonds by Carat and Cut")| carat_categ | cut | avg_price |
|---|---|---|
| small | Fair | 3546.575 |
| small | Good | 3439.851 |
| small | Very Good | 3551.484 |
| small | Premium | 3931.582 |
| small | Ideal | 3163.888 |
| medium | Fair | 11757.386 |
| medium | Good | 14598.451 |
| medium | Very Good | 15101.960 |
| medium | Premium | 14901.474 |
| medium | Ideal | 15530.072 |
| big | Fair | 16386.500 |
| big | Very Good | 15984.000 |
| big | Premium | 16335.000 |
| big | Ideal | 12587.000 |
diamonds %>%
group_by(cut) %>%
summarise(avg_price = mean(price)) %>%
arrange(cut) %>%
ungroup() %>%
kable(caption = "Average Price of Diamonds by Cut")| cut | avg_price |
|---|---|
| Fair | 4358.758 |
| Good | 3928.864 |
| Very Good | 3981.760 |
| Premium | 4584.258 |
| Ideal | 3457.542 |
diamonds %>%
mutate(carat_categ=ifelse(carat<2, "small",
ifelse(carat>=2 & carat<3.5, "medium", "big"))) %>%
group_by(carat_categ) %>%
summarise(avg_price = mean(price)) %>%
arrange(desc(carat_categ)) %>%
ungroup() %>%
kable(caption = "Average Price of Diamonds by Carat")| carat_categ | avg_price |
|---|---|
| small | 3478.97 |
| medium | 14838.52 |
| big | 15945.70 |
Tutorial 3
Click to expand Tutorial 3
Graphics Data Exploration
library(ggplot2)2.1 Scatter Plots
Click to expand scatter plots
#plot(x values, y values)
plot(mtcars$wt, mtcars$mpg) ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() #geom identifies the graph type2.2 Line Graphs
Click to expand line graphs
plot(pressure$temperature, pressure$pressure, type = "l")plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure) # adds points to the line
lines(pressure$temperature, pressure$pressure/2, col = "red") # adds a new line on the graph and gives it red colour
points(pressure$temperature, pressure$pressure/2, col = "red") # puts points on this red lineggplot(pressure, aes(x = temperature, y = pressure)) + geom_line()ggplot(pressure, aes(x = temperature, y = pressure)) +
geom_line() +
geom_point()ggplot(pressure, aes(x = temperature)) +
geom_line(aes(y = pressure)) +
geom_point(aes(y = pressure)) +
geom_line(aes(y = pressure/2), color="red") +
geom_point(aes(y = pressure/2, color="red"))2.3 Bar Graphs
Click to expand bar graphs
# barplot(y/height value, names.arg = x/label value)
barplot(BOD$demand, names.arg = BOD$Time) # table() counts the number of unique values e.g. 11 cases of value 4, 7 cases of value 6, 14 cases of value 8
barplot(table(mtcars$cyl)) # factor() turns Time into a categorical (discrete) label rather than numerous (which would include 6)
ggplot(BOD, aes(x = factor(Time), y = demand)) +
geom_col()# Bar graph of counts, with x = "cyl" and y = number of rows for each value or "cyl"
# factor() turns "cyl" into 4, 6 and 8 categories whereas without it 1-9 would show on the x value labels
ggplot(mtcars, aes(x = factor(cyl))) +
geom_bar()2.4 Histograms
Click to expand histograms
hist(mtcars$mpg)# breaks specifies the number of bins(intervals)
hist(mtcars$mpg, breaks = 10) # 30 bins by default so may need to change with 'binwidth'
ggplot(mtcars, aes(x=mpg)) +
geom_histogram() # bin width set to 4
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(binwidth = 4) 2.5 Box Plots
Click to expand box plots
plot(ToothGrowth$supp, ToothGrowth$len)# for multiple x variables, boxplot(y ~ x+x2, data="data")
boxplot(len ~ supp+dose, data=ToothGrowth) ggplot(ToothGrowth, aes(x=supp,y=len))+
geom_boxplot()# interaction() combines variables for multiple x variables
ggplot(ToothGrowth, aes(x=interaction(supp,dose),y=len))+
geom_boxplot()2.6 Function Curves
Click to expand function curves
curve(x^3 - 5*x, from = -4, to = 4)# Plot a user-defined function
myfun <- function(xvar) {1 / (1 + exp(-xvar + 10))}
curve(myfun(x), from = 0, to = 20)
# Add a line
curve(1 - myfun(x), add = TRUE, col = "red")# data.frame(x=c(0,20)) creates a data frame specifying 0 to 20 as the range for the x variable
# aes(x=x) means the plot uses the x variable from the data frame
ggplot(data.frame(x = c(0, 20)), aes(x = x)) +
# stat_function() plots a user-defined function
# fun = specifies the function to be plotted, which is 'myfun' from above
# geom = "line" indicates that the function should be displayed as a line
stat_function(fun = myfun, geom = "line")Graphing Lines of Dispersion
Click to expand lines of dispersion
Using a histogram (histogram/density/freqpoly)
mean_circumference <- mean(Orange$circumference) # calculate mean as a term
sd_circumference <- sd(Orange$circumference) # calculate standard deviation as a term
Orange %>%
ggplot(aes(x=circumference)) +
geom_histogram(fill="lightblue",
color="lightblue") +
geom_vline(xintercept=c(mean_circumference,
mean_circumference - sd_circumference,
mean_circumference + sd_circumference), # creates vertical lines on plot displaying mean and dispersion
linetype = c("solid", "dashed", "dashed")) + # line type for vertical lines
labs(title = "Histogram of Orange Tree Circumference", # labels for plot
x = "Circumference (mm)",
y = "Count") +
scale_x_continuous(breaks = seq(0, max(220), by=20))+ # customises the x axis, breaks = tick marks, seq(value range), by = increments
theme_minimal() # removes grey background themeUsing a boxplot
Orange %>%
ggplot(aes(circumference)) + # aes(value)
geom_boxplot(fill="lightblue", # boxplots show mean and quartiles
alpha=0.7) +
scale_x_continuous(breaks = seq(0, max(220), by=20)) +
theme_minimal()Graphing my previous Research Question
Click to expand research question graph
How does the carat weight and cut quality of diamonds influence their average price?
diamonds %>%
mutate(carat_categ = factor( # factor() shows this variable is categorical
ifelse(carat < 2, "small",
ifelse(carat >= 2 & carat < 3.5, "medium", "big")),
levels = c("small", "medium", "big") # levels = c() shows the order the categories should appear (particularly useful when plotting)
)) %>%
group_by(carat_categ, cut) %>%
summarise(avg_price = mean(price)) %>%
ungroup() %>%
ggplot(aes(x=interaction(cut,carat_categ),
y=avg_price,
color=carat_categ, # color = outline of bar variable
fill=cut))+ # fill = color fill of bar variable
geom_bar(stat="identity", # geom_bar defaults to "count" which counts the frequency of a value, "identity" means we want to plot the actual value
alpha=0.8)+ # alpha = sets transparency (80%)
labs(title = "Average Price of Diamonds by Carat Category and Cut", # labs() adds labels
x = "Carat Category and Cut",
y = "Average Price") +
scale_y_continuous(breaks = seq(0, max(diamonds$price), by = 2000))+ # customises the y axis, breaks = tick marks, seq(value range), by = increments
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) # customises the appearance of x axis text by rotating and aligning to tickData Analyses Exercises
library(modeldata)Modifying basic properties of the plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point(color = "red", # learn more options with ?geom_point
size = 2,
alpha = .3,
shape = "square") +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")Adding a line of best fit
ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
geom_smooth( # adds the line of best fit
method = "lm", # default line curves so "lm" makes it a linear model
se = FALSE) + # standard error buffers next to line is removed
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2")Other Plots
ggplot(crickets, aes(x = rate)) +
geom_freqpoly(bins = 15) # counts of one quantitative variable (same as histogram)penguins %>%
na.omit() %>%
ggplot(aes(sex,bill_length_mm))+
geom_jitter() # reduces overlapping points, jitters data to be seen (catgeories) ggplot(crickets, aes(x = species,
y = rate,
color = species)) +
geom_boxplot(show.legend = FALSE) + # removes legend (repeated species from x axis)
scale_color_brewer(palette = "Dark2") +
theme_minimal() # removes background grey, ?theme_minimal for more theme details and optionsFaceting
Allows data to be split into multiple plots based on the values of one (facet_wrap) or more (facet_grid) categorical variables
ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15, # bins = how many bars to be shown
show.legend = FALSE) +
facet_wrap(~species, # splits data by species into two plots
ncol = 1) + # ncol = specifies the number of columns
scale_fill_brewer(palette = "Dark2") +
theme_minimal()penguins %>%
na.omit() %>% # removes NA data
ggplot(aes(x=flipper_length_mm,
group=species, # plot density/count per species
fill=species,
color=species))+
geom_density(alpha=0.7)+ # density is like a histogram visual
facet_grid(sex~island, # creates a matrix of plots
scales = "free_x")+ # x axis can differ per plot
theme_minimal()Faceting my Research Question Graph
diamonds %>%
mutate(carat_categ = factor(
ifelse(carat < 2, "small",
ifelse(carat >= 2 & carat < 3.5, "medium", "big")),
levels = c("small", "medium", "big")
)) %>%
group_by(carat_categ, cut) %>%
summarise(avg_price = mean(price)) %>%
arrange(carat_categ, desc(cut)) %>%
ungroup() %>%
ggplot(aes(x = cut, # x axis only needs cut as carat_categ is separated by plot
y = avg_price,
color = carat_categ,
fill = cut)) +
geom_bar(stat = "identity",
alpha = 0.8) +
facet_wrap(~carat_categ) + # creates distinct plots for carat_categ
# scales="free_x" would manipulate each plot's x axis so that "big" doesn't have the "good" variable
labs(title = "Average Price of Diamonds by Carat Category and Cut",
x = "Cut",
y = "Average Price") +
scale_y_continuous(breaks = seq(0, max(diamonds$price), by = 2000)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))Research Method Exercises
A good research hypothesis is a concise and testable statement predicting the expected outcome of a study. It should emerge from a thorough literature review that identifies a knowledge gap and proposes a plausible explanation or answer to a research question. A strong hypothesis is characterized by testability, enabling verification through experimentation or observation, while being brief and objective. It should clearly reflect the relationship between variables, such as by using statement formats like If x, then y, or When x, then y.
Tutorial 4
Click to expand Tutorial 4
Choosing the Right Test
Category x Category = Frequency tests (Chi-square)
Click to expand frequency tests
penguins %>%
na.omit() %>%
ggplot(aes(
x= species,
color=sex,
fill=sex))+
geom_bar(position = "dodge")-> cat_x_cat # "dodge" makes bars side by side
cat_x_catCreate a contingency table
penguins %>%
na.omit() %>%
count(species, sex) # Count the number of occurrences for each combination of 'species' and 'sex' species sex n
1 Adelie female 73
2 Adelie male 73
3 Chinstrap female 34
4 Chinstrap male 34
5 Gentoo female 58
6 Gentoo male 61
penguins %>%
na.omit() %>%
count(species, sex) %>%
pivot_wider(names_from = sex, values_from = n, values_fill = 0) # Reshape the data from a long to a wide format# A tibble: 3 × 3
species female male
<chr> <int> <int>
1 Adelie 73 73
2 Chinstrap 34 34
3 Gentoo 58 61
# 'values_fill = 0' ensures that if there are any missing values, they are replaced with 0Run the Chi-square test
penguins %>%
na.omit() %>%
count(species, sex) %>%
pivot_wider(names_from = sex, values_from = n, values_fill = 0) %>%
select(-species) %>% # Remove the 'species' column to run test with only numbers
chisq.test() # Perform the Chi-square test
Pearson's Chi-squared test
data: .
X-squared = 0.048607, df = 2, p-value = 0.976
Category x Number = Mean tests (T-tests, Anovas, Non-parametric equivalents)
Click to expand mean tests
penguins %>%
na.omit() %>%
ggplot(aes(
x= species,
y= bill_length_mm,
color=species,
fill=species))+
geom_boxplot(alpha=0.7)-> cat_x_num
cat_x_numT-test
summary(lm(bill_length_mm~species, data=penguins))
Call:
lm(formula = bill_length_mm ~ species, data = penguins)
Residuals:
Min 1Q Median 3Q Max
-7.9338 -2.2049 0.0086 2.0662 12.0951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 38.7914 0.2409 161.05 <2e-16 ***
speciesChinstrap 10.0424 0.4323 23.23 <2e-16 ***
speciesGentoo 8.7135 0.3595 24.24 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.96 on 339 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.7078, Adjusted R-squared: 0.7061
F-statistic: 410.6 on 2 and 339 DF, p-value: < 2.2e-16
Number x Number = Correlation
Click to expand correlation
penguins %>%
na.omit() %>%
ggplot(aes(
x= bill_length_mm,
y= flipper_length_mm))+
geom_smooth(method = "lm")+
geom_point(aes(color=species))-> num_x_num
num_x_num`geom_smooth()` using formula = 'y ~ x'
Data Analyses Exercises
Mean test = Category x Number
iris %>%
ggplot(aes(x=Species,
y=Sepal.Length,
color=Species))+
geom_boxplot()Chi-square test = Category x Category
iris %>%
ggplot(aes(x=Petal.Length,
fill=Species))+
geom_density(alpha=0.4)Correlation = Number x Number
iris %>%
ggplot(aes(x=Petal.Length,
y=Petal.Width))+
geom_point(aes(color=Species,
shape=Species))+
geom_smooth(method = "lm")`geom_smooth()` using formula = 'y ~ x'
Chi-square test = Category x Category
iris %>%
mutate(size=ifelse(Sepal.Length<median(Sepal.Length),
"small","big")) %>%
ggplot(aes(x=Species,
color=size,
fill=size))+
geom_bar(position="dodge")Tutorial 5
Click to expand Tutorial 5
Notes
Click to expand Notes
Shortcuts
R chunk = Option + Command + I
%>% = Control + Shift + M
Quarto
To insert images in Quarto
# - but delete # and write outside of R chunkTo insert a dropdown text
#delete the # for the below when entering for Quarto
#<details> ~ to start the dropdown section
#<summary>text header</summary> ~ test header for the dropdown
#</details> ~ to close the dropdown text section To get rid of warnings showing up on Quarto
##| warning: false ~ delete the first hashtag and place at top of R chunkBasic Data Management
To check the documentation built in for the data set
(?diamonds)To get a summary of the dataset
diamonds %>%
summary() carat cut color clarity depth
Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
J: 2808 (Other): 2531
table price x y
Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
z
Min. : 0.000
1st Qu.: 2.910
Median : 3.530
Mean : 3.539
3rd Qu.: 4.040
Max. :31.800
To find variable names and structure (number, ordinal integer)
str(diamonds)tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
$ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
$ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
$ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
To see a table of the summarised dataset
#diamonds %>%
#vtable(., lush = TRUE) Save changes to data on a new document
#data-file-name.new (without the #)To save as .csv
#diamonds %>%
#write.csv(., "diamonds.csv") Modify original dataset
#data-file-name <- data-file-name %>%