Eleanor Salisbury Workbook

Showcasing my work from my Research Methods and Data Analysis module

Tutorial 1

Click to expand Tutorial 1

Uploaded penguins data onto R

penguins <- read.csv("~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Quarto R/penguins.csv")

Displaying data

head(penguins)

  X species    island bill_length_mm bill_depth_mm flipper_length_mm
1 1  Adelie Torgersen           39.1          18.7               181
2 2  Adelie Torgersen           39.5          17.4               186
3 3  Adelie Torgersen           40.3          18.0               195
4 4  Adelie Torgersen             NA            NA                NA
5 5  Adelie Torgersen           36.7          19.3               193
6 6  Adelie Torgersen           39.3          20.6               190
  body_mass_g    sex year
1        3750   male 2007
2        3800 female 2007
3        3250 female 2007
4          NA   <NA> 2007
5        3450 female 2007
6        3650   male 2007

library(knitr)
kable(head(penguins))

X	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	year
1	Adelie	Torgersen	39.1	18.7	181	3750	male	2007
2	Adelie	Torgersen	39.5	17.4	186	3800	female	2007
3	Adelie	Torgersen	40.3	18.0	195	3250	female	2007
4	Adelie	Torgersen	NA	NA	NA	NA	NA	2007
5	Adelie	Torgersen	36.7	19.3	193	3450	female	2007
6	Adelie	Torgersen	39.3	20.6	190	3650	male	2007

Tutorial 2

Click to expand Tutorial 2

Basic Data Managment

library(tidyverse)

%>% allows manipulation of data (put at start and end)

mutate() adds new columns or modifies current variable in the dataset

recode() modifies the values within a variable, normally to fix inconsistent labelling (e.g. m, M and male change all to Male)

#data %>% mutate(Variable = recode(Variable, "old value" = "new value"))

summarize() collapses all rows and returns a one-row summary

group_by() and ungroup() takes existing data and groups specific variables together for future operations (e.g. group by age and sex to compare females and males of a certain age)

filter() only retain specific rows of data that meet the specified requirements

#filter(variable == "1" | same variable == "2")  ~ shows variables with 1 or 2
#filter(variable %in% c("1", "2"))               ~ alternative method where c() combines values that want to be shown
#filter(variable <= number)                      ~ <= means at or under e.g. price
#filter(variable != "value")                     ~ != means show data without that value

select() select only the columns (variables) that you want to see, gets rid of all other columns

can also retain all except for something (select(-(1:5)) or select(-x, -y, -z)) - take away columns 1 to 5 or take away x, y, z
can also rearrange columns (select(value.at.the.back, everything()))

arrange() arranges values within a variable in ascending or descending (arrange(desc()) order (numerical or alphabetical)

ifelse() turns numbers into categories

Click to expand ifelse() example

#ifelse(variable<numerical value, "category name for previous definition", "category name for everything else")

diamonds %>% 
  select(carat) %>% 
  mutate(carat_categ=ifelse(carat<2, "small", "big"))

# A tibble: 53,940 × 2
   carat carat_categ
   <dbl> <chr>      
 1  0.23 small      
 2  0.21 small      
 3  0.23 small      
 4  0.29 small      
 5  0.31 small      
 6  0.24 small      
 7  0.24 small      
 8  0.26 small      
 9  0.22 small      
10  0.23 small      
# ℹ 53,930 more rows

pivot_longer() collapses wide format data with multiple columns into long format data with multiple rows (organize data better for graphs)

Click to expand pivot_longer() example

midwest %>% 
  select(county, state, poptotal, popwhite:popother) %>% 
  head(10) %>% 
  kable()

county	state	poptotal	popwhite	popblack	popamerindian	popasian	popother
ADAMS	IL	66090	63917	1702	98	249	124
ALEXANDER	IL	10626	7054	3496	19	48	9
BOND	IL	14991	14477	429	35	16	34
BOONE	IL	30806	29344	127	46	150	1139
BROWN	IL	5836	5264	547	14	5	6
BUREAU	IL	35688	35157	50	65	195	221
CALHOUN	IL	5322	5298	1	8	15	0
CARROLL	IL	16805	16519	111	30	61	84
CASS	IL	13437	13384	16	8	23	6
CHAMPAIGN	IL	173025	146506	16559	331	8033	1596

midwest %>% 
  select(county, state, poptotal, popwhite:popother) %>% 
  pivot_longer(cols = popwhite:popother,
               names_to = "Ethnicity",
               values_to = "Population") %>% 
  head(10) %>% 
  kable()

county	state	poptotal	Ethnicity	Population
ADAMS	IL	66090	popwhite	63917
ADAMS	IL	66090	popblack	1702
ADAMS	IL	66090	popamerindian	98
ADAMS	IL	66090	popasian	249
ADAMS	IL	66090	popother	124
ALEXANDER	IL	10626	popwhite	7054
ALEXANDER	IL	10626	popblack	3496
ALEXANDER	IL	10626	popamerindian	19
ALEXANDER	IL	10626	popasian	48
ALEXANDER	IL	10626	popother	9

pivot_wider() collapses long format data with multiple rows into wide format data with multiple columns (organize data better for graphs)

Click to expand pivot_wider() example

midwest %>% 
  select(county, state, poptotal) %>% 
  head(10) %>% 
  kable()

county	state	poptotal
ADAMS	IL	66090
ALEXANDER	IL	10626
BOND	IL	14991
BOONE	IL	30806
BROWN	IL	5836
BUREAU	IL	35688
CALHOUN	IL	5322
CARROLL	IL	16805
CASS	IL	13437
CHAMPAIGN	IL	173025

midwest %>% 
  mutate(row = row_number()) %>% 
  select(county, state, poptotal) %>% 
  pivot_wider(names_from = state, 
              values_from = poptotal) %>% 
  head(10) %>% 
  kable()

county	IL	IN	MI	OH	WI
ADAMS	66090	31095	NA	25371	15682
ALEXANDER	10626	NA	NA	NA	NA
BOND	14991	NA	NA	NA	NA
BOONE	30806	38147	NA	NA	NA
BROWN	5836	14080	NA	34966	194594
BUREAU	35688	NA	NA	NA	NA
CALHOUN	5322	NA	135982	NA	NA
CARROLL	16805	18809	NA	26521	NA
CASS	13437	38413	49477	NA	NA
CHAMPAIGN	173025	NA	NA	36019	NA

na.omit() removes rows that contain missing values (NA)

Data Analyses Exercise

6.6.1 Exercises

Click to expand 6.6.1 Exercises

#Problem A

midwest %>%                                         # utilizes the midwest dataset
  group_by(state) %>%                               # groups data by state
  summarize(poptotalmean = mean(poptotal),          # summarizes data (average population of the state)
            poptotalmed = median(poptotal),         # middle population value of the state
            popmax = max(poptotal),                 # biggest population of the state
            popmin = min(poptotal),                 # smallest population of the state 
            popdistinct = n_distinct(poptotal),     # how many unique population values are present for each state
            popfirst = first(poptotal),             # first population value of the state
            popany = any(poptotal < 5000),          # is any of the population below 5000 in the state
            popany2 = any(poptotal > 2000000)) %>%  # is any of the population above 2000000 in the state
  ungroup() %>%                                     # final ungrouping of data
  kable()                                           # creates table for html

state	poptotalmean	poptotalmed	popmax	popmin	popdistinct	popfirst	popany	popany2
IL	112064.73	24486.5	5105067	4373	101	66090	TRUE	TRUE
IN	60262.60	30362.5	797159	5315	92	31095	FALSE	FALSE
MI	111991.53	37308.0	2111687	1701	83	10145	TRUE	TRUE
OH	123262.67	54929.5	1412140	11098	88	25371	FALSE	FALSE
WI	67941.24	33528.0	959275	3890	72	15682	TRUE	FALSE

#Problem B

midwest %>% 
  group_by(state) %>% 
  summarize(num5k = sum(poptotal < 5000),       # the sum of all the population values smaller than 5000 for each state
            num2mil = sum(poptotal > 2000000),  # the sum of all the population values bigger than 2000000 for each state
            numrows = n()) %>%                  # number of population values per state
  ungroup () %>% 
  kable()

state	num5k	num2mil	numrows
IL	1	1	102
IN	0	0	92
MI	1	1	83
OH	0	0	88
WI	2	0	72

#Problem C ~ Part I

midwest %>% 
  group_by(county) %>%                   # group data by county
  summarize(x = n_distinct(state)) %>%   # summarise how many state values exist for each county
  arrange(desc(x)) %>%                   # arrange x data in descending order
  ungroup()

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

#Problem C ~ Part II

midwest %>% 
  group_by(county) %>% 
  summarize(x = n()) %>%   # the number of values that exist per county
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

#Problem C ~ Part III

midwest %>% 
  group_by(county) %>% 
  summarize(x = n_distinct(county)) %>%   # the number of unique county values that exist per county (which will always be 1 if grouped by county)
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

#Problem D

diamonds %>%                        # utilizes the diamonds dataset
  group_by(clarity) %>%             # groups diamonds data by clarity
  summarize(a = n_distinct(color),  # the number of color values per clarity
            b = n_distinct(price),  # the number of unique prices per clarity
            c = n()) %>%            # the number of values for clarity
  ungroup() %>% 
  kable()

clarity	a	b	c
I1	7	632	741
SI2	7	4904	9194
SI1	7	5380	13065
VS2	7	5051	12258
VS1	7	3926	8171
VVS2	7	2409	5066
VVS1	7	1623	3655
IF	7	902	1790

#Problem E ~ Part I

diamonds %>% 
  group_by(color, cut) %>%         # groups data by color and cut
  summarize(m = mean(price),       # summarises data for average price per color and cut
            s = sd(price)) %>%    # standard deviation for price per color and cut
  ungroup() %>% 
  head() %>% 
  kable()

color	cut	m	s
D	Fair	4291.061	3286.114
D	Good	3405.382	3175.149
D	Very Good	3470.467	3523.753
D	Premium	3631.293	3711.634
D	Ideal	2629.095	3001.070
E	Fair	3682.312	2976.652

#Problem E ~ Part II

diamonds %>% 
  group_by(cut, color) %>%         # groups data by cut and color
  summarize(m = mean(price),       # summarises data for average price per cut and color
            s = sd(price)) %>%     # standard deviation for price per cut and color
  ungroup() %>% 
  head() %>% 
  kable()

cut	color	m	s
Fair	D	4291.061	3286.114
Fair	E	3682.312	2976.652
Fair	F	3827.003	3223.303
Fair	G	4239.255	3609.644
Fair	H	5135.683	3886.482
Fair	I	4685.446	3730.271

#Problem E ~ Part III

diamonds %>% 
  group_by(cut, color, clarity) %>%   # groups data by cut, color and clarity
  summarize(m = mean(price),          # summarises data for average price per cut, color and clarity
            s = sd(price),            # standard deviation for price per cut, color and clarity
            msale = m * 0.80) %>%     # 20% off average price per cut, color and clarity
  ungroup() %>% 
  head() %>% 
  kable()

cut	color	clarity	m	s	msale
Fair	D	I1	7383.000	5898.641	5906.400
Fair	D	SI2	4355.143	3260.153	3484.114
Fair	D	SI1	4273.345	3018.899	3418.676
Fair	D	VS2	4512.880	3382.871	3610.304
Fair	D	VS1	2921.200	2549.931	2336.960
Fair	D	VVS2	3607.000	3628.604	2885.600

6.7 Extra Practice

Click to expand 6.7 Extra Practice

#Practice 2A

diamonds %>%                  # utilises the diamonds dataset
  arrange(price) %>%          # arranges price values in ascending order
  select(price)               # selects only the price column to view

# A tibble: 53,940 × 1
   price
   <int>
 1   326
 2   326
 3   327
 4   334
 5   335
 6   336
 7   336
 8   337
 9   337
10   338
# ℹ 53,930 more rows

#Practice 2B

diamonds %>%                  
  arrange(desc(price)) %>%    # arranges price values in descending order
  select(price)

# A tibble: 53,940 × 1
   price
   <int>
 1 18823
 2 18818
 3 18806
 4 18804
 5 18803
 6 18797
 7 18795
 8 18795
 9 18791
10 18791
# ℹ 53,930 more rows

#Practice 2C

diamonds %>%                  
  arrange(price) %>%   # arranges price values in ascending order
  arrange(cut) %>%     # arranges cut values in ascending order
  select(price, cut)   # selects price and cut columns to view

# A tibble: 53,940 × 2
   price cut  
   <int> <ord>
 1   337 Fair 
 2   361 Fair 
 3   369 Fair 
 4   371 Fair 
 5   416 Fair 
 6   496 Fair 
 7   497 Fair 
 8   527 Fair 
 9   536 Fair 
10   563 Fair 
# ℹ 53,930 more rows

#Practice 2D

diamonds %>%                  
  arrange(desc(price)) %>%   # arranges price values in descending order
  arrange(desc(cut)) %>%     # arranges cut values in descending order
  select(price, cut)         # selects price and cut columns to view

# A tibble: 53,940 × 2
   price cut  
   <int> <ord>
 1 18806 Ideal
 2 18804 Ideal
 3 18791 Ideal
 4 18787 Ideal
 5 18780 Ideal
 6 18779 Ideal
 7 18768 Ideal
 8 18760 Ideal
 9 18757 Ideal
10 18756 Ideal
# ℹ 53,930 more rows

#Practice 3

diamonds %>% 
  arrange(price) %>%       # arranges price in ascending order (lowest to highest)
  arrange(clarity) %>%     # arranges clarity from wrost to best
  select(price, clarity)   # selects price and clarity columns to view

# A tibble: 53,940 × 2
   price clarity
   <int> <ord>  
 1   345 I1     
 2   361 I1     
 3   394 I1     
 4   413 I1     
 5   413 I1     
 6   444 I1     
 7   452 I1     
 8   467 I1     
 9   468 I1     
10   490 I1     
# ℹ 53,930 more rows

#Practice 4

diamonds %>% 
  mutate(salePrice = price-250) %>%   # creates new variable of salePrice where the price has $250 discount off
  select(price, salePrice)            # select price and salePrice columns to view

# A tibble: 53,940 × 2
   price salePrice
   <int>     <dbl>
 1   326        76
 2   326        76
 3   327        77
 4   334        84
 5   335        85
 6   336        86
 7   336        86
 8   337        87
 9   337        87
10   338        88
# ℹ 53,930 more rows

#Practice 5

diamonds %>% 
  select(-x, -y, -z) %>%    # removes x, y and z from the dataset to view
  head() %>% 
  kable()

carat	cut	color	clarity	depth	table	price
0.23	Ideal	E	SI2	61.5	55	326
0.21	Premium	E	SI1	59.8	61	326
0.23	Good	E	VS1	56.9	65	327
0.29	Premium	I	VS2	62.4	58	334
0.31	Good	J	SI2	63.3	58	335
0.24	Very Good	J	VVS2	62.8	57	336

#Practice 6

diamonds %>% 
  group_by(cut) %>%    # groups data by cut
  summarise(n())       # summarises number of values per cut

# A tibble: 5 × 2
  cut       `n()`
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551

#Practice 7

diamonds %>% 
  mutate(totalNum = n()) %>%    # adds a new column showing total number of values / diamonds
  select(totalNum)              # selects only the totalNum column to be shown

# A tibble: 53,940 × 1
   totalNum
      <int>
 1    53940
 2    53940
 3    53940
 4    53940
 5    53940
 6    53940
 7    53940
 8    53940
 9    53940
10    53940
# ℹ 53,930 more rows

Research Method Exercise

Bad Research Question: Is the price of diamonds affected by its carat and cut?

Good Research Question: How does the carat weight and cut quality of diamonds influence their average price?

Out of curiousity, I chose to answer / visualise this question

Click to expand code and visualised tables

diamonds %>% 
  mutate(carat_categ=ifelse(carat<2, "small", 
                            ifelse(carat>=2 & carat<3.5, "medium", "big"))) %>% 
  group_by(carat_categ, cut) %>%
  summarise(avg_price = mean(price)) %>% 
  arrange(desc(carat_categ), cut) %>% 
  ungroup() %>% 
  kable(caption = "Average Price of Diamonds by Carat and Cut")

Average Price of Diamonds by Carat and Cut
carat_categ	cut	avg_price
small	Fair	3546.575
small	Good	3439.851
small	Very Good	3551.484
small	Premium	3931.582
small	Ideal	3163.888
medium	Fair	11757.386
medium	Good	14598.451
medium	Very Good	15101.960
medium	Premium	14901.474
medium	Ideal	15530.072
big	Fair	16386.500
big	Very Good	15984.000
big	Premium	16335.000
big	Ideal	12587.000

diamonds %>% 
  group_by(cut) %>%
  summarise(avg_price = mean(price)) %>% 
  arrange(cut) %>% 
  ungroup() %>% 
  kable(caption = "Average Price of Diamonds by Cut")

Average Price of Diamonds by Cut
cut	avg_price
Fair	4358.758
Good	3928.864
Very Good	3981.760
Premium	4584.258
Ideal	3457.542

diamonds %>% 
  mutate(carat_categ=ifelse(carat<2, "small", 
                            ifelse(carat>=2 & carat<3.5, "medium", "big"))) %>% 
  group_by(carat_categ) %>%
  summarise(avg_price = mean(price)) %>% 
  arrange(desc(carat_categ)) %>% 
  ungroup() %>% 
  kable(caption = "Average Price of Diamonds by Carat")

Average Price of Diamonds by Carat
carat_categ	avg_price
small	3478.97
medium	14838.52
big	15945.70

Tutorial 3

Click to expand Tutorial 3

Graphics Data Exploration

library(ggplot2)

2.1 Scatter Plots

Click to expand scatter plots

#plot(x values, y values)
plot(mtcars$wt, mtcars$mpg)

ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point()   #geom identifies the graph type

2.2 Line Graphs

Click to expand line graphs

plot(pressure$temperature, pressure$pressure, type = "l")

plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure)                     # adds points to the line

lines(pressure$temperature, pressure$pressure/2, col = "red")       # adds a new line on the graph and gives it red colour
points(pressure$temperature, pressure$pressure/2, col = "red")      # puts points on this red line

ggplot(pressure, aes(x = temperature, y = pressure)) + geom_line()

ggplot(pressure, aes(x = temperature, y = pressure)) +
  geom_line() +
  geom_point()

ggplot(pressure, aes(x = temperature)) +
  geom_line(aes(y = pressure)) +
  geom_point(aes(y = pressure)) +
  geom_line(aes(y = pressure/2), color="red") +
  geom_point(aes(y = pressure/2, color="red"))

2.3 Bar Graphs

Click to expand bar graphs

# barplot(y/height value, names.arg = x/label value)
barplot(BOD$demand, names.arg = BOD$Time)

# table() counts the number of unique values e.g. 11 cases of value 4, 7 cases of value 6, 14 cases of value 8
barplot(table(mtcars$cyl))

# factor() turns Time into a categorical (discrete) label rather than numerous (which would include 6)
ggplot(BOD, aes(x = factor(Time), y = demand)) + 
  geom_col()

# Bar graph of counts, with x = "cyl" and y = number of rows for each value or "cyl"
# factor() turns "cyl" into 4, 6 and 8 categories whereas without it 1-9 would show on the x value labels
ggplot(mtcars, aes(x = factor(cyl))) +    
  geom_bar()

2.4 Histograms

Click to expand histograms

hist(mtcars$mpg)

# breaks specifies the number of bins(intervals)
hist(mtcars$mpg, breaks = 10)

# 30 bins by default so may need to change with 'binwidth'
ggplot(mtcars, aes(x=mpg)) +
  geom_histogram()

# bin width set to 4
ggplot(mtcars, aes(x=mpg)) +
  geom_histogram(binwidth = 4)

2.5 Box Plots

Click to expand box plots

plot(ToothGrowth$supp, ToothGrowth$len)

# for multiple x variables, boxplot(y ~ x+x2, data="data")
boxplot(len ~ supp+dose, data=ToothGrowth)

ggplot(ToothGrowth, aes(x=supp,y=len))+
  geom_boxplot()

# interaction() combines variables for multiple x variables
ggplot(ToothGrowth, aes(x=interaction(supp,dose),y=len))+
  geom_boxplot()

2.6 Function Curves

Click to expand function curves

curve(x^3 - 5*x, from = -4, to = 4)

# Plot a user-defined function
myfun <- function(xvar) {1 / (1 + exp(-xvar + 10))}
curve(myfun(x), from = 0, to = 20)
# Add a line
curve(1 - myfun(x), add = TRUE, col = "red")

# data.frame(x=c(0,20)) creates a data frame specifying 0 to 20 as the range for the x variable
# aes(x=x) means the plot uses the x variable from the data frame
ggplot(data.frame(x = c(0, 20)), aes(x = x)) +

  # stat_function() plots a user-defined function
  # fun = specifies the function to be plotted, which is 'myfun' from above
  # geom = "line" indicates that the function should be displayed as a line
  stat_function(fun = myfun, geom = "line")

Graphing Lines of Dispersion

Click to expand lines of dispersion

Using a histogram (histogram/density/freqpoly)

mean_circumference <- mean(Orange$circumference)   # calculate mean as a term
sd_circumference <- sd(Orange$circumference)       # calculate standard deviation as a term

Orange %>% 
  ggplot(aes(x=circumference)) + 
  geom_histogram(fill="lightblue",
               color="lightblue") +
  geom_vline(xintercept=c(mean_circumference, 
                             mean_circumference - sd_circumference, 
                             mean_circumference + sd_circumference),    # creates vertical lines on plot displaying mean and dispersion
             linetype = c("solid", "dashed", "dashed")) +    # line type for vertical lines
  labs(title = "Histogram of Orange Tree Circumference",     # labels for plot
       x = "Circumference (mm)",
       y = "Count") +
  scale_x_continuous(breaks = seq(0, max(220), by=20))+    # customises the x axis, breaks = tick marks, seq(value range), by = increments
  theme_minimal()      # removes grey background theme

Using a boxplot

Orange %>% 
  ggplot(aes(circumference)) +      # aes(value) 
  geom_boxplot(fill="lightblue",    # boxplots show mean and quartiles
               alpha=0.7) +
  scale_x_continuous(breaks = seq(0, max(220), by=20)) +
  theme_minimal()

Graphing my previous Research Question

Click to expand research question graph

How does the carat weight and cut quality of diamonds influence their average price?

diamonds %>% 
  mutate(carat_categ = factor(     # factor() shows this variable is categorical
    ifelse(carat < 2, "small", 
           ifelse(carat >= 2 & carat < 3.5, "medium", "big")),
    levels = c("small", "medium", "big")     # levels = c() shows the order the categories should appear (particularly useful when plotting)
  )) %>%
  group_by(carat_categ, cut) %>%
  summarise(avg_price = mean(price)) %>% 
  ungroup() %>% 
  ggplot(aes(x=interaction(cut,carat_categ),
             y=avg_price,
             color=carat_categ,   # color = outline of bar variable
             fill=cut))+          # fill = color fill of bar variable
  geom_bar(stat="identity",       # geom_bar defaults to "count" which counts the frequency of a value, "identity" means we want to plot the actual value
           alpha=0.8)+            # alpha = sets transparency (80%)
  labs(title = "Average Price of Diamonds by Carat Category and Cut",     # labs() adds labels
       x = "Carat Category and Cut",
       y = "Average Price") +
  scale_y_continuous(breaks = seq(0, max(diamonds$price), by = 2000))+    # customises the y axis, breaks = tick marks, seq(value range), by = increments
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))     # customises the appearance of x axis text by rotating and aligning to tick

Data Analyses Exercises

library(modeldata)

Modifying basic properties of the plot

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point(color = "red",         # learn more options with ?geom_point
             size = 2,
             alpha = .3,
             shape = "square") +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

Adding a line of best fit

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  geom_smooth(                   # adds the line of best fit
              method = "lm",     # default line curves so "lm" makes it a linear model
              se = FALSE) +      # standard error buffers next to line is removed
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

Other Plots

ggplot(crickets, aes(x = rate)) + 
  geom_freqpoly(bins = 15)          # counts of one quantitative variable (same as histogram)

penguins %>%  
  na.omit() %>% 
  ggplot(aes(sex,bill_length_mm))+
  geom_jitter()    # reduces overlapping points, jitters data to be seen (catgeories)

ggplot(crickets, aes(x = species, 
                     y = rate,
                     color = species)) + 
  geom_boxplot(show.legend = FALSE) +      # removes legend (repeated species from x axis)
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()        # removes background grey, ?theme_minimal for more theme details and options

Faceting

Allows data to be split into multiple plots based on the values of one (facet_wrap) or more (facet_grid) categorical variables

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,   # bins = how many bars to be shown
                 show.legend = FALSE) + 
  facet_wrap(~species,        # splits data by species into two plots
             ncol = 1) +      # ncol = specifies the number of columns
  scale_fill_brewer(palette = "Dark2") + 
  theme_minimal()

penguins %>% 
  na.omit() %>%                       # removes NA data           
  ggplot(aes(x=flipper_length_mm,
         group=species,               # plot density/count per species
         fill=species,
         color=species))+
  geom_density(alpha=0.7)+            # density is like a histogram visual
  facet_grid(sex~island,              # creates a matrix of plots 
             scales = "free_x")+      # x axis can differ per plot
  theme_minimal()

Faceting my Research Question Graph

diamonds %>% 
  mutate(carat_categ = factor(
    ifelse(carat < 2, "small", 
           ifelse(carat >= 2 & carat < 3.5, "medium", "big")),
    levels = c("small", "medium", "big")  
  )) %>%
  group_by(carat_categ, cut) %>%
  summarise(avg_price = mean(price)) %>% 
  arrange(carat_categ, desc(cut)) %>% 
  ungroup() %>% 
  ggplot(aes(x = cut,         # x axis only needs cut as carat_categ is separated by plot
             y = avg_price, 
             color = carat_categ, 
             fill = cut)) +
  geom_bar(stat = "identity", 
           alpha = 0.8) +
  facet_wrap(~carat_categ) +  # creates distinct plots for carat_categ
             # scales="free_x" would manipulate each plot's x axis so that "big" doesn't have the "good" variable
  labs(title = "Average Price of Diamonds by Carat Category and Cut",
       x = "Cut",
       y = "Average Price") +
  scale_y_continuous(breaks = seq(0, max(diamonds$price), by = 2000)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

Research Method Exercises

A good research hypothesis is a concise and testable statement predicting the expected outcome of a study. It should emerge from a thorough literature review that identifies a knowledge gap and proposes a plausible explanation or answer to a research question. A strong hypothesis is characterized by testability, enabling verification through experimentation or observation, while being brief and objective. It should clearly reflect the relationship between variables, such as by using statement formats like If x, then y, or When x, then y.

Tutorial 4

Click to expand Tutorial 4

Choosing the Right Test

Category x Category = Frequency tests (Chi-square)

Click to expand frequency tests

penguins %>% 
  na.omit() %>% 
  ggplot(aes(
    x= species, 
    color=sex, 
    fill=sex))+
  geom_bar(position = "dodge")-> cat_x_cat  # "dodge" makes bars side by side
cat_x_cat

Create a contingency table

penguins %>%
  na.omit() %>%
  count(species, sex)   # Count the number of occurrences for each combination of 'species' and 'sex'

    species    sex  n
1    Adelie female 73
2    Adelie   male 73
3 Chinstrap female 34
4 Chinstrap   male 34
5    Gentoo female 58
6    Gentoo   male 61

penguins %>%
  na.omit() %>%
  count(species, sex) %>% 
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)   # Reshape the data from a long to a wide format

# A tibble: 3 × 3
  species   female  male
  <chr>      <int> <int>
1 Adelie        73    73
2 Chinstrap     34    34
3 Gentoo        58    61

  # 'values_fill = 0' ensures that if there are any missing values, they are replaced with 0

Run the Chi-square test

penguins %>%
  na.omit() %>%
  count(species, sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0) %>%
  select(-species) %>% # Remove the 'species' column to run test with only numbers
  chisq.test() # Perform the Chi-square test


    Pearson's Chi-squared test

data:  .
X-squared = 0.048607, df = 2, p-value = 0.976

Category x Number = Mean tests (T-tests, Anovas, Non-parametric equivalents)

Click to expand mean tests

penguins %>% 
  na.omit() %>% 
  ggplot(aes(
    x= species, 
    y= bill_length_mm,
    color=species, 
    fill=species))+
  geom_boxplot(alpha=0.7)-> cat_x_num

cat_x_num

T-test

summary(lm(bill_length_mm~species, data=penguins))


Call:
lm(formula = bill_length_mm ~ species, data = penguins)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.9338 -2.2049  0.0086  2.0662 12.0951 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       38.7914     0.2409  161.05   <2e-16 ***
speciesChinstrap  10.0424     0.4323   23.23   <2e-16 ***
speciesGentoo      8.7135     0.3595   24.24   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.96 on 339 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.7078,    Adjusted R-squared:  0.7061 
F-statistic: 410.6 on 2 and 339 DF,  p-value: < 2.2e-16

Number x Number = Correlation

Click to expand correlation

penguins %>% 
  na.omit() %>% 
  ggplot(aes(
    x= bill_length_mm, 
    y= flipper_length_mm))+
  geom_smooth(method = "lm")+
  geom_point(aes(color=species))-> num_x_num

num_x_num

`geom_smooth()` using formula = 'y ~ x'

Data Analyses Exercises

Mean test = Category x Number

iris %>% 
  ggplot(aes(x=Species,
             y=Sepal.Length,
             color=Species))+
  geom_boxplot()

Chi-square test = Category x Category

iris %>% 
  ggplot(aes(x=Petal.Length,
             fill=Species))+
  geom_density(alpha=0.4)

Correlation = Number x Number

iris %>% 
  ggplot(aes(x=Petal.Length,
             y=Petal.Width))+
  geom_point(aes(color=Species,
             shape=Species))+
  geom_smooth(method = "lm")

`geom_smooth()` using formula = 'y ~ x'

Chi-square test = Category x Category

iris %>% 
  mutate(size=ifelse(Sepal.Length<median(Sepal.Length),
                     "small","big")) %>% 
  ggplot(aes(x=Species,
             color=size,
             fill=size))+
  geom_bar(position="dodge")

Tutorial 5

Click to expand Tutorial 5

Notes

Click to expand Notes

Shortcuts

R chunk = Option + Command + I

%>% = Control + Shift + M

Quarto

To insert images in Quarto

#![](file name) - but delete # and write outside of R chunk

To insert a dropdown text

#delete the # for the below when entering for Quarto

#<details>                       ~ to start the dropdown section
#<summary>text header</summary>  ~ test header for the dropdown
#</details>                      ~ to close the dropdown text section

To get rid of warnings showing up on Quarto

##| warning: false  ~ delete the first hashtag and place at top of R chunk

Basic Data Management

To check the documentation built in for the data set

(?diamonds)

To get a summary of the dataset

diamonds %>% 
  summary()

     carat               cut        color        clarity          depth      
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
 Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
                                    J: 2808   (Other): 2531                  
     table           price             x                y         
 Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
 1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
 Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
 Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
 3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
 Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
                                                                  
       z         
 Min.   : 0.000  
 1st Qu.: 2.910  
 Median : 3.530  
 Mean   : 3.539  
 3rd Qu.: 4.040  
 Max.   :31.800

To find variable names and structure (number, ordinal integer)

str(diamonds)

tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
 $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
 $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
 $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
 $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
 $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
 $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
 $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
 $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
 $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
 $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

To see a table of the summarised dataset

#diamonds %>%          
  #vtable(., lush = TRUE)

Save changes to data on a new document

#data-file-name.new (without the #)

To save as .csv

#diamonds %>%               
  #write.csv(., "diamonds.csv")

Modify original dataset

#data-file-name <- data-file-name %>%