library(tidyverse)
library(readr)
library(knitr)
library(dplyr)
library(palmerpenguins)
library(tibble)
library(janitor)
library(gmodels)
library(vcd)
library(effsize)Josh Shaw - R Workbook
This workbook is where I am storing code from tutorials and notes about additional code.
1 - Beginning with R
Importing and viewing a dataset in R
- Displaying the dataset in Quarto
Click to expand Tutorial 1
Importing a dataset
This function is to import a .csv file, there are similar functions for other file types
penguins <- read.csv("E:/UNI/MASTERS/R/1/penguins.csv")Viewing the dataset
This function is to view the data within Rstudio
View(penguins)Displaying the data
Using this function, I am displaying the first 10 rows of data on my quarto document
library(readr)
head(penguins, 10) X species island bill_length_mm bill_depth_mm flipper_length_mm
1 1 Adelie Torgersen 39.1 18.7 181
2 2 Adelie Torgersen 39.5 17.4 186
3 3 Adelie Torgersen 40.3 18.0 195
4 4 Adelie Torgersen NA NA NA
5 5 Adelie Torgersen 36.7 19.3 193
6 6 Adelie Torgersen 39.3 20.6 190
7 7 Adelie Torgersen 38.9 17.8 181
8 8 Adelie Torgersen 39.2 19.6 195
9 9 Adelie Torgersen 34.1 18.1 193
10 10 Adelie Torgersen 42.0 20.2 190
body_mass_g sex year
1 3750 male 2007
2 3800 female 2007
3 3250 female 2007
4 NA <NA> 2007
5 3450 female 2007
6 3650 male 2007
7 3625 female 2007
8 4675 male 2007
9 3475 <NA> 2007
10 4250 <NA> 2007
However the above code was moving the columns around, so using the help of chatgpt I learned to use the following function to display my data correctly:
library(knitr)
kable(head(penguins, 10))| X | species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year |
|---|---|---|---|---|---|---|---|---|
| 1 | Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | male | 2007 |
| 2 | Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | female | 2007 |
| 3 | Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | female | 2007 |
| 4 | Adelie | Torgersen | NA | NA | NA | NA | NA | 2007 |
| 5 | Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | female | 2007 |
| 6 | Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | male | 2007 |
| 7 | Adelie | Torgersen | 38.9 | 17.8 | 181 | 3625 | female | 2007 |
| 8 | Adelie | Torgersen | 39.2 | 19.6 | 195 | 4675 | male | 2007 |
| 9 | Adelie | Torgersen | 34.1 | 18.1 | 193 | 3475 | NA | 2007 |
| 10 | Adelie | Torgersen | 42.0 | 20.2 | 190 | 4250 | NA | 2007 |
2 - Data Wrangling
Click to expand Tutorial 2
Exercises
Diamond Dataset
diamonds %>%
group_by(color, clarity) %>% # groups diamonds by colour and clarity
mutate(price200 = mean(price)) %>% # adds a column showing the mean price of cut + colour
ungroup() %>% # ungroups the data
mutate(random10 = 10 + price) %>% # adds column with price+10
select(cut, color, clarity, price, price200, random10) %>% # selects just specified columns
arrange(color) %>% # arranging the data by colour
group_by(cut) %>% # then grouping the data by cut
mutate(dis = n_distinct(price), # counts the number of unique price values per cut
rowID = row_number()) %>%
ungroup() # Finally, ungroup the data
diamonds %>%
group_by(clarity) %>% # grouping by column clarity
summarize(a = n_distinct(color), # new column with number of distinct colours
b = n_distinct(price), # new column with number distinct prices
c = n()) %>% # new column with count of how many diamonds in each clarity group
ungroup()
diamonds %>%
group_by(color, cut) %>% # group by colour and cut
summarize(m = mean(price), # summarise the mean price
s = sd(price)) %>% # summarise the standard deviation of price
ungroup()
diamonds %>%
group_by(cut, color) %>% # this organises cut column before colour
summarize(m = mean(price),
s = sd(price)) %>%
ungroup()
diamonds %>%
group_by(color) %>%
summarize(m = mean(price)) %>%
mutate(x1 = str_c("Diamond color ", color), # this combines the phrase Diamond Colour with the results of 'colour'
x2 = 5) %>%
ungroup()
diamonds %>%
group_by(color) %>%
summarize(m = mean(price)) %>%
ungroup() %>%
mutate(x1 = str_c("Diamond color ", color), # here, mutate is applied to the entire dataframe not just the colour summary
x2 = 5) Midwest Dataset
midwest %>%
group_by(state) %>%
summarise(poptotalmean = mean(poptotal),
poptotalmed = median(poptotal),
popmax = max(poptotal),
popmin = min(poptotal),
popdistinct = n_distinct(poptotal),
popfirst = first(poptotal),
popany = any(poptotal < 5000),
popany2 = any(poptotal > 2000000)) %>% # this is summarising the data instead of 'mutating and adding the data in columns'
ungroup()
midwest %>%
group_by(state) %>%
summarize(num5k = sum(poptotal < 5000), # sums total number with less than 5000 pop
num2mil = sum(poptotal > 2000000), # sums total number with more than 2m pop
numrows = n()) %>% # counts no. rows
ungroup()
midwest %>%
group_by(county) %>% # groups dataset by county
summarize(x = n_distinct(state)) %>% #summarises how many are distinct in states
arrange(desc(x)) %>% # arrange in descending order
ungroup()
midwest %>%
group_by(county) %>%
summarize(x = n()) %>% # this counts the number of rows in county column
ungroup()
midwest %>%
group_by(state) %>% # grouped by state
summarize(x = n_distinct(county)) %>% # counting how many counties in each state
ungroup()Extra Practice
data(diamonds)
view(diamonds)
diamonds %>%
arrange(price) # arranges diamonds by lowest to highest price
diamonds %>%
arrange(desc(price)) # arranges diamonds highest to lowest price
diamondshigh <- diamonds %>%
arrange(desc(price)) # creates new dataset with diamonds arranged highest to lowest
diamonds %>%
arrange(price, cut) # arranges by price and cut
diamonds %>%
arrange(desc(price), clarity) # arranges by price descending and clarity
diamonds %>%
mutate(salePrice = price - 250) # creates a new column with sale price
diamonds %>%
select(1:7) # removes x y z columns
diamonds %>%
group_by(cut) %>% # groups by cut
summarise(count = n()) %>% # counts number in column cut
ungroup()
diamonds %>%
mutate(totalnum = n()) # creates new column with total countResearch Methods Questions
Good = What are the affecting factors for the price of a diamond, and which has the most significant effect?
Bad = Does colour affect the price of diamonds?
3 - Data Exploration
Click to expand Tutorial 3
Plotting different graphs
data(penguins)
penguins# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins %>%
na.omit() %>%
group_by(species) %>% # sort the data by species
ggplot(aes(x=bill_length_mm, color=species, fill=species)) + # plot bill length vs species with colour and fill = species
geom_histogram() # make a histogram`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Bill Length per species
penguins %>%
group_by(species) %>% # sort the data by species
ggplot(aes(x = species, y = bill_length_mm, colour = species, fill = species)) + # plot bill length vs species with col and fill = species
geom_boxplot(alpha = 0.5) + # create a boxplot
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
# Count of species
penguins %>%
ggplot(aes(x = species, color = species, fill = species)) +
geom_bar(alpha= 0.5) + # plot bar chart of species
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Observations per year
penguins %>%
ggplot(aes(x= year, colour = species, fill = species)) + # plots year along x axis with species filled in the bars
geom_bar() +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Species per Island
penguins %>%
ggplot(aes(x = island, colour = species, fill = species)) +
geom_bar() +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Correlations
penguins %>%
ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) + # plots length vs depth
geom_point() + # plots the points
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))Warning: Removed 2 rows containing missing values (`geom_point()`).
penguins %>%
ggplot(aes(x = bill_length_mm, y = bill_depth_mm, colour = species, fill = species)) +
geom_point() +
geom_smooth(method = "lm") + # adds trend line with confidence interval
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 2 rows containing missing values (`geom_point()`).
# Body mass per sex
# This one is easiest to compare the body mass per male and female between each species
penguins %>%
na.omit() %>%
ggplot(aes(x = sex, y = body_mass_g, colour = species, fill = species)) +
geom_boxplot(alpha = 0.7) +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Inverted
# This one is easiest to compare body mass of sex within each species
penguins %>%
na.omit() %>%
ggplot(aes(x = species, y = body_mass_g, colour = sex, fill = sex)) +
geom_boxplot(alpha = 0.7) +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Can body mass predict bill length?
penguins %>%
ggplot(aes(x= body_mass_g, y = bill_length_mm, colour = species, fill = species)) +
geom_point() +
geom_smooth(method = "lm") +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
Removed 2 rows containing missing values (`geom_point()`).
# Does sex explain flipper length?
penguins %>%
na.omit() %>%
ggplot(aes(x = species, y = flipper_length_mm, colour = sex, fill = sex)) +
geom_boxplot(alpha = 0.5) +
theme(axis.title = element_text(size = 16), # make text size 16
axis.text = element_text(size = 16))# Checking Distrubution of Data
penguins1 <- penguins %>%
pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") # this is just to show what it is doing to the data
penguins %>%
na.omit() %>% # this removes any values with NA
pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") %>%
ggplot(aes(x=value,
group=species,
fill=species,
color=species))+
geom_density(alpha=0.7)+
facet_grid(~trait, scales = "free_x" )+ # this splits the plot into seperate panels based on the column trait and allows each one to have its own x scale
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))+
theme_minimal()4 - Choosing Analyses
Click to expand Tutorial 4
X axis of data = Predictor Variable Independent Variable
Y axis of data = Outcome Variable Dependent Variable
Check your variables: - Are they numerical or categorical?
penguins %>%
glimpse()Rows: 344
Columns: 8
$ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex <fct> male, female, female, NA, female, male, female, male…
$ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
Choosing the Right Test Category x Category = Frequency tests (Chi-square)
Category x Number = Mean tests (T-tests, ANOVAs, Non-parametric equivalents)
Number x Number = Correlation
5 - Frequency Tests
Frequency Test Types:
Chi-square test
Z-test
G test
Binomial tests
Loglinear models
Click to expand Contingency Tables
Frequency Table - lists a set of values and how often each one appears - helpful to understand which data values are common or rare
Contingency Table - two-way or two-variable frequency table
head(Arthritis) # this displays the first few rows of data of 'arthritis' from vcd package ID Treatment Sex Age Improved
1 57 Treated Male 27 Some
2 46 Treated Male 29 None
3 77 Treated Male 30 None
4 17 Treated Male 32 Marked
5 36 Treated Male 46 Marked
6 23 Treated Male 58 Marked
# one way table
mytable <- table (Arthritis$Improved)
mytable # counts of each improved variable
None Some Marked
42 14 28
prop.table(mytable) # proportion of each variable
None Some Marked
0.5000000 0.1666667 0.3333333
prop.table(mytable)*100 # percentage of each variable
None Some Marked
50.00000 16.66667 33.33333
# two way table
mytable1 <- xtabs(~ Treatment+Improved, data=Arthritis)
mytable1 # counts improved per treatment Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
margin.table(mytable1, 1) # Total counts for treatmentTreatment
Placebo Treated
43 41
prop.table(mytable1, 1) # row proportions (rows equal 1) Improved
Treatment None Some Marked
Placebo 0.6744186 0.1627907 0.1627907
Treated 0.3170732 0.1707317 0.5121951
margin.table(mytable1, 2) # total counts for improvedImproved
None Some Marked
42 14 28
prop.table(mytable1,2) # column proportions (columns equal 1) Improved
Treatment None Some Marked
Placebo 0.6904762 0.5000000 0.2500000
Treated 0.3095238 0.5000000 0.7500000
prop.table(mytable1) # proportions of all (all equal 1) Improved
Treatment None Some Marked
Placebo 0.34523810 0.08333333 0.08333333
Treated 0.15476190 0.08333333 0.25000000
addmargins(mytable1) # adds sums to columns and rows Improved
Treatment None Some Marked Sum
Placebo 29 7 7 43
Treated 13 7 21 41
Sum 42 14 28 84
addmargins(prop.table(mytable1)) # cell proportions with proportion sums Improved
Treatment None Some Marked Sum
Placebo 0.34523810 0.08333333 0.08333333 0.51190476
Treated 0.15476190 0.08333333 0.25000000 0.48809524
Sum 0.50000000 0.16666667 0.33333333 1.00000000
addmargins(prop.table(mytable1, 1), 2) # row proportions with row sums Improved
Treatment None Some Marked Sum
Placebo 0.6744186 0.1627907 0.1627907 1.0000000
Treated 0.3170732 0.1707317 0.5121951 1.0000000
addmargins(prop.table(mytable1, 2), 1) # column proportions with column sums Improved
Treatment None Some Marked
Placebo 0.6904762 0.5000000 0.2500000
Treated 0.3095238 0.5000000 0.7500000
Sum 1.0000000 1.0000000 1.0000000
CrossTable(Arthritis$Treatment, Arthritis$Improved)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 84
| Arthritis$Improved
Arthritis$Treatment | None | Some | Marked | Row Total |
--------------------|-----------|-----------|-----------|-----------|
Placebo | 29 | 7 | 7 | 43 |
| 2.616 | 0.004 | 3.752 | |
| 0.674 | 0.163 | 0.163 | 0.512 |
| 0.690 | 0.500 | 0.250 | |
| 0.345 | 0.083 | 0.083 | |
--------------------|-----------|-----------|-----------|-----------|
Treated | 13 | 7 | 21 | 41 |
| 2.744 | 0.004 | 3.935 | |
| 0.317 | 0.171 | 0.512 | 0.488 |
| 0.310 | 0.500 | 0.750 | |
| 0.155 | 0.083 | 0.250 | |
--------------------|-----------|-----------|-----------|-----------|
Column Total | 42 | 14 | 28 | 84 |
| 0.500 | 0.167 | 0.333 | |
--------------------|-----------|-----------|-----------|-----------|
# Three-way Contingency table
newtable <- xtabs(~ Treatment+Sex+Improved, data=Arthritis) # this creates a two way table by each Improved Variable
newtable, , Improved = None
Sex
Treatment Female Male
Placebo 19 10
Treated 6 7
, , Improved = Some
Sex
Treatment Female Male
Placebo 7 0
Treated 5 2
, , Improved = Marked
Sex
Treatment Female Male
Placebo 6 1
Treated 16 5
margin.table(newtable, 1) # totals for treatmentTreatment
Placebo Treated
43 41
margin.table(newtable, 2) # totals for sexSex
Female Male
59 25
margin.table(newtable, 3) # totals for improvedImproved
None Some Marked
42 14 28
margin.table(newtable, c(1,2)) # totals for treatment by improved Sex
Treatment Female Male
Placebo 32 11
Treated 27 14
ftable(newtable) Improved None Some Marked
Treatment Sex
Placebo Female 19 7 6
Male 10 0 1
Treated Female 6 5 16
Male 7 2 5
ftable(prop.table(newtable, c(1,2))) Improved None Some Marked
Treatment Sex
Placebo Female 0.59375000 0.21875000 0.18750000
Male 0.90909091 0.00000000 0.09090909
Treated Female 0.22222222 0.18518519 0.59259259
Male 0.50000000 0.14285714 0.35714286
ftable(addmargins(prop.table(newtable, c(1,2)), 3)) Improved None Some Marked Sum
Treatment Sex
Placebo Female 0.59375000 0.21875000 0.18750000 1.00000000
Male 0.90909091 0.00000000 0.09090909 1.00000000
Treated Female 0.22222222 0.18518519 0.59259259 1.00000000
Male 0.50000000 0.14285714 0.35714286 1.00000000
ftable(addmargins(prop.table(newtable, c(1,2)), 3))*100 Improved None Some Marked Sum
Treatment Sex
Placebo Female 59.375000 21.875000 18.750000 100.000000
Male 90.909091 0.000000 9.090909 100.000000
Treated Female 22.222222 18.518519 59.259259 100.000000
Male 50.000000 14.285714 35.714286 100.000000
Click to expand Chi Square Test
volunteers <- matrix(c(111, 96, 48, 96 , 133, 61, 91, 150, 53), # creating a 'matrix' of numbers
byrow = T, nrow = 3)
volunteers [,1] [,2] [,3]
[1,] 111 96 48
[2,] 96 133 61
[3,] 91 150 53
rownames(volunteers) <- c("Community College", "Four Year", "Non Students") # assigning row names
colnames(volunteers) <- c("1-3 hours", "4-6 hours", "7-9 hours") # assigning column names
volunteers 1-3 hours 4-6 hours 7-9 hours
Community College 111 96 48
Four Year 96 133 61
Non Students 91 150 53
model <- chisq.test(volunteers) # creates a model
model # runs model
Pearson's Chi-squared test
data: volunteers
X-squared = 12.991, df = 4, p-value = 0.01132
model$expected # the expected cell counts 1-3 hours 4-6 hours 7-9 hours
Community College 90.57211 115.1907 49.23719
Four Year 103.00358 131.0012 55.99523
Non Students 104.42431 132.8081 56.76758
model$observed # the observed counts 1-3 hours 4-6 hours 7-9 hours
Community College 111 96 48
Four Year 96 133 61
Non Students 91 150 53
model$residuals # the difference between observed and expected 1-3 hours 4-6 hours 7-9 hours
Community College 2.1464772 -1.7880604 -0.1763148
Four Year -0.6900708 0.1746359 0.6688187
Non Students -1.3136852 1.4918030 -0.5000487
# Alternate way of ChiSquare test
vol_tab <- as.table(volunteers) # creates new table (vol_tab) from matrix
summary(vol_tab) # provides summary including chi squareNumber of cases in table: 839
Number of factors: 2
Test for independence of all factors:
Chisq = 12.991, df = 4, p-value = 0.01132
Click to expand Test for Independence
Example 1
naruto<-matrix(c(35,205,8,48), nrow=2, byrow=TRUE) # example table of naruto and ghibli fans
rownames(naruto) <- c("Naruto Fan NO", "Naruto fan YES")
colnames(naruto) <- c("Watched Ghibli", "Never watched Ghibli")
naruto Watched Ghibli Never watched Ghibli
Naruto Fan NO 35 205
Naruto fan YES 8 48
chisq.test(naruto)$expected # gives expected values Watched Ghibli Never watched Ghibli
Naruto Fan NO 34.864865 205.13514
Naruto fan YES 8.135135 47.86486
proportions(naruto[1,]) # gives proportions of row 1 Watched Ghibli Never watched Ghibli
0.1458333 0.8541667
proportions(naruto[2,]) # gives proportions of row 2 Watched Ghibli Never watched Ghibli
0.1428571 0.8571429
chisq.test(naruto) # chi square test
Pearson's Chi-squared test with Yates' continuity correction
data: naruto
X-squared = 5.9938e-30, df = 1, p-value = 1
x-squared is very low (basically 0) and p-value is 1 therefore null-hypothesis is rejected - n0 = “there is no independence”… so there IS independence. The values are not related.
Example 2
placebo<-matrix(c(43,31,27,12,24,28), nrow=2, byrow=TRUE)
rownames(placebo)<-c("cure-yes", "cure-no")
colnames(placebo)<-c("treatment","placebo","no treatment")
exp.placebo<-chisq.test(placebo)$expected # shows expected values if no effect from treatments
exp.placebo treatment placebo no treatment
cure-yes 33.66667 33.66667 33.66667
cure-no 21.33333 21.33333 21.33333
chisq.test(placebo)
Pearson's Chi-squared test
data: placebo
X-squared = 10.619, df = 2, p-value = 0.004945
P-value less than 0.05 so there IS an association between being cured and being treated
Importance of Proportions
Example of Importance of Proportions
# creating dataset for ladybirds
ladybirds <- tribble(
~Habitat, ~Site, ~Colour, ~Number,
"Rural", "R1", "Black", 10,
"Rural", "R2", "Black", 3,
"Rural", "R3", "Black", 4,
"Rural", "R4", "Black", 7,
"Rural", "R5", "Black", 6,
"Rural", "R1", "Red", 15,
"Rural", "R2", "Red", 18,
"Rural", "R3", "Red", 9,
"Rural", "R4", "Red", 12,
"Rural", "R5", "Red", 16,
"Industrial", "U1", "Black", 32,
"Industrial", "U2", "Black", 25,
"Industrial", "U3", "Black", 25,
"Industrial", "U4", "Black", 17,
"Industrial", "U5", "Black", 16,
"Industrial", "U1", "Red", 17,
"Industrial", "U2", "Red", 23,
"Industrial", "U3", "Red", 21,
"Industrial", "U4", "Red", 9,
"Industrial", "U5",
"Red", 15
)Is there an association between habitat type and morphotype?
ladybirds_wide <- ladybirds %>% # changes colour column into two columns 'Black' and 'Red'
pivot_wider(names_from = Colour, values_from = Number)# Show totals
ladybirds %>% # creates wide table
group_by(Habitat, Colour) %>%
summarize(count = sum(Number)) %>% # creates count column with sum of number column
spread(Colour, count, fill = 0) |> # spreads the data with colour and count combined
kable()| Habitat | Black | Red |
|---|---|---|
| Industrial | 115 | 85 |
| Rural | 30 | 70 |
# Show proportions
ladybirds |>
group_by(Habitat, Colour) %>%
summarize(count = sum(Number)) %>%
spread(Colour, count, fill = 0) |>
column_to_rownames("Habitat") |>
proportions() |>
kable()| Black | Red | |
|---|---|---|
| Industrial | 0.3833333 | 0.2833333 |
| Rural | 0.1000000 | 0.2333333 |
# Show proportions by ROW
ladybirds |>
group_by(Habitat, Colour) %>%
summarise(count = sum(Number)) %>%
spread(Colour, count, fill = 0) |>
column_to_rownames("Habitat") |>
as.matrix()->t
proportions(t,1) |>
kable()| Black | Red | |
|---|---|---|
| Industrial | 0.575 | 0.425 |
| Rural | 0.300 | 0.700 |
# Show proportions by COLUMN
ladybirds |>
group_by(Habitat, Colour) %>%
summarise(count = sum(Number)) %>%
spread(Colour, count, fill = 0) |>
column_to_rownames("Habitat") |>
as.matrix()->t
proportions(t,2) |>
kable()| Black | Red | |
|---|---|---|
| Industrial | 0.7931034 | 0.5483871 |
| Rural | 0.2068966 | 0.4516129 |
# Instead of doing it all seperately, you can use the following code to display row and column proportions
CrossTable(t)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 300
|
| Black | Red | Row Total |
-------------|-----------|-----------|-----------|
Industrial | 115 | 85 | 200 |
| 3.477 | 3.253 | |
| 0.575 | 0.425 | 0.667 |
| 0.793 | 0.548 | |
| 0.383 | 0.283 | |
-------------|-----------|-----------|-----------|
Rural | 30 | 70 | 100 |
| 6.954 | 6.505 | |
| 0.300 | 0.700 | 0.333 |
| 0.207 | 0.452 | |
| 0.100 | 0.233 | |
-------------|-----------|-----------|-----------|
Column Total | 145 | 155 | 300 |
| 0.483 | 0.517 | |
-------------|-----------|-----------|-----------|
# Now test Chi Sq by rows
t[,1] # first line, BlacksIndustrial Rural
115 30
chisq.test(t[,1])$expected # shows expected values - it is expected to have equal contributionIndustrial Rural
72.5 72.5
chisq.test(t[,1]) # chi square of row 1
Chi-squared test for given probabilities
data: t[, 1]
X-squared = 49.828, df = 1, p-value = 1.679e-12
# there is a very small p value so proportions differ a lot - black lbs may be affected?
t[,2] # second line, RedsIndustrial Rural
85 70
chisq.test(t[,2])$expectedIndustrial Rural
77.5 77.5
chisq.test(t[,2])
Chi-squared test for given probabilities
data: t[, 2]
X-squared = 1.4516, df = 1, p-value = 0.2283
# large p - red lbs don't respond to habitat
# so it can be concluded that habitat doesn't necessarily affect morphotype since black lbs are affected by red lbs not
# Test Chi Sq by Columns
t[1,] # first column, industrialBlack Red
115 85
chisq.test(t[1,])$expected # expected values equalBlack Red
100 100
chisq.test(t[1,])
Chi-squared test for given probabilities
data: t[1, ]
X-squared = 4.5, df = 1, p-value = 0.03389
# small p value, may be some difference
t[2,] # second column, ruralBlack Red
30 70
chisq.test(t[2,])$expected # expected values equalBlack Red
50 50
chisq.test(t[2,])
Chi-squared test for given probabilities
data: t[2, ]
X-squared = 16, df = 1, p-value = 6.334e-05
# smaller p value - stronger differenceSo what does this mean??
Need to ask this… Which is correct, should you test them all? What is the final answer?
Test for Homogeneity
Homogeneity in statistics refers to the idea that different samples or groups share the same distribution of a particular characteristic or variable.
# checking for homegeneity - do these samples have the same distribution?
pop<-tribble( # created a dataset of age distribution in Brazil and UK
~age, ~country, ~pop,
"0-10","UK", 50,
"0-10","BRA", 120,
"11-20","UK", 70,
"11-20","BRA", 140,
"21-30","UK", 110,
"21-30","BRA", 150,
"31-40","UK", 150,
"31-40","BRA", 180,
"40-50","UK", 180,
"40-50","BRA", 180,
"50-60","UK", 150,
"50-60","BRA", 90,
"60-70","UK", 130,
"60-70","BRA", 50)
pop |> # plotted a distribution graph
ggplot(aes(x=age, y= pop, color=country, fill=country))+
geom_col(position="dodge")+
theme(axis.text=element_text(size = 18),
axis.title = element_text(size=18))pop |>
group_by(country, age) %>%
#summarise(count = sum(Number)) %>%
spread(country, pop, fill = 0) |>
column_to_rownames("age") |>
as.matrix() -> t2 # converted to matrix to perform chi square
chisq.test(t2)
Pearson's Chi-squared test
data: t2
X-squared = 108.97, df = 6, p-value < 2.2e-16
chisq.test(t2)$observed-chisq.test(t2)$expected # observed minus expexted BRA UK
0-10 31.6 -31.6
11-20 30.8 -30.8
21-30 14.8 -14.8
31-40 8.4 -8.4
40-50 -7.2 7.2
50-60 -34.8 34.8
60-70 -43.6 43.6
Goodness of Fit
Goodness of fit refers to how well a statistical model fits a set of observations by measuring discrepancy between observed data and models predicted values e.g chi square goodness of fit test it produces ‘residuals’. smaller residual means better fit.
pop %>%
spread(country, pop, fill=0) %>%
mutate(world=(BRA+UK)/2) %>% # comparing against world distribution
pivot_longer(cols = BRA:world, names_to = "place")->pop2
pop2 # A tibble: 21 × 3
age place value
<chr> <chr> <dbl>
1 0-10 BRA 120
2 0-10 UK 50
3 0-10 world 85
4 11-20 BRA 140
5 11-20 UK 70
6 11-20 world 105
7 21-30 BRA 150
8 21-30 UK 110
9 21-30 world 130
10 31-40 BRA 180
# ℹ 11 more rows
pop2 %>% # plotting distribution
ggplot(aes(x=age,y=value, colour=place, fill=place))+
geom_col(position = "dodge")pop2 %>% # preparing data
spread(place, value, fill = 0) %>%
column_to_rownames("age")-> pop3
chisq.test(pop3$BRA, pop3$world) # chi square of brazil vs worldWarning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
incorrect
Pearson's Chi-squared test
data: pop3$BRA and pop3$world
X-squared = 35, df = 30, p-value = 0.2426
chisq.test(pop3$UK, pop3$world) # chi square of uk vs worldWarning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
incorrect
Pearson's Chi-squared test
data: pop3$UK and pop3$world
X-squared = 35, df = 30, p-value = 0.2426
Loglinear models
Loglinear models are a type of statistical model used to analyze the relationship between categorical variable
Frequency Test Types: Chi-square test Z-test G test Binomial tests Loglinear models
If BOTH factors in response to variables are CATEGORIC - the nyou need a frequency test
6 - Mean Tests
T-tests are a statistical analyses that simply tell us if the differences between 2 means are significant or not.
Click to expand Mean Tests
Mean Tests
Only family of tests that have parametric and non-parametric tests
If we have a categorical Y value and Numerical X value - we use means tests
Comparing Means: - One Sample Means P = One sample t test NP = One sample Wilcoxon
Two samples P = unpaired t-test NP = Unpaired Wilcoxon
More than two samples P = One/two way ANOVA or MANOVA NP = Kruskal Wallis
Paired samples P = Paired T test NP = Paired Wilcoxon
Mean Test Assumptions: - Continuous Data - Normally Distributed - Random Sample - Enough Data
Mu = greek letter - represents mean
# Unpaired T-Test
set.seed(123456)
body_mass_g1 = rnorm(100,60,12) # 100 values with mean of 60 and SD of 12
body_mass_g2 = rnorm(100,45,12)
t.test(body_mass_g1, body_mass_g2, paired = FALSE) # this is testing the means of each group against one another
Welch Two Sample t-test
data: body_mass_g1 and body_mass_g2
t = 9.3705, df = 197.98, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
12.41382 19.03149
sample estimates:
mean of x mean of y
60.20184 44.47918
# Null hypothesis is that the difference between the means is 0
# to go further you can measure the effect size
effsize::cohen.d(body_mass_g1, body_mass_g2)
Cohen's d
d estimate: 1.325185 (large)
95 percent confidence interval:
lower upper
1.017207 1.633163
# there is a large effect size indicating a substantial difference between the two groups
# Mean mass of 60g for G1 and mean mass of 44g for G2, mean difference of 16g with 95% confident interval between 14-19g.# Paired T-test
set.seed(123456)
foraging_time_preburn = rnorm(100,10,2)
foraging_time_postburn = rnorm(100,11,2)
t.test(foraging_time_preburn, foraging_time_postburn, paired = TRUE)
Paired t-test
data: foraging_time_preburn and foraging_time_postburn
t = -2.8538, df = 99, p-value = 0.005261
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.4910978 -0.2680168
sample estimates:
mean difference
-0.8795573
# measuring effect size
effsize::cohen.d(foraging_time_postburn, foraging_time_preburn)
Cohen's d
d estimate: 0.4448012 (small)
95 percent confidence interval:
lower upper
0.1624883 0.7271141
# small p value but small effect size
# there is roughly a 52 second or 9% difference in foraging time
# with a result like this, need to be careful and look into data.
# significant difference but small effect - will this result make a massive difference to the organism?7 - Correlations
Click to expand Correlations
Correlation measure the strength and direction of a linear relationship between 2 quantitative variables
set.seed(123456)
riverspeed <- rnorm(100,7,1)
turtle_length <- riverspeed + rnorm(100,30,1) # creating dataset
cor.test(riverspeed,turtle_length) # correlation test
Pearson's product-moment correlation
data: riverspeed and turtle_length
t = 8.0591, df = 98, p-value = 1.897e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4964851 0.7364323
sample estimates:
cor
0.6313361
# There is correlation between groups cor= 0.6 and p= <0.05
riverspeeddf <- data.frame(riverspeed) # converting to dataframe
turtle_length_df <- data.frame(turtle_length)
binded <- cbind(riverspeeddf, turtle_length_df) # binding two dataframes together
ggplot(binded, aes(x = riverspeed, y = turtle_length))+ # plotting dataframe
geom_point()+
geom_smooth(method = "lm")+
theme(axis.text=element_text(size=16),
axis.title=element_text(size=16))+
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Exercises
8 - Linear Models
Click to expand Linear Models
Linear models are a statistical method used to approximate the relationship between variables. Under a regression model, we calculate an equation of a line that best describes the data. In fact, that’s what regression means! Regressing the points into a line.
Linear Models = Regression
Numerical vs Numerical = scatterplot
Why use linear models?
When you do expect effects
When you are able to explain effects
You want predictive values
Understanding Linear Models
Slope is measure of strength
R2 is measure of fitness of the model
lm(formula = qsec ~ hp, data = mtcars) # lm(y~x)
Call:
lm(formula = qsec ~ hp, data = mtcars)
Coefficients:
(Intercept) hp
20.55635 -0.01846
anova(lm(formula = qsec ~ hp, data = mtcars)) # Shows ANOVA tableAnalysis of Variance Table
Response: qsec
Df Sum Sq Mean Sq F value Pr(>F)
hp 1 49.651 49.651 30.19 5.766e-06 ***
Residuals 30 49.338 1.645
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(formula = qsec ~ hp, data = mtcars)) # Shows summary
Call:
lm(formula = qsec ~ hp, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-2.1766 -0.6975 0.0348 0.6520 4.0972
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.556354 0.542424 37.897 < 2e-16 ***
hp -0.018458 0.003359 -5.495 5.77e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.282 on 30 degrees of freedom
Multiple R-squared: 0.5016, Adjusted R-squared: 0.485
F-statistic: 30.19 on 1 and 30 DF, p-value: 5.766e-06
# Sum sq is 50% hp and 50% residuals
# residuals are the 'errors' or distances from points to the correlation line
# ANOVA and summary of LM show different features - use both to investigate results and choose betterPart Two Regression
Simple Linear Regression
mtcars
## Model Assumptions
shapiro.test(mtcars$qsec) #### Test normality
Shapiro-Wilk normality test
data: mtcars$qsec
W = 0.97325, p-value = 0.5935
#### it is normal
m1 <- lm(qsec~hp, data=mtcars) # assigns model 1
m1 # check coefficients
Call:
lm(formula = qsec ~ hp, data = mtcars)
Coefficients:
(Intercept) hp
20.55635 -0.01846
m1$residuals # residuals Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
-2.06593942 -1.50593942 -0.22973076 0.91406058
Hornet Sportabout Valiant Duster 360 Merc 240D
-0.30614897 1.60176901 -0.19406695 0.58806148
Merc 230 Merc 280 Merc 280C Merc 450SE
4.09718587 0.01401867 0.61401867 0.16614260
Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
0.36614260 0.76614260 1.20760047 1.23218361
Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
1.10905833 0.13189474 -1.07652166 0.54343643
Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
1.24410249 -0.91760683 -0.48760683 -0.62406695
Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
-0.27614897 -0.43810526 -2.17664739 -1.57056447
Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
-1.18335897 -1.82614897 0.22718136 0.05560227
m1$fitted.values# values that make the line Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
18.52594 18.52594 18.83973 18.52594
Hornet Sportabout Valiant Duster 360 Merc 240D
17.32615 18.61823 16.03407 19.41194
Merc 230 Merc 280 Merc 280C Merc 450SE
18.80281 18.28598 18.28598 17.23386
Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
17.23386 17.23386 16.77240 16.58782
Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
16.31094 19.33811 19.59652 19.35656
Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
18.76590 17.78761 17.78761 16.03407
Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
17.32615 19.33811 18.87665 18.47056
Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
15.68336 17.32615 14.37282 18.54440
ggplot(mtcars, aes(hp, qsec))+
geom_point()ggplot(mtcars, aes(hp, qsec))+
geom_point()+
geom_smooth(method = lm, se=FALSE) # plotting points with linear model fit`geom_smooth()` using formula = 'y ~ x'
mtcars %>%
mutate(fitted=m1$fitted.values) %>% # plotting fitted values
ggplot(aes(hp, fitted))+
geom_point()+
geom_smooth(method = lm, se=FALSE)`geom_smooth()` using formula = 'y ~ x'
Multiple Linear Models
penguins# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
m2 <- lm(flipper_length_mm~bill_length_mm, data=penguins)# normal model
summary(m2)
Call:
lm(formula = flipper_length_mm ~ bill_length_mm, data = penguins)
Residuals:
Min 1Q Median 3Q Max
-43.708 -7.896 0.664 8.650 21.179
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 126.6844 4.6651 27.16 <2e-16 ***
bill_length_mm 1.6901 0.1054 16.03 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.63 on 340 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.4306, Adjusted R-squared: 0.4289
F-statistic: 257.1 on 1 and 340 DF, p-value: < 2.2e-16
m3 <- lm(flipper_length_mm~bill_length_mm+
bill_depth_mm +
body_mass_g, data=penguins) # adding multiple components
m3
Call:
lm(formula = flipper_length_mm ~ bill_length_mm + bill_depth_mm +
body_mass_g, data = penguins)
Coefficients:
(Intercept) bill_length_mm bill_depth_mm body_mass_g
157.78594 0.59227 -1.67868 0.01093
shapiro.test(m3$residuals) # test multiple normality
Shapiro-Wilk normality test
data: m3$residuals
W = 0.99081, p-value = 0.03121
# residuals are not normal
plot(m3) # visually testing residualsm3log <- lm(flipper_length_mm~log(bill_length_mm)+
log(bill_depth_mm) +
log(body_mass_g), data=penguins) # converting data to log versions
m3log
Call:
lm(formula = flipper_length_mm ~ log(bill_length_mm) + log(bill_depth_mm) +
log(body_mass_g), data = penguins)
Coefficients:
(Intercept) log(bill_length_mm) log(bill_depth_mm)
-188.59 25.91 -30.10
log(body_mass_g)
45.29
shapiro.test(m3log$residuals)
Shapiro-Wilk normality test
data: m3log$residuals
W = 0.99271, p-value = 0.09402
# now the data is normal
summary(m3log)
Call:
lm(formula = flipper_length_mm ~ log(bill_length_mm) + log(bill_depth_mm) +
log(body_mass_g), data = penguins)
Residuals:
Min 1Q Median 3Q Max
-21.046 -3.883 -0.125 4.108 14.740
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -188.589 21.645 -8.713 < 2e-16 ***
log(bill_length_mm) 25.914 3.209 8.075 1.2e-14 ***
log(bill_depth_mm) -30.098 3.098 -9.717 < 2e-16 ***
log(body_mass_g) 45.285 2.343 19.330 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.934 on 338 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.8235, Adjusted R-squared: 0.8219
F-statistic: 525.7 on 3 and 338 DF, p-value: < 2.2e-16
# Colinearity
## when two or more predictor variables in a regression model are correlated
cor(mtcars) mpg cyl disp hp drat wt
mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
qsec vs am gear carb
mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
lm1 <- lm(mpg~cyl, data = mtcars)
lm2 <- lm(mpg~cyl+disp, data = mtcars)
lm3 <- lm(mpg~cyl+disp+hp, data = mtcars)
summary(lm1)
Call:
lm(formula = mpg ~ cyl, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-4.9814 -2.1185 0.2217 1.0717 7.5186
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.8846 2.0738 18.27 < 2e-16 ***
cyl -2.8758 0.3224 -8.92 6.11e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.206 on 30 degrees of freedom
Multiple R-squared: 0.7262, Adjusted R-squared: 0.7171
F-statistic: 79.56 on 1 and 30 DF, p-value: 6.113e-10
anova(lm1,lm2,lm3) # comparing models with anovaAnalysis of Variance Table
Model 1: mpg ~ cyl
Model 2: mpg ~ cyl + disp
Model 3: mpg ~ cyl + disp + hp
Res.Df RSS Df Sum of Sq F Pr(>F)
1 30 308.33
2 29 270.74 1 37.594 4.0274 0.05451 .
3 28 261.37 1 9.371 1.0039 0.32495
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# testing whether complexity or different combinations improves model
# if significant p, it shows that model is different from the others and significant improved
# more parameters you add, more degrees of freedom but increased chances of error
# simplest model is often best
# even if you lose a little bit of explanation, the model may work better9 - Logistic Models
Click to expand Logistic Models
Reminder: (x) (y) Num x Num = correlation Num x Num = Linear Model
Num x Cat = Logistic Model
Better than Linear model for predicting probability
Detects at which point the probability shifts
Flat s-curve shows the variable is not a good predictor
Strong s-curve shows good predictor
Log model gives you:
- coefficients
- shows the effect of a one unit change in the predictor variable
- deviance
- measure of how well the model fits the data
inputting families (?family) for models:
binomial = If your outcome is binary (zeros and ones), proportions of “successes” and “failures” (values between 0 and 1), or their counts
gaussian = If your outcome is continuous and unbounded (aka normal)
Gamma = If you are dealing with continuous non-negative outcome
poisson = If your outcome is discrete, or more precisely, you are dealing with counts
Other more complicated types:
quasi
quasibinomial
quasipoisson
inverse.gaussian
Example
m1<-glm(sex~body_mass_g,family="binomial", data=penguins)
m1
Call: glm(formula = sex ~ body_mass_g, family = "binomial", data = penguins)
Coefficients:
(Intercept) body_mass_g
-5.16254 0.00124
Degrees of Freedom: 332 Total (i.e. Null); 331 Residual
(11 observations deleted due to missingness)
Null Deviance: 461.6
Residual Deviance: 396.6 AIC: 400.6
Logistic models don’t produce R value You can produce a ‘pseudo R2’ which shows the effect size on the model
null<-m1$null.deviance/-2
model<-m1$deviance/-2
(null-model)/null[1] 0.1407346
You can use logistic models to test all variables against one by using ‘.’
m2<-glm(formula = sex ~ ., family = "binomial", data = penguins)
summary(m2)
Call:
glm(formula = sex ~ ., family = "binomial", data = penguins)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 310.553424 632.100857 0.491 0.62321
speciesChinstrap -7.624848 1.714564 -4.447 8.70e-06 ***
speciesGentoo -8.766260 2.679151 -3.272 0.00107 **
islandDream 0.394305 0.817853 0.482 0.62972
islandTorgersen -0.513102 0.849231 -0.604 0.54571
bill_length_mm 0.627014 0.134709 4.655 3.25e-06 ***
bill_depth_mm 1.613122 0.338498 4.766 1.88e-06 ***
flipper_length_mm 0.037682 0.051547 0.731 0.46476
body_mass_g 0.005717 0.001090 5.244 1.58e-07 ***
year -0.195493 0.316257 -0.618 0.53648
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 461.61 on 332 degrees of freedom
Residual deviance: 125.66 on 323 degrees of freedom
(11 observations deleted due to missingness)
AIC: 145.66
Number of Fisher Scoring iterations: 7
Final Model
m3<-glm(sex~bill_length_mm+
bill_depth_mm+
body_mass_g,
family = "binomial",
data=penguins)
summary(m3)
Call:
glm(formula = sex ~ bill_length_mm + bill_depth_mm + body_mass_g,
family = "binomial", data = penguins)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.056e+01 7.081e+00 -8.552 < 2e-16 ***
bill_length_mm 9.151e-02 4.416e-02 2.072 0.0382 *
bill_depth_mm 2.063e+00 2.469e-01 8.356 < 2e-16 ***
body_mass_g 5.061e-03 6.348e-04 7.971 1.57e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 461.61 on 332 degrees of freedom
Residual deviance: 159.89 on 329 degrees of freedom
(11 observations deleted due to missingness)
AIC: 167.89
Number of Fisher Scoring iterations: 7
This produces an effect size of 65%
Types of test based on x and y variables:
99% of all tests can be done with these…
Notes
Basic Functions
Basic Functions
library(package) #loads particular package
data1 <- '1,2,3,4' #assigns name to data
ls() #lists dataset
nameofdata #to list items within that dataset
rm() #removes data items from environmentBasic Data Management
Basic Data Management
combine() # combines data/items into specified area
# e.g data1 <- c(1,2,3)
mutate() # can be used to add columns to a dataset
summarise() # summarises the data
str() #shows the structure of the data
group_by() # groups data by specified value
ungroup() #always ungroup data afterwards
filter() # can be used to show specified values
select() # selects specific values, rows/columnsBasic Plots
Scattergraph
data(mtcars)
data(pressure)
data(BOD)
data(ToothGrowth)
# Base R Scattergraph
plot(mtcars$wt, mtcars$mpg)library(ggplot2)
# ggplot Scattergraph
ggplot(mtcars, aes(x = wt, y = mpg)) + #this tells it to create a plot object
geom_point() # this just tells it to add a layer of points to the plotLine Graph
# Line Graph
#Base R function
plot(pressure$temperature, pressure$pressure, type = "l", xlab = "Temperature", ylab = "pressure")plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure, col = "red")
lines(pressure$temperature, pressure$pressure/2, col = "red")#ggplot Line Graph
ggplot(pressure, aes(x = temperature, y = pressure)) +
geom_line(col = "blue")Bar Graph
# Bar Graph
#Base R function
barplot(BOD$demand, names.arg = BOD$Time)barplot(table(mtcars$cyl))# ggplot Bar Graph
ggplot(BOD, aes(x = Time, y = demand)) +
geom_col()ggplot(BOD, aes(x = factor(Time), y = demand)) + # converts x variable to a factor so that it is discrete
geom_col()ggplot(mtcars, aes(x = cyl)) +
geom_bar() ggplot(mtcars, aes(x = factor(cyl))) + # discrete not continuous
geom_bar()Histogram
# Histogram
hist(mtcars$mpg)hist(mtcars$mpg, breaks = 10) # specifies no. breaks# ggplot Histogram
ggplot(mtcars, aes(x = mpg)) +
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 4)Box Plot
# Box Plot
plot(ToothGrowth$supp, ToothGrowth$len) # when x is a factor (not numerical) it will automatically create a boxplotboxplot(len ~ supp, data = ToothGrowth)boxplot(len ~ supp + dose, data = ToothGrowth)# ggplot Box Plot
ggplot(ToothGrowth, aes(x = supp, y = len)) +
geom_boxplot()ggplot(ToothGrowth, aes(x = interaction(supp, dose), y = len)) +
geom_boxplot()Function Curve
# Function Curve
curve(x^3 - 5*x, from = -4, to = 4)myfun <- function(xvar) {
1 / (1 + exp(-xvar + 10))
}
curve(myfun(x), from = 0, to = 20)
# Add a line:
curve(1 - myfun(x), add = TRUE, col = "red")# ggplot Function Curve
ggplot(data.frame(x = c(0,20)), aes(x = x)) +
stat_function(fun = myfun, geom = "line")