Josh Shaw - R Workbook

library(tidyverse)
library(readr)
library(knitr)
library(dplyr)
library(palmerpenguins)
library(tibble)
library(janitor)
library(gmodels)
library(vcd)
library(effsize)

This workbook is where I am storing code from tutorials and notes about additional code.

1 - Beginning with R

Importing and viewing a dataset in R

  • Displaying the dataset in Quarto
Click to expand Tutorial 1

Importing a dataset

This function is to import a .csv file, there are similar functions for other file types

 penguins <- read.csv("E:/UNI/MASTERS/R/1/penguins.csv")

Viewing the dataset

This function is to view the data within Rstudio

  View(penguins)

Displaying the data

Using this function, I am displaying the first 10 rows of data on my quarto document

library(readr)  
head(penguins, 10)
    X species    island bill_length_mm bill_depth_mm flipper_length_mm
1   1  Adelie Torgersen           39.1          18.7               181
2   2  Adelie Torgersen           39.5          17.4               186
3   3  Adelie Torgersen           40.3          18.0               195
4   4  Adelie Torgersen             NA            NA                NA
5   5  Adelie Torgersen           36.7          19.3               193
6   6  Adelie Torgersen           39.3          20.6               190
7   7  Adelie Torgersen           38.9          17.8               181
8   8  Adelie Torgersen           39.2          19.6               195
9   9  Adelie Torgersen           34.1          18.1               193
10 10  Adelie Torgersen           42.0          20.2               190
   body_mass_g    sex year
1         3750   male 2007
2         3800 female 2007
3         3250 female 2007
4           NA   <NA> 2007
5         3450 female 2007
6         3650   male 2007
7         3625 female 2007
8         4675   male 2007
9         3475   <NA> 2007
10        4250   <NA> 2007

However the above code was moving the columns around, so using the help of chatgpt I learned to use the following function to display my data correctly:

library(knitr)
kable(head(penguins, 10))
X species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
2 Adelie Torgersen 39.5 17.4 186 3800 female 2007
3 Adelie Torgersen 40.3 18.0 195 3250 female 2007
4 Adelie Torgersen NA NA NA NA NA 2007
5 Adelie Torgersen 36.7 19.3 193 3450 female 2007
6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
7 Adelie Torgersen 38.9 17.8 181 3625 female 2007
8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
9 Adelie Torgersen 34.1 18.1 193 3475 NA 2007
10 Adelie Torgersen 42.0 20.2 190 4250 NA 2007

2 - Data Wrangling

Click to expand Tutorial 2

Exercises

Diamond Dataset

diamonds %>%
  group_by(color, clarity) %>% # groups diamonds by colour and clarity
  mutate(price200 = mean(price)) %>% # adds a column showing the mean price of cut + colour
  ungroup() %>% # ungroups the data
  mutate(random10 = 10 + price) %>% # adds column with price+10
  select(cut, color, clarity, price, price200, random10) %>% # selects just specified columns
  arrange(color) %>% # arranging the data by colour
  group_by(cut) %>% # then grouping the data by cut
  mutate(dis = n_distinct(price), # counts the number of unique price values per cut
         rowID = row_number()) %>%
  ungroup() # Finally, ungroup the data

diamonds %>% 
  group_by(clarity) %>%  # grouping by column clarity
  summarize(a = n_distinct(color), # new column with number of distinct colours
            b = n_distinct(price), # new column with number distinct prices
            c = n()) %>% # new column with count of how many diamonds in each clarity group
  ungroup()

diamonds %>% 
  group_by(color, cut) %>%  # group by colour and cut
  summarize(m = mean(price), # summarise the mean price
            s = sd(price)) %>% # summarise the standard deviation of price
  ungroup()

diamonds %>% 
  group_by(cut, color) %>% # this organises cut column before colour
  summarize(m = mean(price),
            s = sd(price)) %>% 
  ungroup()

diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  mutate(x1 = str_c("Diamond color ", color), # this combines the phrase Diamond Colour with the results of 'colour'
         x2 = 5) %>% 
  ungroup()

diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  ungroup() %>% 
  mutate(x1 = str_c("Diamond color ", color), # here, mutate is applied to the entire dataframe not just the colour summary
         x2 = 5) 

Midwest Dataset

midwest %>% 
  group_by(state) %>% 
  summarise(poptotalmean = mean(poptotal),
            poptotalmed = median(poptotal),
            popmax = max(poptotal),
            popmin = min(poptotal),
            popdistinct = n_distinct(poptotal),
            popfirst = first(poptotal),
            popany = any(poptotal < 5000),
            popany2 = any(poptotal > 2000000)) %>% # this is summarising the data instead of 'mutating and adding the data in columns'
  ungroup()


midwest %>% 
  group_by(state) %>% 
  summarize(num5k = sum(poptotal < 5000), # sums total number with less than 5000 pop
            num2mil = sum(poptotal > 2000000), # sums total number with more than 2m pop
            numrows = n()) %>% # counts no. rows
  ungroup()


midwest %>% 
  group_by(county) %>%  # groups dataset by county
  summarize(x = n_distinct(state)) %>% #summarises how many are distinct in states
  arrange(desc(x)) %>% # arrange in descending order
  ungroup()

midwest %>% 
  group_by(county) %>% 
  summarize(x = n()) %>% # this counts the number of rows in county column
  ungroup()

midwest %>% 
  group_by(state) %>% # grouped by state
  summarize(x = n_distinct(county)) %>% # counting how many counties in each state
  ungroup()

Extra Practice

data(diamonds)
view(diamonds)

diamonds %>%
  arrange(price) # arranges diamonds by lowest to highest price

diamonds %>%
  arrange(desc(price)) # arranges diamonds highest to lowest price

diamondshigh <- diamonds %>%
  arrange(desc(price)) # creates new dataset with diamonds arranged highest to lowest

diamonds %>%
  arrange(price, cut) # arranges by price and cut

diamonds %>%
  arrange(desc(price), clarity) # arranges by price descending and clarity 

diamonds %>%
  mutate(salePrice = price - 250) # creates a new column with sale price

diamonds %>%
  select(1:7) # removes x y z columns

diamonds %>%
  group_by(cut) %>% # groups by cut
  summarise(count = n()) %>% # counts number in column cut
  ungroup()
  
diamonds %>%
  mutate(totalnum = n()) # creates new column with total count

Research Methods Questions

Good = What are the affecting factors for the price of a diamond, and which has the most significant effect?

Bad = Does colour affect the price of diamonds?

3 - Data Exploration

Click to expand Tutorial 3

Plotting different graphs

data(penguins)
penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins %>%
  na.omit() %>%
  group_by(species) %>% # sort  the data by species
  ggplot(aes(x=bill_length_mm, color=species, fill=species)) + # plot bill length vs species with colour and fill = species
  geom_histogram() # make a histogram
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Bill Length per species

penguins %>%
  group_by(species) %>% # sort the data by species
  ggplot(aes(x = species, y = bill_length_mm, colour = species, fill = species)) + # plot bill length vs species with col and fill = species
  geom_boxplot(alpha = 0.5) + # create a boxplot
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))
Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).

# Count of species

penguins %>%
  ggplot(aes(x = species, color = species, fill = species)) +
  geom_bar(alpha= 0.5) + # plot bar chart of species
  theme(axis.title = element_text(size = 16), # make text size 16
      axis.text = element_text(size = 16))

# Observations per year

penguins %>%
  ggplot(aes(x= year, colour = species, fill = species)) + # plots year along x axis with species filled in the bars
  geom_bar() + 
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))

# Species per Island

penguins %>%
  ggplot(aes(x = island, colour = species, fill = species)) +
  geom_bar() +
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))

# Correlations

penguins %>%
  ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) + # plots length vs depth
  geom_point() + # plots the points
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))
Warning: Removed 2 rows containing missing values (`geom_point()`).

penguins %>%
  ggplot(aes(x = bill_length_mm, y = bill_depth_mm, colour = species, fill = species)) + 
  geom_point() +
  geom_smooth(method = "lm") + # adds trend line with confidence interval
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 2 rows containing missing values (`geom_point()`).

# Body mass per sex

# This one is easiest to compare the body mass per male and female between each species 

penguins %>%
  na.omit() %>%
  ggplot(aes(x = sex, y = body_mass_g, colour = species, fill = species)) +
  geom_boxplot(alpha = 0.7) +
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))

# Inverted

# This one is easiest to compare body mass of sex within each species

penguins %>%
  na.omit() %>%
  ggplot(aes(x = species, y = body_mass_g, colour = sex, fill = sex)) +
  geom_boxplot(alpha = 0.7) +
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))

# Can body mass predict bill length?

penguins %>%
  ggplot(aes(x= body_mass_g, y = bill_length_mm, colour = species, fill = species)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
Removed 2 rows containing missing values (`geom_point()`).

# Does sex explain flipper length?

penguins %>%
  na.omit() %>%
  ggplot(aes(x = species, y = flipper_length_mm, colour = sex, fill = sex)) +
  geom_boxplot(alpha = 0.5) +
  theme(axis.title = element_text(size = 16), # make text size 16
        axis.text = element_text(size = 16))

# Checking Distrubution of Data

penguins1 <- penguins %>%
  pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") # this is just to show what it is doing to the data

penguins %>% 
  na.omit() %>% # this removes any values with NA
  pivot_longer(bill_length_mm:body_mass_g, names_to = "trait") %>% 
  ggplot(aes(x=value,
             group=species,
             fill=species,
             color=species))+
  geom_density(alpha=0.7)+
  facet_grid(~trait, scales = "free_x" )+ # this splits the plot into seperate panels based on the column trait and allows each one to have its own x scale
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))+
  theme_minimal()

4 - Choosing Analyses

Click to expand Tutorial 4

X axis of data = Predictor Variable Independent Variable

Y axis of data = Outcome Variable Dependent Variable

Check your variables: - Are they numerical or categorical?

penguins %>%
  glimpse()
Rows: 344
Columns: 8
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
$ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
$ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
$ sex               <fct> male, female, female, NA, female, male, female, male…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Choosing the Right Test Category x Category = Frequency tests (Chi-square)

Category x Number = Mean tests (T-tests, ANOVAs, Non-parametric equivalents)

Number x Number = Correlation

5 - Frequency Tests

Frequency Test Types:

  • Chi-square test

  • Z-test

  • G test

  • Binomial tests

  • Loglinear models

Click to expand Contingency Tables

Frequency Table - lists a set of values and how often each one appears - helpful to understand which data values are common or rare

Contingency Table - two-way or two-variable frequency table

head(Arthritis) # this displays the first few rows of data of 'arthritis' from vcd package
  ID Treatment  Sex Age Improved
1 57   Treated Male  27     Some
2 46   Treated Male  29     None
3 77   Treated Male  30     None
4 17   Treated Male  32   Marked
5 36   Treated Male  46   Marked
6 23   Treated Male  58   Marked
# one way table
mytable <- table (Arthritis$Improved)
mytable # counts of each improved variable

  None   Some Marked 
    42     14     28 
prop.table(mytable) # proportion of each variable

     None      Some    Marked 
0.5000000 0.1666667 0.3333333 
prop.table(mytable)*100 # percentage of each variable

    None     Some   Marked 
50.00000 16.66667 33.33333 
# two way table 
mytable1 <- xtabs(~ Treatment+Improved, data=Arthritis)
mytable1 # counts improved per treatment
         Improved
Treatment None Some Marked
  Placebo   29    7      7
  Treated   13    7     21
margin.table(mytable1, 1) # Total counts for treatment
Treatment
Placebo Treated 
     43      41 
prop.table(mytable1, 1) # row proportions (rows equal 1)
         Improved
Treatment      None      Some    Marked
  Placebo 0.6744186 0.1627907 0.1627907
  Treated 0.3170732 0.1707317 0.5121951
margin.table(mytable1, 2) # total counts for improved
Improved
  None   Some Marked 
    42     14     28 
prop.table(mytable1,2) # column proportions (columns equal 1)
         Improved
Treatment      None      Some    Marked
  Placebo 0.6904762 0.5000000 0.2500000
  Treated 0.3095238 0.5000000 0.7500000
prop.table(mytable1) # proportions of all (all equal 1)
         Improved
Treatment       None       Some     Marked
  Placebo 0.34523810 0.08333333 0.08333333
  Treated 0.15476190 0.08333333 0.25000000
addmargins(mytable1) # adds sums to columns and rows
         Improved
Treatment None Some Marked Sum
  Placebo   29    7      7  43
  Treated   13    7     21  41
  Sum       42   14     28  84
addmargins(prop.table(mytable1)) # cell proportions with proportion sums
         Improved
Treatment       None       Some     Marked        Sum
  Placebo 0.34523810 0.08333333 0.08333333 0.51190476
  Treated 0.15476190 0.08333333 0.25000000 0.48809524
  Sum     0.50000000 0.16666667 0.33333333 1.00000000
addmargins(prop.table(mytable1, 1), 2) # row proportions with row sums
         Improved
Treatment      None      Some    Marked       Sum
  Placebo 0.6744186 0.1627907 0.1627907 1.0000000
  Treated 0.3170732 0.1707317 0.5121951 1.0000000
addmargins(prop.table(mytable1, 2), 1) # column proportions with column sums
         Improved
Treatment      None      Some    Marked
  Placebo 0.6904762 0.5000000 0.2500000
  Treated 0.3095238 0.5000000 0.7500000
  Sum     1.0000000 1.0000000 1.0000000
CrossTable(Arthritis$Treatment, Arthritis$Improved)

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  84 

 
                    | Arthritis$Improved 
Arthritis$Treatment |      None |      Some |    Marked | Row Total | 
--------------------|-----------|-----------|-----------|-----------|
            Placebo |        29 |         7 |         7 |        43 | 
                    |     2.616 |     0.004 |     3.752 |           | 
                    |     0.674 |     0.163 |     0.163 |     0.512 | 
                    |     0.690 |     0.500 |     0.250 |           | 
                    |     0.345 |     0.083 |     0.083 |           | 
--------------------|-----------|-----------|-----------|-----------|
            Treated |        13 |         7 |        21 |        41 | 
                    |     2.744 |     0.004 |     3.935 |           | 
                    |     0.317 |     0.171 |     0.512 |     0.488 | 
                    |     0.310 |     0.500 |     0.750 |           | 
                    |     0.155 |     0.083 |     0.250 |           | 
--------------------|-----------|-----------|-----------|-----------|
       Column Total |        42 |        14 |        28 |        84 | 
                    |     0.500 |     0.167 |     0.333 |           | 
--------------------|-----------|-----------|-----------|-----------|

 
# Three-way Contingency table
newtable <- xtabs(~ Treatment+Sex+Improved, data=Arthritis) # this creates a two way table by each Improved Variable
newtable
, , Improved = None

         Sex
Treatment Female Male
  Placebo     19   10
  Treated      6    7

, , Improved = Some

         Sex
Treatment Female Male
  Placebo      7    0
  Treated      5    2

, , Improved = Marked

         Sex
Treatment Female Male
  Placebo      6    1
  Treated     16    5
margin.table(newtable, 1) # totals for treatment
Treatment
Placebo Treated 
     43      41 
margin.table(newtable, 2) # totals for sex
Sex
Female   Male 
    59     25 
margin.table(newtable, 3) # totals for improved
Improved
  None   Some Marked 
    42     14     28 
margin.table(newtable, c(1,2)) # totals for treatment by improved
         Sex
Treatment Female Male
  Placebo     32   11
  Treated     27   14
ftable(newtable)
                 Improved None Some Marked
Treatment Sex                             
Placebo   Female            19    7      6
          Male              10    0      1
Treated   Female             6    5     16
          Male               7    2      5
ftable(prop.table(newtable, c(1,2)))
                 Improved       None       Some     Marked
Treatment Sex                                             
Placebo   Female          0.59375000 0.21875000 0.18750000
          Male            0.90909091 0.00000000 0.09090909
Treated   Female          0.22222222 0.18518519 0.59259259
          Male            0.50000000 0.14285714 0.35714286
ftable(addmargins(prop.table(newtable, c(1,2)), 3))
                 Improved       None       Some     Marked        Sum
Treatment Sex                                                        
Placebo   Female          0.59375000 0.21875000 0.18750000 1.00000000
          Male            0.90909091 0.00000000 0.09090909 1.00000000
Treated   Female          0.22222222 0.18518519 0.59259259 1.00000000
          Male            0.50000000 0.14285714 0.35714286 1.00000000
ftable(addmargins(prop.table(newtable, c(1,2)), 3))*100
                 Improved       None       Some     Marked        Sum
Treatment Sex                                                        
Placebo   Female           59.375000  21.875000  18.750000 100.000000
          Male             90.909091   0.000000   9.090909 100.000000
Treated   Female           22.222222  18.518519  59.259259 100.000000
          Male             50.000000  14.285714  35.714286 100.000000
Click to expand Chi Square Test
volunteers <- matrix(c(111, 96, 48, 96 , 133, 61, 91, 150, 53), # creating a 'matrix' of numbers
                     byrow = T, nrow = 3)
volunteers
     [,1] [,2] [,3]
[1,]  111   96   48
[2,]   96  133   61
[3,]   91  150   53
rownames(volunteers) <- c("Community College", "Four Year", "Non Students") # assigning row names
colnames(volunteers) <- c("1-3 hours", "4-6 hours", "7-9 hours") # assigning column names
volunteers
                  1-3 hours 4-6 hours 7-9 hours
Community College       111        96        48
Four Year                96       133        61
Non Students             91       150        53
model <- chisq.test(volunteers) # creates a model

model # runs model

    Pearson's Chi-squared test

data:  volunteers
X-squared = 12.991, df = 4, p-value = 0.01132
model$expected # the expected cell counts
                  1-3 hours 4-6 hours 7-9 hours
Community College  90.57211  115.1907  49.23719
Four Year         103.00358  131.0012  55.99523
Non Students      104.42431  132.8081  56.76758
model$observed # the observed counts
                  1-3 hours 4-6 hours 7-9 hours
Community College       111        96        48
Four Year                96       133        61
Non Students             91       150        53
model$residuals # the difference between observed and expected
                   1-3 hours  4-6 hours  7-9 hours
Community College  2.1464772 -1.7880604 -0.1763148
Four Year         -0.6900708  0.1746359  0.6688187
Non Students      -1.3136852  1.4918030 -0.5000487
# Alternate way of ChiSquare test

vol_tab <- as.table(volunteers) # creates new table (vol_tab) from matrix
summary(vol_tab) # provides summary including chi square
Number of cases in table: 839 
Number of factors: 2 
Test for independence of all factors:
    Chisq = 12.991, df = 4, p-value = 0.01132
Click to expand Test for Independence

Example 1

naruto<-matrix(c(35,205,8,48), nrow=2, byrow=TRUE) # example table of naruto and ghibli fans
rownames(naruto) <- c("Naruto Fan NO", "Naruto fan YES")
colnames(naruto) <- c("Watched Ghibli", "Never watched Ghibli")
naruto
               Watched Ghibli Never watched Ghibli
Naruto Fan NO              35                  205
Naruto fan YES              8                   48
chisq.test(naruto)$expected # gives expected values
               Watched Ghibli Never watched Ghibli
Naruto Fan NO       34.864865            205.13514
Naruto fan YES       8.135135             47.86486
proportions(naruto[1,]) # gives proportions of row 1
      Watched Ghibli Never watched Ghibli 
           0.1458333            0.8541667 
proportions(naruto[2,]) # gives proportions of row 2
      Watched Ghibli Never watched Ghibli 
           0.1428571            0.8571429 
chisq.test(naruto) # chi square test

    Pearson's Chi-squared test with Yates' continuity correction

data:  naruto
X-squared = 5.9938e-30, df = 1, p-value = 1

x-squared is very low (basically 0) and p-value is 1 therefore null-hypothesis is rejected - n0 = “there is no independence”… so there IS independence. The values are not related.

Example 2

placebo<-matrix(c(43,31,27,12,24,28), nrow=2, byrow=TRUE)
rownames(placebo)<-c("cure-yes", "cure-no")
colnames(placebo)<-c("treatment","placebo","no treatment")
exp.placebo<-chisq.test(placebo)$expected # shows expected values if no effect from treatments
exp.placebo
         treatment  placebo no treatment
cure-yes  33.66667 33.66667     33.66667
cure-no   21.33333 21.33333     21.33333
chisq.test(placebo)

    Pearson's Chi-squared test

data:  placebo
X-squared = 10.619, df = 2, p-value = 0.004945

P-value less than 0.05 so there IS an association between being cured and being treated

Importance of Proportions

Example of Importance of Proportions

# creating dataset for ladybirds

ladybirds <- tribble(
  ~Habitat, ~Site, ~Colour, ~Number,
  "Rural", "R1", "Black", 10,
  "Rural", "R2", "Black", 3,
  "Rural", "R3", "Black", 4,
  "Rural", "R4", "Black", 7,
  "Rural", "R5", "Black", 6,
  "Rural", "R1", "Red", 15,
  "Rural", "R2", "Red", 18,
  "Rural", "R3", "Red", 9,
  "Rural", "R4", "Red", 12,
  "Rural", "R5", "Red", 16,
  "Industrial", "U1", "Black", 32,
  "Industrial", "U2", "Black", 25,
  "Industrial", "U3", "Black", 25,
  "Industrial", "U4", "Black", 17,
  "Industrial", "U5", "Black", 16,
  "Industrial", "U1", "Red", 17,
  "Industrial", "U2", "Red", 23,
  "Industrial", "U3", "Red", 21,
  "Industrial", "U4", "Red", 9,
  "Industrial", "U5",  
 "Red", 15
)

Is there an association between habitat type and morphotype?

ladybirds_wide <- ladybirds %>% # changes colour column into two columns 'Black' and 'Red'
  pivot_wider(names_from = Colour, values_from = Number)
# Show totals

ladybirds %>% # creates wide table 
  group_by(Habitat, Colour) %>% 
  summarize(count = sum(Number)) %>% # creates count column with sum of number column
  spread(Colour, count, fill = 0) |> # spreads the data with colour and count combined
  kable()
Habitat Black Red
Industrial 115 85
Rural 30 70
# Show proportions

ladybirds |> 
  group_by(Habitat, Colour) %>% 
  summarize(count = sum(Number)) %>% 
  spread(Colour, count, fill = 0) |> 
  column_to_rownames("Habitat") |> 
  proportions() |> 
  kable()
Black Red
Industrial 0.3833333 0.2833333
Rural 0.1000000 0.2333333
# Show proportions by ROW

ladybirds |> 
  group_by(Habitat, Colour) %>% 
  summarise(count = sum(Number)) %>% 
  spread(Colour, count, fill = 0) |> 
  column_to_rownames("Habitat") |> 
  as.matrix()->t
  proportions(t,1) |> 
    kable()
Black Red
Industrial 0.575 0.425
Rural 0.300 0.700
# Show proportions by COLUMN
  
ladybirds |> 
  group_by(Habitat, Colour) %>% 
  summarise(count = sum(Number)) %>% 
  spread(Colour, count, fill = 0) |> 
  column_to_rownames("Habitat") |> 
  as.matrix()->t
  proportions(t,2) |> 
    kable()
Black Red
Industrial 0.7931034 0.5483871
Rural 0.2068966 0.4516129
# Instead of doing it all seperately, you can use the following code to display row and column proportions
  
CrossTable(t)  

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  300 

 
             |  
             |     Black |       Red | Row Total | 
-------------|-----------|-----------|-----------|
  Industrial |       115 |        85 |       200 | 
             |     3.477 |     3.253 |           | 
             |     0.575 |     0.425 |     0.667 | 
             |     0.793 |     0.548 |           | 
             |     0.383 |     0.283 |           | 
-------------|-----------|-----------|-----------|
       Rural |        30 |        70 |       100 | 
             |     6.954 |     6.505 |           | 
             |     0.300 |     0.700 |     0.333 | 
             |     0.207 |     0.452 |           | 
             |     0.100 |     0.233 |           | 
-------------|-----------|-----------|-----------|
Column Total |       145 |       155 |       300 | 
             |     0.483 |     0.517 |           | 
-------------|-----------|-----------|-----------|

 
# Now test Chi Sq by rows

t[,1] # first line, Blacks
Industrial      Rural 
       115         30 
chisq.test(t[,1])$expected # shows expected values - it is expected to have equal contribution
Industrial      Rural 
      72.5       72.5 
chisq.test(t[,1]) # chi square of row 1

    Chi-squared test for given probabilities

data:  t[, 1]
X-squared = 49.828, df = 1, p-value = 1.679e-12
# there is a very small p value so proportions differ a lot - black lbs may be affected?



t[,2] # second line, Reds
Industrial      Rural 
        85         70 
chisq.test(t[,2])$expected
Industrial      Rural 
      77.5       77.5 
chisq.test(t[,2])

    Chi-squared test for given probabilities

data:  t[, 2]
X-squared = 1.4516, df = 1, p-value = 0.2283
# large p - red lbs don't respond to habitat

# so it can be concluded that habitat doesn't necessarily affect morphotype since black lbs are affected by red lbs not


# Test Chi Sq by Columns

t[1,] # first column, industrial
Black   Red 
  115    85 
chisq.test(t[1,])$expected # expected values equal
Black   Red 
  100   100 
chisq.test(t[1,]) 

    Chi-squared test for given probabilities

data:  t[1, ]
X-squared = 4.5, df = 1, p-value = 0.03389
# small p value, may be some difference


t[2,] # second column, rural
Black   Red 
   30    70 
chisq.test(t[2,])$expected # expected values equal
Black   Red 
   50    50 
chisq.test(t[2,]) 

    Chi-squared test for given probabilities

data:  t[2, ]
X-squared = 16, df = 1, p-value = 6.334e-05
# smaller p value - stronger difference

So what does this mean??

Need to ask this… Which is correct, should you test them all? What is the final answer?

Test for Homogeneity

Homogeneity in statistics refers to the idea that different samples or groups share the same distribution of a particular characteristic or variable.

# checking for homegeneity - do these samples have the same distribution?

pop<-tribble( # created a dataset of age distribution in Brazil and UK
  ~age, ~country, ~pop,
  "0-10","UK", 50,
  "0-10","BRA", 120,
  "11-20","UK", 70,
  "11-20","BRA", 140,
  "21-30","UK", 110,
  "21-30","BRA", 150,
  "31-40","UK", 150,
  "31-40","BRA", 180,
  "40-50","UK", 180,
  "40-50","BRA", 180,
  "50-60","UK", 150,
  "50-60","BRA", 90,
  "60-70","UK", 130,
  "60-70","BRA", 50)

pop |> # plotted a distribution graph
  ggplot(aes(x=age, y= pop, color=country, fill=country))+
  geom_col(position="dodge")+
  theme(axis.text=element_text(size = 18),
        axis.title = element_text(size=18))

pop |> 
  group_by(country, age) %>% 
  #summarise(count = sum(Number)) %>% 
  spread(country, pop, fill = 0) |> 
  column_to_rownames("age") |> 
  as.matrix() -> t2       # converted to matrix to perform chi square
chisq.test(t2)

    Pearson's Chi-squared test

data:  t2
X-squared = 108.97, df = 6, p-value < 2.2e-16
chisq.test(t2)$observed-chisq.test(t2)$expected # observed minus expexted
        BRA    UK
0-10   31.6 -31.6
11-20  30.8 -30.8
21-30  14.8 -14.8
31-40   8.4  -8.4
40-50  -7.2   7.2
50-60 -34.8  34.8
60-70 -43.6  43.6

Goodness of Fit

Goodness of fit refers to how well a statistical model fits a set of observations by measuring discrepancy between observed data and models predicted values e.g chi square goodness of fit test it produces ‘residuals’. smaller residual means better fit.

pop %>% 
  spread(country, pop, fill=0) %>% 
  mutate(world=(BRA+UK)/2) %>% # comparing against world distribution
  pivot_longer(cols = BRA:world, names_to = "place")->pop2
pop2  
# A tibble: 21 × 3
   age   place value
   <chr> <chr> <dbl>
 1 0-10  BRA     120
 2 0-10  UK       50
 3 0-10  world    85
 4 11-20 BRA     140
 5 11-20 UK       70
 6 11-20 world   105
 7 21-30 BRA     150
 8 21-30 UK      110
 9 21-30 world   130
10 31-40 BRA     180
# ℹ 11 more rows
pop2 %>% # plotting distribution
  ggplot(aes(x=age,y=value, colour=place, fill=place))+
  geom_col(position = "dodge")

pop2 %>% # preparing data
  spread(place, value, fill = 0) %>% 
  column_to_rownames("age")-> pop3
chisq.test(pop3$BRA, pop3$world) # chi square of brazil vs world
Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
incorrect

    Pearson's Chi-squared test

data:  pop3$BRA and pop3$world
X-squared = 35, df = 30, p-value = 0.2426
chisq.test(pop3$UK, pop3$world) # chi square of uk vs world
Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
incorrect

    Pearson's Chi-squared test

data:  pop3$UK and pop3$world
X-squared = 35, df = 30, p-value = 0.2426

Loglinear models

Loglinear models are a type of statistical model used to analyze the relationship between categorical variable

Frequency Test Types: Chi-square test Z-test G test Binomial tests Loglinear models

If BOTH factors in response to variables are CATEGORIC - the nyou need a frequency test

6 - Mean Tests

T-tests are a statistical analyses that simply tell us if the differences between 2 means are significant or not.

Click to expand Mean Tests

Mean Tests

Only family of tests that have parametric and non-parametric tests

If we have a categorical Y value and Numerical X value - we use means tests

Comparing Means: - One Sample Means P = One sample t test NP = One sample Wilcoxon

  • Two samples P = unpaired t-test NP = Unpaired Wilcoxon

  • More than two samples P = One/two way ANOVA or MANOVA NP = Kruskal Wallis

  • Paired samples P = Paired T test NP = Paired Wilcoxon

Mean Test Assumptions: - Continuous Data - Normally Distributed - Random Sample - Enough Data

Mu = greek letter - represents mean

# Unpaired T-Test

set.seed(123456)

body_mass_g1 = rnorm(100,60,12) # 100 values with mean of 60 and SD of 12
body_mass_g2 = rnorm(100,45,12)

t.test(body_mass_g1, body_mass_g2, paired = FALSE) # this is testing the means of each group against one another

    Welch Two Sample t-test

data:  body_mass_g1 and body_mass_g2
t = 9.3705, df = 197.98, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 12.41382 19.03149
sample estimates:
mean of x mean of y 
 60.20184  44.47918 
                                                    # Null hypothesis is that the difference between the means is 0

# to go further you can measure the effect size

effsize::cohen.d(body_mass_g1, body_mass_g2)

Cohen's d

d estimate: 1.325185 (large)
95 percent confidence interval:
   lower    upper 
1.017207 1.633163 
# there is a large effect size indicating a substantial difference between the two groups
# Mean mass of 60g for G1 and mean mass of 44g for G2, mean difference of 16g with 95% confident interval between 14-19g.
# Paired T-test

set.seed(123456) 

foraging_time_preburn = rnorm(100,10,2) 
foraging_time_postburn = rnorm(100,11,2) 

t.test(foraging_time_preburn, foraging_time_postburn, paired = TRUE)

    Paired t-test

data:  foraging_time_preburn and foraging_time_postburn
t = -2.8538, df = 99, p-value = 0.005261
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.4910978 -0.2680168
sample estimates:
mean difference 
     -0.8795573 
# measuring effect size

effsize::cohen.d(foraging_time_postburn, foraging_time_preburn)

Cohen's d

d estimate: 0.4448012 (small)
95 percent confidence interval:
    lower     upper 
0.1624883 0.7271141 
# small p value but small effect size
# there is roughly a 52 second or 9% difference in foraging time

# with a result like this, need to be careful and look into data. 
# significant difference but small effect - will this result make a massive difference to the organism?

7 - Correlations

Click to expand Correlations

Correlation measure the strength and direction of a linear relationship between 2 quantitative variables

set.seed(123456) 

riverspeed <- rnorm(100,7,1) 
turtle_length <- riverspeed + rnorm(100,30,1) # creating dataset

cor.test(riverspeed,turtle_length) # correlation test

    Pearson's product-moment correlation

data:  riverspeed and turtle_length
t = 8.0591, df = 98, p-value = 1.897e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.4964851 0.7364323
sample estimates:
      cor 
0.6313361 
# There is correlation between groups cor= 0.6 and p= <0.05

riverspeeddf <- data.frame(riverspeed) # converting to dataframe
turtle_length_df <- data.frame(turtle_length)

binded <- cbind(riverspeeddf, turtle_length_df) # binding two dataframes together

ggplot(binded, aes(x = riverspeed, y = turtle_length))+ # plotting dataframe
  geom_point()+
  geom_smooth(method = "lm")+
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=16))+
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Exercises

8 - Linear Models

Click to expand Linear Models

Linear models are a statistical method used to approximate the relationship between variables. Under a regression model, we calculate an equation of a line that best describes the data. In fact, that’s what regression means! Regressing the points into a line.

Linear Models = Regression

Numerical vs Numerical = scatterplot

Why use linear models?

  • When you do expect effects

  • When you are able to explain effects

  • You want predictive values

Understanding Linear Models

  • Slope is measure of strength

  • R2 is measure of fitness of the model

lm(formula = qsec ~ hp, data = mtcars) # lm(y~x)

Call:
lm(formula = qsec ~ hp, data = mtcars)

Coefficients:
(Intercept)           hp  
   20.55635     -0.01846  
anova(lm(formula = qsec ~ hp, data = mtcars)) # Shows ANOVA table
Analysis of Variance Table

Response: qsec
          Df Sum Sq Mean Sq F value    Pr(>F)    
hp         1 49.651  49.651   30.19 5.766e-06 ***
Residuals 30 49.338   1.645                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(formula = qsec ~ hp, data = mtcars)) # Shows summary

Call:
lm(formula = qsec ~ hp, data = mtcars)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.1766 -0.6975  0.0348  0.6520  4.0972 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 20.556354   0.542424  37.897  < 2e-16 ***
hp          -0.018458   0.003359  -5.495 5.77e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.282 on 30 degrees of freedom
Multiple R-squared:  0.5016,    Adjusted R-squared:  0.485 
F-statistic: 30.19 on 1 and 30 DF,  p-value: 5.766e-06
# Sum sq is 50% hp and 50% residuals
# residuals are the 'errors' or distances from points to the correlation line

# ANOVA and summary of LM show different features - use both to investigate results and choose better

Part Two Regression

Simple Linear Regression

mtcars

## Model Assumptions

shapiro.test(mtcars$qsec) #### Test normality

    Shapiro-Wilk normality test

data:  mtcars$qsec
W = 0.97325, p-value = 0.5935
#### it is normal

m1 <- lm(qsec~hp, data=mtcars) # assigns model 1

m1 # check coefficients

Call:
lm(formula = qsec ~ hp, data = mtcars)

Coefficients:
(Intercept)           hp  
   20.55635     -0.01846  
m1$residuals # residuals
          Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
        -2.06593942         -1.50593942         -0.22973076          0.91406058 
  Hornet Sportabout             Valiant          Duster 360           Merc 240D 
        -0.30614897          1.60176901         -0.19406695          0.58806148 
           Merc 230            Merc 280           Merc 280C          Merc 450SE 
         4.09718587          0.01401867          0.61401867          0.16614260 
         Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
         0.36614260          0.76614260          1.20760047          1.23218361 
  Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
         1.10905833          0.13189474         -1.07652166          0.54343643 
      Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
         1.24410249         -0.91760683         -0.48760683         -0.62406695 
   Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
        -0.27614897         -0.43810526         -2.17664739         -1.57056447 
     Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
        -1.18335897         -1.82614897          0.22718136          0.05560227 
m1$fitted.values# values that make the line
          Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
           18.52594            18.52594            18.83973            18.52594 
  Hornet Sportabout             Valiant          Duster 360           Merc 240D 
           17.32615            18.61823            16.03407            19.41194 
           Merc 230            Merc 280           Merc 280C          Merc 450SE 
           18.80281            18.28598            18.28598            17.23386 
         Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
           17.23386            17.23386            16.77240            16.58782 
  Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
           16.31094            19.33811            19.59652            19.35656 
      Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
           18.76590            17.78761            17.78761            16.03407 
   Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
           17.32615            19.33811            18.87665            18.47056 
     Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
           15.68336            17.32615            14.37282            18.54440 
ggplot(mtcars, aes(hp, qsec))+
  geom_point()

ggplot(mtcars, aes(hp, qsec))+
  geom_point()+
  geom_smooth(method = lm, se=FALSE) # plotting points with linear model fit
`geom_smooth()` using formula = 'y ~ x'

mtcars %>%
  mutate(fitted=m1$fitted.values) %>% # plotting fitted values
  ggplot(aes(hp, fitted))+
  geom_point()+
  geom_smooth(method = lm, se=FALSE)
`geom_smooth()` using formula = 'y ~ x'

Multiple Linear Models

penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
m2 <- lm(flipper_length_mm~bill_length_mm, data=penguins)# normal model
summary(m2)             

Call:
lm(formula = flipper_length_mm ~ bill_length_mm, data = penguins)

Residuals:
    Min      1Q  Median      3Q     Max 
-43.708  -7.896   0.664   8.650  21.179 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    126.6844     4.6651   27.16   <2e-16 ***
bill_length_mm   1.6901     0.1054   16.03   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.63 on 340 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.4306,    Adjusted R-squared:  0.4289 
F-statistic: 257.1 on 1 and 340 DF,  p-value: < 2.2e-16
m3 <- lm(flipper_length_mm~bill_length_mm+ 
           bill_depth_mm +
           body_mass_g, data=penguins) # adding multiple components
m3

Call:
lm(formula = flipper_length_mm ~ bill_length_mm + bill_depth_mm + 
    body_mass_g, data = penguins)

Coefficients:
   (Intercept)  bill_length_mm   bill_depth_mm     body_mass_g  
     157.78594         0.59227        -1.67868         0.01093  
shapiro.test(m3$residuals) # test multiple normality

    Shapiro-Wilk normality test

data:  m3$residuals
W = 0.99081, p-value = 0.03121
# residuals are not normal

plot(m3) # visually testing residuals

m3log <- lm(flipper_length_mm~log(bill_length_mm)+ 
           log(bill_depth_mm) +
           log(body_mass_g), data=penguins) # converting data to log versions
m3log

Call:
lm(formula = flipper_length_mm ~ log(bill_length_mm) + log(bill_depth_mm) + 
    log(body_mass_g), data = penguins)

Coefficients:
        (Intercept)  log(bill_length_mm)   log(bill_depth_mm)  
            -188.59                25.91               -30.10  
   log(body_mass_g)  
              45.29  
shapiro.test(m3log$residuals)

    Shapiro-Wilk normality test

data:  m3log$residuals
W = 0.99271, p-value = 0.09402
# now the data is normal

summary(m3log)

Call:
lm(formula = flipper_length_mm ~ log(bill_length_mm) + log(bill_depth_mm) + 
    log(body_mass_g), data = penguins)

Residuals:
    Min      1Q  Median      3Q     Max 
-21.046  -3.883  -0.125   4.108  14.740 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)         -188.589     21.645  -8.713  < 2e-16 ***
log(bill_length_mm)   25.914      3.209   8.075  1.2e-14 ***
log(bill_depth_mm)   -30.098      3.098  -9.717  < 2e-16 ***
log(body_mass_g)      45.285      2.343  19.330  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.934 on 338 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.8235,    Adjusted R-squared:  0.8219 
F-statistic: 525.7 on 3 and 338 DF,  p-value: < 2.2e-16
# Colinearity
## when two or more predictor variables in a regression model are correlated

cor(mtcars)
            mpg        cyl       disp         hp        drat         wt
mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
            qsec         vs          am       gear        carb
mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000
lm1 <- lm(mpg~cyl, data = mtcars)
lm2 <- lm(mpg~cyl+disp, data = mtcars)
lm3 <- lm(mpg~cyl+disp+hp, data = mtcars)

summary(lm1)

Call:
lm(formula = mpg ~ cyl, data = mtcars)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.9814 -2.1185  0.2217  1.0717  7.5186 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.8846     2.0738   18.27  < 2e-16 ***
cyl          -2.8758     0.3224   -8.92 6.11e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.206 on 30 degrees of freedom
Multiple R-squared:  0.7262,    Adjusted R-squared:  0.7171 
F-statistic: 79.56 on 1 and 30 DF,  p-value: 6.113e-10
anova(lm1,lm2,lm3) # comparing models with anova
Analysis of Variance Table

Model 1: mpg ~ cyl
Model 2: mpg ~ cyl + disp
Model 3: mpg ~ cyl + disp + hp
  Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
1     30 308.33                              
2     29 270.74  1    37.594 4.0274 0.05451 .
3     28 261.37  1     9.371 1.0039 0.32495  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# testing whether complexity or different combinations improves model
# if significant p, it shows that model is different from the others and significant improved
# more parameters you add, more degrees of freedom but increased chances of error
# simplest model is often best
# even if you lose a little bit of explanation, the model may work better

9 - Logistic Models

Click to expand Logistic Models

Reminder: (x) (y) Num x Num = correlation Num x Num = Linear Model

Num x Cat = Logistic Model

Better than Linear model for predicting probability

  • Detects at which point the probability shifts

  • Flat s-curve shows the variable is not a good predictor

  • Strong s-curve shows good predictor

Log model gives you:

  • coefficients
-    shows the effect of a one unit change in the predictor variable
  • deviance
-   measure of how well the model fits the data

inputting families (?family) for models:

binomial = If your outcome is binary (zeros and ones), proportions of “successes” and “failures” (values between 0 and 1), or their counts

gaussian = If your outcome is continuous and unbounded (aka normal)

Gamma = If you are dealing with continuous non-negative outcome

poisson = If your outcome is discrete, or more precisely, you are dealing with counts

Other more complicated types:

quasi

quasibinomial

quasipoisson

inverse.gaussian

Example

m1<-glm(sex~body_mass_g,family="binomial", data=penguins)
m1

Call:  glm(formula = sex ~ body_mass_g, family = "binomial", data = penguins)

Coefficients:
(Intercept)  body_mass_g  
   -5.16254      0.00124  

Degrees of Freedom: 332 Total (i.e. Null);  331 Residual
  (11 observations deleted due to missingness)
Null Deviance:      461.6 
Residual Deviance: 396.6    AIC: 400.6

Logistic models don’t produce R value You can produce a ‘pseudo R2’ which shows the effect size on the model

null<-m1$null.deviance/-2
model<-m1$deviance/-2
(null-model)/null
[1] 0.1407346

You can use logistic models to test all variables against one by using ‘.’

m2<-glm(formula = sex ~ ., family = "binomial", data = penguins)
summary(m2)

Call:
glm(formula = sex ~ ., family = "binomial", data = penguins)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       310.553424 632.100857   0.491  0.62321    
speciesChinstrap   -7.624848   1.714564  -4.447 8.70e-06 ***
speciesGentoo      -8.766260   2.679151  -3.272  0.00107 ** 
islandDream         0.394305   0.817853   0.482  0.62972    
islandTorgersen    -0.513102   0.849231  -0.604  0.54571    
bill_length_mm      0.627014   0.134709   4.655 3.25e-06 ***
bill_depth_mm       1.613122   0.338498   4.766 1.88e-06 ***
flipper_length_mm   0.037682   0.051547   0.731  0.46476    
body_mass_g         0.005717   0.001090   5.244 1.58e-07 ***
year               -0.195493   0.316257  -0.618  0.53648    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 461.61  on 332  degrees of freedom
Residual deviance: 125.66  on 323  degrees of freedom
  (11 observations deleted due to missingness)
AIC: 145.66

Number of Fisher Scoring iterations: 7

Final Model

m3<-glm(sex~bill_length_mm+
          bill_depth_mm+
          body_mass_g, 
        family = "binomial", 
        data=penguins)
summary(m3)

Call:
glm(formula = sex ~ bill_length_mm + bill_depth_mm + body_mass_g, 
    family = "binomial", data = penguins)

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -6.056e+01  7.081e+00  -8.552  < 2e-16 ***
bill_length_mm  9.151e-02  4.416e-02   2.072   0.0382 *  
bill_depth_mm   2.063e+00  2.469e-01   8.356  < 2e-16 ***
body_mass_g     5.061e-03  6.348e-04   7.971 1.57e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 461.61  on 332  degrees of freedom
Residual deviance: 159.89  on 329  degrees of freedom
  (11 observations deleted due to missingness)
AIC: 167.89

Number of Fisher Scoring iterations: 7

This produces an effect size of 65%

Types of test based on x and y variables:

99% of all tests can be done with these…

Notes

Basic Functions

Basic Functions
library(package) #loads particular package

data1 <- '1,2,3,4' #assigns name to data

ls() #lists dataset

nameofdata #to list items within that dataset

rm() #removes data items from environment

Basic Data Management

Basic Data Management
combine() # combines data/items into specified area
          # e.g data1 <- c(1,2,3)

mutate() # can be used to add columns to a dataset

summarise() # summarises the data

str() #shows the structure of the data

group_by() # groups data by specified value
ungroup() #always ungroup data afterwards

filter() # can be used to show specified values

select() # selects specific values, rows/columns

Basic Plots

Scattergraph
data(mtcars)
data(pressure)
data(BOD)
data(ToothGrowth)

# Base R Scattergraph

plot(mtcars$wt, mtcars$mpg)

library(ggplot2)

# ggplot Scattergraph

ggplot(mtcars, aes(x = wt, y = mpg)) + #this tells it to create a plot object
  geom_point() # this just tells it to add a layer of points to the plot

Line Graph
# Line Graph

#Base R function
plot(pressure$temperature, pressure$pressure, type = "l", xlab = "Temperature", ylab = "pressure")

plot(pressure$temperature, pressure$pressure, type = "l")
points(pressure$temperature, pressure$pressure, col = "red")
lines(pressure$temperature, pressure$pressure/2, col = "red")

#ggplot Line Graph
ggplot(pressure, aes(x = temperature, y = pressure)) +
  geom_line(col = "blue")

Bar Graph
# Bar Graph

#Base R function
barplot(BOD$demand, names.arg = BOD$Time)

barplot(table(mtcars$cyl))

# ggplot Bar Graph
ggplot(BOD, aes(x = Time, y = demand)) +
  geom_col()

ggplot(BOD, aes(x = factor(Time), y = demand)) + # converts x variable to a factor so that it is discrete
  geom_col()

ggplot(mtcars, aes(x = cyl)) +
  geom_bar() 

ggplot(mtcars, aes(x = factor(cyl))) + # discrete not continuous
  geom_bar()

Histogram
# Histogram

hist(mtcars$mpg)

hist(mtcars$mpg, breaks = 10) # specifies no. breaks

# ggplot Histogram

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 4)

Box Plot
# Box Plot

plot(ToothGrowth$supp, ToothGrowth$len) # when x is a factor (not numerical) it will automatically create a boxplot

boxplot(len ~ supp, data = ToothGrowth)

boxplot(len ~ supp + dose, data = ToothGrowth)

# ggplot Box Plot

ggplot(ToothGrowth, aes(x = supp, y = len)) +
  geom_boxplot()

ggplot(ToothGrowth, aes(x = interaction(supp, dose), y = len)) +
  geom_boxplot()

Function Curve
# Function Curve

curve(x^3 - 5*x, from = -4, to = 4)

myfun <- function(xvar) {
  1 / (1 + exp(-xvar + 10))
}
curve(myfun(x), from = 0, to = 20)
# Add a line:
curve(1 - myfun(x), add = TRUE, col = "red")

# ggplot Function Curve

ggplot(data.frame(x = c(0,20)), aes(x = x)) +
  stat_function(fun = myfun, geom = "line")