Coding and regression for Laura Richardson.

Laura has asked that I do some basic coding and logisitc regression for a chapter of hers.

It involves the dataset on pregnancies in Anglican churchs in Cape Town.

The instructions are included in a document called, “instructions.doc” in the data folder of this project.

library(pacman)
pacman::p_load(tidyverse, lubridate, glue, stringr, broom)

library(readxl)
lr <- read_excel("data/LR.xlsx")

## New names:
## * `` -> ...8

We begin by creating the date variables

# creating date of marriage
lr <- lr %>% 
  mutate(date_married = glue("{Day_married}-{Month_married}-{Year_married}"),
         date_married = lubridate::dmy(date_married))
# check for failures
lr %>% 
  filter(is.na(date_married))

## # A tibble: 0 x 35
## # ... with 35 variables: `Parish name` <chr>, Year_baptized <chr>,
## #   Month_baptized_written <chr>, Month_baptized <chr>, Day_baptized <chr>,
## #   Childs_name <chr>, Year_of_birth <chr>, ...8 <chr>, Month_of_birth <chr>,
## #   Day_of_birth <chr>, Male_name_parent <chr>, Female_name_parent <chr>,
## #   Surname_atbap <chr>, Year_married <dbl>, Month_married_written <chr>,
## #   Month_married <dbl>, Day_married <chr>, Male_name <chr>,
## #   Male_surname <chr>, Male_age <chr>, Male_race <chr>, Male_condition <chr>,
## #   Male_Profession <chr>, Prof_code <chr>, Female_name <chr>,
## #   Female_surname <chr>, Female_age <chr>, Female_race <chr>,
## #   Female_profession <chr>, Female_prof_rank <chr>, Male_signed <chr>,
## #   Female_signed <chr>, Race <chr>, Breadwinner_class <chr>,
## #   date_married <date>

# all good, no missing

# creating date of birth
lr <- lr %>% 
  mutate(date_birth = glue("{Day_of_birth}-{Month_of_birth}-{Year_of_birth}"),
         date_birth = lubridate::dmy(date_birth))

## Warning: 6 failed to parse.

# check for failures
lr %>% 
  filter(is.na(date_birth))

## # A tibble: 6 x 36
##   `Parish name` Year_baptized Month_baptized_~ Month_baptized Day_baptized
##   <chr>         <chr>         <chr>            <chr>          <chr>       
## 1 Cape Town, M~ 1907          December         12             17          
## 2 Cape Town, A~ 1952          Mar              3              14          
## 3 Cape Town, L~ 1943          October          10             20          
## 4 Cape Town,Be~ 1955          Sep              9              4           
## 5 Cape Town, A~ 1940          Jan              1              16          
## 6 Cape Town, M~ 1919          October          10             17          
## # ... with 31 more variables: Childs_name <chr>, Year_of_birth <chr>,
## #   ...8 <chr>, Month_of_birth <chr>, Day_of_birth <chr>,
## #   Male_name_parent <chr>, Female_name_parent <chr>, Surname_atbap <chr>,
## #   Year_married <dbl>, Month_married_written <chr>, Month_married <dbl>,
## #   Day_married <chr>, Male_name <chr>, Male_surname <chr>, Male_age <chr>,
## #   Male_race <chr>, Male_condition <chr>, Male_Profession <chr>,
## #   Prof_code <chr>, Female_name <chr>, Female_surname <chr>, Female_age <chr>,
## #   Female_race <chr>, Female_profession <chr>, Female_prof_rank <chr>,
## #   Male_signed <chr>, Female_signed <chr>, Race <chr>,
## #   Breadwinner_class <chr>, date_married <date>, date_birth <date>

# 6 failures. 3 missing days. 3 days which are incorrect, e.g. day 232 and day 31


# creating date of baptism
lr <- lr %>% 
  mutate(date_baptism = glue("{Day_baptized}-{Month_baptized}-{Year_baptized}"),
         date_baptism = lubridate::dmy(date_baptism))

## Warning: 11 failed to parse.

# 11 failed to parse. why?
lr %>% 
  filter(is.na(date_baptism))

## # A tibble: 11 x 37
##    `Parish name` Year_baptized Month_baptized_~ Month_baptized Day_baptized
##    <chr>         <chr>         <chr>            <chr>          <chr>       
##  1 Cape Town, A~ 1937          Jun              6              .           
##  2 Cape Town, A~ 1937          Sep              9              .           
##  3 Cape Town, M~ <NA>          September        9              30          
##  4 Cape Town, M~ 1920          October          <NA>           8           
##  5 Cape Town, M~ 1926          June             <NA>           6           
##  6 Cape Town, M~ 1925          December         <NA>           9           
##  7 Cape Town, M~ 1923          March            <NA>           4           
##  8 Cape Town, M~ 1924          October          <NA>           5           
##  9 Cape Town, M~ 1925          November         <NA>           1           
## 10 Cape Town, M~ 1921          October          <NA>           7           
## 11 Cape Town, M~ 1921          August           <NA>           7           
## # ... with 32 more variables: Childs_name <chr>, Year_of_birth <chr>,
## #   ...8 <chr>, Month_of_birth <chr>, Day_of_birth <chr>,
## #   Male_name_parent <chr>, Female_name_parent <chr>, Surname_atbap <chr>,
## #   Year_married <dbl>, Month_married_written <chr>, Month_married <dbl>,
## #   Day_married <chr>, Male_name <chr>, Male_surname <chr>, Male_age <chr>,
## #   Male_race <chr>, Male_condition <chr>, Male_Profession <chr>,
## #   Prof_code <chr>, Female_name <chr>, Female_surname <chr>, Female_age <chr>,
## #   Female_race <chr>, Female_profession <chr>, Female_prof_rank <chr>,
## #   Male_signed <chr>, Female_signed <chr>, Race <chr>,
## #   Breadwinner_class <chr>, date_married <date>, date_birth <date>,
## #   date_baptism <date>

# two have missing day, one has missing year, 8 have missing month.

# Month_baptized
lr <- lr %>% 
  mutate(Month_baptized = as.numeric(Month_baptized),
         Month_baptized= month(Month_baptized, label = T))
lr %>% 
  count(Month_baptized, sort = T)

## Warning: Factor `Month_baptized` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## # A tibble: 13 x 2
##    Month_baptized     n
##    <ord>          <int>
##  1 Nov              205
##  2 Feb              201
##  3 Dec              201
##  4 Mar              199
##  5 Sep              189
##  6 Aug              186
##  7 Oct              183
##  8 Jun              176
##  9 Apr              168
## 10 May              167
## 11 Jul              162
## 12 Jan              138
## 13 <NA>               8

# plot to check distribution of births and baptisims and marriages
lr %>% 
  ggplot(aes()) +
  geom_histogram(aes(date_birth), alpha = 0.3, fill = 'blue') +
  geom_histogram(aes(date_baptism), alpha = 0.3, fill = 'red') + 
   geom_histogram(aes(date_married), alpha = 0.3, fill = 'green')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 6 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 11 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Next we create the variable for days between marriage and birth and visualise these

lr <- lr %>% 
  mutate(days_marriage_to_birth = as.numeric(as.duration(date_birth - date_married), "days"))
# NB there is no-one in the dataset with an illigitmate birth

# plots
# density plot shows proportions for each group
lr %>% 
  filter(days_marriage_to_birth > 0,
         days_marriage_to_birth < 700,
         Race != ".",
         Race != "NA") %>% 
  ggplot(aes(days_marriage_to_birth, fill = Female_race)) +
  geom_density(alpha = .3) +
  geom_vline(lty = 2, xintercept = 240)

# histogram shows distribtuion, not proportions
lr %>% 
  filter(days_marriage_to_birth > 0,
         days_marriage_to_birth < 700,
         Race != ".",
         Race != "NA") %>% 
  ggplot(aes(days_marriage_to_birth, fill = Female_race)) +
  geom_histogram(alpha = .3, position = "dodge") +
  geom_vline(lty = 2, xintercept = 240)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Days from birth to baptism visualisation and sanity check

lr <- lr %>% 
  mutate(days_birth_to_baptism = as.numeric(as.duration(date_baptism- date_birth), "days"))

# plots
# density plot shows proportions for each group
lr %>% 
  filter(days_birth_to_baptism > 0,
         days_birth_to_baptism < 700,
         Race != ".",
         Race != "NA") %>% 
  ggplot(aes(days_birth_to_baptism, fill = Female_race)) +
  geom_density(alpha = .3)

# histogram shows distribtuion, not proportions
lr %>% 
  filter(days_birth_to_baptism > 0,
         days_birth_to_baptism < 700,
         Race != ".",
         Race != "NA") %>% 
  ggplot(aes(days_birth_to_baptism, fill = Female_race)) +
  geom_histogram(alpha = .3, position = "dodge")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Classification of births

# here we don't worry about illigitamte births as there are none
lr <- lr %>% 
  filter(days_marriage_to_birth >= 0) %>% 
  mutate(birth_type = ifelse(days_marriage_to_birth >= 240, yes = 0, no = 1))

# plot to check for sensibility
lr %>% 
  ggplot(aes(birth_type)) +
  geom_bar() +
  facet_wrap(~ Female_race)

# THERE IS A LOT OF DATA THAT WE LOSE FROM MISSING RACE INFORMATION

Bridal pregnancy

lr <- lr %>% 
  mutate(bridal_pregnancy = birth_type)

Baptism day of week and visualisation

# day of week of baptism
lr <- lr %>% 
  mutate(baptism_dow = lubridate::wday(date_baptism),
         baptism_sunday = ifelse(baptism_dow == 1, 1, 0))

lr %>% 
  ggplot(aes(baptism_dow)) +
  geom_bar() +
  facet_wrap( ~ Female_race)

## Warning: Removed 9 rows containing non-finite values (stat_count).

# comment is that babies of coloured mothers are more likely to have baptism on Fridays

# what about the interaction between Sunday and bridal pregnancy?
lr %>% 
  filter(Female_race != ".") %>% 
  mutate(bridal_pregnancy = recode(bridal_pregnancy,
                                   `0` = "Conventional pregnancy",
                                   `1` = "Bridal pregnancy"),
         baptism_dow = wday(baptism_dow, label = T)) %>% 
  filter(baptism_dow != "NA") %>% 
  ggplot(aes(baptism_dow)) +
  geom_bar() +
  facet_grid(bridal_pregnancy  ~ Female_race) +
  labs(title = "Plot of day of week of baptism by mother's race and type of pregnancy",
       x = "Baptism day of the week",
       y = "Count")

Logistic regression

For bridal pregnancy

# getting rid of the dots
lr_clean <- lr %>% 
  filter(Race != ".",
         Breadwinner_class != ".",
         Male_age != ".",
         Female_age != ".",
         Male_signed != ".",
         Female_signed != ".",
         Month_baptized != ".")

# what is largest parish?
lr %>% 
  count(`Parish name`, sort = T)

## # A tibble: 6 x 2
##   `Parish name`                                n
##   <chr>                                    <int>
## 1 Cape Town, Athlone, St Mark                821
## 2 Cape Town, Mowbray, St Peter               422
## 3 Cape Town, Kenilworth, Christ Church       229
## 4 Cape Town, Sea Point, St James the Great   199
## 5 Cape Town, Maitland, St. Peters            136
## 6 Cape Town,Bellville,Parow,St Margaret       74

# Athlone, St Mark

# treat variables as numeric
lr_clean <- lr_clean %>% 
  mutate(Race = recode(Race, `0` = "White", `2` = "Mixed race", `1` = "Coloured"),
         Race = as.factor(Race),
         Breadwinner_class = as.numeric(Breadwinner_class),
         Male_age = as.numeric(Male_age),
         Female_age = as.numeric(Female_age),
         Male_signed = recode(Male_signed, "Yes" = 1, "No" = 0),
         Female_signed = recode(Female_signed, "Yes" = 1, "No" = 0),
         Year_baptized = as.numeric(Year_baptized),
         Parish = as.factor(`Parish name`),
         Parish = fct_relevel(Parish, "Cape Town, Athlone, St Mark"))

months <- lr %>% 
  count(Month_baptized, sort =T)

## Warning: Factor `Month_baptized` contains implicit NA, consider using
## `forcats::fct_explicit_na`

model_bridal_pregnancy <- lr_clean %>% 
  glm(bridal_pregnancy ~ Race + Breadwinner_class + Male_age + Female_age + Male_signed + Female_signed + Year_baptized + Parish, family = "binomial", data = .) %>% 
  tidy(conf.int = T)

# Here is the output
model_bridal_pregnancy

## # A tibble: 14 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)          -1.71     11.8        -0.145  8.85e-1 -24.8     21.3    
##  2 RaceMixed race       -0.635     0.523      -1.22   2.24e-1  -1.73     0.357  
##  3 RaceWhite            -1.37      0.242      -5.64   1.70e-8  -1.85    -0.900  
##  4 Breadwinner_class     0.684     0.205       3.33   8.61e-4   0.286    1.09   
##  5 Male_age             -0.0357    0.0167     -2.14   3.20e-2  -0.0690  -0.00355
##  6 Female_age           -0.0977    0.0205     -4.76   1.91e-6  -0.138   -0.0580 
##  7 Male_signed           0.342     0.224       1.52   1.27e-1  -0.0953   0.786  
##  8 Female_signed         0.476     0.239       1.99   4.63e-2   0.0128   0.951  
##  9 Year_baptized         0.00163   0.00607     0.268  7.89e-1  -0.0103   0.0136 
## 10 ParishCape Town, Ke~ -0.410     0.359      -1.14   2.53e-1  -1.14     0.280  
## 11 ParishCape Town, Ma~  0.321     0.291       1.10   2.70e-1  -0.250    0.892  
## 12 ParishCape Town, Mo~  0.344     0.161       2.14   3.22e-2   0.0303   0.661  
## 13 ParishCape Town, Se~  0.143     0.435       0.328  7.43e-1  -0.730    0.984  
## 14 ParishCape Town,Bel~ -0.499     0.360      -1.39   1.66e-1  -1.23     0.188

Visualisation of logistic regression for bridal pregnancy

# coefficient plot including the parish and year terms
model_bridal_pregnancy %>% 
  filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(subtitle = "Coefficient plot for logistic regression of determinants of bridal pregnancy",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

model_bridal_pregnancy_excl <- model_bridal_pregnancy %>% 
  filter(term != "Year_baptized",
         term != "ParishCape Town, Kenilworth, Christ Church",
         term != "ParishCape Town, Maitland, St. Peters",
         term != "ParishCape Town, Mowbray, St Peter",
         term != "ParishCape Town, Sea Point, St James the Great",
         term != "ParishCape Town,Bellville,Parow,St Margaret")

# coefficient plot excluding the parish and year terms
model_bridal_pregnancy_excl %>% 
  filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants of bridal pregnancy",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

Does this look correct, Laura?

Logistic regression for day of the week

model_sunday <- lr_clean %>% 
  mutate(Month_baptized = as.character(Month_baptized)) %>% 
  glm(baptism_sunday ~ Race + bridal_pregnancy + Race*bridal_pregnancy +
        Parish + Month_baptized + Year_baptized,
      family = "binomial", data = .) %>% 
  tidy(conf.int = T)
  
# here is the output
model_sunday

## # A tibble: 23 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           43.2       13.1       3.29  9.94e- 4   17.7      69.2  
##  2 RaceMixed race        -0.492      1.09     -0.450 6.53e- 1   -2.28      2.46 
##  3 RaceWhite             -2.68       0.327    -8.22  2.02e-16   -3.34     -2.05 
##  4 bridal_pregnancy      -2.17       0.181   -12.0   2.91e-33   -2.54     -1.83 
##  5 ParishCape Town, Ke~   1.03       0.322     3.20  1.35e- 3    0.408     1.67 
##  6 ParishCape Town, Ma~  -1.16       0.323    -3.58  3.47e- 4   -1.79     -0.520
##  7 ParishCape Town, Mo~   1.68       0.247     6.79  1.12e-11    1.21      2.18 
##  8 ParishCape Town, Se~   1.09       0.432     2.53  1.13e- 2    0.261     1.96 
##  9 ParishCape Town,Bel~   2.37       0.449     5.26  1.41e- 7    1.53      3.30 
## 10 Month_baptizedAug      0.0677     0.334     0.203 8.39e- 1   -0.587     0.723
## # ... with 13 more rows

Visualisation of logistic regression for day of week of baptism

model_sunday %>% 
    filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

model_sunday_excl <- model_sunday %>% 
   filter(term != "Year_baptized",
         term != "ParishCape Town, Kenilworth, Christ Church",
         term != "ParishCape Town, Maitland, St. Peters",
         term != "ParishCape Town, Mowbray, St Peter",
         term != "ParishCape Town, Sea Point, St James the Great",
         term != "ParishCape Town,Bellville,Parow,St Margaret",
         term!= "Month_baptizedMay",
         term!= "Month_baptizedAug",
         term!= "Month_baptizedJun",
         term!= "Month_baptizedFeb",
         term!= "Month_baptizedNov",
         term!= "Month_baptizedSep",
         term!= "Month_baptizedMar",
         term!= "Month_baptizedOct",
         term!= "Month_baptizedJul",
         term!= "Month_baptizedJan",
         term!= "Month_baptizedDec")

model_sunday_excl %>% 
      filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

Here we exclude mixed race due to the small sample size

logistic regression for baptism sunday but race is binary

model_sunday_binary <- lr_clean %>% 
  filter(Race != "Mixed race") %>% 
  mutate(Race = recode(Race, "White" = 0, "Coloured" = 1)) %>% 
  mutate(Month_baptized = as.character(Month_baptized)) %>% 
  glm(baptism_sunday ~ Race + bridal_pregnancy + Race*bridal_pregnancy +
        Parish + Month_baptized + Year_baptized,
      family = "binomial", data = .) %>% 
  tidy(conf.int = T)
  
# here is the output
model_sunday_binary

## # A tibble: 21 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           40.0       13.2       3.04  2.39e- 3   14.4      66.1  
##  2 Race                   2.71       0.330     8.21  2.17e-16    2.07      3.36 
##  3 bridal_pregnancy       0.413      0.383     1.08  2.81e- 1   -0.314     1.20 
##  4 ParishCape Town, Ke~   1.05       0.326     3.23  1.23e- 3    0.422     1.70 
##  5 ParishCape Town, Ma~  -1.15       0.323    -3.55  3.86e- 4   -1.78     -0.511
##  6 ParishCape Town, Mo~   1.71       0.251     6.82  9.21e-12    1.24      2.23 
##  7 ParishCape Town, Se~   1.12       0.434     2.59  9.62e- 3    0.287     1.99 
##  8 ParishCape Town,Bel~   2.38       0.452     5.27  1.34e- 7    1.54      3.32 
##  9 Month_baptizedAug      0.0601     0.334     0.180 8.57e- 1   -0.596     0.717
## 10 Month_baptizedDec     -0.436      0.306    -1.43  1.54e- 1   -1.04      0.159
## # ... with 11 more rows

visualisation of logistic regression for baptism sunday but race is binary

model_sunday_binary %>% 
    filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

model_sunday_binary_excl <- model_sunday_binary %>% 
   filter(term != "Year_baptized",
         term != "ParishCape Town, Kenilworth, Christ Church",
         term != "ParishCape Town, Maitland, St. Peters",
         term != "ParishCape Town, Mowbray, St Peter",
         term != "ParishCape Town, Sea Point, St James the Great",
         term != "ParishCape Town,Bellville,Parow,St Margaret",
         term!= "Month_baptizedMay",
         term!= "Month_baptizedAug",
         term!= "Month_baptizedJun",
         term!= "Month_baptizedFeb",
         term!= "Month_baptizedNov",
         term!= "Month_baptizedSep",
         term!= "Month_baptizedMar",
         term!= "Month_baptizedOct",
         term!= "Month_baptizedJul",
         term!= "Month_baptizedJan",
         term!= "Month_baptizedDec")

model_sunday_binary_excl %>% 
      filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

logistic regression for baptism sunday but race is binary and comparator level is white

model_sunday_binary_white <- lr_clean %>% 
  filter(Race != "Mixed race") %>% 
  mutate(Race = fct_relevel(Race, "White")) %>% 
  mutate(Month_baptized = as.character(Month_baptized)) %>% 
  glm(baptism_sunday ~ Race + bridal_pregnancy + Race*bridal_pregnancy +
        Parish + Month_baptized + Year_baptized,
      family = "binomial", data = .) %>% 
  tidy(conf.int = T)
  
# here is the output
model_sunday_binary_white

## # A tibble: 21 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           40.0       13.2       3.04  2.39e- 3   14.4      66.1  
##  2 RaceColoured           2.71       0.330     8.21  2.17e-16    2.07      3.36 
##  3 bridal_pregnancy       0.413      0.383     1.08  2.81e- 1   -0.314     1.20 
##  4 ParishCape Town, Ke~   1.05       0.326     3.23  1.23e- 3    0.422     1.70 
##  5 ParishCape Town, Ma~  -1.15       0.323    -3.55  3.86e- 4   -1.78     -0.511
##  6 ParishCape Town, Mo~   1.71       0.251     6.82  9.21e-12    1.24      2.23 
##  7 ParishCape Town, Se~   1.12       0.434     2.59  9.62e- 3    0.287     1.99 
##  8 ParishCape Town,Bel~   2.38       0.452     5.27  1.34e- 7    1.54      3.32 
##  9 Month_baptizedAug      0.0601     0.334     0.180 8.57e- 1   -0.596     0.717
## 10 Month_baptizedDec     -0.436      0.306    -1.43  1.54e- 1   -1.04      0.159
## # ... with 11 more rows

visualisation for logistic regression for baptism sunday but race is binary and comparator level is white

model_sunday_binary_white %>% 
      filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

model_sunday_binary_white_excl <- model_sunday_binary_white %>% 
   filter(term != "Year_baptized",
         term != "ParishCape Town, Kenilworth, Christ Church",
         term != "ParishCape Town, Maitland, St. Peters",
         term != "ParishCape Town, Mowbray, St Peter",
         term != "ParishCape Town, Sea Point, St James the Great",
         term != "ParishCape Town,Bellville,Parow,St Margaret",
         term!= "Month_baptizedMay",
         term!= "Month_baptizedAug",
         term!= "Month_baptizedJun",
         term!= "Month_baptizedFeb",
         term!= "Month_baptizedNov",
         term!= "Month_baptizedSep",
         term!= "Month_baptizedMar",
         term!= "Month_baptizedOct",
         term!= "Month_baptizedJul",
         term!= "Month_baptizedJan",
         term!= "Month_baptizedDec")

model_sunday_binary_white_excl %>% 
      filter(term != "(Intercept)") %>% 
  arrange(desc(estimate)) %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(lty =2, xintercept = 0) +
  theme(legend.position = "bottom",
        legend.title = element_blank()) +
  labs(title = "Coefficient plot for logistic regression of determinants baptism on Sunday",
       x = "Estimate",
       y = "",
       caption = "Note: error bars show 95% confidence interval")

Coding and regression for Laura Richardson

JJayes

26/04/2020