Setup

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Get Data

This data is from CDC Wonder and covers the years 2007 through 2019. The race variable we are using was dropped for 2020.

Gender <- read_delim("Natality, 2007-2020.txt", 
delim = "\t", escape_double = FALSE, 
col_names = TRUE, trim_ws = TRUE) %>% 
select(`Census Region Code`, `Age of Mother 9`,Year,
       `Mother's Bridged Race`,Gender,Births) %>% 
filter(Year < 2020)  %>% 
drop_na()

## Rows: 3421 Columns: 12

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Notes, Census Region, Census Region Code, Mother's Bridged Race, Mo...
## dbl (3): Year, Year Code, Births

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Warning: One or more parsing issues, see `problems()` for details

#save(Gender,file = "Gender.Rdata")

Inspect

Do a summary(), glimpse(), and head() of the dataframe.

Solution

summary(Gender)

##  Census Region Code Age of Mother 9         Year      Mother's Bridged Race
##  Length:3322        Length:3322        Min.   :2007   Length:3322          
##  Class :character   Class :character   1st Qu.:2010   Class :character     
##  Mode  :character   Mode  :character   Median :2013   Mode  :character     
##                                        Mean   :2013                        
##                                        3rd Qu.:2016                        
##                                        Max.   :2019                        
##     Gender              Births      
##  Length:3322        Min.   :    10  
##  Class :character   1st Qu.:   257  
##  Mode  :character   Median :  2142  
##                     Mean   : 15605  
##                     3rd Qu.: 12607  
##                     Max.   :172645

glimpse(Gender)

## Rows: 3,322
## Columns: 6
## $ `Census Region Code`    <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "C…
## $ `Age of Mother 9`       <chr> "15-19 years", "15-19 years", "15-19 years", "…
## $ Year                    <dbl> 2007, 2007, 2008, 2008, 2009, 2009, 2010, 2010…
## $ `Mother's Bridged Race` <chr> "American Indian or Alaska Native", "American …
## $ Gender                  <chr> "Female", "Male", "Female", "Male", "Female", …
## $ Births                  <dbl> 150, 131, 135, 144, 131, 127, 123, 110, 134, 1…

head(Gender)

## # A tibble: 6 × 6
##   `Census Region Code` `Age of Mother 9`  Year `Mother's Bridged … Gender Births
##   <chr>                <chr>             <dbl> <chr>               <chr>   <dbl>
## 1 CENS-R1              15-19 years        2007 American Indian or… Female    150
## 2 CENS-R1              15-19 years        2007 American Indian or… Male      131
## 3 CENS-R1              15-19 years        2008 American Indian or… Female    135
## 4 CENS-R1              15-19 years        2008 American Indian or… Male      144
## 5 CENS-R1              15-19 years        2009 American Indian or… Female    131
## 6 CENS-R1              15-19 years        2009 American Indian or… Male      127

Pivot

We want to have two separate columns name Male and Female. These columns contain the numbers of births. Use pivot_wider() to accomplish this. Check the results with the three functions above.

Solution

Gender = Gender %>% 
pivot_wider(names_from = Gender, values_from = Births)  %>% 
drop_na() 

summary(Gender)

##  Census Region Code Age of Mother 9         Year      Mother's Bridged Race
##  Length:1638        Length:1638        Min.   :2007   Length:1638          
##  Class :character   Class :character   1st Qu.:2010   Class :character     
##  Mode  :character   Mode  :character   Median :2013   Mode  :character     
##                                        Mean   :2013                        
##                                        3rd Qu.:2016                        
##                                        Max.   :2019                        
##      Female              Male         
##  Min.   :    10.0   Min.   :    10.0  
##  1st Qu.:   266.2   1st Qu.:   278.5  
##  Median :  2180.5   Median :  2278.5  
##  Mean   : 15454.6   Mean   : 16192.4  
##  3rd Qu.: 12555.5   3rd Qu.: 13163.5  
##  Max.   :164768.0   Max.   :172645.0

glimpse(Gender)

## Rows: 1,638
## Columns: 6
## $ `Census Region Code`    <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "C…
## $ `Age of Mother 9`       <chr> "15-19 years", "15-19 years", "15-19 years", "…
## $ Year                    <dbl> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014…
## $ `Mother's Bridged Race` <chr> "American Indian or Alaska Native", "American …
## $ Female                  <dbl> 150, 135, 131, 123, 134, 140, 106, 72, 63, 53,…
## $ Male                    <dbl> 131, 144, 127, 110, 135, 144, 106, 85, 100, 70…

head(Gender)

## # A tibble: 6 × 6
##   `Census Region Code` `Age of Mother 9`  Year `Mother's Bridged R… Female  Male
##   <chr>                <chr>             <dbl> <chr>                 <dbl> <dbl>
## 1 CENS-R1              15-19 years        2007 American Indian or …    150   131
## 2 CENS-R1              15-19 years        2008 American Indian or …    135   144
## 3 CENS-R1              15-19 years        2009 American Indian or …    131   127
## 4 CENS-R1              15-19 years        2010 American Indian or …    123   110
## 5 CENS-R1              15-19 years        2011 American Indian or …    134   135
## 6 CENS-R1              15-19 years        2012 American Indian or …    140   144

Clean Up

Use the code from Fertility 3 to fix the names and values in this dataframe.

Solution

Gender = Gender %>% 
rename(Region = `Census Region Code`,
         Race = `Mother's Bridged Race`,
         Age = `Age of Mother 9`) %>% 
mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
         Region = ifelse(Region == "CENS-R2","MW",Region),
         Region = ifelse(Region == "CENS-R3","SO",Region),
         Region = ifelse(Region == "CENS-R4","WE",Region),
         Race = ifelse(Race == "American Indian or Alaska Native","AmInd",Race),
         Race = ifelse(Race == "Asian or Pacific Islander","API",Race),
         Race = ifelse(Race == "Black or African American","Black",Race))
# View(Gender)

The Ratio

Use all of the data to compute the proportion of males in live births.

Solution

Gender %>% 
  summarize(Male = sum(Male),
            Female = sum(Female)) %>% 
  mutate(p = Male/(Male + Female))

## # A tibble: 1 × 3
##       Male   Female     p
##      <dbl>    <dbl> <dbl>
## 1 26523138 25314578 0.512

Let’s take .5116 as the best estimate. Use the spreadsheet to look at the implications of this and the different probabilities of death for males and females at different ages.

For Wednesday

Use this data to ask if the Male ratio differs by Age, Race, or Region.

Solution for Age

Gender %>% 
  group_by(Age) %>% 
  summarize(n = sum(Male + Female),
             p = sum(Male)/(n)) %>% 
  mutate(se = sqrt(p*(1-p)/n),
         ub = p + 1.96 * se,
         lb = p - 1.96 * se) %>% 
ggplot(aes(y = Age ) )  +
      geom_point(aes(x = p), color = "black") +
      geom_point(aes(x = lb), color = "red") +
      geom_point(aes(x = ub), color = "blue") +
      ggtitle("95% Confidence Intervals for Male Ratio") +
      geom_vline(aes(xintercept =.5116))

Solution

Gender %>% 
  group_by(Race) %>% 
  summarize(n = sum(Male + Female),
             p = sum(Male)/(n)) %>% 
  mutate(se = sqrt(p*(1-p)/n),
         ub = p + 1.96 * se,
         lb = p - 1.96 * se) %>% 
ggplot(aes(y = Race ) )  +
      geom_point(aes(x = p), color = "black") +
      geom_point(aes(x = lb), color = "red") +
      geom_point(aes(x = ub), color = "blue") +
      ggtitle("95% Confidence Intervals for Male Ratio") +
      geom_vline(aes(xintercept =.5116))

Solution

Gender %>% 
  group_by(Year) %>% 
  summarize(n = sum(Male + Female),
             p = sum(Male)/(n)) %>% 
  mutate(se = sqrt(p*(1-p)/n),
         ub = p + 1.96 * se,
         lb = p - 1.96 * se) %>% 
ggplot(aes(y = Year ) )  +
      geom_point(aes(x = p), color = "black") +
      geom_point(aes(x = lb), color = "red") +
      geom_point(aes(x = ub), color = "blue") +
      ggtitle("95% Confidence Intervals for Male Ratio") +
      geom_vline(aes(xintercept =.5116))

Solution

Gender %>% 
  group_by(Region) %>% 
  summarize(n = sum(Male + Female),
             p = sum(Male)/(n)) %>% 
  mutate(se = sqrt(p*(1-p)/n),
         ub = p + 1.96 * se,
         lb = p - 1.96 * se) %>% 
ggplot(aes(y = Region ) )  +
      geom_point(aes(x = p), color = "black") +
      geom_point(aes(x = lb), color = "red") +
      geom_point(aes(x = ub), color = "blue") +
      ggtitle("95% Confidence Intervals for Male Ratio") +
      geom_vline(aes(xintercept =.5116))

Transition to Mortality

Use the SSA life table to compute the ratio of the male probability of death to the female probability of death at every age.

Solution

ProbDeath <- read_csv("ProbDeath.csv")

## Rows: 120 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Age, MProbDeath, FProbDeath

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ProbDeath %>% 
  mutate(Ratio = MProbDeath/FProbDeath) %>% 
  ggplot(aes(x = Age, y = Ratio)) +
  geom_point() +
  ggtitle("Ratio of Male Porbability of Death to Female")

Gender Mix

Topic

Setup

Get Data

Inspect

Solution

Pivot

Solution

Clean Up

Solution

The Ratio

Solution

For Wednesday

Solution for Age

Solution for Race

Solution

Solution for Year

Solution

Solution for Region

Solution

Transition to Mortality

Solution