LAB 5 Dataset

Author

Eli Kramer

Part 1 - The Data

rm(list=ls())
library(tidyverse)
library(dplyr)
library(socviz)
head(gss_sm)
# A tibble: 6 × 32
   year    id ballot   age childs sibs  degree race  sex   region income16 relig
  <dbl> <dbl> <labe> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>    <fct>
1  2016     1 1         47      3 2     Bache… White Male  New E… $170000… None 
2  2016     2 2         61      0 3     High … White Male  New E… $50000 … None 
3  2016     3 3         72      2 3     Bache… White Male  New E… $75000 … Cath…
4  2016     4 1         43      4 3     High … White Fema… New E… $170000… Cath…
5  2016     5 3         55      2 2     Gradu… White Fema… New E… $170000… None 
6  2016     6 2         53      2 2     Junio… White Fema… New E… $60000 … None 
# ℹ 20 more variables: marital <fct>, padeg <fct>, madeg <fct>, partyid <fct>,
#   polviews <fct>, happy <fct>, partners <fct>, grass <fct>, zodiac <fct>,
#   pres12 <labelled>, wtssall <dbl>, income_rc <fct>, agegrp <fct>,
#   ageq <fct>, siblings <fct>, kids <fct>, religion <fct>, bigregion <fct>,
#   partners_rc <fct>, obama <dbl>
pip1 <- gss_sm %>%         
  group_by(marital, religion) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1
# A tibble: 30 × 5
# Groups:   marital [6]
   marital religion       N    freq   pct
   <fct>   <fct>      <int>   <dbl> <dbl>
 1 Married Protestant   616 0.508      51
 2 Married Catholic     295 0.243      24
 3 Married Jewish        24 0.0198      2
 4 Married None         209 0.172      17
 5 Married Other         62 0.0512      5
 6 Married <NA>           6 0.00495     0
 7 Widowed Protestant   150 0.598      60
 8 Widowed Catholic      64 0.255      25
 9 Widowed Jewish         3 0.0120      1
10 Widowed None          28 0.112      11
# ℹ 20 more rows

Part 2 - Charts and Variables

stacked <- ggplot((data = subset(pip1, !is.na(marital) & !is.na(religion))), 
                        aes(x=marital, y=pct, fill = religion))

stacked + geom_col(position = "stack") +
    labs(x="Marital", y="Percent", fill = "Religion",
         title = "Marital and religion", caption = "gssm_sm dataset", 
         subtitle = "A stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

dodged <- ggplot((data = subset(pip1, !is.na(marital) & !is.na(religion))), 
                        aes(x=marital, y=pct, fill = religion))

dodged + geom_col(position = "dodge") +
    labs(x="Marital", y="percent", fill = "Religion",
         title = "Marital and religion", caption = "gssm_sm dataset", 
         subtitle = "As a dodged bar chart") +
    geom_text(aes(label=pct), position = position_dodge(width =.9))

horizontal <- ggplot((data = subset(pip1, !is.na(marital) & !is.na(religion))), 
                        aes(x=marital, y=pct, fill = religion))

horizontal + geom_col(position = "dodge2") +
  labs(x= NULL, y="percent", fill = "Religion",
         title = "Marital and religion", caption = "gssm_sm dataset", 
         subtitle = "As a faceted horizontal chart") + 
         guides(fill = "none") + 
         coord_flip() +
         facet_grid(~ marital) + 
    geom_text(aes(label=pct), position = position_dodge2(width =.9)) 

Part 3 - dplyr and Summarization

categorical <- gss_sm %>%         
  group_by(religion) %>%
  summarize(N = n(),
            childs_mean = mean(childs, na.rm=TRUE), 
            siblings_mean = mean(sibs, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))

categorical
# A tibble: 6 × 6
  religion       N childs_mean siblings_mean    freq   pct
  <fct>      <int>       <dbl>         <dbl>   <dbl> <dbl>
1 Protestant  1371        2.02          3.77 0.478      48
2 Catholic     649        2.06          4.22 0.226      23
3 Jewish        51        1.71          1.84 0.0178      2
4 None         619        1.41          3.29 0.216      22
5 Other        159        1.32          3.57 0.0555      6
6 <NA>          18        2.25          2.82 0.00628     1
categorical2 <- gss_sm %>%         
  group_by(marital) %>%
  summarize(N = n(),
            childs_mean = mean(childs, na.rm=TRUE), 
            siblings_mean = mean(sibs, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))

categorical2
# A tibble: 6 × 6
  marital           N childs_mean siblings_mean     freq   pct
  <fct>         <int>       <dbl>         <dbl>    <dbl> <dbl>
1 Married        1212       2.17           3.63 0.423       42
2 Widowed         251       2.84           4.49 0.0875       9
3 Divorced        495       2.23           3.86 0.173       17
4 Separated       102       2.46           4.79 0.0356       4
5 Never Married   806       0.748          3.38 0.281       28
6 <NA>              1       6              7    0.000349     0

Part 4 - Scatterplot

p <- ggplot(categorical, aes(x=siblings_mean, y=childs_mean, color=religion))
p + geom_point(size=5) +
    annotate(geom = "text", x = 2, y=3.5, 
                     label = "", hjust=0) +
    labs(y="Average number children", x="Average # siblings.", 
         title="Number of siblings and Number of Children by Region", 
         subtitle = "Catholics have the most average siblings",
         caption <- "gss_sm dataset{socviz}")

Part 5 - Legends

categorical$religion.c <- as.character(categorical$religion)
pip2 <- categorical[order(categorical$religion.c),]
pip2 
# A tibble: 6 × 7
  religion       N childs_mean siblings_mean    freq   pct religion.c
  <fct>      <int>       <dbl>         <dbl>   <dbl> <dbl> <chr>     
1 Catholic     649        2.06          4.22 0.226      23 Catholic  
2 Jewish        51        1.71          1.84 0.0178      2 Jewish    
3 None         619        1.41          3.29 0.216      22 None      
4 Other        159        1.32          3.57 0.0555      6 Other     
5 Protestant  1371        2.02          3.77 0.478      48 Protestant
6 <NA>          18        2.25          2.82 0.00628     1 <NA>      
p <- ggplot(categorical, aes(x=siblings_mean, y=childs_mean, color=religion.c))
p + geom_point(size=5) +
    annotate(geom = "text", x = 1.6, y=3.5, 
                     label = "", hjust=0) +
    labs(y="Average number of children", x="Average Number of siblings", 
         color = "Religion") +
  theme(legend.title = element_text(color="gray50", size=14, face="bold"),
        legend.position = c(x=0.1, y=.7))

Part 6 - Data Labels

p <- ggplot(pip2, aes(x=siblings_mean, y=childs_mean, color=religion.c))
p + geom_point(size=5) +
    geom_text(mapping = aes(label=religion), hjust=1.2, size=3) +
    annotate(geom = "text", x = 1.8, y=3.5, 
                     label = "", hjust=0) +
    labs(y="Average number of children", x="Average Number of siblings", 
         title="Number of siblings and Number of Children by Region", 
         color = "Religion") +
    theme(legend.position = "none")

Part 7 - Interpretation

categorical2$marital.c <- as.character(categorical2$marital)
pip2 <- categorical2[order(categorical2$marital.c),]
pip2 
# A tibble: 6 × 7
  marital           N childs_mean siblings_mean     freq   pct marital.c    
  <fct>         <int>       <dbl>         <dbl>    <dbl> <dbl> <chr>        
1 Divorced        495       2.23           3.86 0.173       17 Divorced     
2 Married        1212       2.17           3.63 0.423       42 Married      
3 Never Married   806       0.748          3.38 0.281       28 Never Married
4 Separated       102       2.46           4.79 0.0356       4 Separated    
5 Widowed         251       2.84           4.49 0.0875       9 Widowed      
6 <NA>              1       6              7    0.000349     0 <NA>         
p <- ggplot(categorical2, aes(x=siblings_mean, y=childs_mean, color=marital.c))
p + geom_point(size=5) +
    annotate(geom = "text", x = 1.6, y=7, 
                     label = "", hjust=0) +
    labs(y="Average number of children", x="Average Number of siblings", 
         title = "Number of children and siblings by marital status",
         subtitle = "Many people did not report thier marital status and they had an average of 7 siblings and 6 children",
         color = "marital") +
  theme(legend.title = element_text(color="gray50", size=14, face="bold"),
        legend.position = c(x=0.1, y=.7))

Catholics had the most siblings and Jewish had the least. Every relgion had at least one child on average with unknown religions slightly having more than 2. The unknown marital status had many siblings and many children. Every other status had around 4 siblings and 2 children.