LAB 5 - Tables, Labels, & Notes

Author

dr-v-jbu

options(scipen=999) 
library(tidyverse)
library(socviz)
library(datasetsICR)
head(gss_sm)
# A tibble: 6 × 32
   year    id ballot    age childs sibs  degree race  sex   region incom…¹ relig
  <dbl> <dbl> <label> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>   <fct>
1  2016     1 1          47      3 2     Bache… White Male  New E… $17000… None 
2  2016     2 2          61      0 3     High … White Male  New E… $50000… None 
3  2016     3 3          72      2 3     Bache… White Male  New E… $75000… Cath…
4  2016     4 1          43      4 3     High … White Fema… New E… $17000… Cath…
5  2016     5 3          55      2 2     Gradu… White Fema… New E… $17000… None 
6  2016     6 2          53      2 2     Junio… White Fema… New E… $60000… None 
# … with 20 more variables: marital <fct>, padeg <fct>, madeg <fct>,
#   partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
#   zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
#   agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
#   bigregion <fct>, partners_rc <fct>, obama <dbl>, and abbreviated variable
#   name ¹​income16

PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

library(socviz)
pip1 <- gss_sm %>%         
  group_by(partners_rc, happy) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1
# A tibble: 21 × 5
# Groups:   partners_rc [6]
   partners_rc happy             N    freq   pct
   <fct>       <fct>         <int>   <dbl> <dbl>
 1 0           Very Happy       80 0.196      20
 2 0           Pretty Happy    230 0.562      56
 3 0           Not Too Happy    98 0.240      24
 4 0           <NA>              1 0.00244     0
 5 1           Very Happy      354 0.326      33
 6 1           Pretty Happy    588 0.542      54
 7 1           Not Too Happy   141 0.130      13
 8 1           <NA>              2 0.00184     0
 9 2           Very Happy       19 0.153      15
10 2           Pretty Happy     80 0.645      65
# … with 11 more rows

PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Happiness by Number of Partners"
p_caption <- "gss_sm dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(happy) & !is.na(partners_rc)), 
                        aes(x=partners_rc, y=pct, fill = happy))

p + geom_col(position = "stack") +
    labs(x="Number of Partners", y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "More than half of people regardless of the number of partners they have tend to be pretty happy.") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5)) +
  theme_minimal()

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Number of Partners", y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "Those with no or one partner have a higher probability of being very happy.") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9), vjust=-0.3) + # vjust - ChatGPT
  theme_minimal()

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "Except for those who are single there is a negative correlation between number of partners and happiness.") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ happy) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1),) +
  theme_light()

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE

gss_sm$partners_no <- as.numeric(gsub("[^0-9.]", "", gss_sm$partners))
gss_sm$income_no <- as.numeric(gsub("[^0-9.]", "", gss_sm$income_rc))
pip2 <- gss_sm %>%         
  group_by(happy) %>%
  summarize(N = n(),
            income_mean = mean(income_no, na.rm=TRUE), 
            sibs_mean = mean(sibs, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2
# A tibble: 4 × 6
  happy             N income_mean sibs_mean    freq   pct
  <fct>         <int>       <dbl>     <dbl>   <dbl> <dbl>
1 Very Happy      806      70207.      3.76 0.281      28
2 Pretty Happy   1601      58177.      3.52 0.558      56
3 Not Too Happy   452      31730.      4.28 0.158      16
4 <NA>              8      32000       7.29 0.00279     0

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

This code does not remove NA – make sure you deal with that.

p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
  annotate(geom = "text", x = 31729, y=4.23, 
                 label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
  labs(y="Average Number of Siblings", x="Average Income", 
       title="Income and Number of Siblings by Happiness", 
       subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
       caption <- "gss_sm dataset{socviz}",
       color = "Happiness")

PART 5: LEGEND AND GUIDES

This is just an example – do not simply duplicate these options.

p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
  annotate(geom = "text", x = 31729, y=4.23, 
                 label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
  labs(y="Average Number of Siblings", x="Average Income", 
       title="Income and Number of Siblings by Happiness", 
       subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
       caption <- "gss_sm dataset{socviz}",
       color = "Happiness") + 
  scale_x_continuous(labels = scales::dollar_format()) + 
  theme(legend.title = element_text(color="gray50", size=14, face="bold"),
        legend.position = c(x=0.11, y=.2))

PART 6: DATA LABELS VS LEGEND

p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
  geom_text(mapping = aes(label=happy), hjust=1.2, size=3) +
  annotate(geom = "text", x = 31729, y=4.23, 
                 label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
  labs(y="Average Number of Siblings", x="Average Income", 
       title="Income and Number of Siblings by Happiness", 
       subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
       caption <- "gss_sm dataset{socviz}",
       color = "Happiness") + 
  scale_x_continuous(labels = scales::dollar_format()) +
  coord_cartesian(xlim = c(25000, 72000)) + # ChatGPT 
  theme(legend.position = "none")

PART 7: INTERPRETATION

  • Happiness by Number of Partners
    • As observed in the stacked, dodged, and faceted horizontal bar chart, those with one partner had the highest probability of being very happy. Regardless of the number of partners, there was at least a 50% probability of being pretty happy. Except for those who are single, there appears to be a negative correlation between the number of partners and happiness. Perhaps the change in the trend by those who are single may be caused by a stigma since those who are single have the second highest probability of being very happy.
  • Income and Number of Siblings by Happiness
    • Those with a lower income and more siblings tend not to be happy. The distinguishing trait between those who are pretty and very happy is those who are very happy tend to have slightly more siblings and a higher income. Income appears to be a significant factor in determining whether one is happy. Perhaps income contributes to the negative skew in the happiness of those who are single since those who are married are known to earn more money when compared to their single counterparts.

END