Lab 5

Author

Elise Bosma

options(scipen=999) 
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(socviz)
library(dplyr)

Part 1: Summerizing the Data

#ChatGPT helped me on this code

mydata <- gss_sm %>%
  group_by(marital, happy) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))

`summarise()` has grouped output by 'marital'. You can override using the
`.groups` argument.

mydata

# A tibble: 19 × 5
# Groups:   marital [6]
   marital       happy             N    freq   pct
   <fct>         <fct>         <int>   <dbl> <dbl>
 1 Married       Very Happy      465 0.384      38
 2 Married       Pretty Happy    657 0.542      54
 3 Married       Not Too Happy    86 0.0710      7
 4 Married       <NA>              4 0.00330     0
 5 Widowed       Very Happy       59 0.235      24
 6 Widowed       Pretty Happy    126 0.502      50
 7 Widowed       Not Too Happy    66 0.263      26
 8 Divorced      Very Happy      100 0.202      20
 9 Divorced      Pretty Happy    281 0.568      57
10 Divorced      Not Too Happy   114 0.230      23
11 Separated     Very Happy       26 0.255      25
12 Separated     Pretty Happy     46 0.451      45
13 Separated     Not Too Happy    29 0.284      28
14 Separated     <NA>              1 0.00980     1
15 Never Married Very Happy      156 0.194      19
16 Never Married Pretty Happy    490 0.608      61
17 Never Married Not Too Happy   157 0.195      19
18 Never Married <NA>              3 0.00372     0
19 <NA>          Pretty Happy      1 1         100

Interpretation:

The data is separated by marriage status then happiness levels. ‘N’ represents the number of observations for each category, ‘freq’ shows the frequency of the number of individuals that fall within each category, and ‘pct’ demonstrates the percentage of observations under each category in the ‘marital’ section. For instance, under those who are married, 54% are ‘Pretty Happy,’ which is the largest percentage out of the three categories under ‘happy.’

Part 2: Stacked and Dodged Bar Charts

p_title <- "Marriage and Happiness"
p_caption <- "Dataset: gss_sm dataset"

# Stacked Bar Chart
p <- ggplot(data = subset(mydata, !is.na(marital) & !is.na(happy)), 
                        aes(x=marital, y=pct, fill = happy))

p + geom_col(position = "stack") +
    labs(x="Marriage Status", y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# Dodged Bar Chart
p + geom_col(position = "dodge2") +
    labs(x="Marriage Status", y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9))

# Faceted Horizontal Bar Chart
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Happiness",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ happy) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

Part 3: Summerizing the Data with Two Continuous Variables and One Categorical

mydata2 <- gss_sm %>%         
  group_by(marital) %>%
  summarize(N = n(),
            age_mean = mean(age, na.rm=TRUE), 
            sibs_mean = mean(sibs, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
mydata2

# A tibble: 6 × 6
  marital           N age_mean sibs_mean     freq   pct
  <fct>         <int>    <dbl>     <dbl>    <dbl> <dbl>
1 Married        1212     51.3      3.63 0.423       42
2 Widowed         251     72.3      4.49 0.0875       9
3 Divorced        495     54.5      3.86 0.173       17
4 Separated       102     47.8      4.79 0.0356       4
5 Never Married   806     35.6      3.38 0.281       28
6 <NA>              1     51        7    0.000349     0

Interpretation:

The results of the data are structured in the same manner as part 1. However, this time it includes two continuous variables, which are ‘age_mean’ and ‘sibs_mean.’ The average age for widows is 72, and the average age for individuals who never married is 35. These age numbers correlate with younger individuals because they are more likely to have never married than those who have lived longer lives. In contrast, the average number of siblings range between 3-4, but they do not show a strong correlation with marital status or age.

Part 4: Scatterplot with Two Continuous Variables and One Categorical

# Omits missing values
mydata2 <- na.omit(mydata2)

# Scatterplot
p <- ggplot(mydata2, aes(x=sibs_mean, y=age_mean, color=marital))
p + geom_point(size=5) +
    annotate(geom = "text", x = 1.6, y=58, 
                     label = "The average number of siblings does not fall below 3.", hjust=0) +
    labs(y="Average Age", x="Average # of Siblings.", 
         title="Age and Number of Siblings by Marital Status", 
         subtitle = "Comparing the average age of individuals and their average number of siblings by marital status",
         caption <- "Dataset: gss_sm dataset{socviz}")

Part 5: Legends and Guides

# ChatGPT helped me write this code
# Reordering Variables in Legend
mydata2$marital <- factor(mydata2$marital, levels = c("Married", "Divorced", "Widowed", "Separated", "Never Married"))

# Scatterplot
p <- ggplot(mydata2, aes(x = sibs_mean, y = age_mean, color = marital)) +
  geom_point(size = 5) +
  annotate(geom = "text", x = 1.6, y = 58, 
           label = "The average number of siblings does not fall below 3.", hjust = 0) +
  labs(y = "Average Age", x = "Average # of Siblings.", 
       title = "Age and Number of Siblings by Marital Status", 
       subtitle = "Comparing the average age of individuals and their average number of siblings by marital status",
       caption = "Dataset: gss_sm dataset{socviz}",
       color = "Marital Status") +
  theme(legend.position = "top") +
  guides(color = guide_legend(title = "Marital Status", order = 1))
p

Part 6: Data Labels

# Creating the plot with data labels
p <- ggplot(mydata2, aes(x = sibs_mean, y = age_mean, color = marital, label = marital)) +
  geom_point(size = 5) +
  geom_text(size = 3, hjust = 1.25) +  
  annotate(geom = "text", x = 1.6, y = 58, 
           label = "The average number of siblings does not fall below 3.", hjust = 0) +
  labs(y = "Average Age", x = "Average # of Siblings.", 
       title = "Age and Number of Siblings by Marital Status", 
       subtitle = "Comparing the average age of individuals and their average number of siblings by marital status",
       caption = "Dataset: gss_sm dataset{socviz}",
       color = "Marital Status") +
  theme(legend.position = "none")
p

Part 7: Insights

p <- ggplot(mydata2, aes(x = sibs_mean, y = age_mean, color = marital, label = marital)) +
  geom_point(size = 5) +
  geom_text(size = 3, hjust = 1.25) +  
  annotate(geom = "text", x = 1.6, y = 58, 
           label = "The average number of siblings does not fall below 3.", hjust = 0) +
  labs(y = "Average Age", x = "Average # of Siblings.", 
       title = "Age and Number of Siblings by Marital Status", 
       subtitle = "The widows have an older average age whereas the never married have a younger average age.\nThose who are not currently married have a larger # of siblings.\nMarried and divorced are close in age where the others are further apart.",
       caption = "Dataset: gss_sm dataset{socviz}",
       color = "Marital Status") +
  theme(legend.position = "none")
p