LAB 5- GSS

Author

Jose Almanza

options(scipen=999) 
library(tidyverse)
library(socviz)
library(datasetsICR)

PART 1: SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

## Summarize data
library(socviz)
pip1 <- gss_sm %>%         
  group_by(degree, sex) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1
# A tibble: 12 × 5
# Groups:   degree [6]
   degree         sex        N  freq   pct
   <fct>          <fct>  <int> <dbl> <dbl>
 1 Lt High School Male     147 0.448    45
 2 Lt High School Female   181 0.552    55
 3 High School    Male     662 0.453    45
 4 High School    Female   799 0.547    55
 5 Junior College Male      89 0.412    41
 6 Junior College Female   127 0.588    59
 7 Bachelor       Male     243 0.453    45
 8 Bachelor       Female   293 0.547    55
 9 Graduate       Male     132 0.415    42
10 Graduate       Female   186 0.585    58
11 <NA>           Male       3 0.375    38
12 <NA>           Female     5 0.625    62
## Removing N/A values 
mydata <- na.omit(pip1)
mydata
# A tibble: 10 × 5
# Groups:   degree [5]
   degree         sex        N  freq   pct
   <fct>          <fct>  <int> <dbl> <dbl>
 1 Lt High School Male     147 0.448    45
 2 Lt High School Female   181 0.552    55
 3 High School    Male     662 0.453    45
 4 High School    Female   799 0.547    55
 5 Junior College Male      89 0.412    41
 6 Junior College Female   127 0.588    59
 7 Bachelor       Male     243 0.453    45
 8 Bachelor       Female   293 0.547    55
 9 Graduate       Male     132 0.415    42
10 Graduate       Female   186 0.585    58

The table presents a comprehensive breakdown of individuals by their educational attainment levels and gender. It offers detailed insights into the count (N), frequency (freq), and percentage (pct) for each educational category. These categories span across “Lt High School,” “High School,” “Junior College,” “Bachelor,” and “Graduate.” An intriguing trend emerges when examining educational attainment by gender. Females exhibit notably higher percentages within the “Lt High School,” “High School,” and “Junior College” categories. However, as educational attainment progresses to higher degrees, such as “Bachelor” and “Graduate,” the gender gap in educational levels gradually diminishes. This observation suggests a convergence in educational achievements between genders as individuals pursue advanced degrees.

PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Study Degree by Sex"
p_caption <- "gss_sm dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(mydata, !is.na(degree) & !is.na(sex)), 
                        aes(x=degree, y=pct, fill = sex))

p + geom_col(position = "stack") +
    labs(x="Degree", y="Percent", fill = "Sex",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Degree", y="Percent", fill = "Sex",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9)) 

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Degree",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ degree) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE

summary(gss_sm$partners)
         No Partners            1 Partner           2 Partners 
                 409                 1085                  124 
          3 Partners           4 Partners        5-10 Partners 
                  50                   34                   38 
      11-20 Partners      21-100 Partners 1 or More, # Unknown 
                  11                    1                    9 
                NA's 
                1106 
summary(gss_sm$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  18.00   34.00   49.00   49.16   62.00   89.00      10 
mydata2 <- gss_sm %>%
  filter(partners %in% c("No Partners", "1 Partner", "2 Partners", "3 Partners", 
                         "4 Partners", "5-10 Partners", "11-20 Partners", "21-100 Partners")) %>%
  filter(!is.na(age) & !is.na(partners)) %>%
  group_by(sex, partners) %>%
  summarize(N = n(),
            age_mean = mean(age, na.rm = TRUE)) %>%
  group_by(sex) %>%
  mutate(freq = N / sum(N),
         pct = round((freq * 100), 0))

mydata2
# A tibble: 15 × 6
# Groups:   sex [2]
   sex    partners            N age_mean    freq   pct
   <fct>  <fct>           <int>    <dbl>   <dbl> <dbl>
 1 Male   No Partners       165     53.5 0.210      21
 2 Male   1 Partner         470     47.5 0.597      60
 3 Male   2 Partners         74     39.4 0.0940      9
 4 Male   3 Partners         23     43.9 0.0292      3
 5 Male   4 Partners         20     35.8 0.0254      3
 6 Male   5-10 Partners      25     32.3 0.0318      3
 7 Male   11-20 Partners      9     35.1 0.0114      1
 8 Male   21-100 Partners     1     72   0.00127     0
 9 Female No Partners       243     59.9 0.253      25
10 Female 1 Partner         611     45.7 0.636      64
11 Female 2 Partners         50     38.0 0.0521      5
12 Female 3 Partners         27     32.5 0.0281      3
13 Female 4 Partners         14     32.5 0.0146      1
14 Female 5-10 Partners      13     31.8 0.0135      1
15 Female 11-20 Partners      2     55   0.00208     0

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

This code does not remove NA – make sure you deal with that.

p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) + 
  geom_point(aes(size = N)) + 
  annotate(geom = "text", x = 1.6, y = "11-20 Partners",
           label = "There is more Females with no partners than Males",
           hjust = 0, vjust = 2) + 
  labs(y = "Number of Partners", x = "Average Age",
       title = "Number of Partners by Sex and Age",
       subtitle = "Majority of men and woman tend to only have one partner in there lifes",
       caption = "gss_sm dataset{socviz}")

print(p)

PART 5: LEGEND AND GUIDES

This is just an example – do not simply duplicate these options.

mydata2$sex <- as.character(mydata2$sex)
mydata2 <- mydata2[order(mydata2$sex),]
mydata2  # check to see if the dataset is sorted alpha order starting with Catholic
# A tibble: 15 × 6
# Groups:   sex [2]
   sex    partners            N age_mean    freq   pct
   <chr>  <fct>           <int>    <dbl>   <dbl> <dbl>
 1 Female No Partners       243     59.9 0.253      25
 2 Female 1 Partner         611     45.7 0.636      64
 3 Female 2 Partners         50     38.0 0.0521      5
 4 Female 3 Partners         27     32.5 0.0281      3
 5 Female 4 Partners         14     32.5 0.0146      1
 6 Female 5-10 Partners      13     31.8 0.0135      1
 7 Female 11-20 Partners      2     55   0.00208     0
 8 Male   No Partners       165     53.5 0.210      21
 9 Male   1 Partner         470     47.5 0.597      60
10 Male   2 Partners         74     39.4 0.0940      9
11 Male   3 Partners         23     43.9 0.0292      3
12 Male   4 Partners         20     35.8 0.0254      3
13 Male   5-10 Partners      25     32.3 0.0318      3
14 Male   11-20 Partners      9     35.1 0.0114      1
15 Male   21-100 Partners     1     72   0.00127     0
p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) + 
  geom_point(aes(size = N)) + 
  annotate(geom = "text", x = 1.6, y = "11-20 Partners",
           label = "There is more Females with no partners than Males",
           hjust = 0, vjust = 2) + 
  labs(y = "Number of Partners", x = "Average Age",
       title = "Number of Partners by Sex and Age",
       subtitle = "Majority of men and woman tend to only have one partner in there lifes", hjust=0,
       caption = "gss_sm dataset{socviz}") +  theme(legend.title = element_text(color="gray50", size=14, face="bold"),
        legend.position = "right")
p

PART 6: DATA LABELS VS LEGEND

p <- ggplot(mydata2, aes(x=age_mean, y=partners, color=sex))
p + geom_point(aes(size = N)) +
    geom_text(mapping = aes(label=sex), hjust=1.2, size=3) +
    annotate(geom = "text", x = 1.6, y = "11-20 Partners", 
                     label = "There is more Females with no partners than Males", hjust = 0, vjust = 2 ) +
    labs(y="Average Number of Partners", x="Average Age", 
         title="Number of Partners by Sex and Age", 
         color = "Religion") +
    theme(legend.position = "none")

PART 7: INTERPRETATION

Create insights from the visualization.
Summarize those insights as the subtitle.

p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) + 
  geom_point(aes(size = N), alpha = 0.7) +  
  annotate(geom = "text", x = 1.6, y = "11-20 Partners",
           label = "Females are more propense to not have partner than Males",
           hjust = 0, vjust = 2, size = 4, color = "black") +  
  labs(y = "Average Number of Partners", x = "Average Age",
       title = "Number of Partners by Sex and Age",
       subtitle = "Majority of both men and women tend to have only one partner in their lives", 
       caption = "Data Source: gss_sm dataset{socviz}") + 
  theme_minimal() +  
  theme(legend.title = element_text(color = "gray50", size = 14, face = "bold"),
        legend.position = "right",  
        plot.title = element_text(size = 16, face = "bold"),  
        plot.subtitle = element_text(size = 12, color = "darkgreen"),  
        axis.title = element_text(size = 14),  
        axis.text = element_text(size = 12),  
        legend.text = element_text(size = 10))  

print(p)

END