LAB 5

Author

Katlyn Collins

library(ggplot2)
library(socviz)
View(gss_sm)

PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

library(socviz)
library(dplyr)

# Summarize data using dplyr
pip1 <- gss_sm %>%         
  filter(!is.na(marital) & !is.na(sex)) %>%  # Remove missing values
  group_by(marital, sex) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq * 100), 0))

# Display the summary table
print(pip1)
# A tibble: 10 × 5
# Groups:   marital [5]
   marital       sex        N  freq   pct
   <fct>         <fct>  <int> <dbl> <dbl>
 1 Married       Male     565 0.466    47
 2 Married       Female   647 0.534    53
 3 Widowed       Male      62 0.247    25
 4 Widowed       Female   189 0.753    75
 5 Divorced      Male     209 0.422    42
 6 Divorced      Female   286 0.578    58
 7 Separated     Male      34 0.333    33
 8 Separated     Female    68 0.667    67
 9 Never Married Male     405 0.502    50
10 Never Married Female   401 0.498    50

Interpretation:

This table shows the gender distribution within each marital status category. In the married category, 47% are males and 53% are females. In the widowed category, 25% are males and 75% are females. In the Divorced category, 42% are males and 58% are females. In the Separated category, 33% are males and 67% are females. In the never-married category, 50% are males and 50% are females.


PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Marital Status by Sex"
p_caption <- "gss_sm dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(marital) & !is.na(sex)), 
                        aes(x=marital, y=pct, fill = sex))

p + geom_col(position = "stack") +
    labs(x="Marital Status", y="Percent", fill = "Sex",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Marital Status", y="Percent", fill = "Sex",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9)) 

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Sex",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ marital) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE

summary(gss_sm$partners)
         No Partners            1 Partner           2 Partners 
                 409                 1085                  124 
          3 Partners           4 Partners        5-10 Partners 
                  50                   34                   38 
      11-20 Partners      21-100 Partners 1 or More, # Unknown 
                  11                    1                    9 
                NA's 
                1106 
summary(gss_sm$income16)
      under $1 000    $1 000 to 2 999    $3 000 to 3 999    $4 000 to 4 999 
                36                 35                 24                 13 
   $5 000 to 5 999    $6 000 to 6 999    $7 000 to 7 999    $8 000 to 9 999 
                19                 15                 23                 48 
   $10000 to 12499    $12500 to 14999    $15000 to 17499    $17500 to 19999 
                88                 71                 65                 65 
   $20000 to 22499    $22500 to 24999    $25000 to 29999    $30000 to 34999 
                95                101                115                131 
   $35000 to 39999    $40000 to 49999    $50000 to 59999    $60000 to 74999 
               126                218                207                258 
  $75000 to $89999  $90000 to $109999 $110000 to $129999 $130000 to $149999 
               214                179                126                 99 
$150000 to $169999    $170000 or over               NA's 
                61                164                271 
pip2 <- gss_sm %>%         
  filter(partners %in% c("No Partners", "1 Partner", "2 Partners", "3 Partners", 
                         "4 Partners", "5-10 Partners", "11-20 Partners", "21-100 Partners")) %>%
  filter(!is.na(income16) & income16 != "NA's") %>%
  group_by(sex, partners) %>%
  summarize(N = n(),
            income_mean = mean(as.numeric(gsub("[^0-9]", "", income16)))) %>% 
  group_by(sex) %>%
  mutate(freq = N / sum(N),
         pct = round((freq * 100), 0))
pip2
# A tibble: 15 × 6
# Groups:   sex [2]
   sex    partners            N  income_mean    freq   pct
   <fct>  <fct>           <int>        <dbl>   <dbl> <dbl>
 1 Male   No Partners       151 13815477691. 0.203      20
 2 Male   1 Partner         450 29537659837. 0.606      61
 3 Male   2 Partners         70 23092349006. 0.0942      9
 4 Male   3 Partners         21  9706257094. 0.0283      3
 5 Male   4 Partners         18 34909528332. 0.0242      2
 6 Male   5-10 Partners      23 26947899064. 0.0310      3
 7 Male   11-20 Partners      9 16767843055. 0.0121      1
 8 Male   21-100 Partners     1  6000074999  0.00135     0
 9 Female No Partners       219 10249177483. 0.25       25
10 Female 1 Partner         560 26225200230. 0.639      64
11 Female 2 Partners         46 11880920466. 0.0525      5
12 Female 3 Partners         22 16058229772. 0.0251      3
13 Female 4 Partners         14  3429327356. 0.0160      2
14 Female 5-10 Partners      13 21100822461. 0.0148      1
15 Female 11-20 Partners      2 66125087499  0.00228     0

Interpretation

The table presents a detailed overview of the distribution of partners and the corresponding mean income for both males and females. For males, those with 1 partner have the highest mean income, closely followed by those with 2 partners. Interestingly, males with no partners also exhibit a substantial mean income. The trend of higher mean income for various partner counts continues for males. On the female side, a similar pattern emerges, with females having 1 partner displaying the highest mean income. Additionally, females with 2 partners also show a relatively high mean income. This table does make sense.

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

This code does not remove NA – make sure you deal with that.
::: {.cell}

scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
  geom_point(size = 5) +
  annotate(geom = "text", x = 1.6, y = "11-20 Partners", 
           label = "Males show to have a higher number
           of partners compared to women", hjust = -0.5) +
  labs(y = "Number of Partners", x = "Mean Income", 
       title = "Relationship Between Mean Income and Number of Partners", 
       subtitle = "Exploring the correlation and patterns",
       caption = "gss_sm dataset{socviz}")

scatterplot

:::

PART 5: LEGEND AND GUIDES

This is just an example – do not simply duplicate these options.
::: {.cell}

pip2$sex.c <- as.character(pip2$sex)
pip2 <- pip2[order(pip2$sex.c),]

scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
  geom_point(size = 5) +
  annotate(geom = "text", x = 1.9, y = "11-20 Partners", 
           label = "Males show to have a higher number
           of partners compared to women", hjust = -0.4) +
  labs(y = "Number of Partners", x = "Mean Income", 
       title = "Relationship Between Mean Income and Number of Partners", 
       subtitle = "Exploring the correlation and patterns",
       caption = "gss_sm dataset{socviz}") +
  theme(legend.title = element_text(color = "grey50", size = 12, face = "bold"),
        legend.position = c(x = 0.90, y = 0.3))

scatterplot

:::

PART 6: DATA LABELS VS LEGEND

scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
  geom_point(size = 5) +
  geom_text(mapping = aes(label = sex), hjust = 1.2, vjust = 1.5, size = 3) +
  annotate(geom = "text", x = 1.6, y = "11-20 Partners", 
           label = "Males show to have a higher number of 
           partners compared to women", hjust = -0.5) +
  labs(y = "Number of Partners", x = "Mean Income", 
       title = "Relationship Between Mean Income and Number of Partners", 
       subtitle = "Exploring the correlation and patterns",
       caption = "gss_sm dataset{socviz}") +
  theme(legend.position = "none")

scatterplot

PART 7: INTERPRETATION

Create insights from the visualization.
Summarize those insights as the subtitle.

The slope between mean income and number of partners tend to increase for both sexes. This shows us that there is a positive coorelation between the three which means as the average income increases, their number of partners increases as well.

scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
  geom_point(size = 5) +
  geom_text(mapping = aes(label = sex), hjust = 1.2, vjust = 1.5, size = 3) +
  annotate(geom = "text", x = 1.6, y = "11-20 Partners", 
           label = "Males show to have a higher number of 
           partners compared to women", hjust = -0.5) +
  labs(y = "Number of Partners", x = "Mean Income", 
       title = "Relationship Between Mean Income and Number of Partners", 
       subtitle = "Positive correlation between average income and number of partners for both sexes",
       caption = "gss_sm dataset{socviz}") +
  theme(legend.position = "none")

scatterplot

END