options(scipen=999)
library(tidyverse)
library(socviz)
library(datasetsICR)LAB 5 - Tables, Labels, & Notes
head(gss_sm)# A tibble: 6 × 32
year id ballot age childs sibs degree race sex region incom…¹ relig
<dbl> <dbl> <label> <dbl> <dbl> <lab> <fct> <fct> <fct> <fct> <fct> <fct>
1 2016 1 1 47 3 2 Bache… White Male New E… $17000… None
2 2016 2 2 61 0 3 High … White Male New E… $50000… None
3 2016 3 3 72 2 3 Bache… White Male New E… $75000… Cath…
4 2016 4 1 43 4 3 High … White Fema… New E… $17000… Cath…
5 2016 5 3 55 2 2 Gradu… White Fema… New E… $17000… None
6 2016 6 2 53 2 2 Junio… White Fema… New E… $60000… None
# … with 20 more variables: marital <fct>, padeg <fct>, madeg <fct>,
# partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
# zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
# agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
# bigregion <fct>, partners_rc <fct>, obama <dbl>, and abbreviated variable
# name ¹income16
PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
library(socviz)
pip1 <- gss_sm %>%
group_by(partners_rc, happy) %>%
summarize(N = n()) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip1# A tibble: 21 × 5
# Groups: partners_rc [6]
partners_rc happy N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 0 Very Happy 80 0.196 20
2 0 Pretty Happy 230 0.562 56
3 0 Not Too Happy 98 0.240 24
4 0 <NA> 1 0.00244 0
5 1 Very Happy 354 0.326 33
6 1 Pretty Happy 588 0.542 54
7 1 Not Too Happy 141 0.130 13
8 1 <NA> 2 0.00184 0
9 2 Very Happy 19 0.153 15
10 2 Pretty Happy 80 0.645 65
# … with 11 more rows
PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES
p_title <- "Happiness by Number of Partners"
p_caption <- "gss_sm dataset"
# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(happy) & !is.na(partners_rc)),
aes(x=partners_rc, y=pct, fill = happy))
p + geom_col(position = "stack") +
labs(x="Number of Partners", y="Percent", fill = "Happiness",
title = p_title, caption = p_caption,
subtitle = "More than half of people regardless of the number of partners they have tend to be pretty happy.") +
geom_text(aes(label=pct), position = position_stack(vjust=.5)) +
theme_minimal()# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
labs(x="Number of Partners", y="Percent", fill = "Happiness",
title = p_title, caption = p_caption,
subtitle = "Those with no or one partner have a higher probability of being very happy.") +
geom_text(aes(label = pct), position = position_dodge(width = .9), vjust=-0.3) + # vjust - ChatGPT
theme_minimal()# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
labs(x=NULL, y="Percent", fill = "Happiness",
title = p_title, caption = p_caption,
subtitle = "Except for those who are single there is a negative correlation between number of partners and happiness.") +
guides(fill = "none") +
coord_flip() +
facet_grid(~ happy) +
geom_text(aes(label = pct), position = position_dodge2(width = 1),) +
theme_light()PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE
gss_sm$partners_no <- as.numeric(gsub("[^0-9.]", "", gss_sm$partners))
gss_sm$income_no <- as.numeric(gsub("[^0-9.]", "", gss_sm$income_rc))
pip2 <- gss_sm %>%
group_by(happy) %>%
summarize(N = n(),
income_mean = mean(income_no, na.rm=TRUE),
sibs_mean = mean(sibs, na.rm=TRUE)) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip2# A tibble: 4 × 6
happy N income_mean sibs_mean freq pct
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Very Happy 806 70207. 3.76 0.281 28
2 Pretty Happy 1601 58177. 3.52 0.558 56
3 Not Too Happy 452 31730. 4.28 0.158 16
4 <NA> 8 32000 7.29 0.00279 0
PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE
This code does not remove NA – make sure you deal with that.
p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
annotate(geom = "text", x = 31729, y=4.23,
label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
labs(y="Average Number of Siblings", x="Average Income",
title="Income and Number of Siblings by Happiness",
subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
caption <- "gss_sm dataset{socviz}",
color = "Happiness")PART 5: LEGEND AND GUIDES
This is just an example – do not simply duplicate these options.
p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
annotate(geom = "text", x = 31729, y=4.23,
label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
labs(y="Average Number of Siblings", x="Average Income",
title="Income and Number of Siblings by Happiness",
subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
caption <- "gss_sm dataset{socviz}",
color = "Happiness") +
scale_x_continuous(labels = scales::dollar_format()) +
theme(legend.title = element_text(color="gray50", size=14, face="bold"),
legend.position = c(x=0.11, y=.2))PART 6: DATA LABELS VS LEGEND
p <- ggplot(data = subset(pip2, !is.na(happy)), aes(x=income_mean, y=sibs_mean, color=happy))
p + geom_point(size=5) +
geom_text(mapping = aes(label=happy), hjust=1.2, size=3) +
annotate(geom = "text", x = 31729, y=4.23,
label = "People with more siblings and less income tend to be not too happy.", hjust=0) +
labs(y="Average Number of Siblings", x="Average Income",
title="Income and Number of Siblings by Happiness",
subtitle = "People with higher income and fewer siblings are more likely to be very happy.",
caption <- "gss_sm dataset{socviz}",
color = "Happiness") +
scale_x_continuous(labels = scales::dollar_format()) +
coord_cartesian(xlim = c(25000, 72000)) + # ChatGPT
theme(legend.position = "none")PART 7: INTERPRETATION
- Happiness by Number of Partners
- As observed in the stacked, dodged, and faceted horizontal bar chart, those with one partner had the highest probability of being very happy. Regardless of the number of partners, there was at least a 50% probability of being pretty happy. Except for those who are single, there appears to be a negative correlation between the number of partners and happiness. Perhaps the change in the trend by those who are single may be caused by a stigma since those who are single have the second highest probability of being very happy.
- Income and Number of Siblings by Happiness
- Those with a lower income and more siblings tend not to be happy. The distinguishing trait between those who are pretty and very happy is those who are very happy tend to have slightly more siblings and a higher income. Income appears to be a significant factor in determining whether one is happy. Perhaps income contributes to the negative skew in the happiness of those who are single since those who are married are known to earn more money when compared to their single counterparts.