options(scipen=999)
library(tidyverse)
library(socviz)
library(datasetsICR)LAB 5- GSS
PART 1: SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
## Summarize data
library(socviz)
pip1 <- gss_sm %>%
group_by(degree, sex) %>%
summarize(N = n()) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip1# A tibble: 12 × 5
# Groups: degree [6]
degree sex N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 Lt High School Male 147 0.448 45
2 Lt High School Female 181 0.552 55
3 High School Male 662 0.453 45
4 High School Female 799 0.547 55
5 Junior College Male 89 0.412 41
6 Junior College Female 127 0.588 59
7 Bachelor Male 243 0.453 45
8 Bachelor Female 293 0.547 55
9 Graduate Male 132 0.415 42
10 Graduate Female 186 0.585 58
11 <NA> Male 3 0.375 38
12 <NA> Female 5 0.625 62
## Removing N/A values
mydata <- na.omit(pip1)
mydata# A tibble: 10 × 5
# Groups: degree [5]
degree sex N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 Lt High School Male 147 0.448 45
2 Lt High School Female 181 0.552 55
3 High School Male 662 0.453 45
4 High School Female 799 0.547 55
5 Junior College Male 89 0.412 41
6 Junior College Female 127 0.588 59
7 Bachelor Male 243 0.453 45
8 Bachelor Female 293 0.547 55
9 Graduate Male 132 0.415 42
10 Graduate Female 186 0.585 58
The table presents a comprehensive breakdown of individuals by their educational attainment levels and gender. It offers detailed insights into the count (N), frequency (freq), and percentage (pct) for each educational category. These categories span across “Lt High School,” “High School,” “Junior College,” “Bachelor,” and “Graduate.” An intriguing trend emerges when examining educational attainment by gender. Females exhibit notably higher percentages within the “Lt High School,” “High School,” and “Junior College” categories. However, as educational attainment progresses to higher degrees, such as “Bachelor” and “Graduate,” the gender gap in educational levels gradually diminishes. This observation suggests a convergence in educational achievements between genders as individuals pursue advanced degrees.
PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES
p_title <- "Study Degree by Sex"
p_caption <- "gss_sm dataset"
# AS STACKED BAR CHART
p <- ggplot(data = subset(mydata, !is.na(degree) & !is.na(sex)),
aes(x=degree, y=pct, fill = sex))
p + geom_col(position = "stack") +
labs(x="Degree", y="Percent", fill = "Sex",
title = p_title, caption = p_caption,
subtitle = "As a stacked bar chart") +
geom_text(aes(label=pct), position = position_stack(vjust=.5))# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
labs(x="Degree", y="Percent", fill = "Sex",
title = p_title, caption = p_caption,
subtitle = "As a dodged bar chart") +
geom_text(aes(label = pct), position = position_dodge(width = .9)) # AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
labs(x=NULL, y="Percent", fill = "Degree",
title = p_title, caption = p_caption,
subtitle = "As a faceted horizontal bar chart") +
guides(fill = "none") +
coord_flip() +
facet_grid(~ degree) +
geom_text(aes(label = pct), position = position_dodge2(width = 1))PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE
summary(gss_sm$partners) No Partners 1 Partner 2 Partners
409 1085 124
3 Partners 4 Partners 5-10 Partners
50 34 38
11-20 Partners 21-100 Partners 1 or More, # Unknown
11 1 9
NA's
1106
summary(gss_sm$age) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
18.00 34.00 49.00 49.16 62.00 89.00 10
mydata2 <- gss_sm %>%
filter(partners %in% c("No Partners", "1 Partner", "2 Partners", "3 Partners",
"4 Partners", "5-10 Partners", "11-20 Partners", "21-100 Partners")) %>%
filter(!is.na(age) & !is.na(partners)) %>%
group_by(sex, partners) %>%
summarize(N = n(),
age_mean = mean(age, na.rm = TRUE)) %>%
group_by(sex) %>%
mutate(freq = N / sum(N),
pct = round((freq * 100), 0))
mydata2# A tibble: 15 × 6
# Groups: sex [2]
sex partners N age_mean freq pct
<fct> <fct> <int> <dbl> <dbl> <dbl>
1 Male No Partners 165 53.5 0.210 21
2 Male 1 Partner 470 47.5 0.597 60
3 Male 2 Partners 74 39.4 0.0940 9
4 Male 3 Partners 23 43.9 0.0292 3
5 Male 4 Partners 20 35.8 0.0254 3
6 Male 5-10 Partners 25 32.3 0.0318 3
7 Male 11-20 Partners 9 35.1 0.0114 1
8 Male 21-100 Partners 1 72 0.00127 0
9 Female No Partners 243 59.9 0.253 25
10 Female 1 Partner 611 45.7 0.636 64
11 Female 2 Partners 50 38.0 0.0521 5
12 Female 3 Partners 27 32.5 0.0281 3
13 Female 4 Partners 14 32.5 0.0146 1
14 Female 5-10 Partners 13 31.8 0.0135 1
15 Female 11-20 Partners 2 55 0.00208 0
PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE
This code does not remove NA – make sure you deal with that.
p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) +
geom_point(aes(size = N)) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "There is more Females with no partners than Males",
hjust = 0, vjust = 2) +
labs(y = "Number of Partners", x = "Average Age",
title = "Number of Partners by Sex and Age",
subtitle = "Majority of men and woman tend to only have one partner in there lifes",
caption = "gss_sm dataset{socviz}")
print(p)PART 5: LEGEND AND GUIDES
This is just an example – do not simply duplicate these options.
mydata2$sex <- as.character(mydata2$sex)
mydata2 <- mydata2[order(mydata2$sex),]
mydata2 # check to see if the dataset is sorted alpha order starting with Catholic# A tibble: 15 × 6
# Groups: sex [2]
sex partners N age_mean freq pct
<chr> <fct> <int> <dbl> <dbl> <dbl>
1 Female No Partners 243 59.9 0.253 25
2 Female 1 Partner 611 45.7 0.636 64
3 Female 2 Partners 50 38.0 0.0521 5
4 Female 3 Partners 27 32.5 0.0281 3
5 Female 4 Partners 14 32.5 0.0146 1
6 Female 5-10 Partners 13 31.8 0.0135 1
7 Female 11-20 Partners 2 55 0.00208 0
8 Male No Partners 165 53.5 0.210 21
9 Male 1 Partner 470 47.5 0.597 60
10 Male 2 Partners 74 39.4 0.0940 9
11 Male 3 Partners 23 43.9 0.0292 3
12 Male 4 Partners 20 35.8 0.0254 3
13 Male 5-10 Partners 25 32.3 0.0318 3
14 Male 11-20 Partners 9 35.1 0.0114 1
15 Male 21-100 Partners 1 72 0.00127 0
p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) +
geom_point(aes(size = N)) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "There is more Females with no partners than Males",
hjust = 0, vjust = 2) +
labs(y = "Number of Partners", x = "Average Age",
title = "Number of Partners by Sex and Age",
subtitle = "Majority of men and woman tend to only have one partner in there lifes", hjust=0,
caption = "gss_sm dataset{socviz}") + theme(legend.title = element_text(color="gray50", size=14, face="bold"),
legend.position = "right")
pPART 6: DATA LABELS VS LEGEND
p <- ggplot(mydata2, aes(x=age_mean, y=partners, color=sex))
p + geom_point(aes(size = N)) +
geom_text(mapping = aes(label=sex), hjust=1.2, size=3) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "There is more Females with no partners than Males", hjust = 0, vjust = 2 ) +
labs(y="Average Number of Partners", x="Average Age",
title="Number of Partners by Sex and Age",
color = "Religion") +
theme(legend.position = "none")PART 7: INTERPRETATION
Create insights from the visualization.
Summarize those insights as the subtitle.
p <- ggplot(mydata2, aes(x = age_mean, y = partners, color = sex)) +
geom_point(aes(size = N), alpha = 0.7) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "Females are more propense to not have partner than Males",
hjust = 0, vjust = 2, size = 4, color = "black") +
labs(y = "Average Number of Partners", x = "Average Age",
title = "Number of Partners by Sex and Age",
subtitle = "Majority of both men and women tend to have only one partner in their lives",
caption = "Data Source: gss_sm dataset{socviz}") +
theme_minimal() +
theme(legend.title = element_text(color = "gray50", size = 14, face = "bold"),
legend.position = "right",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "darkgreen"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.text = element_text(size = 10))
print(p)