library(ggplot2)
library(socviz)
View(gss_sm)LAB 5
PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
library(socviz)
library(dplyr)
# Summarize data using dplyr
pip1 <- gss_sm %>%
filter(!is.na(marital) & !is.na(sex)) %>% # Remove missing values
group_by(marital, sex) %>%
summarize(N = n()) %>%
mutate(freq = N/sum(N),
pct = round((freq * 100), 0))
# Display the summary table
print(pip1)# A tibble: 10 × 5
# Groups: marital [5]
marital sex N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 Married Male 565 0.466 47
2 Married Female 647 0.534 53
3 Widowed Male 62 0.247 25
4 Widowed Female 189 0.753 75
5 Divorced Male 209 0.422 42
6 Divorced Female 286 0.578 58
7 Separated Male 34 0.333 33
8 Separated Female 68 0.667 67
9 Never Married Male 405 0.502 50
10 Never Married Female 401 0.498 50
Interpretation:
This table shows the gender distribution within each marital status category. In the married category, 47% are males and 53% are females. In the widowed category, 25% are males and 75% are females. In the Divorced category, 42% are males and 58% are females. In the Separated category, 33% are males and 67% are females. In the never-married category, 50% are males and 50% are females.
PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES
p_title <- "Marital Status by Sex"
p_caption <- "gss_sm dataset"
# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(marital) & !is.na(sex)),
aes(x=marital, y=pct, fill = sex))
p + geom_col(position = "stack") +
labs(x="Marital Status", y="Percent", fill = "Sex",
title = p_title, caption = p_caption,
subtitle = "As a stacked bar chart") +
geom_text(aes(label=pct), position = position_stack(vjust=.5))# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
labs(x="Marital Status", y="Percent", fill = "Sex",
title = p_title, caption = p_caption,
subtitle = "As a dodged bar chart") +
geom_text(aes(label = pct), position = position_dodge(width = .9)) # AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
labs(x=NULL, y="Percent", fill = "Sex",
title = p_title, caption = p_caption,
subtitle = "As a faceted horizontal bar chart") +
guides(fill = "none") +
coord_flip() +
facet_grid(~ marital) +
geom_text(aes(label = pct), position = position_dodge2(width = 1))PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE
summary(gss_sm$partners) No Partners 1 Partner 2 Partners
409 1085 124
3 Partners 4 Partners 5-10 Partners
50 34 38
11-20 Partners 21-100 Partners 1 or More, # Unknown
11 1 9
NA's
1106
summary(gss_sm$income16) under $1 000 $1 000 to 2 999 $3 000 to 3 999 $4 000 to 4 999
36 35 24 13
$5 000 to 5 999 $6 000 to 6 999 $7 000 to 7 999 $8 000 to 9 999
19 15 23 48
$10000 to 12499 $12500 to 14999 $15000 to 17499 $17500 to 19999
88 71 65 65
$20000 to 22499 $22500 to 24999 $25000 to 29999 $30000 to 34999
95 101 115 131
$35000 to 39999 $40000 to 49999 $50000 to 59999 $60000 to 74999
126 218 207 258
$75000 to $89999 $90000 to $109999 $110000 to $129999 $130000 to $149999
214 179 126 99
$150000 to $169999 $170000 or over NA's
61 164 271
pip2 <- gss_sm %>%
filter(partners %in% c("No Partners", "1 Partner", "2 Partners", "3 Partners",
"4 Partners", "5-10 Partners", "11-20 Partners", "21-100 Partners")) %>%
filter(!is.na(income16) & income16 != "NA's") %>%
group_by(sex, partners) %>%
summarize(N = n(),
income_mean = mean(as.numeric(gsub("[^0-9]", "", income16)))) %>%
group_by(sex) %>%
mutate(freq = N / sum(N),
pct = round((freq * 100), 0))
pip2# A tibble: 15 × 6
# Groups: sex [2]
sex partners N income_mean freq pct
<fct> <fct> <int> <dbl> <dbl> <dbl>
1 Male No Partners 151 13815477691. 0.203 20
2 Male 1 Partner 450 29537659837. 0.606 61
3 Male 2 Partners 70 23092349006. 0.0942 9
4 Male 3 Partners 21 9706257094. 0.0283 3
5 Male 4 Partners 18 34909528332. 0.0242 2
6 Male 5-10 Partners 23 26947899064. 0.0310 3
7 Male 11-20 Partners 9 16767843055. 0.0121 1
8 Male 21-100 Partners 1 6000074999 0.00135 0
9 Female No Partners 219 10249177483. 0.25 25
10 Female 1 Partner 560 26225200230. 0.639 64
11 Female 2 Partners 46 11880920466. 0.0525 5
12 Female 3 Partners 22 16058229772. 0.0251 3
13 Female 4 Partners 14 3429327356. 0.0160 2
14 Female 5-10 Partners 13 21100822461. 0.0148 1
15 Female 11-20 Partners 2 66125087499 0.00228 0
Interpretation
The table presents a detailed overview of the distribution of partners and the corresponding mean income for both males and females. For males, those with 1 partner have the highest mean income, closely followed by those with 2 partners. Interestingly, males with no partners also exhibit a substantial mean income. The trend of higher mean income for various partner counts continues for males. On the female side, a similar pattern emerges, with females having 1 partner displaying the highest mean income. Additionally, females with 2 partners also show a relatively high mean income. This table does make sense.
PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE
This code does not remove NA – make sure you deal with that.
::: {.cell}
scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
geom_point(size = 5) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "Males show to have a higher number
of partners compared to women", hjust = -0.5) +
labs(y = "Number of Partners", x = "Mean Income",
title = "Relationship Between Mean Income and Number of Partners",
subtitle = "Exploring the correlation and patterns",
caption = "gss_sm dataset{socviz}")
scatterplot:::
PART 5: LEGEND AND GUIDES
This is just an example – do not simply duplicate these options.
::: {.cell}
pip2$sex.c <- as.character(pip2$sex)
pip2 <- pip2[order(pip2$sex.c),]
scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
geom_point(size = 5) +
annotate(geom = "text", x = 1.9, y = "11-20 Partners",
label = "Males show to have a higher number
of partners compared to women", hjust = -0.4) +
labs(y = "Number of Partners", x = "Mean Income",
title = "Relationship Between Mean Income and Number of Partners",
subtitle = "Exploring the correlation and patterns",
caption = "gss_sm dataset{socviz}") +
theme(legend.title = element_text(color = "grey50", size = 12, face = "bold"),
legend.position = c(x = 0.90, y = 0.3))
scatterplot:::
PART 6: DATA LABELS VS LEGEND
scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
geom_point(size = 5) +
geom_text(mapping = aes(label = sex), hjust = 1.2, vjust = 1.5, size = 3) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "Males show to have a higher number of
partners compared to women", hjust = -0.5) +
labs(y = "Number of Partners", x = "Mean Income",
title = "Relationship Between Mean Income and Number of Partners",
subtitle = "Exploring the correlation and patterns",
caption = "gss_sm dataset{socviz}") +
theme(legend.position = "none")
scatterplotPART 7: INTERPRETATION
Create insights from the visualization.
Summarize those insights as the subtitle.
The slope between mean income and number of partners tend to increase for both sexes. This shows us that there is a positive coorelation between the three which means as the average income increases, their number of partners increases as well.
scatterplot <- ggplot(pip2, aes(x = income_mean, y = partners, color = sex)) +
geom_point(size = 5) +
geom_text(mapping = aes(label = sex), hjust = 1.2, vjust = 1.5, size = 3) +
annotate(geom = "text", x = 1.6, y = "11-20 Partners",
label = "Males show to have a higher number of
partners compared to women", hjust = -0.5) +
labs(y = "Number of Partners", x = "Mean Income",
title = "Relationship Between Mean Income and Number of Partners",
subtitle = "Positive correlation between average income and number of partners for both sexes",
caption = "gss_sm dataset{socviz}") +
theme(legend.position = "none")
scatterplot