library(tidyverse)
library(ggplot2)
library(readxl)
library(here)
library(dplyr)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)Challenge 8 Instructions
Challenge Overview
Today’s challenge is to:
- read in multiple data sets, and describe the data set using both words and any supporting information (e.g., tables, etc)
- tidy data as needed.
- mutate variables as needed.
- join two or more data sets and analyze some aspect of the joined data.
Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
- military marriages ⭐
- faostat ⭐⭐
- snl ⭐⭐⭐⭐
Solutions
Reading the Data
The working directory for RStudio has been set such that “ActiveDuty_MaritalStatus.xls” can be found at the root of the working directory using the setwd() method. The file contains 5 sheets.
total_orig <- read_excel(here("ActiveDuty_MaritalStatus.xls"),
sheet = 1,
range = "B10:N35",
col_names = c("Pay Grade", "Single Without Children - Male", "Single Without Children - Female",
"Single Without Children - Total", "Single With Children - Male",
"Single With Children - Female", "Single With Children - Total",
"Joint Service Marriage - Male", "Joint Service Marriage - Female",
"Joint Service Marriage - Total","Civilian Marriage - Male", "Civilian Marriage - Female",
"Civilian Marriage - Total")) %>%
filter(!str_detect(`Pay Grade`, "TOTAL"))
total_orig# A tibble: 24 × 13
`Pay Grade` `Single Without Children - Male` Single Without Children - Fema…¹
<chr> <dbl> <dbl>
1 E-1 31229 5717
2 E-2 53094 8388
3 E-3 131091 21019
4 E-4 112710 16381
5 E-5 57989 11021
6 E-6 19125 4654
7 E-7 5446 1913
8 E-8 1009 438
9 E-9 381 202
10 O-1 13495 3081
# ℹ 14 more rows
# ℹ abbreviated name: ¹`Single Without Children - Female`
# ℹ 10 more variables: `Single Without Children - Total` <dbl>,
# `Single With Children - Male` <dbl>, `Single With Children - Female` <dbl>,
# `Single With Children - Total` <dbl>,
# `Joint Service Marriage - Male` <dbl>,
# `Joint Service Marriage - Female` <dbl>, …
airforce_orig <- read_excel(here("ActiveDuty_MaritalStatus.xls"),
sheet = 2,
range = "B10:N29",
col_names = c("Pay Grade", "Single Without Children - Male", "Single Without Children - Female",
"Single Without Children - Total", "Single With Children - Male",
"Single With Children - Female", "Single With Children - Total",
"Joint Service Marriage - Male", "Joint Service Marriage - Female",
"Joint Service Marriage - Total","Civilian Marriage - Male", "Civilian Marriage - Female",
"Civilian Marriage - Total")) %>%
filter(!str_detect(`Pay Grade`, "TOTAL"))
airforce_orig# A tibble: 19 × 13
`Pay Grade` `Single Without Children - Male` Single Without Children - Fema…¹
<chr> <dbl> <dbl>
1 E-1 7721 1550
2 E-2 4380 1010
3 E-3 29725 7108
4 E-4 20805 4756
5 E-5 14623 4104
6 E-6 3660 1377
7 E-7 1441 617
8 E-8 182 139
9 E-9 83 48
10 O-1 3831 1068
11 O-2 2710 819
12 O-3 4035 1642
13 O-4 1077 597
14 O-5 425 300
15 O-6 101 107
16 O-7 0 2
17 O-8 1 3
18 O-9 0 0
19 O-10 0 0
# ℹ abbreviated name: ¹`Single Without Children - Female`
# ℹ 10 more variables: `Single Without Children - Total` <dbl>,
# `Single With Children - Male` <dbl>, `Single With Children - Female` <dbl>,
# `Single With Children - Total` <dbl>,
# `Joint Service Marriage - Male` <dbl>,
# `Joint Service Marriage - Female` <dbl>,
# `Joint Service Marriage - Total` <dbl>, `Civilian Marriage - Male` <dbl>, …
marine_orig <- read_excel(here("ActiveDuty_MaritalStatus.xls"),
sheet = 3,
range = "B10:N35",
col_names = c("Pay Grade", "Single Without Children - Male", "Single Without Children - Female",
"Single Without Children - Total", "Single With Children - Male",
"Single With Children - Female", "Single With Children - Total",
"Joint Service Marriage - Male", "Joint Service Marriage - Female",
"Joint Service Marriage - Total","Civilian Marriage - Male", "Civilian Marriage - Female",
"Civilian Marriage - Total")) %>%
filter(!str_detect(`Pay Grade`, "TOTAL"))
marine_orig# A tibble: 24 × 13
`Pay Grade` `Single Without Children - Male` Single Without Children - Fema…¹
<chr> <dbl> <dbl>
1 E-1 6232 583
2 E-2 15916 1336
3 E-3 34868 1864
4 E-4 17862 1026
5 E-5 7490 590
6 E-6 2051 208
7 E-7 565 72
8 E-8 148 21
9 E-9 68 10
10 O-1 2227 204
# ℹ 14 more rows
# ℹ abbreviated name: ¹`Single Without Children - Female`
# ℹ 10 more variables: `Single Without Children - Total` <dbl>,
# `Single With Children - Male` <dbl>, `Single With Children - Female` <dbl>,
# `Single With Children - Total` <dbl>,
# `Joint Service Marriage - Male` <dbl>,
# `Joint Service Marriage - Female` <dbl>, …
navy_orig <- read_excel(here("ActiveDuty_MaritalStatus.xls"),
sheet = 4,
range = "B10:N34",
col_names = c("Pay Grade", "Single Without Children - Male", "Single Without Children - Female",
"Single Without Children - Total", "Single With Children - Male",
"Single With Children - Female", "Single With Children - Total",
"Joint Service Marriage - Male", "Joint Service Marriage - Female",
"Joint Service Marriage - Total","Civilian Marriage - Male", "Civilian Marriage - Female",
"Civilian Marriage - Total")) %>%
filter(!str_detect(`Pay Grade`, "TOTAL"))
navy_orig# A tibble: 23 × 13
`Pay Grade` `Single Without Children - Male` Single Without Children - Fema…¹
<chr> <dbl> <dbl>
1 E-1 7820 2275
2 E-2 11198 2718
3 E-3 28163 6396
4 E-4 23285 4266
5 E-5 18856 3649
6 E-6 5917 1429
7 E-7 1294 395
8 E-8 219 84
9 E-9 96 56
10 O-1 3529 911
# ℹ 13 more rows
# ℹ abbreviated name: ¹`Single Without Children - Female`
# ℹ 10 more variables: `Single Without Children - Total` <dbl>,
# `Single With Children - Male` <dbl>, `Single With Children - Female` <dbl>,
# `Single With Children - Total` <dbl>,
# `Joint Service Marriage - Male` <dbl>,
# `Joint Service Marriage - Female` <dbl>, …
army_orig <- read_excel(here("ActiveDuty_MaritalStatus.xls"),
sheet = 5,
range = "B10:N35",
col_names = c("Pay Grade", "Single Without Children - Male", "Single Without Children - Female",
"Single Without Children - Total", "Single With Children - Male",
"Single With Children - Female", "Single With Children - Total",
"Joint Service Marriage - Male", "Joint Service Marriage - Female",
"Joint Service Marriage - Total","Civilian Marriage - Male", "Civilian Marriage - Female",
"Civilian Marriage - Total")) %>%
filter(!str_detect(`Pay Grade`, "TOTAL"))
army_orig# A tibble: 24 × 13
`Pay Grade` `Single Without Children - Male` Single Without Children - Fema…¹
<chr> <dbl> <dbl>
1 E-1 9456 1309
2 E-2 21600 3324
3 E-3 38335 5651
4 E-4 50758 6333
5 E-5 17020 2678
6 E-6 7497 1640
7 E-7 2146 829
8 E-8 460 194
9 E-9 134 88
10 O-1 3908 898
# ℹ 14 more rows
# ℹ abbreviated name: ¹`Single Without Children - Female`
# ℹ 10 more variables: `Single Without Children - Total` <dbl>,
# `Single With Children - Male` <dbl>, `Single With Children - Female` <dbl>,
# `Single With Children - Total` <dbl>,
# `Joint Service Marriage - Male` <dbl>,
# `Joint Service Marriage - Female` <dbl>, …
Data Description
High Level Description
An observation within each of the dataframes extracted from the sheets reports the count of active duty personnel within a specific pay grade based on their family type. Each of the dataframes has 13 variables representing the family type further segregated based on gender and a total for that family type. In addition, the Pay Grade variable represents the corresponding pay grade under observation.
How was the Data likely collected?
The data is likely to have been collected through Department of Defence (DoD) polls conducted on families with active duty military personnel across all military branches. Depending on the serving member from a family it has further been segregated based on their gender.
Tidying the Data
The current data is wide and difficult to work with. Each of the dataframes needs to be converted into a long and narrow form for easier data analysis. The pivot_longer method is used to do so below. Additionally, since we’ll be joining dataframes across different branches a “Branch” column is also added through a mutation.
airforce <- airforce_orig %>%
pivot_longer(cols=`Single Without Children - Male`:`Civilian Marriage - Total`, names_to = "Family Type", values_to = "Count") %>%
mutate(Branch="airforce")
airforce# A tibble: 228 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-1 Single Without Children - Male 7721 airforce
2 E-1 Single Without Children - Female 1550 airforce
3 E-1 Single Without Children - Total 9271 airforce
4 E-1 Single With Children - Male 27 airforce
5 E-1 Single With Children - Female 5 airforce
6 E-1 Single With Children - Total 32 airforce
7 E-1 Joint Service Marriage - Male 49 airforce
8 E-1 Joint Service Marriage - Female 27 airforce
9 E-1 Joint Service Marriage - Total 76 airforce
10 E-1 Civilian Marriage - Male 1064 airforce
# ℹ 218 more rows
marine <- marine_orig %>%
pivot_longer(cols=`Single Without Children - Male`:`Civilian Marriage - Total`, names_to = "Family Type", values_to = "Count") %>%
mutate(Branch="marine")
marine# A tibble: 288 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-1 Single Without Children - Male 6232 marine
2 E-1 Single Without Children - Female 583 marine
3 E-1 Single Without Children - Total 6815 marine
4 E-1 Single With Children - Male 54 marine
5 E-1 Single With Children - Female 3 marine
6 E-1 Single With Children - Total 57 marine
7 E-1 Joint Service Marriage - Male 20 marine
8 E-1 Joint Service Marriage - Female 19 marine
9 E-1 Joint Service Marriage - Total 39 marine
10 E-1 Civilian Marriage - Male 611 marine
# ℹ 278 more rows
navy <- navy_orig %>%
pivot_longer(cols=`Single Without Children - Male`:`Civilian Marriage - Total`, names_to = "Family Type", values_to = "Count") %>%
mutate(Branch="navy")
navy# A tibble: 276 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-1 Single Without Children - Male 7820 navy
2 E-1 Single Without Children - Female 2275 navy
3 E-1 Single Without Children - Total 10095 navy
4 E-1 Single With Children - Male 117 navy
5 E-1 Single With Children - Female 34 navy
6 E-1 Single With Children - Total 151 navy
7 E-1 Joint Service Marriage - Male 30 navy
8 E-1 Joint Service Marriage - Female 57 navy
9 E-1 Joint Service Marriage - Total 87 navy
10 E-1 Civilian Marriage - Male 806 navy
# ℹ 266 more rows
army <- army_orig %>%
pivot_longer(cols=`Single Without Children - Male`:`Civilian Marriage - Total`, names_to = "Family Type", values_to = "Count") %>%
mutate(Branch="army")
army# A tibble: 288 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-1 Single Without Children - Male 9456 army
2 E-1 Single Without Children - Female 1309 army
3 E-1 Single Without Children - Total 10765 army
4 E-1 Single With Children - Male 365 army
5 E-1 Single With Children - Female 80 army
6 E-1 Single With Children - Total 445 army
7 E-1 Joint Service Marriage - Male 40 army
8 E-1 Joint Service Marriage - Female 38 army
9 E-1 Joint Service Marriage - Total 78 army
10 E-1 Civilian Marriage - Male 2579 army
# ℹ 278 more rows
Joining Dataframes
We now join all our tidied dataframes into a single dataframe and perform analysis on the data.
military <- full_join(army, marine) %>%
full_join(navy) %>%
full_join(airforce)
military# A tibble: 1,080 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-1 Single Without Children - Male 9456 army
2 E-1 Single Without Children - Female 1309 army
3 E-1 Single Without Children - Total 10765 army
4 E-1 Single With Children - Male 365 army
5 E-1 Single With Children - Female 80 army
6 E-1 Single With Children - Total 445 army
7 E-1 Joint Service Marriage - Male 40 army
8 E-1 Joint Service Marriage - Female 38 army
9 E-1 Joint Service Marriage - Total 78 army
10 E-1 Civilian Marriage - Male 2579 army
# ℹ 1,070 more rows
The following query first filters only the total counts for each family type to improve visibility of the plot. The facet_wrap function used allows visualizing grouped bar plots across all military branches for all family types in our data.
military %>%
filter(str_ends(`Family Type`, "Total")) %>%
ggplot(aes(fill=Branch, y=Count, x=`Pay Grade`)) +
geom_bar(position="dodge", stat="identity") +
facet_wrap(~ `Family Type`, scales = "free_y") +
labs(title = "Grouped Bar Plot \nPay Grades for Different Family Types \nacross Military Branches", x = "Pay Grades", y = "Personnel Count", fill = "Military Branch") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5),
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))From the above plots, we observe that for “Single Without Children”, “Single With Children” and “Civilian” type families “army” has the highest number of personnel at the “E-4” pay grade. However, for “Join Service Marriage” families “airforce” tops the number of personnel list at the “E-5” pay grade. The following query can be used to get exact numbers for each “Family Type”:
military %>%
filter(str_ends(`Family Type`, "Total")) %>%
group_by(`Family Type`) %>%
top_n(1, Count) %>%
ungroup()# A tibble: 4 × 4
`Pay Grade` `Family Type` Count Branch
<chr> <chr> <dbl> <chr>
1 E-4 Single Without Children - Total 57091 army
2 E-4 Single With Children - Total 9609 army
3 E-4 Civilian Marriage - Total 64327 army
4 E-5 Joint Service Marriage - Total 11673 airforce
Now, let’s assume we want to find out the distribution of genders across all military branches irrespective of the family type.
The following query creates a new dataframe with a new Gender column extracted based on the Family Type of that observation.
military_gender <- military %>%
filter(!str_detect(`Family Type`, "Total")) %>%
mutate(Gender=ifelse(grepl("Male", `Family Type`), "Male", "Female"))
military_gender# A tibble: 720 × 5
`Pay Grade` `Family Type` Count Branch Gender
<chr> <chr> <dbl> <chr> <chr>
1 E-1 Single Without Children - Male 9456 army Male
2 E-1 Single Without Children - Female 1309 army Female
3 E-1 Single With Children - Male 365 army Male
4 E-1 Single With Children - Female 80 army Female
5 E-1 Joint Service Marriage - Male 40 army Male
6 E-1 Joint Service Marriage - Female 38 army Female
7 E-1 Civilian Marriage - Male 2579 army Male
8 E-1 Civilian Marriage - Female 358 army Female
9 E-2 Single Without Children - Male 21600 army Male
10 E-2 Single Without Children - Female 3324 army Female
# ℹ 710 more rows
We can now get the distributions of genders across all branches:
gender_distribution <- military_gender %>%
group_by(Branch, Gender) %>%
summarize(Count = sum(Count)) %>%
ungroup() %>%
group_by(Branch) %>%
mutate(Percentage = Count / sum(Count)) %>%
ungroup()
gender_distribution# A tibble: 8 × 4
Branch Gender Count Percentage
<chr> <chr> <dbl> <dbl>
1 airforce Female 64116 0.193
2 airforce Male 267677 0.807
3 army Female 74849 0.134
4 army Male 482095 0.866
5 marine Female 13135 0.0651
6 marine Male 188708 0.935
7 navy Female 50623 0.156
8 navy Male 273748 0.844
These can also be plotted on a pie-chart across all military branches:
gender_distribution %>%
ggplot(aes(x = "", y = Percentage, fill = Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
facet_wrap(~Branch) +
labs(title = "Gender Distribution by Branch",
fill = "Gender",
x = NULL,
y = NULL) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) +
theme(plot.title = element_text(hjust=0.5))From the above plots, we observe that “airforce” has the highest distribution of female personnel across all branches while “army” has the least gender distribution.