Read in and Tidy Data
I chose to analyze the military marriages dataset. Specifically, I chose to analyze the Army and the Navy sheets of the dataset.
First, we can read in the data, give custom more descriptive column names, and remove the rows which don’t correspond to an individual pay grade.
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (readxl)
# More descriptive custom column names
column_names <- c (
"pay_grade" , "single_no_child_male" , "single_no_child_female" , "single_no_child_total" ,
"single_with_child_male" , "single_with_child_female" , "single_with_child_total" ,
"joint_male" , "joint_female" , "joint_total" , "civilian_male" , "civilian_female" ,
"civilian_total" , "total_male" , "total_female" , "total_total"
)
army <- read_excel ("../challenge_datasets/ActiveDuty_MaritalStatus.xls" , sheet= "Army" , range= "B10:Q35" , col_names= column_names) %>% filter (pay_grade != "TOTAL ENLISTED" & pay_grade != "TOTAL OFFICER" )
army
# A tibble: 24 × 16
pay_grade single_no_child_male single_no_child_female single_no_child_total
<chr> <dbl> <dbl> <dbl>
1 E-1 9456 1309 10765
2 E-2 21600 3324 24924
3 E-3 38335 5651 43986
4 E-4 50758 6333 57091
5 E-5 17020 2678 19698
6 E-6 7497 1640 9137
7 E-7 2146 829 2975
8 E-8 460 194 654
9 E-9 134 88 222
10 O-1 3908 898 4806
# ℹ 14 more rows
# ℹ 12 more variables: single_with_child_male <dbl>,
# single_with_child_female <dbl>, single_with_child_total <dbl>,
# joint_male <dbl>, joint_female <dbl>, joint_total <dbl>,
# civilian_male <dbl>, civilian_female <dbl>, civilian_total <dbl>,
# total_male <dbl>, total_female <dbl>, total_total <dbl>
navy <- read_excel ("../challenge_datasets/ActiveDuty_MaritalStatus.xls" , sheet= "Navy" , range= "B10:Q34" , col_names= column_names) %>% filter (pay_grade != "TOTAL ENLISTED" & pay_grade != "TOTAL OFFICER" )
navy
# A tibble: 23 × 16
pay_grade single_no_child_male single_no_child_female single_no_child_total
<chr> <dbl> <dbl> <dbl>
1 E-1 7820 2275 10095
2 E-2 11198 2718 13916
3 E-3 28163 6396 34559
4 E-4 23285 4266 27551
5 E-5 18856 3649 22505
6 E-6 5917 1429 7346
7 E-7 1294 395 1689
8 E-8 219 84 303
9 E-9 96 56 152
10 O-1 3529 911 4440
# ℹ 13 more rows
# ℹ 12 more variables: single_with_child_male <dbl>,
# single_with_child_female <dbl>, single_with_child_total <dbl>,
# joint_male <dbl>, joint_female <dbl>, joint_total <dbl>,
# civilian_male <dbl>, civilian_female <dbl>, civilian_total <dbl>,
# total_male <dbl>, total_female <dbl>, total_total <dbl>
Then, we can mutate pay_grade to be a factor and all other variables to be ints.
army$ pay_grade <- as.factor (army$ pay_grade)
army <- army %>% mutate_if (is.double, as.integer)
navy$ pay_grade <- as.factor (navy$ pay_grade)
navy <- navy %>% mutate_if (is.double, as.integer)
army
# A tibble: 24 × 16
pay_grade single_no_child_male single_no_child_female single_no_child_total
<fct> <int> <int> <int>
1 E-1 9456 1309 10765
2 E-2 21600 3324 24924
3 E-3 38335 5651 43986
4 E-4 50758 6333 57091
5 E-5 17020 2678 19698
6 E-6 7497 1640 9137
7 E-7 2146 829 2975
8 E-8 460 194 654
9 E-9 134 88 222
10 O-1 3908 898 4806
# ℹ 14 more rows
# ℹ 12 more variables: single_with_child_male <int>,
# single_with_child_female <int>, single_with_child_total <int>,
# joint_male <int>, joint_female <int>, joint_total <int>,
# civilian_male <int>, civilian_female <int>, civilian_total <int>,
# total_male <int>, total_female <int>, total_total <int>
# A tibble: 23 × 16
pay_grade single_no_child_male single_no_child_female single_no_child_total
<fct> <int> <int> <int>
1 E-1 7820 2275 10095
2 E-2 11198 2718 13916
3 E-3 28163 6396 34559
4 E-4 23285 4266 27551
5 E-5 18856 3649 22505
6 E-6 5917 1429 7346
7 E-7 1294 395 1689
8 E-8 219 84 303
9 E-9 96 56 152
10 O-1 3529 911 4440
# ℹ 13 more rows
# ℹ 12 more variables: single_with_child_male <int>,
# single_with_child_female <int>, single_with_child_total <int>,
# joint_male <int>, joint_female <int>, joint_total <int>,
# civilian_male <int>, civilian_female <int>, civilian_total <int>,
# total_male <int>, total_female <int>, total_total <int>
Join
Because there are a lot of columns, joining the entire two dataframes could be messy. Instead, let’s look at joining just the total counts for the army and the navy.
First, we can get dataframes which just correspond to the total counts.
tot_army <- army %>% select (pay_grade, total_male, total_female, total_total) %>%
rename (
army_male = total_male,
army_female = total_female,
army_total = total_total
)
tot_army
# A tibble: 24 × 4
pay_grade army_male army_female army_total
<fct> <int> <int> <int>
1 E-1 12440 1785 14225
2 E-2 29535 4727 34262
3 E-3 60047 10224 70271
4 E-4 119114 18878 137992
5 E-5 74170 10814 84984
6 E-6 58278 7558 65836
7 E-7 36316 4731 41047
8 E-8 11249 1136 12385
9 E-9 3270 308 3578
10 O-1 6653 1615 8268
# ℹ 14 more rows
tot_navy <- navy %>% select (pay_grade, total_male, total_female, total_total) %>%
rename (
navy_male = total_male,
navy_female = total_female,
navy_total = total_total
)
tot_navy
# A tibble: 23 × 4
pay_grade navy_male navy_female navy_total
<fct> <int> <int> <int>
1 E-1 8773 2528 11301
2 E-2 14022 3357 17379
3 E-3 41002 10263 51265
4 E-4 42055 8674 50729
5 E-5 55375 9807 65182
6 E-6 40700 5735 46435
7 E-7 19873 1810 21683
8 E-8 5669 366 6035
9 E-9 2439 150 2589
10 O-1 5584 1320 6904
# ℹ 13 more rows
Then, we can left join the navy with the army. Since the army has one more pay-grade “W1”, this will eliminate the pay grade so we are only comparing information that we actually have for both datasets:
army_navy <- left_join (tot_navy, tot_army, by= "pay_grade" )
army_navy
# A tibble: 23 × 7
pay_grade navy_male navy_female navy_total army_male army_female army_total
<fct> <int> <int> <int> <int> <int> <int>
1 E-1 8773 2528 11301 12440 1785 14225
2 E-2 14022 3357 17379 29535 4727 34262
3 E-3 41002 10263 51265 60047 10224 70271
4 E-4 42055 8674 50729 119114 18878 137992
5 E-5 55375 9807 65182 74170 10814 84984
6 E-6 40700 5735 46435 58278 7558 65836
7 E-7 19873 1810 21683 36316 4731 41047
8 E-8 5669 366 6035 11249 1136 12385
9 E-9 2439 150 2589 3270 308 3578
10 O-1 5584 1320 6904 6653 1615 8268
# ℹ 13 more rows
We can then graph the membership by paygrade for the army for both men and women:
library (ggplot2)
# Male grpah
ggplot (army_navy, aes (x = pay_grade)) +
geom_bar (aes (y = navy_male, fill = "Navy" ), stat = "identity" , position = "dodge" , alpha = 0.7 ) +
geom_bar (aes (y = army_male, fill = "Army" ), stat = "identity" , position = "dodge" , alpha = 0.5 ) +
labs (title = "Male Membership by Paygrade in the Army vs Navy" ,
x = "Pay Grade" ,
y = "Number of Male Members" ) +
scale_fill_manual (name = "Military Wing" , values = c ("Navy" = "green" , "Army" = "black" )) +
theme_minimal ()
# Female graph
ggplot (army_navy, aes (x = pay_grade)) +
geom_bar (aes (y = navy_female, fill = "Navy" ), stat = "identity" , position = "dodge" , alpha = 0.7 ) +
geom_bar (aes (y = army_female, fill = "Army" ), stat = "identity" , position = "dodge" , alpha = 0.5 ) +
labs (title = "Female Membership by Paygrade in the Army vs Navy" ,
x = "Pay Grade" ,
y = "Number of Male Members" ) +
scale_fill_manual (name = "Military Wing" , values = c ("Navy" = "green" , "Army" = "black" )) +
theme_minimal ()
We see that at every position except for female E-1’s, the army is more populous than the navy. We also see that the most common pay grade for army males is E-4 while it is E-5 for navy males, and for females it is E-4 and E-2 respectively.