Challenge 8

Read in and Tidy Data

I chose to analyze the military marriages dataset. Specifically, I chose to analyze the Army and the Navy sheets of the dataset.

First, we can read in the data, give custom more descriptive column names, and remove the rows which don’t correspond to an individual pay grade.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)


# More descriptive custom column names
column_names <- c(
  "pay_grade", "single_no_child_male", "single_no_child_female", "single_no_child_total",
  "single_with_child_male", "single_with_child_female", "single_with_child_total",
  "joint_male", "joint_female", "joint_total", "civilian_male", "civilian_female",
  "civilian_total", "total_male", "total_female", "total_total"
)

army <- read_excel("../challenge_datasets/ActiveDuty_MaritalStatus.xls", sheet="Army", range="B10:Q35", col_names=column_names) %>% filter(pay_grade != "TOTAL ENLISTED" & pay_grade != "TOTAL OFFICER")
army

# A tibble: 24 × 16
   pay_grade single_no_child_male single_no_child_female single_no_child_total
   <chr>                    <dbl>                  <dbl>                 <dbl>
 1 E-1                       9456                   1309                 10765
 2 E-2                      21600                   3324                 24924
 3 E-3                      38335                   5651                 43986
 4 E-4                      50758                   6333                 57091
 5 E-5                      17020                   2678                 19698
 6 E-6                       7497                   1640                  9137
 7 E-7                       2146                    829                  2975
 8 E-8                        460                    194                   654
 9 E-9                        134                     88                   222
10 O-1                       3908                    898                  4806
# ℹ 14 more rows
# ℹ 12 more variables: single_with_child_male <dbl>,
#   single_with_child_female <dbl>, single_with_child_total <dbl>,
#   joint_male <dbl>, joint_female <dbl>, joint_total <dbl>,
#   civilian_male <dbl>, civilian_female <dbl>, civilian_total <dbl>,
#   total_male <dbl>, total_female <dbl>, total_total <dbl>

navy <- read_excel("../challenge_datasets/ActiveDuty_MaritalStatus.xls", sheet="Navy", range="B10:Q34", col_names=column_names) %>% filter(pay_grade != "TOTAL ENLISTED" & pay_grade != "TOTAL OFFICER")
navy

# A tibble: 23 × 16
   pay_grade single_no_child_male single_no_child_female single_no_child_total
   <chr>                    <dbl>                  <dbl>                 <dbl>
 1 E-1                       7820                   2275                 10095
 2 E-2                      11198                   2718                 13916
 3 E-3                      28163                   6396                 34559
 4 E-4                      23285                   4266                 27551
 5 E-5                      18856                   3649                 22505
 6 E-6                       5917                   1429                  7346
 7 E-7                       1294                    395                  1689
 8 E-8                        219                     84                   303
 9 E-9                         96                     56                   152
10 O-1                       3529                    911                  4440
# ℹ 13 more rows
# ℹ 12 more variables: single_with_child_male <dbl>,
#   single_with_child_female <dbl>, single_with_child_total <dbl>,
#   joint_male <dbl>, joint_female <dbl>, joint_total <dbl>,
#   civilian_male <dbl>, civilian_female <dbl>, civilian_total <dbl>,
#   total_male <dbl>, total_female <dbl>, total_total <dbl>

Then, we can mutate pay_grade to be a factor and all other variables to be ints.

army$pay_grade <- as.factor(army$pay_grade)
army <- army %>% mutate_if(is.double, as.integer)
navy$pay_grade <- as.factor(navy$pay_grade)
navy <- navy %>% mutate_if(is.double, as.integer)
army

# A tibble: 24 × 16
   pay_grade single_no_child_male single_no_child_female single_no_child_total
   <fct>                    <int>                  <int>                 <int>
 1 E-1                       9456                   1309                 10765
 2 E-2                      21600                   3324                 24924
 3 E-3                      38335                   5651                 43986
 4 E-4                      50758                   6333                 57091
 5 E-5                      17020                   2678                 19698
 6 E-6                       7497                   1640                  9137
 7 E-7                       2146                    829                  2975
 8 E-8                        460                    194                   654
 9 E-9                        134                     88                   222
10 O-1                       3908                    898                  4806
# ℹ 14 more rows
# ℹ 12 more variables: single_with_child_male <int>,
#   single_with_child_female <int>, single_with_child_total <int>,
#   joint_male <int>, joint_female <int>, joint_total <int>,
#   civilian_male <int>, civilian_female <int>, civilian_total <int>,
#   total_male <int>, total_female <int>, total_total <int>

navy

# A tibble: 23 × 16
   pay_grade single_no_child_male single_no_child_female single_no_child_total
   <fct>                    <int>                  <int>                 <int>
 1 E-1                       7820                   2275                 10095
 2 E-2                      11198                   2718                 13916
 3 E-3                      28163                   6396                 34559
 4 E-4                      23285                   4266                 27551
 5 E-5                      18856                   3649                 22505
 6 E-6                       5917                   1429                  7346
 7 E-7                       1294                    395                  1689
 8 E-8                        219                     84                   303
 9 E-9                         96                     56                   152
10 O-1                       3529                    911                  4440
# ℹ 13 more rows
# ℹ 12 more variables: single_with_child_male <int>,
#   single_with_child_female <int>, single_with_child_total <int>,
#   joint_male <int>, joint_female <int>, joint_total <int>,
#   civilian_male <int>, civilian_female <int>, civilian_total <int>,
#   total_male <int>, total_female <int>, total_total <int>

Join

Because there are a lot of columns, joining the entire two dataframes could be messy. Instead, let’s look at joining just the total counts for the army and the navy.

First, we can get dataframes which just correspond to the total counts.

tot_army <- army %>% select(pay_grade, total_male, total_female, total_total) %>%
  rename(
    army_male = total_male,
    army_female = total_female,
    army_total = total_total
  )
tot_army

# A tibble: 24 × 4
   pay_grade army_male army_female army_total
   <fct>         <int>       <int>      <int>
 1 E-1           12440        1785      14225
 2 E-2           29535        4727      34262
 3 E-3           60047       10224      70271
 4 E-4          119114       18878     137992
 5 E-5           74170       10814      84984
 6 E-6           58278        7558      65836
 7 E-7           36316        4731      41047
 8 E-8           11249        1136      12385
 9 E-9            3270         308       3578
10 O-1            6653        1615       8268
# ℹ 14 more rows

tot_navy <- navy %>% select(pay_grade, total_male, total_female, total_total) %>%
  rename(
    navy_male = total_male,
    navy_female = total_female,
    navy_total = total_total
  )
tot_navy

# A tibble: 23 × 4
   pay_grade navy_male navy_female navy_total
   <fct>         <int>       <int>      <int>
 1 E-1            8773        2528      11301
 2 E-2           14022        3357      17379
 3 E-3           41002       10263      51265
 4 E-4           42055        8674      50729
 5 E-5           55375        9807      65182
 6 E-6           40700        5735      46435
 7 E-7           19873        1810      21683
 8 E-8            5669         366       6035
 9 E-9            2439         150       2589
10 O-1            5584        1320       6904
# ℹ 13 more rows

Then, we can left join the navy with the army. Since the army has one more pay-grade “W1”, this will eliminate the pay grade so we are only comparing information that we actually have for both datasets:

army_navy <- left_join(tot_navy, tot_army, by="pay_grade")
army_navy

# A tibble: 23 × 7
   pay_grade navy_male navy_female navy_total army_male army_female army_total
   <fct>         <int>       <int>      <int>     <int>       <int>      <int>
 1 E-1            8773        2528      11301     12440        1785      14225
 2 E-2           14022        3357      17379     29535        4727      34262
 3 E-3           41002       10263      51265     60047       10224      70271
 4 E-4           42055        8674      50729    119114       18878     137992
 5 E-5           55375        9807      65182     74170       10814      84984
 6 E-6           40700        5735      46435     58278        7558      65836
 7 E-7           19873        1810      21683     36316        4731      41047
 8 E-8            5669         366       6035     11249        1136      12385
 9 E-9            2439         150       2589      3270         308       3578
10 O-1            5584        1320       6904      6653        1615       8268
# ℹ 13 more rows

We can then graph the membership by paygrade for the army for both men and women:

library(ggplot2)

# Male grpah
ggplot(army_navy, aes(x = pay_grade)) +
  geom_bar(aes(y = navy_male, fill = "Navy"), stat = "identity", position = "dodge", alpha = 0.7) +
  geom_bar(aes(y = army_male, fill = "Army"), stat = "identity", position = "dodge", alpha = 0.5) +
  labs(title = "Male Membership by Paygrade in the Army vs Navy",
       x = "Pay Grade",
       y = "Number of Male Members") +
  scale_fill_manual(name = "Military Wing", values = c("Navy" = "green", "Army" = "black")) +
  theme_minimal()

# Female graph
ggplot(army_navy, aes(x = pay_grade)) +
  geom_bar(aes(y = navy_female, fill = "Navy"), stat = "identity", position = "dodge", alpha = 0.7) +
  geom_bar(aes(y = army_female, fill = "Army"), stat = "identity", position = "dodge", alpha = 0.5) +
  labs(title = "Female Membership by Paygrade in the Army vs Navy",
       x = "Pay Grade",
       y = "Number of Male Members") +
  scale_fill_manual(name = "Military Wing", values = c("Navy" = "green", "Army" = "black")) +
  theme_minimal()

We see that at every position except for female E-1’s, the army is more populous than the navy. We also see that the most common pay grade for army males is E-4 while it is E-5 for navy males, and for females it is E-4 and E-2 respectively.