Synopsis

This R Markdown Notebook is my report for the Data Wrangling with R class assignment for Week 5.
The following report uses tidyverse and ggplot2 to potray transformation of messy data into ‘tidy’ data and visualizion for exploratory data analysis.

Data Source

For this report we are looking at the Bomber data sets, which provides information about some of the United States Air Force Bombers.

Packages required

library(tidyverse)
library(dplyr)
library(ggplot2)
library(readr)
library(magrittr)
library(lubridate)
library(scales)
library(stringr)

About the data

Complete description on the variables in the data set can be found here.


Questions

Problem 1

Import the bomber_wide.rds file, which lists the flying hours for each aircraft by year. Convert this data to a tibble and tidy it by changing it from a wide format to a long format so that you have the following columns: Type, MD, Year, & FH.

read_rds('../data/week5/bomber_wide.rds') %>%
  as_tibble() %>%
  gather(`1996`:`2014`, key = 'year', value = "FH")

Problem 2

Import the bomber_long.rds data, which provides the value for three different outputs for each aircraft by year. The output measures include cost, flying hours, and gallons of gas consumed but these variables are “stacked” in the Output variable. Change this data to a tibble and convert to a wider format so that you have the following columns: Type, MD, FY, Cost, FH, & Gallons.

read_rds('../data/week5/bomber_long.rds') %>%
  as_tibble() %>%
  spread(key = 'Output', value = "Value")

Problem 3

Import the bomber_combined.rds file. Note that the first variable in this data (AC) combines the aircraft type (Bomber) and aircraft designator (i.e. B-1). This variable should be split into two. Take this data and convert it to a tibble and separate the AC variable into “Type” and “MD”

read_rds('../data/week5/bomber_combined.rds') %>%
  as_tibble() %>%
  separate(col = AC,
           into = c("Type", "MD"),
           sep = " ")

Problem 4

Import the bomber_prefix.rds data. Take this data and convert it to a tibble and unite the prefix and number variables into an “MD” variable so that the data matches the tidy data sets you produced in problems #2 and #3

read_rds('../data/week5/bomber_prefix.rds') %>%
  as_tibble() %>%
  unite('MD', prefix, number, sep = '-') %>%
  spread(key = 'Output', value = 'Value')

Problem 5

Import the bomber_mess.rds file so that it is a tibble. Clean this data up by making it contain the following variables:

  • Type
  • MD which combines the prefix and number variable (i.e. “B-1”)
  • FY which is the left part of the Metric variable
  • Cost which is captured in the right part of the Metric variable
  • FH which is captured in the right part of the Metric variable
  • Gallons which is captured in the right part of the Metric variable
    Perform the required actions by stringing together the necessary functions with the pipe operator (%>%)
read_rds('../data/week5/bomber_mess.rds') %>%
  as_tibble() %>%
  unite('MD', prefix, number, sep = '-') %>%
  separate(col = Metric,
           into = c('FY', 'Metric'),
           sep = '_') %T>%
  {
   print(
     ggplot(
       data = .,
       mapping = aes(
         x = as.Date(paste0(FY, '-01-01')), # Make every year as the first date of the year
         y = Value,
         group = MD,
         color = MD
       )
     ) +
       geom_line() +
       scale_x_date(date_breaks = "3 year", date_labels = '%Y') +
       facet_wrap( ~ Metric, scale = "free", ncol = 1) +
       labs(x = "Year", y = "Value") +
       ggtitle("Historical Trend of Metrics")
   )
  } %>%
  spread(key = 'Metric', value = "Value")

Problem 6

Import the ws_programmatics.rds & ws_categorization.rds data so that they are tibbles and perform the following steps in sequence using the pipe operator (%>%)

  • Join the ws_categorization data to the ws_programmatics data
  • Filter for only FY 2014 data at the following Base: Minot AFB (ND)
  • Filter for only Systems classified as “AIRCRAFT” or “MISSILES”
  • Group the data by System level
  • Calculate the total sum of the Total_O.S and End_Strength variables
ws_programmatics <- read_rds('../data/week5/ws_programmatics.rds')
ws_categorizations <- read_rds('../data/week5/ws_categorizations.rds')

ws_programmatics %>%
  full_join(
    ws_categorizations,
    by = c("Base", "MD"),
    all.x = T,
    all.y = T
  ) %>%
  filter(FY == '2014',
         Base == "MINOT AFB (ND)",
         System %in% c("AIRCRAFT", "MISSILES")) %>%
  group_by(System) %>%
  summarize(
    sum_total_os = sum(Total_O.S, na.rm = T),
    sum_end_strength = sum(End_Strength, na.rm = T)
  )

Problem 7

Once again, join the ws_programmatics.rds & ws_categorization.rds data; however, this time identify which Base had the largest cost per flying hour (defined as \(CPFH = \frac{Total\_O.S}{FH}\) which requires you to create a new variable) in 2014. Using a bar chart in ggplot2, plot these values for the top 10 bases with the largest cost per flying hour.

ws_programmatics %>%
  full_join(
    ws_categorizations,
    by = c("Base", "MD"),
    all.x = T,
    all.y = T
  ) %>%
  # The first thing we need to do is filter out observations 
  # where Total_O.S is unknown (zero or negative) 
  filter(Total_O.S > 0) %>%
  group_by(MD) %>%  # Trying imputing the mean for the particular MD
  mutate(FH = ifelse(is.na(FH), mean(FH, na.rm = T), FH)) %>%
  ungroup(MD) %>%
  group_by(System) %>% # Trying imputing the mean for the particular System
  mutate(FH = ifelse(is.na(FH), mean(FH, na.rm = T), FH)) %>%
  ungroup() %>%
  group_by(Base) %>% # Trying imputing the mean for the particular base
  mutate(FH = ifelse(is.na(FH), mean(FH, na.rm = T), FH)) %>%
  ungroup() %>%
  filter(FY == '2014') %>%
  group_by(Base) %>%
  mutate(CPFH = sum(Total_O.S)  / sum(FH)) %>%
  summarize(avg_CPFH = mean(CPFH, na.rm = T)) %>%
  filter(rank(desc(avg_CPFH)) <= 10) %>%
  arrange(desc(avg_CPFH)) %>%
  ggplot(data = .) +
  geom_bar(
    mapping = aes(x = reorder(Base, avg_CPFH),
                  y = avg_CPFH),
    stat = "identity",
    fill = "#9ad3de",
    color = "#006BA6",
    size = 0.5,
    width = 0.5
  ) +
  geom_text(
    aes(
      x = reorder(Base, avg_CPFH),
      y = avg_CPFH,
      label = round(avg_CPFH)
    ),
    position = position_dodge(width = 0.9),
    hjust = -0.25
  ) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(x = "Base", y = "CPFH ($/Hour)") +
  ggtitle("Highest CPFH Bases") +
  coord_flip(ylim = c(0, 9000))

Problem 8

Using scatter plots in ggplot2, assess the relationship between the end strength (End_Strength) variable and total costs (Total_O.S). Provide three scatter plots that visually assesses this replationship from different angles (by FY, System, etc).

ws_prog_cat_joint <- ws_programmatics %>%
  full_join(
    ws_categorizations,
    by = c("Base", "MD"),
    all.x = T,
    all.y = T
  ) %>%
  select(Base, MD, FY, Total_O.S, End_Strength, System) %>%
  filter(complete.cases(.), Total_O.S > 0)

ws_prog_cat_joint %>%
  ggplot(data = .) +
  geom_point(
    mapping = aes(
      y = Total_O.S,
      x = End_Strength,
      color = System
    ),
    size = 1,
    alpha = 0.7
  ) +
  geom_smooth(mapping = aes(y = Total_O.S,
                            x = End_Strength,
                            color = System),
              se = F) +
  scale_y_continuous(labels = scales::dollar) +
  # facet_wrap( ~ System, ncol = 3) +
  labs(x = "Total personnel", y = "Total_O.S") +
  ggtitle("Total personnel vs Total_OS by System")

ws_prog_cat_joint %>%
  ggplot(data = .) +
  geom_point(
    mapping = aes(y = Total_O.S,
                  x = End_Strength),
    size = 0.5,
    alpha = 0.7
  ) +
  
  geom_smooth(mapping = aes(y = Total_O.S,
                            x = End_Strength),
              se = F) +
  facet_wrap( ~ FY, ncol = 4) +
  scale_y_continuous(labels = scales::dollar) +
  labs(x = "Total personnel", y = "Total_O.S") +
  ggtitle("Total personnel vs Total_OS by Year")

ws_prog_cat_joint %>%
  separate(col = Base,
           into = c("Station", "State"),
           sep = "\\(") %>%
  mutate(State = str_replace(State, "\\)", '')) %>%
  ggplot(data = .) +
  geom_point(
    mapping = aes(y = Total_O.S,
                  x = End_Strength),
    size = 1,
    alpha = 0.7
  ) +
  
  geom_smooth(
    mapping = aes(y = Total_O.S,
                  x = End_Strength),
    se = F
  ) +
  scale_y_continuous(labels = scales::dollar) +
  facet_wrap( ~ State, ncol = 4)  +
  labs(x = "Total personnel", y = "Total_O.S") +
  ggtitle("Total personnel vs Total_OS by State")