Packages used:

library(tidyverse) 
library(tidyr) # Using functions for reshaping data
library(dplyr) # Using functions for EDA
library(ggplot2) # Visualization functions

Assignment questions

Q1

Wide to long transformation

bomber_wide <- as_tibble(readRDS("bomber_data/bomber_wide.rds"))
wide_to_long <- bomber_wide %>% 
                gather(`1996`:`2014`,key="year",value="FH")
head(wide_to_long)
## # A tibble: 6 × 4
##     Type    MD  year    FH
##    <chr> <chr> <chr> <int>
## 1 Bomber   B-1  1996 26914
## 2 Bomber   B-2  1996  2364
## 3 Bomber  B-52  1996 28511
## 4 Bomber   B-1  1997 25219
## 5 Bomber   B-2  1997  2776
## 6 Bomber  B-52  1997 26034

Q2

Long to wide transformation

bomber_long <- as_tibble(readRDS("bomber_data/bomber_long.rds"))
wide_to_long <- bomber_long %>% 
                spread(value="Value",key="Output")
head(wide_to_long)
## # A tibble: 6 × 6
##     Type    MD    FY     Cost    FH  Gallons
##    <chr> <chr> <int>    <int> <int>    <int>
## 1 Bomber   B-1  1996 72753781 26914 88594449
## 2 Bomber   B-1  1997 71297263 25219 85484074
## 3 Bomber   B-1  1998 84026805 24205 85259038
## 4 Bomber   B-1  1999 71848336 23306 79323816
## 5 Bomber   B-1  2000 58439777 25013 86230284
## 6 Bomber   B-1  2001 94946077 25059 86892432

Q3

Separate a column

bomber_combined <- as_tibble(readRDS("bomber_data/bomber_combined.rds"))
bomber_separate <- bomber_combined %>% 
                   separate(AC,into = c("Type","MD"),sep=" ")
head(bomber_separate)
## # A tibble: 6 × 6
##     Type    MD    FY     Cost    FH  Gallons
##    <chr> <chr> <int>    <int> <int>    <int>
## 1 Bomber   B-1  1996 72753781 26914 88594449
## 2 Bomber   B-1  1997 71297263 25219 85484074
## 3 Bomber   B-1  1998 84026805 24205 85259038
## 4 Bomber   B-1  1999 71848336 23306 79323816
## 5 Bomber   B-1  2000 58439777 25013 86230284
## 6 Bomber   B-1  2001 94946077 25059 86892432

Q4

Combine two columns

bomber_prefix <- as_tibble(readRDS("bomber_data/bomber_prefix.rds"))
bomber_unite <- bomber_prefix %>% 
                unite(MD,prefix,number,sep="-")
head(bomber_unite)
## # A tibble: 6 × 5
##     Type    MD    FY Output Value
##    <chr> <chr> <int>  <chr> <int>
## 1 Bomber   B-1  1996     FH 26914
## 2 Bomber   B-1  1997     FH 25219
## 3 Bomber   B-1  1998     FH 24205
## 4 Bomber   B-1  1999     FH 23306
## 5 Bomber   B-1  2000     FH 25013
## 6 Bomber   B-1  2001     FH 25059

Q5

Clean and plot trend

bomber_mess <- as_tibble(readRDS("bomber_data/bomber_mess.rds"))
bomber_mess_clean <- bomber_mess %>% 
              unite(MD,prefix,number,sep="-") %>% 
              separate(Metric,into = c("year","categories")) %>% 
              spread(key="categories",value="Value")
head(bomber_mess_clean)
## # A tibble: 6 × 6
##     Type    MD  year     Cost    FH  Gallons
##    <chr> <chr> <chr>    <int> <int>    <int>
## 1 Bomber   B-1  1996 72753781 26914 88594449
## 2 Bomber   B-1  1997 71297263 25219 85484074
## 3 Bomber   B-1  1998 84026805 24205 85259038
## 4 Bomber   B-1  1999 71848336 23306 79323816
## 5 Bomber   B-1  2000 58439777 25013 86230284
## 6 Bomber   B-1  2001 94946077 25059 86892432
bomber_mess %>% 
  unite(MD,prefix,number,sep="-") %>% 
  separate(Metric,into = c("year","categories")) %>% 
  ggplot()+
  geom_point(aes(x=year,y=Value,colour=MD,group=MD))+
  geom_line(mapping= aes(x=year,y=Value,colour=MD,group=MD)) +
  #geom_smooth()+
  facet_wrap(~categories,ncol=1,scales = "free")

Q6

Join two relational tables and summarize

ws_categorization <- as_tibble(readRDS("bomber_data/ws_categorizations.rds"))
ws_programmatics <- as_tibble(readRDS("bomber_data/ws_programmatics.rds"))

question_6 <- ws_programmatics %>% 
  left_join(ws_categorization) %>% 
  filter(FY==2014,Base=="MINOT AFB (ND)",System %in% c("AIRCRAFT", "MISSILES")) %>% 
  group_by(System) %>% 
  summarize(sum_os=sum(Total_O.S,na.rm=TRUE),sum_end_strength=sum(End_Strength,na.rm=TRUE))
 head(question_6)
## # A tibble: 2 × 3
##     System    sum_os sum_end_strength
##      <chr>     <dbl>            <dbl>
## 1 AIRCRAFT 297398235             2023
## 2 MISSILES 112224909             1951

Q7

Join two relational tables, summarize and visualize results

 ws_programmatics %>% 
  left_join(ws_categorization) %>% 
  filter(FY==2014) %>% 
  group_by(Base) %>% 
  summarize(CPFH=(sum(Total_O.S,na.rm=TRUE)/sum(FH,na.rm=TRUE))) %>% 
  filter(CPFH != Inf) %>% 
  arrange(desc(CPFH)) %>% 
  top_n(1,CPFH)
## # A tibble: 1 × 2
##               Base   CPFH
##              <chr>  <dbl>
## 1 PATRICK AFB (FL) 178960
# Top 10 Base stations with highest Cost per Flying hours
ws_programmatics %>% 
  left_join(ws_categorization) %>% 
  filter(FY==2014) %>% 
  group_by(Base) %>% 
  summarize(CPFH=(sum(Total_O.S,na.rm=TRUE)/sum(FH,na.rm=TRUE))) %>% 
  filter(CPFH != Inf) %>% 
  arrange(desc(CPFH)) %>% 
  top_n(10,CPFH) %>% 
  ggplot()+
  geom_bar(mapping=aes(x=reorder(Base,-CPFH), y=CPFH),stat="identity",col="yellow")

Q8

Correlation between Total_O.S and End Strength for every year

 ws_programmatics %>% 
  left_join(ws_categorization) %>% 
  ggplot()+
  geom_point(mapping=aes(x=Total_O.S,y=End_Strength,colour=FY))+
  #coord_cartesian(xlim=1000000000,expand=TRUE)+
  facet_wrap(~FY)

Correlation between Total_O.S and End Strength for each system type

 ws_programmatics %>% 
   left_join(ws_categorization) %>% 
   ggplot()+
   geom_point(mapping=aes(x=Total_O.S,y=End_Strength,colour=System))+
   facet_wrap(~System)

Correlation between Total_O.S and End Strength for top 12 base stations by cost

 Top_base_cost <- ws_programmatics %>% 
   group_by(Base) %>% 
   summarize(Base_cost=sum(Total_O.S,na.rm=TRUE)) %>% 
   mutate(base_cost_rank= dense_rank(desc(Base_cost))) %>% 
   arrange(base_cost_rank)

ws_programmatics %>% 
  left_join(Top_base_cost,by="Base") %>% 
  filter(base_cost_rank<=12) %>% 
  ggplot()+
  geom_point(mapping=aes(x=Total_O.S,y=End_Strength,colour=Base))+
  facet_wrap(~Base)

Correlation between Total_O.S and End Strength for top 12 MD’s by cost

 Top_MD_cost <- ws_programmatics %>% 
   group_by(MD) %>% 
   summarize(MD_cost=sum(Total_O.S,na.rm=TRUE)) %>% 
   mutate(MD_cost_rank= dense_rank(desc(MD_cost))) %>% 
   arrange(MD_cost_rank)
 
ws_programmatics %>% 
   left_join(Top_MD_cost,by="MD") %>% 
   filter(MD_cost_rank<=12) %>% 
   ggplot()+
   geom_point(mapping=aes(x=Total_O.S,y=End_Strength,colour=MD))+
   facet_wrap(~MD)