Synopsis

This is my homework report for week 5. In this week, our main focus is to tidy the datasets given and preppare them for further data analysis.

Packages Used

library(knitr)       #knitr is the package used to create R Markdown pages
library(tidyverse)   #tidyverse loads the library dplyr which we use to persform data tranformations and 
library(ggplot2)     #ggplot2 is used for charting and graphing

1. Import the bomber_wide.rds file, which lists the flying hours for each aircraft by year. Convert this data to a tibble and tidy it by changing it from a wide format to a long format -

bomber_wide_data_raw <- readRDS(file = "data\\bomber_wide.rds")
str(bomber_wide_data_raw)
## 'data.frame':    3 obs. of  21 variables:
##  $ Type: chr  "Bomber" "Bomber" "Bomber"
##  $ MD  : chr  "B-1" "B-2" "B-52"
##  $ 1996: int  26914 2364 28511
##  $ 1997: int  25219 2776 26034
##  $ 1998: int  24205 2166 25639
##  $ 1999: int  23306 3672 24500
##  $ 2000: int  25013 4543 24387
##  $ 2001: int  25059 4754 24813
##  $ 2002: int  26581 5969 36687
##  $ 2003: int  21491 6801 30230
##  $ 2004: int  28118 5677 25493
##  $ 2005: int  21859 5228 27838
##  $ 2006: int  20163 6100 28206
##  $ 2007: int  24629 6266 22484
##  $ 2008: int  23024 4074 20284
##  $ 2009: int  23065 4567 18765
##  $ 2010: int  23398 4638 18250
##  $ 2011: int  24166 4873 21727
##  $ 2012: int  29292 4955 21748
##  $ 2013: int  25253 4076 19309
##  $ 2014: int  22204 5070 21297
head(bomber_wide_data_raw)
##     Type   MD  1996  1997  1998  1999  2000  2001  2002  2003  2004  2005
## 1 Bomber  B-1 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859
## 2 Bomber  B-2  2364  2776  2166  3672  4543  4754  5969  6801  5677  5228
## 3 Bomber B-52 28511 26034 25639 24500 24387 24813 36687 30230 25493 27838
##    2006  2007  2008  2009  2010  2011  2012  2013  2014
## 1 20163 24629 23024 23065 23398 24166 29292 25253 22204
## 2  6100  6266  4074  4567  4638  4873  4955  4076  5070
## 3 28206 22484 20284 18765 18250 21727 21748 19309 21297
sum(is.na(bomber_wide_data_raw))
## [1] 0
bomber_wide_data <- bomber_wide_data_raw %>% gather(`1996`:`2014`, key="Year", value="FH")
as_tibble(bomber_wide_data)
## # A tibble: 57 x 4
##      Type    MD  Year    FH
##     <chr> <chr> <chr> <int>
## 1  Bomber   B-1  1996 26914
## 2  Bomber   B-2  1996  2364
## 3  Bomber  B-52  1996 28511
## 4  Bomber   B-1  1997 25219
## 5  Bomber   B-2  1997  2776
## 6  Bomber  B-52  1997 26034
## 7  Bomber   B-1  1998 24205
## 8  Bomber   B-2  1998  2166
## 9  Bomber  B-52  1998 25639
## 10 Bomber   B-1  1999 23306
## # ... with 47 more rows

2. Import the bomber_long.rds data, Change this data to a tibble and convert to a wider format so that you have the following columns: Type, MD, FY, Cost, FH, & Gallons. -

bomber_long_data_raw <- readRDS(file = "data\\bomber_long.rds")
str(bomber_long_data_raw)
## 'data.frame':    171 obs. of  5 variables:
##  $ Type  : chr  "Bomber" "Bomber" "Bomber" "Bomber" ...
##  $ MD    : chr  "B-1" "B-1" "B-1" "B-1" ...
##  $ FY    : int  1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
##  $ Output: chr  "FH" "FH" "FH" "FH" ...
##  $ Value : int  26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
head(bomber_long_data_raw)
##     Type  MD   FY Output Value
## 1 Bomber B-1 1996     FH 26914
## 2 Bomber B-1 1997     FH 25219
## 3 Bomber B-1 1998     FH 24205
## 4 Bomber B-1 1999     FH 23306
## 5 Bomber B-1 2000     FH 25013
## 6 Bomber B-1 2001     FH 25059
sum(is.na(bomber_long_data_raw))
## [1] 0
bomber_long_data <- bomber_long_data_raw %>% spread(key=Output, value=Value)
as_tibble(bomber_long_data)
## # A tibble: 57 x 6
##      Type    MD    FY      Cost    FH   Gallons
## *   <chr> <chr> <int>     <int> <int>     <int>
## 1  Bomber   B-1  1996  72753781 26914  88594449
## 2  Bomber   B-1  1997  71297263 25219  85484074
## 3  Bomber   B-1  1998  84026805 24205  85259038
## 4  Bomber   B-1  1999  71848336 23306  79323816
## 5  Bomber   B-1  2000  58439777 25013  86230284
## 6  Bomber   B-1  2001  94946077 25059  86892432
## 7  Bomber   B-1  2002  96458536 26581  89198262
## 8  Bomber   B-1  2003  68650070 21491  74485788
## 9  Bomber   B-1  2004 101895634 28118 101397707
## 10 Bomber   B-1  2005 124816690 21859  78410415
## # ... with 47 more rows

3. Import the bomber_combined.rds file. Take this data and convert it to a tibble and separate the AC variable into “Type” and “MD” -

bomber_combined_data_raw <- readRDS(file="data\\bomber_combined.rds")
str(bomber_combined_data_raw)
## 'data.frame':    57 obs. of  5 variables:
##  $ AC     : chr  "Bomber B-1" "Bomber B-1" "Bomber B-1" "Bomber B-1" ...
##  $ FY     : int  1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
##  $ Cost   : int  72753781 71297263 84026805 71848336 58439777 94946077 96458536 68650070 101895634 124816690 ...
##  $ FH     : int  26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
##  $ Gallons: int  88594449 85484074 85259038 79323816 86230284 86892432 89198262 74485788 101397707 78410415 ...
head(bomber_combined_data_raw)
##           AC   FY     Cost    FH  Gallons
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
sum(is.na(bomber_combined_data_raw))
## [1] 0
bomber_combined_data <- bomber_combined_data_raw %>% separate(AC,  c("Type","MD"), sep=" ")
as_tibble(bomber_combined_data) 
## # A tibble: 57 x 6
##      Type    MD    FY      Cost    FH   Gallons
## *   <chr> <chr> <int>     <int> <int>     <int>
## 1  Bomber   B-1  1996  72753781 26914  88594449
## 2  Bomber   B-1  1997  71297263 25219  85484074
## 3  Bomber   B-1  1998  84026805 24205  85259038
## 4  Bomber   B-1  1999  71848336 23306  79323816
## 5  Bomber   B-1  2000  58439777 25013  86230284
## 6  Bomber   B-1  2001  94946077 25059  86892432
## 7  Bomber   B-1  2002  96458536 26581  89198262
## 8  Bomber   B-1  2003  68650070 21491  74485788
## 9  Bomber   B-1  2004 101895634 28118 101397707
## 10 Bomber   B-1  2005 124816690 21859  78410415
## # ... with 47 more rows

4. Import the bomber_prefix.rds data. Take this data and convert it to a tibble and unite the prefix and number variables into an “MD” variable so that the data matches the tidy data sets you produced in problems #2 and #3.-

bomber_prefix_data_raw <- readRDS("data\\bomber_prefix.rds")
str(bomber_prefix_data_raw)
## 'data.frame':    171 obs. of  6 variables:
##  $ Type  : chr  "Bomber" "Bomber" "Bomber" "Bomber" ...
##  $ prefix: chr  "B" "B" "B" "B" ...
##  $ number: chr  "1" "1" "1" "1" ...
##  $ FY    : int  1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
##  $ Output: chr  "FH" "FH" "FH" "FH" ...
##  $ Value : int  26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
head(bomber_prefix_data_raw)
##     Type prefix number   FY Output Value
## 1 Bomber      B      1 1996     FH 26914
## 2 Bomber      B      1 1997     FH 25219
## 3 Bomber      B      1 1998     FH 24205
## 4 Bomber      B      1 1999     FH 23306
## 5 Bomber      B      1 2000     FH 25013
## 6 Bomber      B      1 2001     FH 25059
sum(is.na(bomber_prefix_data_raw))
## [1] 0
bomber_prefix_data <- bomber_prefix_data_raw %>% unite(MD, c(prefix,number), sep="-") %>%
                                                 spread(key=Output, value=Value)
as_tibble(bomber_prefix_data)
## # A tibble: 57 x 6
##      Type    MD    FY      Cost    FH   Gallons
## *   <chr> <chr> <int>     <int> <int>     <int>
## 1  Bomber   B-1  1996  72753781 26914  88594449
## 2  Bomber   B-1  1997  71297263 25219  85484074
## 3  Bomber   B-1  1998  84026805 24205  85259038
## 4  Bomber   B-1  1999  71848336 23306  79323816
## 5  Bomber   B-1  2000  58439777 25013  86230284
## 6  Bomber   B-1  2001  94946077 25059  86892432
## 7  Bomber   B-1  2002  96458536 26581  89198262
## 8  Bomber   B-1  2003  68650070 21491  74485788
## 9  Bomber   B-1  2004 101895634 28118 101397707
## 10 Bomber   B-1  2005 124816690 21859  78410415
## # ... with 47 more rows

6. Import the ws_programmatics.rds & ws_categorization.rds data so that they are tibbles and perform the following steps in sequence using the pipe operator -

ws_programmatics_data <- readRDS("data//ws_programmatics.rds")
ws_categorization_data <- readRDS("data//ws_categorizations.rds")
as_tibble(ws_programmatics_data)
## # A tibble: 36,328 x 18
##              Base    MD    FY Manpower_Ops Manpower_Mx
##             <chr> <chr> <int>        <dbl>       <dbl>
## 1  ALTUS AFB (OK)  A-10  2007           NA          NA
## 2  ALTUS AFB (OK)  A-10  2008           NA          NA
## 3  ALTUS AFB (OK) AT-38  1997           NA          NA
## 4  ALTUS AFB (OK) AT-38  1998           NA          NA
## 5  ALTUS AFB (OK) C-130  1998           NA          NA
## 6  ALTUS AFB (OK) C-130  1999           NA          NA
## 7  ALTUS AFB (OK) C-130  2000           NA          NA
## 8  ALTUS AFB (OK) C-130  2002           NA          NA
## 9  ALTUS AFB (OK) C-130  2003           NA          NA
## 10 ALTUS AFB (OK) C-130  2004           NA       19912
## # ... with 36,318 more rows, and 13 more variables:
## #   Manpower_Support_Staff <dbl>, Operating_Material <dbl>,
## #   Mx_Consumables <dbl>, Mx_DLR <dbl>, Mx_Depot_AC <dbl>,
## #   Mx_Depot_Missile <dbl>, Mx_Depot_Engine <dbl>, CLS <dbl>,
## #   Total_O.S <dbl>, Avg_Inv <dbl>, TAI <dbl>, End_Strength <dbl>,
## #   FH <dbl>
as_tibble(ws_categorization_data)
## # A tibble: 4,336 x 3
##              Base   System     MD
##             <chr>    <chr>  <chr>
## 1  ALTUS AFB (OK) AIRCRAFT   A-10
## 2  ALTUS AFB (OK) AIRCRAFT  AT-38
## 3  ALTUS AFB (OK) AIRCRAFT  C-130
## 4  ALTUS AFB (OK) AIRCRAFT  C-135
## 5  ALTUS AFB (OK) AIRCRAFT  C-141
## 6  ALTUS AFB (OK) AIRCRAFT   C-17
## 7  ALTUS AFB (OK) AIRCRAFT    C-5
## 8  ALTUS AFB (OK) AIRCRAFT    E-3
## 9  ALTUS AFB (OK) AIRCRAFT EC-130
## 10 ALTUS AFB (OK) AIRCRAFT   F-15
## # ... with 4,326 more rows
str(ws_programmatics_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    36328 obs. of  18 variables:
##  $ Base                  : chr  "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" ...
##  $ MD                    : chr  "A-10" "A-10" "AT-38" "AT-38" ...
##  $ FY                    : int  2007 2008 1997 1998 1998 1999 2000 2002 2003 2004 ...
##  $ Manpower_Ops          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Manpower_Mx           : num  NA NA NA NA NA ...
##  $ Manpower_Support_Staff: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Operating_Material    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Mx_Consumables        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Mx_DLR                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Mx_Depot_AC           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Mx_Depot_Missile      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Mx_Depot_Engine       : num  NA NA NA NA 158852 ...
##  $ CLS                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Total_O.S             : num  0 0 0 0 158852 ...
##  $ Avg_Inv               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TAI                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ End_Strength          : num  73 21 NA NA NA NA NA 12 NA 1 ...
##  $ FH                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 18
##   .. ..$ Base                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ MD                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ FY                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Manpower_Ops          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Manpower_Mx           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Manpower_Support_Staff: list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Operating_Material    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Mx_Consumables        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Mx_DLR                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Mx_Depot_AC           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Mx_Depot_Missile      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Mx_Depot_Engine       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ CLS                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Total_O.S             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ Avg_Inv               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ TAI                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ End_Strength          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   .. ..$ FH                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_number" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
str(ws_categorization_data)
## 'data.frame':    4336 obs. of  3 variables:
##  $ Base  : chr  "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" ...
##  $ System: chr  "AIRCRAFT" "AIRCRAFT" "AIRCRAFT" "AIRCRAFT" ...
##  $ MD    : chr  "A-10" "AT-38" "C-130" "C-135" ...
head(ws_categorization_data)
##             Base   System    MD
## 1 ALTUS AFB (OK) AIRCRAFT  A-10
## 2 ALTUS AFB (OK) AIRCRAFT AT-38
## 3 ALTUS AFB (OK) AIRCRAFT C-130
## 4 ALTUS AFB (OK) AIRCRAFT C-135
## 5 ALTUS AFB (OK) AIRCRAFT C-141
## 6 ALTUS AFB (OK) AIRCRAFT  C-17
head(ws_programmatics_data)
## # A tibble: 6 x 18
##             Base    MD    FY Manpower_Ops Manpower_Mx
##            <chr> <chr> <int>        <dbl>       <dbl>
## 1 ALTUS AFB (OK)  A-10  2007           NA          NA
## 2 ALTUS AFB (OK)  A-10  2008           NA          NA
## 3 ALTUS AFB (OK) AT-38  1997           NA          NA
## 4 ALTUS AFB (OK) AT-38  1998           NA          NA
## 5 ALTUS AFB (OK) C-130  1998           NA          NA
## 6 ALTUS AFB (OK) C-130  1999           NA          NA
## # ... with 13 more variables: Manpower_Support_Staff <dbl>,
## #   Operating_Material <dbl>, Mx_Consumables <dbl>, Mx_DLR <dbl>,
## #   Mx_Depot_AC <dbl>, Mx_Depot_Missile <dbl>, Mx_Depot_Engine <dbl>,
## #   CLS <dbl>, Total_O.S <dbl>, Avg_Inv <dbl>, TAI <dbl>,
## #   End_Strength <dbl>, FH <dbl>

Trying to find missing values amd performing missing value imputation

cor(x=ws_programmatics_data$FY, y=ws_programmatics_data$End_Strength, use="complete.obs")
## [1] 0.03158332
summary(ws_programmatics_data$End_Strength)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     1.0     3.0    12.0   149.4    62.0  4785.0   15487
ws_programmatics_data$End_Strength[is.na(ws_programmatics_data$End_Strength)]=mean(ws_programmatics_data$End_Strength,na.rm=TRUE)
ws_programmatics_data_6 <- ws_programmatics_data %>% 
                          left_join(ws_categorization_data, by=c("Base","MD")) %>%
                          filter(FY==2014 & Base=="MINOT AFB (ND)") %>%
                          filter(System %in% c("AIRCRAFT","MISSILES")) %>%
                          group_by(System) %>%
                          summarize(Sum_O.S = sum(Total_O.S,na.rm=TRUE),Sum_EndStrength=sum(End_Strength,na.rm=TRUE))

7. Once again, join the ws_programmatics.rds & ws_categorization.rds data; however, this time identify which Base had the largest cost per flying hour (defined as CPFH=Total_O.SFH which requires you to create a new variable) in 2014. Using a bar chart in ggplot2, plot these values for the top 10 bases with the largest cost per flying hour -

ws_programmatics_data_7 <- ws_programmatics_data %>% 
                          left_join(ws_categorization_data, by=c("Base","MD")) %>%
                          filter(FY==2014) %>%
                          group_by(Base) %>%
                          mutate(CPFH=Total_O.S/FH) %>%
                          aggregate(CPFH ~ Base, data= ., mean) %>%
                          arrange(desc(CPFH)) %>%
                          top_n(10)
## Selecting by CPFH
ggplot(ws_programmatics_data_7, aes(x=Base, y=CPFH)) + geom_bar(aes(x= reorder(Base, CPFH)),stat="identity", fill="firebrick") + theme(axis.text.x=element_text(angle=90, hjust=1))

8.Using scatter plots in ggplot2, assess the relationship between the end strength (End_Strength) variable and total costs (Total_O.S). Provide three scatter plots that visually assesses this replationship from different angles (by FY, System, etc) -

ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>% 
                          ggplot(aes(End_Strength, Total_O.S)) + geom_point(color="green")

ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>% 
                          ggplot(aes(End_Strength, Total_O.S)) + geom_point(color="blue") + facet_wrap(~FY, nrow=5)

ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>% 
                           ggplot(aes(End_Strength, Total_O.S)) + geom_point(aes(color=System))