This is my homework report for week 5. In this week, our main focus is to tidy the datasets given and preppare them for further data analysis.
library(knitr) #knitr is the package used to create R Markdown pages
library(tidyverse) #tidyverse loads the library dplyr which we use to persform data tranformations and
library(ggplot2) #ggplot2 is used for charting and graphing
bomber_wide_data_raw <- readRDS(file = "data\\bomber_wide.rds")
str(bomber_wide_data_raw)
## 'data.frame': 3 obs. of 21 variables:
## $ Type: chr "Bomber" "Bomber" "Bomber"
## $ MD : chr "B-1" "B-2" "B-52"
## $ 1996: int 26914 2364 28511
## $ 1997: int 25219 2776 26034
## $ 1998: int 24205 2166 25639
## $ 1999: int 23306 3672 24500
## $ 2000: int 25013 4543 24387
## $ 2001: int 25059 4754 24813
## $ 2002: int 26581 5969 36687
## $ 2003: int 21491 6801 30230
## $ 2004: int 28118 5677 25493
## $ 2005: int 21859 5228 27838
## $ 2006: int 20163 6100 28206
## $ 2007: int 24629 6266 22484
## $ 2008: int 23024 4074 20284
## $ 2009: int 23065 4567 18765
## $ 2010: int 23398 4638 18250
## $ 2011: int 24166 4873 21727
## $ 2012: int 29292 4955 21748
## $ 2013: int 25253 4076 19309
## $ 2014: int 22204 5070 21297
head(bomber_wide_data_raw)
## Type MD 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
## 1 Bomber B-1 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859
## 2 Bomber B-2 2364 2776 2166 3672 4543 4754 5969 6801 5677 5228
## 3 Bomber B-52 28511 26034 25639 24500 24387 24813 36687 30230 25493 27838
## 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 1 20163 24629 23024 23065 23398 24166 29292 25253 22204
## 2 6100 6266 4074 4567 4638 4873 4955 4076 5070
## 3 28206 22484 20284 18765 18250 21727 21748 19309 21297
sum(is.na(bomber_wide_data_raw))
## [1] 0
bomber_wide_data <- bomber_wide_data_raw %>% gather(`1996`:`2014`, key="Year", value="FH")
as_tibble(bomber_wide_data)
## # A tibble: 57 x 4
## Type MD Year FH
## <chr> <chr> <chr> <int>
## 1 Bomber B-1 1996 26914
## 2 Bomber B-2 1996 2364
## 3 Bomber B-52 1996 28511
## 4 Bomber B-1 1997 25219
## 5 Bomber B-2 1997 2776
## 6 Bomber B-52 1997 26034
## 7 Bomber B-1 1998 24205
## 8 Bomber B-2 1998 2166
## 9 Bomber B-52 1998 25639
## 10 Bomber B-1 1999 23306
## # ... with 47 more rows
bomber_long_data_raw <- readRDS(file = "data\\bomber_long.rds")
str(bomber_long_data_raw)
## 'data.frame': 171 obs. of 5 variables:
## $ Type : chr "Bomber" "Bomber" "Bomber" "Bomber" ...
## $ MD : chr "B-1" "B-1" "B-1" "B-1" ...
## $ FY : int 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
## $ Output: chr "FH" "FH" "FH" "FH" ...
## $ Value : int 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
head(bomber_long_data_raw)
## Type MD FY Output Value
## 1 Bomber B-1 1996 FH 26914
## 2 Bomber B-1 1997 FH 25219
## 3 Bomber B-1 1998 FH 24205
## 4 Bomber B-1 1999 FH 23306
## 5 Bomber B-1 2000 FH 25013
## 6 Bomber B-1 2001 FH 25059
sum(is.na(bomber_long_data_raw))
## [1] 0
bomber_long_data <- bomber_long_data_raw %>% spread(key=Output, value=Value)
as_tibble(bomber_long_data)
## # A tibble: 57 x 6
## Type MD FY Cost FH Gallons
## * <chr> <chr> <int> <int> <int> <int>
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
## 7 Bomber B-1 2002 96458536 26581 89198262
## 8 Bomber B-1 2003 68650070 21491 74485788
## 9 Bomber B-1 2004 101895634 28118 101397707
## 10 Bomber B-1 2005 124816690 21859 78410415
## # ... with 47 more rows
bomber_combined_data_raw <- readRDS(file="data\\bomber_combined.rds")
str(bomber_combined_data_raw)
## 'data.frame': 57 obs. of 5 variables:
## $ AC : chr "Bomber B-1" "Bomber B-1" "Bomber B-1" "Bomber B-1" ...
## $ FY : int 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
## $ Cost : int 72753781 71297263 84026805 71848336 58439777 94946077 96458536 68650070 101895634 124816690 ...
## $ FH : int 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
## $ Gallons: int 88594449 85484074 85259038 79323816 86230284 86892432 89198262 74485788 101397707 78410415 ...
head(bomber_combined_data_raw)
## AC FY Cost FH Gallons
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
sum(is.na(bomber_combined_data_raw))
## [1] 0
bomber_combined_data <- bomber_combined_data_raw %>% separate(AC, c("Type","MD"), sep=" ")
as_tibble(bomber_combined_data)
## # A tibble: 57 x 6
## Type MD FY Cost FH Gallons
## * <chr> <chr> <int> <int> <int> <int>
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
## 7 Bomber B-1 2002 96458536 26581 89198262
## 8 Bomber B-1 2003 68650070 21491 74485788
## 9 Bomber B-1 2004 101895634 28118 101397707
## 10 Bomber B-1 2005 124816690 21859 78410415
## # ... with 47 more rows
bomber_prefix_data_raw <- readRDS("data\\bomber_prefix.rds")
str(bomber_prefix_data_raw)
## 'data.frame': 171 obs. of 6 variables:
## $ Type : chr "Bomber" "Bomber" "Bomber" "Bomber" ...
## $ prefix: chr "B" "B" "B" "B" ...
## $ number: chr "1" "1" "1" "1" ...
## $ FY : int 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 ...
## $ Output: chr "FH" "FH" "FH" "FH" ...
## $ Value : int 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
head(bomber_prefix_data_raw)
## Type prefix number FY Output Value
## 1 Bomber B 1 1996 FH 26914
## 2 Bomber B 1 1997 FH 25219
## 3 Bomber B 1 1998 FH 24205
## 4 Bomber B 1 1999 FH 23306
## 5 Bomber B 1 2000 FH 25013
## 6 Bomber B 1 2001 FH 25059
sum(is.na(bomber_prefix_data_raw))
## [1] 0
bomber_prefix_data <- bomber_prefix_data_raw %>% unite(MD, c(prefix,number), sep="-") %>%
spread(key=Output, value=Value)
as_tibble(bomber_prefix_data)
## # A tibble: 57 x 6
## Type MD FY Cost FH Gallons
## * <chr> <chr> <int> <int> <int> <int>
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
## 7 Bomber B-1 2002 96458536 26581 89198262
## 8 Bomber B-1 2003 68650070 21491 74485788
## 9 Bomber B-1 2004 101895634 28118 101397707
## 10 Bomber B-1 2005 124816690 21859 78410415
## # ... with 47 more rows
bomber_mess_data_raw <- readRDS("data\\bomber_mess.RDS")
head(bomber_mess_data_raw)
## Type prefix number Metric Value
## 1 Bomber B 1 1996_FH 26914
## 2 Bomber B 1 1997_FH 25219
## 3 Bomber B 1 1998_FH 24205
## 4 Bomber B 1 1999_FH 23306
## 5 Bomber B 1 2000_FH 25013
## 6 Bomber B 1 2001_FH 25059
str(bomber_mess_data_raw)
## 'data.frame': 171 obs. of 5 variables:
## $ Type : chr "Bomber" "Bomber" "Bomber" "Bomber" ...
## $ prefix: chr "B" "B" "B" "B" ...
## $ number: chr "1" "1" "1" "1" ...
## $ Metric: chr "1996_FH" "1997_FH" "1998_FH" "1999_FH" ...
## $ Value : int 26914 25219 24205 23306 25013 25059 26581 21491 28118 21859 ...
sum(is.na(bomber_mess_data_raw))
## [1] 0
bomber_mess_data <- bomber_mess_data_raw %>% unite(MD, c(prefix,number), sep="-") %>%
separate(Metric, c("FY", "abc"), sep="_") %>%
spread(key=abc, value=Value)
as_tibble(bomber_mess_data)
## # A tibble: 57 x 6
## Type MD FY Cost FH Gallons
## * <chr> <chr> <chr> <int> <int> <int>
## 1 Bomber B-1 1996 72753781 26914 88594449
## 2 Bomber B-1 1997 71297263 25219 85484074
## 3 Bomber B-1 1998 84026805 24205 85259038
## 4 Bomber B-1 1999 71848336 23306 79323816
## 5 Bomber B-1 2000 58439777 25013 86230284
## 6 Bomber B-1 2001 94946077 25059 86892432
## 7 Bomber B-1 2002 96458536 26581 89198262
## 8 Bomber B-1 2003 68650070 21491 74485788
## 9 Bomber B-1 2004 101895634 28118 101397707
## 10 Bomber B-1 2005 124816690 21859 78410415
## # ... with 47 more rows
as.factor(bomber_mess_data$FY)
## [1] 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## [15] 2010 2011 2012 2013 2014 1996 1997 1998 1999 2000 2001 2002 2003 2004
## [29] 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 1996 1997 1998 1999
## [43] 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
## [57] 2014
## 19 Levels: 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 ... 2014
bomber_mess_data_plot <- bomber_mess_data %>% gather(`Cost`:`Gallons`, key="Class", value="Value")
ggplot(bomber_mess_data_plot, aes(x=FY, y=Value, group=MD)) + geom_line(aes(color=MD)) + facet_wrap( ~ Class,scales="free", nrow=3)
ws_programmatics_data <- readRDS("data//ws_programmatics.rds")
ws_categorization_data <- readRDS("data//ws_categorizations.rds")
as_tibble(ws_programmatics_data)
## # A tibble: 36,328 x 18
## Base MD FY Manpower_Ops Manpower_Mx
## <chr> <chr> <int> <dbl> <dbl>
## 1 ALTUS AFB (OK) A-10 2007 NA NA
## 2 ALTUS AFB (OK) A-10 2008 NA NA
## 3 ALTUS AFB (OK) AT-38 1997 NA NA
## 4 ALTUS AFB (OK) AT-38 1998 NA NA
## 5 ALTUS AFB (OK) C-130 1998 NA NA
## 6 ALTUS AFB (OK) C-130 1999 NA NA
## 7 ALTUS AFB (OK) C-130 2000 NA NA
## 8 ALTUS AFB (OK) C-130 2002 NA NA
## 9 ALTUS AFB (OK) C-130 2003 NA NA
## 10 ALTUS AFB (OK) C-130 2004 NA 19912
## # ... with 36,318 more rows, and 13 more variables:
## # Manpower_Support_Staff <dbl>, Operating_Material <dbl>,
## # Mx_Consumables <dbl>, Mx_DLR <dbl>, Mx_Depot_AC <dbl>,
## # Mx_Depot_Missile <dbl>, Mx_Depot_Engine <dbl>, CLS <dbl>,
## # Total_O.S <dbl>, Avg_Inv <dbl>, TAI <dbl>, End_Strength <dbl>,
## # FH <dbl>
as_tibble(ws_categorization_data)
## # A tibble: 4,336 x 3
## Base System MD
## <chr> <chr> <chr>
## 1 ALTUS AFB (OK) AIRCRAFT A-10
## 2 ALTUS AFB (OK) AIRCRAFT AT-38
## 3 ALTUS AFB (OK) AIRCRAFT C-130
## 4 ALTUS AFB (OK) AIRCRAFT C-135
## 5 ALTUS AFB (OK) AIRCRAFT C-141
## 6 ALTUS AFB (OK) AIRCRAFT C-17
## 7 ALTUS AFB (OK) AIRCRAFT C-5
## 8 ALTUS AFB (OK) AIRCRAFT E-3
## 9 ALTUS AFB (OK) AIRCRAFT EC-130
## 10 ALTUS AFB (OK) AIRCRAFT F-15
## # ... with 4,326 more rows
str(ws_programmatics_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 36328 obs. of 18 variables:
## $ Base : chr "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" ...
## $ MD : chr "A-10" "A-10" "AT-38" "AT-38" ...
## $ FY : int 2007 2008 1997 1998 1998 1999 2000 2002 2003 2004 ...
## $ Manpower_Ops : num NA NA NA NA NA NA NA NA NA NA ...
## $ Manpower_Mx : num NA NA NA NA NA ...
## $ Manpower_Support_Staff: num NA NA NA NA NA NA NA NA NA NA ...
## $ Operating_Material : num NA NA NA NA NA NA NA NA NA NA ...
## $ Mx_Consumables : num NA NA NA NA NA NA NA NA NA NA ...
## $ Mx_DLR : num NA NA NA NA NA NA NA NA NA NA ...
## $ Mx_Depot_AC : num NA NA NA NA NA NA NA NA NA NA ...
## $ Mx_Depot_Missile : num NA NA NA NA NA NA NA NA NA NA ...
## $ Mx_Depot_Engine : num NA NA NA NA 158852 ...
## $ CLS : num NA NA NA NA NA NA NA NA NA NA ...
## $ Total_O.S : num 0 0 0 0 158852 ...
## $ Avg_Inv : num NA NA NA NA NA NA NA NA NA NA ...
## $ TAI : num NA NA NA NA NA NA NA NA NA NA ...
## $ End_Strength : num 73 21 NA NA NA NA NA 12 NA 1 ...
## $ FH : num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 18
## .. ..$ Base : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ MD : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ FY : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Manpower_Ops : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Manpower_Mx : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Manpower_Support_Staff: list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Operating_Material : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Mx_Consumables : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Mx_DLR : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Mx_Depot_AC : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Mx_Depot_Missile : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Mx_Depot_Engine : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ CLS : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Total_O.S : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ Avg_Inv : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ TAI : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ End_Strength : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## .. ..$ FH : list()
## .. .. ..- attr(*, "class")= chr "collector_number" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
str(ws_categorization_data)
## 'data.frame': 4336 obs. of 3 variables:
## $ Base : chr "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" "ALTUS AFB (OK)" ...
## $ System: chr "AIRCRAFT" "AIRCRAFT" "AIRCRAFT" "AIRCRAFT" ...
## $ MD : chr "A-10" "AT-38" "C-130" "C-135" ...
head(ws_categorization_data)
## Base System MD
## 1 ALTUS AFB (OK) AIRCRAFT A-10
## 2 ALTUS AFB (OK) AIRCRAFT AT-38
## 3 ALTUS AFB (OK) AIRCRAFT C-130
## 4 ALTUS AFB (OK) AIRCRAFT C-135
## 5 ALTUS AFB (OK) AIRCRAFT C-141
## 6 ALTUS AFB (OK) AIRCRAFT C-17
head(ws_programmatics_data)
## # A tibble: 6 x 18
## Base MD FY Manpower_Ops Manpower_Mx
## <chr> <chr> <int> <dbl> <dbl>
## 1 ALTUS AFB (OK) A-10 2007 NA NA
## 2 ALTUS AFB (OK) A-10 2008 NA NA
## 3 ALTUS AFB (OK) AT-38 1997 NA NA
## 4 ALTUS AFB (OK) AT-38 1998 NA NA
## 5 ALTUS AFB (OK) C-130 1998 NA NA
## 6 ALTUS AFB (OK) C-130 1999 NA NA
## # ... with 13 more variables: Manpower_Support_Staff <dbl>,
## # Operating_Material <dbl>, Mx_Consumables <dbl>, Mx_DLR <dbl>,
## # Mx_Depot_AC <dbl>, Mx_Depot_Missile <dbl>, Mx_Depot_Engine <dbl>,
## # CLS <dbl>, Total_O.S <dbl>, Avg_Inv <dbl>, TAI <dbl>,
## # End_Strength <dbl>, FH <dbl>
cor(x=ws_programmatics_data$FY, y=ws_programmatics_data$End_Strength, use="complete.obs")
## [1] 0.03158332
summary(ws_programmatics_data$End_Strength)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 3.0 12.0 149.4 62.0 4785.0 15487
ws_programmatics_data$End_Strength[is.na(ws_programmatics_data$End_Strength)]=mean(ws_programmatics_data$End_Strength,na.rm=TRUE)
ws_programmatics_data_6 <- ws_programmatics_data %>%
left_join(ws_categorization_data, by=c("Base","MD")) %>%
filter(FY==2014 & Base=="MINOT AFB (ND)") %>%
filter(System %in% c("AIRCRAFT","MISSILES")) %>%
group_by(System) %>%
summarize(Sum_O.S = sum(Total_O.S,na.rm=TRUE),Sum_EndStrength=sum(End_Strength,na.rm=TRUE))
ws_programmatics_data_7 <- ws_programmatics_data %>%
left_join(ws_categorization_data, by=c("Base","MD")) %>%
filter(FY==2014) %>%
group_by(Base) %>%
mutate(CPFH=Total_O.S/FH) %>%
aggregate(CPFH ~ Base, data= ., mean) %>%
arrange(desc(CPFH)) %>%
top_n(10)
## Selecting by CPFH
ggplot(ws_programmatics_data_7, aes(x=Base, y=CPFH)) + geom_bar(aes(x= reorder(Base, CPFH)),stat="identity", fill="firebrick") + theme(axis.text.x=element_text(angle=90, hjust=1))
ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>%
ggplot(aes(End_Strength, Total_O.S)) + geom_point(color="green")
ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>%
ggplot(aes(End_Strength, Total_O.S)) + geom_point(color="blue") + facet_wrap(~FY, nrow=5)
ws_programmatics_data %>% left_join(ws_categorization_data, by=c("Base","MD")) %>%
ggplot(aes(End_Strength, Total_O.S)) + geom_point(aes(color=System))