The goal is to read the multi sheet Excel File (FY2018_Q3_Tables_Final_D corrected_0.xlsx) from the following suggestion on discussion board (Forum: Discussion 5: Untidy Data). https://bbhosted.cuny.edu/webapps/discussionboard/do/message?action=list_messages&course_id=_1705328_1&nav=discussion_board&conf_id=_1845527_1&forum_id=_1908779_1&message_id=_31371839_1
This project is gonna focus on two sheets (Table 1A and Table 1B) to load into two data frames and clean up the data. Using tidyr and dplyr functions the data representation would be manipulated into different formats and make the data more presentable and readable.
excel_sheets(path = mySourceFile)## [1] "Table 1A" "Table 1B" "Table 2" "Table 3" "Table 4A" "Table 4B"
df_Tab1A_description <- df_Tab1A[1,1]
df_Tab1A_description## # A tibble: 1 x 1
## `Table 1A.`
## <chr>
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE OF ADMISSION ~
df_Tab1B_description <- df_Tab1B[1,1]
df_Tab1B_description## # A tibble: 1 x 1
## `Table 1B.`
## <chr>
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE AND MAJOR CLA~
colnames(df_Tab1A) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1A)## [1] "Category" "Total"
## [3] "Total Quarter 1" "Total Quarter 2"
## [5] "Total Quarter 3" "Total Quarter 4"
## [7] "AOS Total" "AOS Quarter 1"
## [9] "AOS Quarter 2" "AOS Quarter 3"
## [11] "AOS Quarter 4" "New Arrival Total"
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"
colnames(df_Tab1B) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1B)## [1] "Category" "Total"
## [3] "Total Quarter 1" "Total Quarter 2"
## [5] "Total Quarter 3" "Total Quarter 4"
## [7] "AOS Total" "AOS Quarter 1"
## [9] "AOS Quarter 2" "AOS Quarter 3"
## [11] "AOS Quarter 4" "New Arrival Total"
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"
df_Tab1A <- df_Tab1A[-c(1:4,212:217),!names(df_Tab1A) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]
df_Tab1B <- df_Tab1B[-c(1:4,31:36),!names(df_Tab1B) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]df_Tab1A_Region <- df_Tab1A[c(2:9),]
df_Tab1A_Region <- df_Tab1A_Region %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))
df_Tab1A_Country <- df_Tab1A[c(11:207),]
df_Tab1A_Country <- df_Tab1A_Country %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))DT::datatable(df_Tab1A_Region, options = list(pagelength=5))DT::datatable(df_Tab1A_Country, options = list(pagelength=5))DT::datatable(df_Tab1B, options = list(pagelength=5))df_Tab1A_Region_Pivot <- gather(df_Tab1A_Region,
key = "Quarter",
value = "Quarterly Total",
-Category)
df_Tab1A_Region_Pivot$`Quarterly Total` <- as.numeric(df_Tab1A_Region_Pivot$`Quarterly Total`)
DT::datatable(df_Tab1A_Region_Pivot, options = list(pagelength=5))df_Tab1A_Region_UnPivot <- spread(df_Tab1A_Region_Pivot,
key = "Quarter",
value = "Quarterly Total")
DT::datatable(df_Tab1A_Region_UnPivot, options = list(pagelength=5))#df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
# col = c("Quarter Total-Q1-Q2-Q3","AOS Total-Q1-Q2-Q3","New Arrival Quarter Total-Q1-Q2-Q3"),
# from = c("Total","Total Quarter 1","Total Quarter 2","Total Quarter 3"),c("AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),c("New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
# sep = "-")
df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
col = "Quarter Q1-Q2-Q3",
from = c("Total Quarter 1","Total Quarter 2","Total Quarter 3"),
sep = "-") %>%
unite(col = "AOS Q1-Q2-Q3",
from = c("AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),
sep = "-") %>%
unite(col = "New Arrival Q1-Q2-Q3",
from = c("New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
sep = "-")
DT::datatable(df_Tab1A_Region_Combine, options = list(pagelength=5))df_Tab1A_Region_Separate <- separate(df_Tab1A_Region_Combine,
col = "Quarter Q1-Q2-Q3",
into = c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3"),
sep = "-") %>%
separate(col = "AOS Q1-Q2-Q3",
into = c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3"),
sep = "-") %>%
separate(col = "New Arrival Q1-Q2-Q3",
into = c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3"),
sep = "-")
DT::datatable(df_Tab1A_Region_Separate, options = list(pagelength=5))ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) +
geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
ylab("Count")ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) +
geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
ylab("Count")ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) +
geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
ylab("Count")