1 Introduction

The goal is to read the multi sheet Excel File (FY2018_Q3_Tables_Final_D corrected_0.xlsx) from the following suggestion on discussion board (Forum: Discussion 5: Untidy Data). https://bbhosted.cuny.edu/webapps/discussionboard/do/message?action=list_messages&course_id=_1705328_1&nav=discussion_board&conf_id=_1845527_1&forum_id=_1908779_1&message_id=_31371839_1

This project is gonna focus on two sheets (Table 1A and Table 1B) to load into two data frames and clean up the data. Using tidyr and dplyr functions the data representation would be manipulated into different formats and make the data more presentable and readable.

2 Challenges

3 Pre-Requistes : Available Libraries

  • readxl
  • DT
  • data.table
  • tidyr
  • dplyr

4 Read File into R from local working directory

5 Read XLSX into R

excel_sheets(path = mySourceFile)
## [1] "Table 1A" "Table 1B" "Table 2"  "Table 3"  "Table 4A" "Table 4B"

5.1 Untidy Data Table 1A

5.2 Untidy Data Table 1B

6 Data Scrubbing

6.1 Grab the data description

df_Tab1A_description <- df_Tab1A[1,1]
df_Tab1A_description
## # A tibble: 1 x 1
##   `Table 1A.`                                                              
##   <chr>                                                                    
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE OF ADMISSION ~
df_Tab1B_description <- df_Tab1B[1,1]
df_Tab1B_description
## # A tibble: 1 x 1
##   `Table 1B.`                                                              
##   <chr>                                                                    
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE AND MAJOR CLA~

6.2 Update the headers

colnames(df_Tab1A) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1A)
##  [1] "Category"              "Total"                
##  [3] "Total Quarter 1"       "Total Quarter 2"      
##  [5] "Total Quarter 3"       "Total Quarter 4"      
##  [7] "AOS Total"             "AOS Quarter 1"        
##  [9] "AOS Quarter 2"         "AOS Quarter 3"        
## [11] "AOS Quarter 4"         "New Arrival Total"    
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"
colnames(df_Tab1B) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1B)
##  [1] "Category"              "Total"                
##  [3] "Total Quarter 1"       "Total Quarter 2"      
##  [5] "Total Quarter 3"       "Total Quarter 4"      
##  [7] "AOS Total"             "AOS Quarter 1"        
##  [9] "AOS Quarter 2"         "AOS Quarter 3"        
## [11] "AOS Quarter 4"         "New Arrival Total"    
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"

6.3 Remove the redundant columns/rows

df_Tab1A <- df_Tab1A[-c(1:4,212:217),!names(df_Tab1A) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]
df_Tab1B <- df_Tab1B[-c(1:4,31:36),!names(df_Tab1B) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]

6.4 Split the data frames By Category

df_Tab1A_Region <- df_Tab1A[c(2:9),]
df_Tab1A_Region <- df_Tab1A_Region %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))
df_Tab1A_Country <- df_Tab1A[c(11:207),]
df_Tab1A_Country <- df_Tab1A_Country %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))

7 Show tidy data

7.1 Tidy Data Table 1A Region

DT::datatable(df_Tab1A_Region, options = list(pagelength=5))

7.2 Tidy Data Table 1A Country

DT::datatable(df_Tab1A_Country, options = list(pagelength=5))

7.3 Tidy Data Table 1B

DT::datatable(df_Tab1B, options = list(pagelength=5))

8 Different Variations of Data from Table1 A Region

8.1 Pivot Using Tidyr Gather

df_Tab1A_Region_Pivot <- gather(df_Tab1A_Region,
                                key = "Quarter",
                                value = "Quarterly Total",
                                -Category)
df_Tab1A_Region_Pivot$`Quarterly Total` <- as.numeric(df_Tab1A_Region_Pivot$`Quarterly Total`)
DT::datatable(df_Tab1A_Region_Pivot, options = list(pagelength=5))

8.2 UnPivot Using Tidyr Spread

df_Tab1A_Region_UnPivot <- spread(df_Tab1A_Region_Pivot,
                                  key = "Quarter",
                                  value = "Quarterly Total")
DT::datatable(df_Tab1A_Region_UnPivot, options = list(pagelength=5))

8.3 Combine Columns Using Tidyr Unite

#df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
#                                 col = c("Quarter Total-Q1-Q2-Q3","AOS Total-Q1-Q2-Q3","New Arrival Quarter Total-Q1-Q2-Q3"),
#                                 from = c("Total","Total Quarter 1","Total Quarter 2","Total Quarter 3"),c("AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),c("New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
#                                 sep = "-")
df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
                                 col = "Quarter Q1-Q2-Q3",
                                 from = c("Total Quarter 1","Total Quarter 2","Total Quarter 3"),
                                 sep = "-") %>%
                           unite(col = "AOS Q1-Q2-Q3",
                                 from = c("AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),
                                 sep = "-") %>%
                           unite(col = "New Arrival Q1-Q2-Q3",
                                 from = c("New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
                                 sep = "-")
                
DT::datatable(df_Tab1A_Region_Combine, options = list(pagelength=5))

8.4 Separate Columns Using Tidyr Separate

df_Tab1A_Region_Separate <- separate(df_Tab1A_Region_Combine,
                                     col = "Quarter Q1-Q2-Q3",
                                     into = c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3"),
                                     sep = "-") %>%
                            separate(col = "AOS Q1-Q2-Q3",
                                     into = c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3"),
                                     sep = "-") %>%
                            separate(col = "New Arrival Q1-Q2-Q3",
                                     into = c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3"),
                                     sep = "-")

DT::datatable(df_Tab1A_Region_Separate, options = list(pagelength=5))

9 Visualizations

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

10 Conclusion

  • As per the data analysis we can conclude overall more population is getting permanent resident status in USA from North America, primarily due to Mexico
  • In Quarter 1, Asia contributed to getting more permanent resident status
  • In Quarter 3, Asia and North America has same population getting permannet resident status