1 Introduction

The goal is to read the multi sheet Excel File (FY2018_Q3_Tables_Final_D corrected_0.xlsx) from the following suggestion on discussion board (Forum: Discussion 5: Untidy Data). https://bbhosted.cuny.edu/webapps/discussionboard/do/message?action=list_messages&course_id=_1705328_1&nav=discussion_board&conf_id=_1845527_1&forum_id=_1908779_1&message_id=_31371839_1

This project is gonna focus on two sheets (Table 1A and Table 1B) to load into two data frames and clean up the data. Using tidyr and dplyr functions the data representation would be manipulated into different formats and make the data more presentable and readable.

2 Challenges

Need to load the xlsx file with muiltiple worksheets from source http://www.hassavocadoboard.com/retail/volume-and-price-data
Cleanup the data from two sheets to make it readable and presentable for analysis
Analyse a subset of the relevant data set for inference

3 Pre-Requistes : Available Libraries

readxl
DT
data.table
tidyr
dplyr

4 Read File into R from local working directory

5 Read XLSX into R

excel_sheets(path = mySourceFile)

## [1] "Table 1A" "Table 1B" "Table 2"  "Table 3"  "Table 4A" "Table 4B"

5.1 Untidy Data Table 1A

5.2 Untidy Data Table 1B

6 Data Scrubbing

6.1 Grab the data description

df_Tab1A_description <- df_Tab1A[1,1]
df_Tab1A_description

## # A tibble: 1 x 1
##   `Table 1A.`                                                              
##   <chr>                                                                    
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE OF ADMISSION ~

df_Tab1B_description <- df_Tab1B[1,1]
df_Tab1B_description

## # A tibble: 1 x 1
##   `Table 1B.`                                                              
##   <chr>                                                                    
## 1 PERSONS OBTAINING LAWFUL PERMANENT RESIDENT STATUS BY TYPE AND MAJOR CLA~

6.2 Update the headers

colnames(df_Tab1A) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1A)

##  [1] "Category"              "Total"                
##  [3] "Total Quarter 1"       "Total Quarter 2"      
##  [5] "Total Quarter 3"       "Total Quarter 4"      
##  [7] "AOS Total"             "AOS Quarter 1"        
##  [9] "AOS Quarter 2"         "AOS Quarter 3"        
## [11] "AOS Quarter 4"         "New Arrival Total"    
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"

colnames(df_Tab1B) <- c("Category","Total","Total Quarter 1","Total Quarter 2","Total Quarter 3","Total Quarter 4","AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3","AOS Quarter 4","New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3","New Arrival Quarter 4")
colnames(df_Tab1B)

##  [1] "Category"              "Total"                
##  [3] "Total Quarter 1"       "Total Quarter 2"      
##  [5] "Total Quarter 3"       "Total Quarter 4"      
##  [7] "AOS Total"             "AOS Quarter 1"        
##  [9] "AOS Quarter 2"         "AOS Quarter 3"        
## [11] "AOS Quarter 4"         "New Arrival Total"    
## [13] "New Arrival Quarter 1" "New Arrival Quarter 2"
## [15] "New Arrival Quarter 3" "New Arrival Quarter 4"

6.3 Remove the redundant columns/rows

df_Tab1A <- df_Tab1A[-c(1:4,212:217),!names(df_Tab1A) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]
df_Tab1B <- df_Tab1B[-c(1:4,31:36),!names(df_Tab1B) %in% c("Total Quarter 4","AOS Quarter 4","New Arrival Quarter 4")]

6.4 Split the data frames By Category

df_Tab1A_Region <- df_Tab1A[c(2:9),]
df_Tab1A_Region <- df_Tab1A_Region %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))
df_Tab1A_Country <- df_Tab1A[c(11:207),]
df_Tab1A_Country <- df_Tab1A_Country %>% filter(Category != c('Total')) %>% select(-c(Total, `AOS Total`, `New Arrival Total`))

7 Show tidy data

7.1 Tidy Data Table 1A Region

DT::datatable(df_Tab1A_Region, options = list(pagelength=5))

7.2 Tidy Data Table 1A Country

DT::datatable(df_Tab1A_Country, options = list(pagelength=5))

7.3 Tidy Data Table 1B

DT::datatable(df_Tab1B, options = list(pagelength=5))

8 Different Variations of Data from Table1 A Region

8.1 Pivot Using Tidyr Gather

df_Tab1A_Region_Pivot <- gather(df_Tab1A_Region,
                                key = "Quarter",
                                value = "Quarterly Total",
                                -Category)
df_Tab1A_Region_Pivot$`Quarterly Total` <- as.numeric(df_Tab1A_Region_Pivot$`Quarterly Total`)
DT::datatable(df_Tab1A_Region_Pivot, options = list(pagelength=5))

8.2 UnPivot Using Tidyr Spread

df_Tab1A_Region_UnPivot <- spread(df_Tab1A_Region_Pivot,
                                  key = "Quarter",
                                  value = "Quarterly Total")
DT::datatable(df_Tab1A_Region_UnPivot, options = list(pagelength=5))

8.3 Combine Columns Using Tidyr Unite

#df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
#                                 col = c("Quarter Total-Q1-Q2-Q3","AOS Total-Q1-Q2-Q3","New Arrival Quarter Total-Q1-Q2-Q3"),
#                                 from = c("Total","Total Quarter 1","Total Quarter 2","Total Quarter 3"),c("AOS Total","AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),c("New Arrival Total","New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
#                                 sep = "-")
df_Tab1A_Region_Combine <- unite(df_Tab1A_Region,
                                 col = "Quarter Q1-Q2-Q3",
                                 from = c("Total Quarter 1","Total Quarter 2","Total Quarter 3"),
                                 sep = "-") %>%
                           unite(col = "AOS Q1-Q2-Q3",
                                 from = c("AOS Quarter 1","AOS Quarter 2","AOS Quarter 3"),
                                 sep = "-") %>%
                           unite(col = "New Arrival Q1-Q2-Q3",
                                 from = c("New Arrival Quarter 1","New Arrival Quarter 2","New Arrival Quarter 3"),
                                 sep = "-")
                
DT::datatable(df_Tab1A_Region_Combine, options = list(pagelength=5))

8.4 Separate Columns Using Tidyr Separate

df_Tab1A_Region_Separate <- separate(df_Tab1A_Region_Combine,
                                     col = "Quarter Q1-Q2-Q3",
                                     into = c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3"),
                                     sep = "-") %>%
                            separate(col = "AOS Q1-Q2-Q3",
                                     into = c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3"),
                                     sep = "-") %>%
                            separate(col = "New Arrival Q1-Q2-Q3",
                                     into = c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3"),
                                     sep = "-")

DT::datatable(df_Tab1A_Region_Separate, options = list(pagelength=5))

9 Visualizations

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("Total Quarter 1", "Total Quarter 2", "Total Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("AOS Quarter 1", "AOS Quarter 2", "AOS Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

ggplot(df_Tab1A_Region_Pivot %>% filter(Quarter %in% c("New Arrival Quarter 1", "New Arrival Quarter 2", "New Arrival Quarter 3")), aes(x = Category, y = `Quarterly Total`, fill = Category)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ Quarter) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  scale_fill_discrete(limits = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown"), labels = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  scale_x_discrete(breaks = c("Africa", "Asia", "Europe", "North America", "Oceania", "South America", "Unknown")) +
  ylab("Count")

10 Conclusion

As per the data analysis we can conclude overall more population is getting permanent resident status in USA from North America, primarily due to Mexico
In Quarter 1, Asia contributed to getting more permanent resident status
In Quarter 3, Asia and North America has same population getting permannet resident status

DATA 607 02 [15961] : Week 6 - Data Transformations [03/04 - 03/10] Project 2.1

IS 607 - Project 2.1 - Legal Immigration and Adjustment of Status Report FY 2018, Q3

Debabrata Kabiraj

March 09, 2019