Project 2 Dataset 2

url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset2/UN_MigrantStockByOriginAndDestination_2015.xlsx"

library(httr)
temp_file <- tempfile(fileext = ".xlsx")
req <- GET(url, 
          # authenticate using GITHUB_PAT
           authenticate(Sys.getenv("GITHUB_PAT"), ""),
          # write result to disk
           write_disk(path = temp_file))
tab <- readxl::read_excel(temp_file, sheet = "Table 16")
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...6
## * ... and 234 more problems

Let’s get a record of all the column names and their data types.

names(tab)
##   [1] "...1"           "...2"           "...3"           "...4"          
##   [5] "United Nations" "...6"           "...7"           "...8"          
##   [9] "...9"           "...10"          "...11"          "...12"         
##  [13] "...13"          "...14"          "...15"          "...16"         
##  [17] "...17"          "...18"          "...19"          "...20"         
##  [21] "...21"          "...22"          "...23"          "...24"         
##  [25] "...25"          "...26"          "...27"          "...28"         
##  [29] "...29"          "...30"          "...31"          "...32"         
##  [33] "...33"          "...34"          "...35"          "...36"         
##  [37] "...37"          "...38"          "...39"          "...40"         
##  [41] "...41"          "...42"          "...43"          "...44"         
##  [45] "...45"          "...46"          "...47"          "...48"         
##  [49] "...49"          "...50"          "...51"          "...52"         
##  [53] "...53"          "...54"          "...55"          "...56"         
##  [57] "...57"          "...58"          "...59"          "...60"         
##  [61] "...61"          "...62"          "...63"          "...64"         
##  [65] "...65"          "...66"          "...67"          "...68"         
##  [69] "...69"          "...70"          "...71"          "...72"         
##  [73] "...73"          "...74"          "...75"          "...76"         
##  [77] "...77"          "...78"          "...79"          "...80"         
##  [81] "...81"          "...82"          "...83"          "...84"         
##  [85] "...85"          "...86"          "...87"          "...88"         
##  [89] "...89"          "...90"          "...91"          "...92"         
##  [93] "...93"          "...94"          "...95"          "...96"         
##  [97] "...97"          "...98"          "...99"          "...100"        
## [101] "...101"         "...102"         "...103"         "...104"        
## [105] "...105"         "...106"         "...107"         "...108"        
## [109] "...109"         "...110"         "...111"         "...112"        
## [113] "...113"         "...114"         "...115"         "...116"        
## [117] "...117"         "...118"         "...119"         "...120"        
## [121] "...121"         "...122"         "...123"         "...124"        
## [125] "...125"         "...126"         "...127"         "...128"        
## [129] "...129"         "...130"         "...131"         "...132"        
## [133] "...133"         "...134"         "...135"         "...136"        
## [137] "...137"         "...138"         "...139"         "...140"        
## [141] "...141"         "...142"         "...143"         "...144"        
## [145] "...145"         "...146"         "...147"         "...148"        
## [149] "...149"         "...150"         "...151"         "...152"        
## [153] "...153"         "...154"         "...155"         "...156"        
## [157] "...157"         "...158"         "...159"         "...160"        
## [161] "...161"         "...162"         "...163"         "...164"        
## [165] "...165"         "...166"         "...167"         "...168"        
## [169] "...169"         "...170"         "...171"         "...172"        
## [173] "...173"         "...174"         "...175"         "...176"        
## [177] "...177"         "...178"         "...179"         "...180"        
## [181] "...181"         "...182"         "...183"         "...184"        
## [185] "...185"         "...186"         "...187"         "...188"        
## [189] "...189"         "...190"         "...191"         "...192"        
## [193] "...193"         "...194"         "...195"         "...196"        
## [197] "...197"         "...198"         "...199"         "...200"        
## [201] "...201"         "...202"         "...203"         "...204"        
## [205] "...205"         "...206"         "...207"         "...208"        
## [209] "...209"         "...210"         "...211"         "...212"        
## [213] "...213"         "...214"         "...215"         "...216"        
## [217] "...217"         "...218"         "...219"         "...220"        
## [221] "...221"         "...222"         "...223"         "...224"        
## [225] "...225"         "...226"         "...227"         "...228"        
## [229] "...229"         "...230"         "...231"         "...232"        
## [233] "...233"         "...234"         "...235"         "...236"        
## [237] "...237"         "...238"         "...239"         "...240"
str(tab)
## Classes 'tbl_df', 'tbl' and 'data.frame':    276 obs. of  240 variables:
##  $ ...1          : chr  NA NA NA NA ...
##  $ ...2          : chr  NA NA NA NA ...
##  $ ...3          : chr  NA NA NA NA ...
##  $ ...4          : chr  NA NA NA NA ...
##  $ United Nations: chr  "Population Division" "Department of Economic and Social Affairs" NA "Trends in International Migrant Stock: Migrants by Destination and Origin" ...
##  $ ...6          : chr  NA NA NA NA ...
##  $ ...7          : chr  NA NA NA NA ...
##  $ ...8          : chr  NA NA NA NA ...
##  $ ...9          : chr  NA NA NA NA ...
##  $ ...10         : chr  NA NA NA NA ...
##  $ ...11         : chr  NA NA NA NA ...
##  $ ...12         : chr  NA NA NA NA ...
##  $ ...13         : chr  NA NA NA NA ...
##  $ ...14         : chr  NA NA NA NA ...
##  $ ...15         : chr  NA NA NA NA ...
##  $ ...16         : chr  NA NA NA NA ...
##  $ ...17         : chr  NA NA NA NA ...
##  $ ...18         : chr  NA NA NA NA ...
##  $ ...19         : chr  NA NA NA NA ...
##  $ ...20         : chr  NA NA NA NA ...
##  $ ...21         : chr  NA NA NA NA ...
##  $ ...22         : chr  NA NA NA NA ...
##  $ ...23         : chr  NA NA NA NA ...
##  $ ...24         : chr  NA NA NA NA ...
##  $ ...25         : chr  NA NA NA NA ...
##  $ ...26         : chr  NA NA NA NA ...
##  $ ...27         : chr  NA NA NA NA ...
##  $ ...28         : chr  NA NA NA NA ...
##  $ ...29         : chr  NA NA NA NA ...
##  $ ...30         : chr  NA NA NA NA ...
##  $ ...31         : chr  NA NA NA NA ...
##  $ ...32         : chr  NA NA NA NA ...
##  $ ...33         : chr  NA NA NA NA ...
##  $ ...34         : chr  NA NA NA NA ...
##  $ ...35         : chr  NA NA NA NA ...
##  $ ...36         : chr  NA NA NA NA ...
##  $ ...37         : chr  NA NA NA NA ...
##  $ ...38         : chr  NA NA NA NA ...
##  $ ...39         : chr  NA NA NA NA ...
##  $ ...40         : chr  NA NA NA NA ...
##  $ ...41         : chr  NA NA NA NA ...
##  $ ...42         : chr  NA NA NA NA ...
##  $ ...43         : chr  NA NA NA NA ...
##  $ ...44         : chr  NA NA NA NA ...
##  $ ...45         : chr  NA NA NA NA ...
##  $ ...46         : chr  NA NA NA NA ...
##  $ ...47         : chr  NA NA NA NA ...
##  $ ...48         : chr  NA NA NA NA ...
##  $ ...49         : chr  NA NA NA NA ...
##  $ ...50         : chr  NA NA NA NA ...
##  $ ...51         : chr  NA NA NA NA ...
##  $ ...52         : chr  NA NA NA NA ...
##  $ ...53         : chr  NA NA NA NA ...
##  $ ...54         : chr  NA NA NA NA ...
##  $ ...55         : chr  NA NA NA NA ...
##  $ ...56         : chr  NA NA NA NA ...
##  $ ...57         : chr  NA NA NA NA ...
##  $ ...58         : chr  NA NA NA NA ...
##  $ ...59         : chr  NA NA NA NA ...
##  $ ...60         : chr  NA NA NA NA ...
##  $ ...61         : chr  NA NA NA NA ...
##  $ ...62         : chr  NA NA NA NA ...
##  $ ...63         : chr  NA NA NA NA ...
##  $ ...64         : chr  NA NA NA NA ...
##  $ ...65         : chr  NA NA NA NA ...
##  $ ...66         : chr  NA NA NA NA ...
##  $ ...67         : chr  NA NA NA NA ...
##  $ ...68         : chr  NA NA NA NA ...
##  $ ...69         : chr  NA NA NA NA ...
##  $ ...70         : chr  NA NA NA NA ...
##  $ ...71         : chr  NA NA NA NA ...
##  $ ...72         : chr  NA NA NA NA ...
##  $ ...73         : chr  NA NA NA NA ...
##  $ ...74         : chr  NA NA NA NA ...
##  $ ...75         : chr  NA NA NA NA ...
##  $ ...76         : chr  NA NA NA NA ...
##  $ ...77         : chr  NA NA NA NA ...
##  $ ...78         : chr  NA NA NA NA ...
##  $ ...79         : chr  NA NA NA NA ...
##  $ ...80         : chr  NA NA NA NA ...
##  $ ...81         : chr  NA NA NA NA ...
##  $ ...82         : chr  NA NA NA NA ...
##  $ ...83         : chr  NA NA NA NA ...
##  $ ...84         : chr  NA NA NA NA ...
##  $ ...85         : chr  NA NA NA NA ...
##  $ ...86         : chr  NA NA NA NA ...
##  $ ...87         : chr  NA NA NA NA ...
##  $ ...88         : chr  NA NA NA NA ...
##  $ ...89         : chr  NA NA NA NA ...
##  $ ...90         : chr  NA NA NA NA ...
##  $ ...91         : chr  NA NA NA NA ...
##  $ ...92         : chr  NA NA NA NA ...
##  $ ...93         : chr  NA NA NA NA ...
##  $ ...94         : chr  NA NA NA NA ...
##  $ ...95         : chr  NA NA NA NA ...
##  $ ...96         : chr  NA NA NA NA ...
##  $ ...97         : chr  NA NA NA NA ...
##  $ ...98         : chr  NA NA NA NA ...
##  $ ...99         : chr  NA NA NA NA ...
##   [list output truncated]

All of the column names are messed up and need to be fixed.

First, let’s get rid of the first 10 rows, since they serve no purpose. Then let’s make the leftover first row into column names and then delete the first row.

Rename the first 5 columns

Filter out the regions and filter out data for just countries

cleanRow <- tab[-c(1:10),]

colnames(cleanRow) <- as.character(unlist(cleanRow[1,]))

cleanRow = cleanRow[-1,]

colnames(cleanRow)[1:5] <- c("Sort_Order", "Dest_Country", "Notes", "Country_Code", "Data_Type")

cleanRow2 <- cleanRow %>% filter(is.na(Data_Type))

cleanRow3 <- cleanRow %>% filter(!is.na(Data_Type))

Delete the other unncessary columns from the Data Frame

Then lets use na.rm = TRUE to handle all NA values and calculate total migrants in the fourth column called migrants.

CountryDF <- cleanRow3 %>% select(-Sort_Order, -Notes, -Data_Type, -Total, -starts_with("Other"))

CountryDF2 <- CountryDF %>% gather(origin_Country, migrants, Afghanistan:Zimbabwe, na.rm = TRUE)

CountryDF2$migrants <- as.numeric(CountryDF2$migrants)

glimpse(CountryDF2)
## Observations: 11,228
## Variables: 4
## $ Dest_Country   <chr> "Egypt", "Libya", "Namibia", "South Africa", "Tajiki...
## $ Country_Code   <chr> "818", "434", "516", "710", "762", "458", "608", "76...
## $ origin_Country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghan...
## $ migrants       <dbl> 235, 320, 39, 83, 7587, 498, 1220, 1, 8086, 2348369,...

Let’s know isolate the cleaned data frame further to gather information pertaining to the United States.

Dest_USA <- CountryDF2 %>% filter(Dest_Country == "United States of America")

Exit_USA <- CountryDF2 %>% filter(origin_Country == "United States of America")

Graphs

Had difficulty using ggplot to display all the country names. Used the plot_ly library to display the data.

Dest_USA <- Dest_USA %>% group_by(Dest_Country) %>% arrange(desc(migrants))
Dest_USA
## # A tibble: 150 x 4
## # Groups:   Dest_Country [1]
##    Dest_Country             Country_Code origin_Country     migrants
##    <chr>                    <chr>        <chr>                 <dbl>
##  1 United States of America 840          Mexico             12050031
##  2 United States of America 840          China               2103551
##  3 United States of America 840          India               1969286
##  4 United States of America 840          Philippines         1896031
##  5 United States of America 840          Puerto Rico         1744402
##  6 United States of America 840          Viet Nam            1302870
##  7 United States of America 840          El Salvador         1276489
##  8 United States of America 840          Cuba                1131284
##  9 United States of America 840          Republic of Korea   1119578
## 10 United States of America 840          Dominican Republic   940874
## # ... with 140 more rows
Exit_USA <- Exit_USA %>% group_by(Dest_Country) %>% arrange(desc(migrants))
Exit_USA
## # A tibble: 161 x 4
## # Groups:   Dest_Country [161]
##    Dest_Country                        Country_Code origin_Country      migrants
##    <chr>                               <chr>        <chr>                  <dbl>
##  1 Mexico                              484          United States of A~   876528
##  2 Canada                              124          United States of A~   343252
##  3 United Kingdom of Great Britain an~ 826          United States of A~   212150
##  4 Puerto Rico                         630          United States of A~   165147
##  5 Germany                             276          United States of A~   137575
##  6 Australia                           36           United States of A~   110643
##  7 Israel                              376          United States of A~    79082
##  8 Republic of Korea                   410          United States of A~    68784
##  9 Italy                               380          United States of A~    54226
## 10 France                              250          United States of A~    53906
## # ... with 151 more rows
ggplot(Dest_USA, aes(x = origin_Country, y = migrants)) + geom_point() + scale_y_log10()

ggplot(Dest_USA, aes(x=origin_Country, y=migrants)) + 
  geom_bar(stat='identity') + theme(axis.text.x=element_text(angle=90, hjust=1)) + scale_y_log10()

library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
Dest <- plot_ly(x = Dest_USA$origin_Country, y = Dest_USA$migrants, width = 1000, height=300, type = 'bar', mode = 'markers')

Dest <- Dest %>% layout(autosize = TRUE)

Dest
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Exit <- plot_ly(data = Exit_USA, x = ~Dest_Country, y = ~migrants)
Exit
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#bar

Conclusion

The country with the greatest number of immigrants to the United States is Mexico, while Americans are leaving to Mexico, Canada, the United Kingdom and Puerto Rico (even though Puerto Rico is a commonwealth of the United States). However, there is a signifcant greater number of immmigrants versus Americans leaving the country. The number of Mexican immigrants are 12million while the number of Americans leaving to Mexico is approximately 800,000.