url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset2/UN_MigrantStockByOriginAndDestination_2015.xlsx"
library(httr)
temp_file <- tempfile(fileext = ".xlsx")
req <- GET(url,
# authenticate using GITHUB_PAT
authenticate(Sys.getenv("GITHUB_PAT"), ""),
# write result to disk
write_disk(path = temp_file))
tab <- readxl::read_excel(temp_file, sheet = "Table 16")
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...6
## * ... and 234 more problems
Let’s get a record of all the column names and their data types.
names(tab)
## [1] "...1" "...2" "...3" "...4"
## [5] "United Nations" "...6" "...7" "...8"
## [9] "...9" "...10" "...11" "...12"
## [13] "...13" "...14" "...15" "...16"
## [17] "...17" "...18" "...19" "...20"
## [21] "...21" "...22" "...23" "...24"
## [25] "...25" "...26" "...27" "...28"
## [29] "...29" "...30" "...31" "...32"
## [33] "...33" "...34" "...35" "...36"
## [37] "...37" "...38" "...39" "...40"
## [41] "...41" "...42" "...43" "...44"
## [45] "...45" "...46" "...47" "...48"
## [49] "...49" "...50" "...51" "...52"
## [53] "...53" "...54" "...55" "...56"
## [57] "...57" "...58" "...59" "...60"
## [61] "...61" "...62" "...63" "...64"
## [65] "...65" "...66" "...67" "...68"
## [69] "...69" "...70" "...71" "...72"
## [73] "...73" "...74" "...75" "...76"
## [77] "...77" "...78" "...79" "...80"
## [81] "...81" "...82" "...83" "...84"
## [85] "...85" "...86" "...87" "...88"
## [89] "...89" "...90" "...91" "...92"
## [93] "...93" "...94" "...95" "...96"
## [97] "...97" "...98" "...99" "...100"
## [101] "...101" "...102" "...103" "...104"
## [105] "...105" "...106" "...107" "...108"
## [109] "...109" "...110" "...111" "...112"
## [113] "...113" "...114" "...115" "...116"
## [117] "...117" "...118" "...119" "...120"
## [121] "...121" "...122" "...123" "...124"
## [125] "...125" "...126" "...127" "...128"
## [129] "...129" "...130" "...131" "...132"
## [133] "...133" "...134" "...135" "...136"
## [137] "...137" "...138" "...139" "...140"
## [141] "...141" "...142" "...143" "...144"
## [145] "...145" "...146" "...147" "...148"
## [149] "...149" "...150" "...151" "...152"
## [153] "...153" "...154" "...155" "...156"
## [157] "...157" "...158" "...159" "...160"
## [161] "...161" "...162" "...163" "...164"
## [165] "...165" "...166" "...167" "...168"
## [169] "...169" "...170" "...171" "...172"
## [173] "...173" "...174" "...175" "...176"
## [177] "...177" "...178" "...179" "...180"
## [181] "...181" "...182" "...183" "...184"
## [185] "...185" "...186" "...187" "...188"
## [189] "...189" "...190" "...191" "...192"
## [193] "...193" "...194" "...195" "...196"
## [197] "...197" "...198" "...199" "...200"
## [201] "...201" "...202" "...203" "...204"
## [205] "...205" "...206" "...207" "...208"
## [209] "...209" "...210" "...211" "...212"
## [213] "...213" "...214" "...215" "...216"
## [217] "...217" "...218" "...219" "...220"
## [221] "...221" "...222" "...223" "...224"
## [225] "...225" "...226" "...227" "...228"
## [229] "...229" "...230" "...231" "...232"
## [233] "...233" "...234" "...235" "...236"
## [237] "...237" "...238" "...239" "...240"
str(tab)
## Classes 'tbl_df', 'tbl' and 'data.frame': 276 obs. of 240 variables:
## $ ...1 : chr NA NA NA NA ...
## $ ...2 : chr NA NA NA NA ...
## $ ...3 : chr NA NA NA NA ...
## $ ...4 : chr NA NA NA NA ...
## $ United Nations: chr "Population Division" "Department of Economic and Social Affairs" NA "Trends in International Migrant Stock: Migrants by Destination and Origin" ...
## $ ...6 : chr NA NA NA NA ...
## $ ...7 : chr NA NA NA NA ...
## $ ...8 : chr NA NA NA NA ...
## $ ...9 : chr NA NA NA NA ...
## $ ...10 : chr NA NA NA NA ...
## $ ...11 : chr NA NA NA NA ...
## $ ...12 : chr NA NA NA NA ...
## $ ...13 : chr NA NA NA NA ...
## $ ...14 : chr NA NA NA NA ...
## $ ...15 : chr NA NA NA NA ...
## $ ...16 : chr NA NA NA NA ...
## $ ...17 : chr NA NA NA NA ...
## $ ...18 : chr NA NA NA NA ...
## $ ...19 : chr NA NA NA NA ...
## $ ...20 : chr NA NA NA NA ...
## $ ...21 : chr NA NA NA NA ...
## $ ...22 : chr NA NA NA NA ...
## $ ...23 : chr NA NA NA NA ...
## $ ...24 : chr NA NA NA NA ...
## $ ...25 : chr NA NA NA NA ...
## $ ...26 : chr NA NA NA NA ...
## $ ...27 : chr NA NA NA NA ...
## $ ...28 : chr NA NA NA NA ...
## $ ...29 : chr NA NA NA NA ...
## $ ...30 : chr NA NA NA NA ...
## $ ...31 : chr NA NA NA NA ...
## $ ...32 : chr NA NA NA NA ...
## $ ...33 : chr NA NA NA NA ...
## $ ...34 : chr NA NA NA NA ...
## $ ...35 : chr NA NA NA NA ...
## $ ...36 : chr NA NA NA NA ...
## $ ...37 : chr NA NA NA NA ...
## $ ...38 : chr NA NA NA NA ...
## $ ...39 : chr NA NA NA NA ...
## $ ...40 : chr NA NA NA NA ...
## $ ...41 : chr NA NA NA NA ...
## $ ...42 : chr NA NA NA NA ...
## $ ...43 : chr NA NA NA NA ...
## $ ...44 : chr NA NA NA NA ...
## $ ...45 : chr NA NA NA NA ...
## $ ...46 : chr NA NA NA NA ...
## $ ...47 : chr NA NA NA NA ...
## $ ...48 : chr NA NA NA NA ...
## $ ...49 : chr NA NA NA NA ...
## $ ...50 : chr NA NA NA NA ...
## $ ...51 : chr NA NA NA NA ...
## $ ...52 : chr NA NA NA NA ...
## $ ...53 : chr NA NA NA NA ...
## $ ...54 : chr NA NA NA NA ...
## $ ...55 : chr NA NA NA NA ...
## $ ...56 : chr NA NA NA NA ...
## $ ...57 : chr NA NA NA NA ...
## $ ...58 : chr NA NA NA NA ...
## $ ...59 : chr NA NA NA NA ...
## $ ...60 : chr NA NA NA NA ...
## $ ...61 : chr NA NA NA NA ...
## $ ...62 : chr NA NA NA NA ...
## $ ...63 : chr NA NA NA NA ...
## $ ...64 : chr NA NA NA NA ...
## $ ...65 : chr NA NA NA NA ...
## $ ...66 : chr NA NA NA NA ...
## $ ...67 : chr NA NA NA NA ...
## $ ...68 : chr NA NA NA NA ...
## $ ...69 : chr NA NA NA NA ...
## $ ...70 : chr NA NA NA NA ...
## $ ...71 : chr NA NA NA NA ...
## $ ...72 : chr NA NA NA NA ...
## $ ...73 : chr NA NA NA NA ...
## $ ...74 : chr NA NA NA NA ...
## $ ...75 : chr NA NA NA NA ...
## $ ...76 : chr NA NA NA NA ...
## $ ...77 : chr NA NA NA NA ...
## $ ...78 : chr NA NA NA NA ...
## $ ...79 : chr NA NA NA NA ...
## $ ...80 : chr NA NA NA NA ...
## $ ...81 : chr NA NA NA NA ...
## $ ...82 : chr NA NA NA NA ...
## $ ...83 : chr NA NA NA NA ...
## $ ...84 : chr NA NA NA NA ...
## $ ...85 : chr NA NA NA NA ...
## $ ...86 : chr NA NA NA NA ...
## $ ...87 : chr NA NA NA NA ...
## $ ...88 : chr NA NA NA NA ...
## $ ...89 : chr NA NA NA NA ...
## $ ...90 : chr NA NA NA NA ...
## $ ...91 : chr NA NA NA NA ...
## $ ...92 : chr NA NA NA NA ...
## $ ...93 : chr NA NA NA NA ...
## $ ...94 : chr NA NA NA NA ...
## $ ...95 : chr NA NA NA NA ...
## $ ...96 : chr NA NA NA NA ...
## $ ...97 : chr NA NA NA NA ...
## $ ...98 : chr NA NA NA NA ...
## $ ...99 : chr NA NA NA NA ...
## [list output truncated]
All of the column names are messed up and need to be fixed.
First, let’s get rid of the first 10 rows, since they serve no purpose. Then let’s make the leftover first row into column names and then delete the first row.
Rename the first 5 columns
Filter out the regions and filter out data for just countries
cleanRow <- tab[-c(1:10),]
colnames(cleanRow) <- as.character(unlist(cleanRow[1,]))
cleanRow = cleanRow[-1,]
colnames(cleanRow)[1:5] <- c("Sort_Order", "Dest_Country", "Notes", "Country_Code", "Data_Type")
cleanRow2 <- cleanRow %>% filter(is.na(Data_Type))
cleanRow3 <- cleanRow %>% filter(!is.na(Data_Type))
Delete the other unncessary columns from the Data Frame
Then lets use na.rm = TRUE to handle all NA values and calculate total migrants in the fourth column called migrants.
CountryDF <- cleanRow3 %>% select(-Sort_Order, -Notes, -Data_Type, -Total, -starts_with("Other"))
CountryDF2 <- CountryDF %>% gather(origin_Country, migrants, Afghanistan:Zimbabwe, na.rm = TRUE)
CountryDF2$migrants <- as.numeric(CountryDF2$migrants)
glimpse(CountryDF2)
## Observations: 11,228
## Variables: 4
## $ Dest_Country <chr> "Egypt", "Libya", "Namibia", "South Africa", "Tajiki...
## $ Country_Code <chr> "818", "434", "516", "710", "762", "458", "608", "76...
## $ origin_Country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghan...
## $ migrants <dbl> 235, 320, 39, 83, 7587, 498, 1220, 1, 8086, 2348369,...
Let’s know isolate the cleaned data frame further to gather information pertaining to the United States.
Dest_USA <- CountryDF2 %>% filter(Dest_Country == "United States of America")
Exit_USA <- CountryDF2 %>% filter(origin_Country == "United States of America")
Had difficulty using ggplot to display all the country names. Used the plot_ly library to display the data.
Dest_USA <- Dest_USA %>% group_by(Dest_Country) %>% arrange(desc(migrants))
Dest_USA
## # A tibble: 150 x 4
## # Groups: Dest_Country [1]
## Dest_Country Country_Code origin_Country migrants
## <chr> <chr> <chr> <dbl>
## 1 United States of America 840 Mexico 12050031
## 2 United States of America 840 China 2103551
## 3 United States of America 840 India 1969286
## 4 United States of America 840 Philippines 1896031
## 5 United States of America 840 Puerto Rico 1744402
## 6 United States of America 840 Viet Nam 1302870
## 7 United States of America 840 El Salvador 1276489
## 8 United States of America 840 Cuba 1131284
## 9 United States of America 840 Republic of Korea 1119578
## 10 United States of America 840 Dominican Republic 940874
## # ... with 140 more rows
Exit_USA <- Exit_USA %>% group_by(Dest_Country) %>% arrange(desc(migrants))
Exit_USA
## # A tibble: 161 x 4
## # Groups: Dest_Country [161]
## Dest_Country Country_Code origin_Country migrants
## <chr> <chr> <chr> <dbl>
## 1 Mexico 484 United States of A~ 876528
## 2 Canada 124 United States of A~ 343252
## 3 United Kingdom of Great Britain an~ 826 United States of A~ 212150
## 4 Puerto Rico 630 United States of A~ 165147
## 5 Germany 276 United States of A~ 137575
## 6 Australia 36 United States of A~ 110643
## 7 Israel 376 United States of A~ 79082
## 8 Republic of Korea 410 United States of A~ 68784
## 9 Italy 380 United States of A~ 54226
## 10 France 250 United States of A~ 53906
## # ... with 151 more rows
ggplot(Dest_USA, aes(x = origin_Country, y = migrants)) + geom_point() + scale_y_log10()
ggplot(Dest_USA, aes(x=origin_Country, y=migrants)) +
geom_bar(stat='identity') + theme(axis.text.x=element_text(angle=90, hjust=1)) + scale_y_log10()
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
##
## config
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Dest <- plot_ly(x = Dest_USA$origin_Country, y = Dest_USA$migrants, width = 1000, height=300, type = 'bar', mode = 'markers')
Dest <- Dest %>% layout(autosize = TRUE)
Dest
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Exit <- plot_ly(data = Exit_USA, x = ~Dest_Country, y = ~migrants)
Exit
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#bar
The country with the greatest number of immigrants to the United States is Mexico, while Americans are leaving to Mexico, Canada, the United Kingdom and Puerto Rico (even though Puerto Rico is a commonwealth of the United States). However, there is a signifcant greater number of immmigrants versus Americans leaving the country. The number of Mexican immigrants are 12million while the number of Americans leaving to Mexico is approximately 800,000.