#Introduction

Below is an exporatory analysis of human migration information gathered by the United Nations. The goal here is look at the growth in migration by region and by sex. Hopefully we can identify regions where the migration has been most spuratic. The data is provided in a wide format, which may be visually appealing, but we will need to elongate the data and mutate as nessesary to run analysis.

Clean Data

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
immigration_data_raw <- read.csv("UN_MigrantStockTotal_2019.csv", na.strings=c("","NA"), encoding = "UTF-8")

Remove unnessesary headers.

immigration_data <- immigration_data_raw[-c(1:13),]

Relevant variable info is found in row 1 but we need to repeat in accross the relevant columns.

immigration_data[c(1), ] <- immigration_data[c(1), ] %>%
  t()%>%
 as.data.frame()%>%
 rename(temp = "14")%>%
 fill(colnames(.))%>%
  t()

Combine the variable information in the first two row and apply them to the column headers.

colnames(immigration_data) <- paste(immigration_data[c(1), ],immigration_data[c(2), ], sep = ";", collapse = NULL, recycle0 = TRUE)

Remove the first two rows that contain duplicate header information.

immigration_data <- immigration_data[-c(1:2),]

The data includes categorical variable information that breaks up the data horizanotaly, describing the category of population that follows below. Since these “sub-headers” are accompanied by a blank (“..”) population row, we can use this pattern to break these headers out into their own variable of “category.”

immigration_data <- immigration_data %>%
  mutate(category =
    ifelse(immigration_data$`International migrant stock at mid-year (both sexes);1990`=="..",`Major area, region, country or area of destination;NA`,
  NA))%>%
  fill(category)

Remove useless rows.

immigration_data <- immigration_data%>%
  filter(`International migrant stock at mid-year (both sexes);1990` != "..")

Remove preaggregated sub categories.

immigration_data <- filter(immigration_data, !is.na(`Type of data (a);NA`))

Transform

Now that we have appropriate column names, we can pivot longer to elongate the population values, then we can seperate the sex/year variable and pivot wider to bring the sex information to the top.

immigration_data_pivot <- immigration_data%>%
  pivot_longer(`International migrant stock at mid-year (both sexes);1990`:`International migrant stock at mid-year (female);2019`, names_to = "sex", values_to = "people")%>%
  separate(sex,into = c("sex","year"), sep = ";")%>%
  pivot_wider(names_from = sex , values_from = c(people))
names(immigration_data_pivot)
##  [1] "Sort\norder;NA"                                       
##  [2] "Major area, region, country or area of destination;NA"
##  [3] "Notes;NA"                                             
##  [4] "Code;NA"                                              
##  [5] "Type of data (a);NA"                                  
##  [6] "category"                                             
##  [7] "year"                                                 
##  [8] "International migrant stock at mid-year (both sexes)" 
##  [9] "International migrant stock at mid-year (male)"       
## [10] "International migrant stock at mid-year (female)"

Assign better naming conventions.

newnames <- c("id", "location", "notes","code", "type", "category", "year", "male_and_female", "male", "female")

Format data types, and remove unnessesary columns.

immigration_data_pivot<- immigration_data_pivot%>% 
  set_names(newnames)%>%
  select(-c(notes:type))%>%
  mutate("male_and_female" = as.numeric(gsub(",", "",male_and_female)),"male"= as.numeric(gsub(",", "",male)), "female" = as.numeric(gsub(",", "",female)))
head(immigration_data_pivot)
## # A tibble: 6 x 7
##   id    location category                    year  male_and_female   male female
##   <chr> <chr>    <chr>                       <chr>           <dbl>  <dbl>  <dbl>
## 1 24    Burundi  Sustainable Development Go… 1990           333110 163267 169843
## 2 24    Burundi  Sustainable Development Go… 1995           254853 124165 130688
## 3 24    Burundi  Sustainable Development Go… 2000           125628  61094  64534
## 4 24    Burundi  Sustainable Development Go… 2005           172874  84805  88069
## 5 24    Burundi  Sustainable Development Go… 2010           235259 115823 119436
## 6 24    Burundi  Sustainable Development Go… 2015           289810 142790 147020

Visualize

library(ggplot2)
immigration_data_pivot%>%
  group_by(category, year)%>%
  summarize(male = sum(male), female = sum(female))%>%
  ggplot()+
  geom_point( aes(x = year, y = male), color = "blue")+
  geom_point( aes(x = year, y = female), color = "pink")+
  facet_wrap(vars(category), scales = "free_y")+
  scale_x_discrete(labels = NULL) 
## `summarise()` regrouping output by 'category' (override with `.groups` argument)

library(ggplot2)
immigration_data_pivot%>%
  filter(category == "Sustainable Development Goal (SDG) regions")%>%
  ggplot()+
  geom_point( aes(x = year, y = male), color = "blue")+
  geom_point( aes(x = year, y = female), color = "pink")+
  facet_wrap(vars(location), scales = "free_y")+
  scale_x_discrete(labels = NULL) 

Cross reference

The trends in migrants is interesting, but we may be able to gain additional insights by combining the data set with information regarding food aide over time, by area.

food_aide <- read.csv("fao_out.csv", na.strings=c("","NA"), encoding = "UTF-8")
food_aide <- set_names(food_aide, c("location", "year", "food_amount"))
immigration_food <- immigration_data_pivot%>%
  mutate(year = as.numeric(year))%>%
  left_join(food_aide, by = NULL)
## Joining, by = c("location", "year")

Join

We can see that the data does not join completely, but we will move along with a smaller data set.

immigration_food <- filter(immigration_food , !is.na(food_amount))%>%
  mutate(food_amount = as.numeric(food_amount))
immigration_food
## # A tibble: 943 x 8
##    id    location category        year male_and_female   male female food_amount
##    <chr> <chr>    <chr>          <dbl>           <dbl>  <dbl>  <dbl>       <dbl>
##  1 26    Djibouti Sustainable D…  1990          122221  64242  57979        1499
##  2 26    Djibouti Sustainable D…  1995           99774  52476  47298        1673
##  3 26    Djibouti Sustainable D…  2000          100507  52920  47587        2023
##  4 26    Djibouti Sustainable D…  2005           92091  51315  40776        2540
##  5 26    Djibouti Sustainable D…  2010          101575  53295  48280        3178
##  6 26    Djibouti Sustainable D…  2015          112351  59081  53270        2160
##  7 28    Ethiopia Sustainable D…  2000          611384 322219 289165      162751
##  8 28    Ethiopia Sustainable D…  2005          514242 269725 244517      214541
##  9 28    Ethiopia Sustainable D…  2010          568748 298069 270679      265637
## 10 28    Ethiopia Sustainable D…  2015         1161642 591409 570233      194350
## # … with 933 more rows

Conclusion

It seems that by comparing the trends side by side, the is a correlation between the two variable. Especially in the developing countries category where the varibles follow a big jump in year 2000.

library(ggplot2)
immigration_food%>%
  group_by(category, year)%>%
  summarise(food_amount2 = sum(food_amount), male_and_female2 = sum(male_and_female))%>%
  ggplot()+
  geom_point(aes(x = year , y = food_amount2 ), color = "green")+ 
  facet_wrap(vars(category), scales = "free_y")+
  scale_x_discrete(labels = NULL) 
## `summarise()` regrouping output by 'category' (override with `.groups` argument)

library(ggplot2)
immigration_food%>%
  group_by(category, year)%>%
  summarise(food_amount2 = sum(food_amount), male_and_female2 = sum(male_and_female))%>%
  ggplot()+
  geom_point(aes(x = year , y = male_and_female2 ), color = "Purple")+ 
  facet_wrap(vars(category), scales = "free_y")+
  scale_x_discrete(labels = NULL) 
## `summarise()` regrouping output by 'category' (override with `.groups` argument)