#Introduction
Below is an exporatory analysis of human migration information gathered by the United Nations. The goal here is look at the growth in migration by region and by sex. Hopefully we can identify regions where the migration has been most spuratic. The data is provided in a wide format, which may be visually appealing, but we will need to elongate the data and mutate as nessesary to run analysis.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
immigration_data_raw <- read.csv("UN_MigrantStockTotal_2019.csv", na.strings=c("","NA"), encoding = "UTF-8")
Remove unnessesary headers.
immigration_data <- immigration_data_raw[-c(1:13),]
Relevant variable info is found in row 1 but we need to repeat in accross the relevant columns.
immigration_data[c(1), ] <- immigration_data[c(1), ] %>%
t()%>%
as.data.frame()%>%
rename(temp = "14")%>%
fill(colnames(.))%>%
t()
Combine the variable information in the first two row and apply them to the column headers.
colnames(immigration_data) <- paste(immigration_data[c(1), ],immigration_data[c(2), ], sep = ";", collapse = NULL, recycle0 = TRUE)
Remove the first two rows that contain duplicate header information.
immigration_data <- immigration_data[-c(1:2),]
The data includes categorical variable information that breaks up the data horizanotaly, describing the category of population that follows below. Since these “sub-headers” are accompanied by a blank (“..”) population row, we can use this pattern to break these headers out into their own variable of “category.”
immigration_data <- immigration_data %>%
mutate(category =
ifelse(immigration_data$`International migrant stock at mid-year (both sexes);1990`=="..",`Major area, region, country or area of destination;NA`,
NA))%>%
fill(category)
Remove useless rows.
immigration_data <- immigration_data%>%
filter(`International migrant stock at mid-year (both sexes);1990` != "..")
Remove preaggregated sub categories.
immigration_data <- filter(immigration_data, !is.na(`Type of data (a);NA`))
Now that we have appropriate column names, we can pivot longer to elongate the population values, then we can seperate the sex/year variable and pivot wider to bring the sex information to the top.
immigration_data_pivot <- immigration_data%>%
pivot_longer(`International migrant stock at mid-year (both sexes);1990`:`International migrant stock at mid-year (female);2019`, names_to = "sex", values_to = "people")%>%
separate(sex,into = c("sex","year"), sep = ";")%>%
pivot_wider(names_from = sex , values_from = c(people))
names(immigration_data_pivot)
## [1] "Sort\norder;NA"
## [2] "Major area, region, country or area of destination;NA"
## [3] "Notes;NA"
## [4] "Code;NA"
## [5] "Type of data (a);NA"
## [6] "category"
## [7] "year"
## [8] "International migrant stock at mid-year (both sexes)"
## [9] "International migrant stock at mid-year (male)"
## [10] "International migrant stock at mid-year (female)"
Assign better naming conventions.
newnames <- c("id", "location", "notes","code", "type", "category", "year", "male_and_female", "male", "female")
Format data types, and remove unnessesary columns.
immigration_data_pivot<- immigration_data_pivot%>%
set_names(newnames)%>%
select(-c(notes:type))%>%
mutate("male_and_female" = as.numeric(gsub(",", "",male_and_female)),"male"= as.numeric(gsub(",", "",male)), "female" = as.numeric(gsub(",", "",female)))
head(immigration_data_pivot)
## # A tibble: 6 x 7
## id location category year male_and_female male female
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 24 Burundi Sustainable Development Go… 1990 333110 163267 169843
## 2 24 Burundi Sustainable Development Go… 1995 254853 124165 130688
## 3 24 Burundi Sustainable Development Go… 2000 125628 61094 64534
## 4 24 Burundi Sustainable Development Go… 2005 172874 84805 88069
## 5 24 Burundi Sustainable Development Go… 2010 235259 115823 119436
## 6 24 Burundi Sustainable Development Go… 2015 289810 142790 147020
library(ggplot2)
immigration_data_pivot%>%
group_by(category, year)%>%
summarize(male = sum(male), female = sum(female))%>%
ggplot()+
geom_point( aes(x = year, y = male), color = "blue")+
geom_point( aes(x = year, y = female), color = "pink")+
facet_wrap(vars(category), scales = "free_y")+
scale_x_discrete(labels = NULL)
## `summarise()` regrouping output by 'category' (override with `.groups` argument)
library(ggplot2)
immigration_data_pivot%>%
filter(category == "Sustainable Development Goal (SDG) regions")%>%
ggplot()+
geom_point( aes(x = year, y = male), color = "blue")+
geom_point( aes(x = year, y = female), color = "pink")+
facet_wrap(vars(location), scales = "free_y")+
scale_x_discrete(labels = NULL)
The trends in migrants is interesting, but we may be able to gain additional insights by combining the data set with information regarding food aide over time, by area.
food_aide <- read.csv("fao_out.csv", na.strings=c("","NA"), encoding = "UTF-8")
food_aide <- set_names(food_aide, c("location", "year", "food_amount"))
immigration_food <- immigration_data_pivot%>%
mutate(year = as.numeric(year))%>%
left_join(food_aide, by = NULL)
## Joining, by = c("location", "year")
We can see that the data does not join completely, but we will move along with a smaller data set.
immigration_food <- filter(immigration_food , !is.na(food_amount))%>%
mutate(food_amount = as.numeric(food_amount))
immigration_food
## # A tibble: 943 x 8
## id location category year male_and_female male female food_amount
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 26 Djibouti Sustainable D… 1990 122221 64242 57979 1499
## 2 26 Djibouti Sustainable D… 1995 99774 52476 47298 1673
## 3 26 Djibouti Sustainable D… 2000 100507 52920 47587 2023
## 4 26 Djibouti Sustainable D… 2005 92091 51315 40776 2540
## 5 26 Djibouti Sustainable D… 2010 101575 53295 48280 3178
## 6 26 Djibouti Sustainable D… 2015 112351 59081 53270 2160
## 7 28 Ethiopia Sustainable D… 2000 611384 322219 289165 162751
## 8 28 Ethiopia Sustainable D… 2005 514242 269725 244517 214541
## 9 28 Ethiopia Sustainable D… 2010 568748 298069 270679 265637
## 10 28 Ethiopia Sustainable D… 2015 1161642 591409 570233 194350
## # … with 933 more rows
It seems that by comparing the trends side by side, the is a correlation between the two variable. Especially in the developing countries category where the varibles follow a big jump in year 2000.
library(ggplot2)
immigration_food%>%
group_by(category, year)%>%
summarise(food_amount2 = sum(food_amount), male_and_female2 = sum(male_and_female))%>%
ggplot()+
geom_point(aes(x = year , y = food_amount2 ), color = "green")+
facet_wrap(vars(category), scales = "free_y")+
scale_x_discrete(labels = NULL)
## `summarise()` regrouping output by 'category' (override with `.groups` argument)
library(ggplot2)
immigration_food%>%
group_by(category, year)%>%
summarise(food_amount2 = sum(food_amount), male_and_female2 = sum(male_and_female))%>%
ggplot()+
geom_point(aes(x = year , y = male_and_female2 ), color = "Purple")+
facet_wrap(vars(category), scales = "free_y")+
scale_x_discrete(labels = NULL)
## `summarise()` regrouping output by 'category' (override with `.groups` argument)