library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(openxlsx)
library(ggplot2)

For my second dataset I will be using the International Migrant Stock dataset, discussed in Jai Jeffryes’s discussion post.

The data provided is a workbook with multiple tables. The discussion post asks to look at table one for analysis, so I will be using the openxlsx package so I can extract the first table.

rows <- c(16, 38:240, 242:299) # Extract necessary rows so that each row shows an individual country along with their data. 
columns <- c(2, 13:26) # Extract necessary columns to show countries and years for male and female stocks seperately.
Migrant_Table_1 <- readWorkbook(xlsxFile = "~/UN_MigrantStockTotal_2019.xlsx", sheet = 2,rows = rows, cols = columns, na.strings = "..")
head(Migrant_Table_1)

##                   X1    1990    1995    2000    2005    2010     2015
## 1 SUB-SAHARAN AFRICA 7048417 7514093 6939992 7621682 8388536 11232028
## 2     Eastern Africa 3069356 2563827 2483905 2535451 2413116  3386950
## 3            Burundi  163267  124165   61094   84805  115823   142790
## 4            Comoros    6717    6614    6511    6286    6060     6071
## 5           Djibouti   64242   52476   52920   51315   53295    59081
## 6            Eritrea    6228    6542    6856    7729    8603     8833
##       2019    1990    1995    2000    2005    2010     2015     2019
## 1 12386026 6237924 6761344 6211237 6600784 7467400 10105305 11187459
## 2  3965291 2889827 2413817 2363214 2227366 2344783  3329574  3942885
## 3   158163  169843  130688   64534   88069  119436   147020   162855
## 4     6047    7362    7325    7288    6923    6558     6484     6457
## 5    60654   57979   47298   47587   40776   48280    53270    54687
## 6     8922    5620    5858    6096    6585    7073     7108     7179

We can see from the table above that the each row is a different country showing the number of migrants by gender, and per year. The years range from 1990 to 2019 in intervals of five. The years appear twice due to showing the stocks for both male and female. So I will be spitting the data to show male and female in their own table.

Data Tidying and Transformations

# Split data for male
Migrant_male <- tbl_df(Migrant_Table_1[, 1:8])
Migrant_male <- cbind(Major_area = NA, Region = NA, Country = Migrant_male$X1, Gender = "male", Migrant_male[, 2:8], stringsAsFactors = FALSE)
Major_area_data <- c(1, 57, 85, 102, 123, 175, 203) # Extract al major areas.
Region_data <- c(2, 23, 33, 39, 58, 66, 86, 92, 103, 111, 124, 151, 160, 176, 185, 193, 204, 215, 229, 246, 256) # Extract all regions.
Migrant_male$Major_area[Major_area_data] <- Migrant_male$Country[Major_area_data]
Migrant_male$Region[Region_data] <- Migrant_male$Country[Region_data]
Migrant_male <- Migrant_male %>%
fill(1:2, .direction = c("down"))
Migrant_male <- Migrant_male[-c(Major_area_data, Region_data), ]
Migrant_male <- Migrant_male %>%
gather(Year, Number, 5:length(Migrant_male))
head(Migrant_male)

##           Major_area         Region  Country Gender Year Number
## 1 SUB-SAHARAN AFRICA Eastern Africa  Burundi   male 1990 163267
## 2 SUB-SAHARAN AFRICA Eastern Africa  Comoros   male 1990   6717
## 3 SUB-SAHARAN AFRICA Eastern Africa Djibouti   male 1990  64242
## 4 SUB-SAHARAN AFRICA Eastern Africa  Eritrea   male 1990   6228
## 5 SUB-SAHARAN AFRICA Eastern Africa Ethiopia   male 1990 607284
## 6 SUB-SAHARAN AFRICA Eastern Africa    Kenya   male 1990 161259

# Split data for female
Migrant_female <- tbl_df(Migrant_Table_1[, c(1, 9:15)])
Migrant_female <- cbind(Major_area = NA, Region = NA, Country = Migrant_female$X1, Gender = "female", Migrant_female[, 2:8], stringsAsFactors = FALSE)
Major_area_data <- c(1, 57, 85, 102, 123, 175, 203) # Extract all major areas
Region_data <- c(2, 23, 33, 39, 58, 66, 86, 92, 103, 111, 124, 151, 160, 176, 185, 193, 204, 215, 229, 246, 256) # Extract all regions.
Migrant_female$Major_area[Major_area_data] <- Migrant_female$Country[Major_area_data]
Migrant_female$Region[Region_data] <- Migrant_female$Country[Region_data]
Migrant_female <- Migrant_female %>%
fill(1:2, .direction = c("down"))
Migrant_female <- Migrant_female[-c(Major_area_data, Region_data), ]
Migrant_female <- Migrant_female %>%
gather(Year, Number, 5:length(Migrant_female))
head(Migrant_female)

##           Major_area         Region  Country Gender Year Number
## 1 SUB-SAHARAN AFRICA Eastern Africa  Burundi female 1990 169843
## 2 SUB-SAHARAN AFRICA Eastern Africa  Comoros female 1990   7362
## 3 SUB-SAHARAN AFRICA Eastern Africa Djibouti female 1990  57979
## 4 SUB-SAHARAN AFRICA Eastern Africa  Eritrea female 1990   5620
## 5 SUB-SAHARAN AFRICA Eastern Africa Ethiopia female 1990 548106
## 6 SUB-SAHARAN AFRICA Eastern Africa    Kenya female 1990 136830

Data Analysis

The analysis to be done for this datset is to compare the migrant stocks between genders over the year and by the regions around the world.

# Combine both tables
Migrant_gender <- tbl_df(rbind(Migrant_male, Migrant_female)) %>%
arrange(Major_area, Region, Country, Year, Gender)
head(Migrant_gender)

## # A tibble: 6 x 6
##   Major_area                Region       Country    Gender Year   Number
##   <chr>                     <chr>        <chr>      <chr>  <chr>   <dbl>
## 1 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 1990  1953036
## 2 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male   1990  1666164
## 3 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 1995  1754513
## 4 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male   1995  1497455
## 5 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 2000  1550614
## 6 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male   2000  1323580

Compare genders by Year

# Find ratio of male and female by year
Migrant_gender_year <- Migrant_gender %>%
filter(!is.na(Number)) %>%
group_by(Gender, Year) %>%
summarise(Number = sum(Number)) %>%
spread(Gender, Number) %>%
mutate(Total = male + female, male = male / Total, female = female / Total) %>%
gather(Gender, Ratio, male:female)
Migrant_gender_year$Gender <- factor(Migrant_gender_year$Gender)
Migrant_gender_year

## # A tibble: 14 x 4
##    Year      Total Gender Ratio
##    <chr>     <dbl> <fct>  <dbl>
##  1 1990  153102673 male   0.508
##  2 1995  161413723 male   0.506
##  3 2000  173683722 male   0.507
##  4 2005  191719591 male   0.511
##  5 2010  220892594 male   0.517
##  6 2015  248980640 male   0.518
##  7 2019  271765669 male   0.521
##  8 1990  153102673 female 0.492
##  9 1995  161413723 female 0.494
## 10 2000  173683722 female 0.493
## 11 2005  191719591 female 0.489
## 12 2010  220892594 female 0.483
## 13 2015  248980640 female 0.482
## 14 2019  271765669 female 0.479

# Graph depicting male and female ratio by year
ggplot(data = Migrant_gender_year) + geom_col(aes(x = Year, y = Ratio, fill = Gender), position = "dodge") + labs(title = "Male and Female Migrants by Year", x = "Year", y = "Ratio")

Based on the graph, there are more male migrants than female migrants. For both male and female the number of migrants remain fairly the same each year. The highest migrant for males seems to be in the year 2019 with 52.09% of the migrants being male and the highest migrants for females seems to be in the year 1995 with 49.36% of the migrants being female.

Compare genders by Area

# Find ratio of male and female by area
Migrant_gender_area <- Migrant_gender %>%
filter(!is.na(Number)) %>%
group_by(Gender, Major_area) %>%
summarise(Number = sum(Number)) %>%
spread(Gender, Number) %>%
mutate(Total = male + female, male = male / Total, female = female / Total) %>%
gather(Gender, Ratio, male:female)
Migrant_gender_area

## # A tibble: 14 x 4
##    Major_area                           Total Gender Ratio
##    <chr>                                <dbl> <chr>  <dbl>
##  1 CENTRAL AND SOUTHERN ASIA        145488946 male   0.507
##  2 EASTERN AND SOUTH-EASTERN ASIA    90547893 male   0.502
##  3 EUROPE AND NORTHERN AMERICA      763460886 male   0.486
##  4 LATIN AMERICA AND THE CARIBBEAN   57023152 male   0.500
##  5 NORTHERN AFRICA AND WESTERN ASIA 203330262 male   0.625
##  6 OCEANIA                           46005246 male   0.500
##  7 SUB-SAHARAN AFRICA               115702227 male   0.528
##  8 CENTRAL AND SOUTHERN ASIA        145488946 female 0.493
##  9 EASTERN AND SOUTH-EASTERN ASIA    90547893 female 0.498
## 10 EUROPE AND NORTHERN AMERICA      763460886 female 0.514
## 11 LATIN AMERICA AND THE CARIBBEAN   57023152 female 0.500
## 12 NORTHERN AFRICA AND WESTERN ASIA 203330262 female 0.375
## 13 OCEANIA                           46005246 female 0.500
## 14 SUB-SAHARAN AFRICA               115702227 female 0.472

# Graph depicting male and female ratio by area
ggplot(data = Migrant_gender_area) + geom_col(aes(x = Major_area, y = Ratio, fill = Gender), position = "dodge") + labs(title = "Male and Female Migrants by Area", x = "Major Area", y = "Ratio")

Based on the graph, majority of the major areas have more male migrants, with the exception being Europe and Northern America, Latin America and the Carribean, and Oceania having more female migrants than male migrants. The highest migrants for males seems to be in Northern Africa and Western Asia with 62.54% of the migrants being male and the highest migrants for females seems to be in Europe and Northern Asia with 51.40% of the migrants being female.

Conclusion

This dataset shows us the number of migrants based on their age, gender, and origin of living. A migrant is someone who moves from one place to another. We can tidy the data to show us the male and female migrants for each major area for the year 1990, 1995, 2000, 2005, 2010, 2015, ans 2019. We see that for each year there are more male migrants than female migrants, but the number of migrants overall almost stays the same. For each area, most of them have more male than female with the exception of a few major areas having more female migrants than male migrants.

Data 607 Project 2 Dataset 2