library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(openxlsx)
library(ggplot2)
The data provided is a workbook with multiple tables. The discussion post asks to look at table one for analysis, so I will be using the openxlsx package so I can extract the first table.
rows <- c(16, 38:240, 242:299) # Extract necessary rows so that each row shows an individual country along with their data.
columns <- c(2, 13:26) # Extract necessary columns to show countries and years for male and female stocks seperately.
Migrant_Table_1 <- readWorkbook(xlsxFile = "~/UN_MigrantStockTotal_2019.xlsx", sheet = 2,rows = rows, cols = columns, na.strings = "..")
head(Migrant_Table_1)
## X1 1990 1995 2000 2005 2010 2015
## 1 SUB-SAHARAN AFRICA 7048417 7514093 6939992 7621682 8388536 11232028
## 2 Eastern Africa 3069356 2563827 2483905 2535451 2413116 3386950
## 3 Burundi 163267 124165 61094 84805 115823 142790
## 4 Comoros 6717 6614 6511 6286 6060 6071
## 5 Djibouti 64242 52476 52920 51315 53295 59081
## 6 Eritrea 6228 6542 6856 7729 8603 8833
## 2019 1990 1995 2000 2005 2010 2015 2019
## 1 12386026 6237924 6761344 6211237 6600784 7467400 10105305 11187459
## 2 3965291 2889827 2413817 2363214 2227366 2344783 3329574 3942885
## 3 158163 169843 130688 64534 88069 119436 147020 162855
## 4 6047 7362 7325 7288 6923 6558 6484 6457
## 5 60654 57979 47298 47587 40776 48280 53270 54687
## 6 8922 5620 5858 6096 6585 7073 7108 7179
We can see from the table above that the each row is a different country showing the number of migrants by gender, and per year. The years range from 1990 to 2019 in intervals of five. The years appear twice due to showing the stocks for both male and female. So I will be spitting the data to show male and female in their own table.
# Split data for male
Migrant_male <- tbl_df(Migrant_Table_1[, 1:8])
Migrant_male <- cbind(Major_area = NA, Region = NA, Country = Migrant_male$X1, Gender = "male", Migrant_male[, 2:8], stringsAsFactors = FALSE)
Major_area_data <- c(1, 57, 85, 102, 123, 175, 203) # Extract al major areas.
Region_data <- c(2, 23, 33, 39, 58, 66, 86, 92, 103, 111, 124, 151, 160, 176, 185, 193, 204, 215, 229, 246, 256) # Extract all regions.
Migrant_male$Major_area[Major_area_data] <- Migrant_male$Country[Major_area_data]
Migrant_male$Region[Region_data] <- Migrant_male$Country[Region_data]
Migrant_male <- Migrant_male %>%
fill(1:2, .direction = c("down"))
Migrant_male <- Migrant_male[-c(Major_area_data, Region_data), ]
Migrant_male <- Migrant_male %>%
gather(Year, Number, 5:length(Migrant_male))
head(Migrant_male)
## Major_area Region Country Gender Year Number
## 1 SUB-SAHARAN AFRICA Eastern Africa Burundi male 1990 163267
## 2 SUB-SAHARAN AFRICA Eastern Africa Comoros male 1990 6717
## 3 SUB-SAHARAN AFRICA Eastern Africa Djibouti male 1990 64242
## 4 SUB-SAHARAN AFRICA Eastern Africa Eritrea male 1990 6228
## 5 SUB-SAHARAN AFRICA Eastern Africa Ethiopia male 1990 607284
## 6 SUB-SAHARAN AFRICA Eastern Africa Kenya male 1990 161259
# Split data for female
Migrant_female <- tbl_df(Migrant_Table_1[, c(1, 9:15)])
Migrant_female <- cbind(Major_area = NA, Region = NA, Country = Migrant_female$X1, Gender = "female", Migrant_female[, 2:8], stringsAsFactors = FALSE)
Major_area_data <- c(1, 57, 85, 102, 123, 175, 203) # Extract all major areas
Region_data <- c(2, 23, 33, 39, 58, 66, 86, 92, 103, 111, 124, 151, 160, 176, 185, 193, 204, 215, 229, 246, 256) # Extract all regions.
Migrant_female$Major_area[Major_area_data] <- Migrant_female$Country[Major_area_data]
Migrant_female$Region[Region_data] <- Migrant_female$Country[Region_data]
Migrant_female <- Migrant_female %>%
fill(1:2, .direction = c("down"))
Migrant_female <- Migrant_female[-c(Major_area_data, Region_data), ]
Migrant_female <- Migrant_female %>%
gather(Year, Number, 5:length(Migrant_female))
head(Migrant_female)
## Major_area Region Country Gender Year Number
## 1 SUB-SAHARAN AFRICA Eastern Africa Burundi female 1990 169843
## 2 SUB-SAHARAN AFRICA Eastern Africa Comoros female 1990 7362
## 3 SUB-SAHARAN AFRICA Eastern Africa Djibouti female 1990 57979
## 4 SUB-SAHARAN AFRICA Eastern Africa Eritrea female 1990 5620
## 5 SUB-SAHARAN AFRICA Eastern Africa Ethiopia female 1990 548106
## 6 SUB-SAHARAN AFRICA Eastern Africa Kenya female 1990 136830
The analysis to be done for this datset is to compare the migrant stocks between genders over the year and by the regions around the world.
# Combine both tables
Migrant_gender <- tbl_df(rbind(Migrant_male, Migrant_female)) %>%
arrange(Major_area, Region, Country, Year, Gender)
head(Migrant_gender)
## # A tibble: 6 x 6
## Major_area Region Country Gender Year Number
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 1990 1953036
## 2 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male 1990 1666164
## 3 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 1995 1754513
## 4 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male 1995 1497455
## 5 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan female 2000 1550614
## 6 CENTRAL AND SOUTHERN ASIA Central Asia Kazakhstan male 2000 1323580
# Find ratio of male and female by year
Migrant_gender_year <- Migrant_gender %>%
filter(!is.na(Number)) %>%
group_by(Gender, Year) %>%
summarise(Number = sum(Number)) %>%
spread(Gender, Number) %>%
mutate(Total = male + female, male = male / Total, female = female / Total) %>%
gather(Gender, Ratio, male:female)
Migrant_gender_year$Gender <- factor(Migrant_gender_year$Gender)
Migrant_gender_year
## # A tibble: 14 x 4
## Year Total Gender Ratio
## <chr> <dbl> <fct> <dbl>
## 1 1990 153102673 male 0.508
## 2 1995 161413723 male 0.506
## 3 2000 173683722 male 0.507
## 4 2005 191719591 male 0.511
## 5 2010 220892594 male 0.517
## 6 2015 248980640 male 0.518
## 7 2019 271765669 male 0.521
## 8 1990 153102673 female 0.492
## 9 1995 161413723 female 0.494
## 10 2000 173683722 female 0.493
## 11 2005 191719591 female 0.489
## 12 2010 220892594 female 0.483
## 13 2015 248980640 female 0.482
## 14 2019 271765669 female 0.479
# Graph depicting male and female ratio by year
ggplot(data = Migrant_gender_year) + geom_col(aes(x = Year, y = Ratio, fill = Gender), position = "dodge") + labs(title = "Male and Female Migrants by Year", x = "Year", y = "Ratio")
Based on the graph, there are more male migrants than female migrants. For both male and female the number of migrants remain fairly the same each year. The highest migrant for males seems to be in the year 2019 with 52.09% of the migrants being male and the highest migrants for females seems to be in the year 1995 with 49.36% of the migrants being female.
# Find ratio of male and female by area
Migrant_gender_area <- Migrant_gender %>%
filter(!is.na(Number)) %>%
group_by(Gender, Major_area) %>%
summarise(Number = sum(Number)) %>%
spread(Gender, Number) %>%
mutate(Total = male + female, male = male / Total, female = female / Total) %>%
gather(Gender, Ratio, male:female)
Migrant_gender_area
## # A tibble: 14 x 4
## Major_area Total Gender Ratio
## <chr> <dbl> <chr> <dbl>
## 1 CENTRAL AND SOUTHERN ASIA 145488946 male 0.507
## 2 EASTERN AND SOUTH-EASTERN ASIA 90547893 male 0.502
## 3 EUROPE AND NORTHERN AMERICA 763460886 male 0.486
## 4 LATIN AMERICA AND THE CARIBBEAN 57023152 male 0.500
## 5 NORTHERN AFRICA AND WESTERN ASIA 203330262 male 0.625
## 6 OCEANIA 46005246 male 0.500
## 7 SUB-SAHARAN AFRICA 115702227 male 0.528
## 8 CENTRAL AND SOUTHERN ASIA 145488946 female 0.493
## 9 EASTERN AND SOUTH-EASTERN ASIA 90547893 female 0.498
## 10 EUROPE AND NORTHERN AMERICA 763460886 female 0.514
## 11 LATIN AMERICA AND THE CARIBBEAN 57023152 female 0.500
## 12 NORTHERN AFRICA AND WESTERN ASIA 203330262 female 0.375
## 13 OCEANIA 46005246 female 0.500
## 14 SUB-SAHARAN AFRICA 115702227 female 0.472
# Graph depicting male and female ratio by area
ggplot(data = Migrant_gender_area) + geom_col(aes(x = Major_area, y = Ratio, fill = Gender), position = "dodge") + labs(title = "Male and Female Migrants by Area", x = "Major Area", y = "Ratio")
Based on the graph, majority of the major areas have more male migrants, with the exception being Europe and Northern America, Latin America and the Carribean, and Oceania having more female migrants than male migrants. The highest migrants for males seems to be in Northern Africa and Western Asia with 62.54% of the migrants being male and the highest migrants for females seems to be in Europe and Northern Asia with 51.40% of the migrants being female.
This dataset shows us the number of migrants based on their age, gender, and origin of living. A migrant is someone who moves from one place to another. We can tidy the data to show us the male and female migrants for each major area for the year 1990, 1995, 2000, 2005, 2010, 2015, ans 2019. We see that for each year there are more male migrants than female migrants, but the number of migrants overall almost stays the same. For each area, most of them have more male than female with the exception of a few major areas having more female migrants than male migrants.