Load packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(RColorBrewer)

Import the dataset

setwd("~/School/MC/DATA 110/Datasets")
data <- read_csv("humantrafficking.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `By using this data you agree to the Terms of Use: https://www.ctdatacollaborative.org/terms-use` = col_logical(),
##   Datasource = col_character(),
##   gender = col_character(),
##   ageBroad = col_character(),
##   majorityStatus = col_character(),
##   citizenship = col_character(),
##   typeOfExploitConcatenated = col_character(),
##   CountryOfExploitation = col_character()
## )
## i Use `spec()` for the full column specifications.
## Warning: 61259 parsing failures.
##  row                      col expected              actual                   file
## 1608 typeOfLabourConcatenated a double Not specified       'humantrafficking.csv'
## 1609 typeOfLabourConcatenated a double Other;Not specified 'humantrafficking.csv'
## 1610 typeOfLabourConcatenated a double Not specified       'humantrafficking.csv'
## 1611 typeOfLabourConcatenated a double Not specified       'humantrafficking.csv'
## 1612 typeOfLabourConcatenated a double Not specified       'humantrafficking.csv'
## .... ........................ ........ ................... ......................
## See problems(...) for more details.

Introduction

This is a global human trafficking dataset downloaded from https://www.ctdatacollaborative.org/.

The Counter-Trafficking Data Collaborative is a global data hub on human trafficking, collecting data from counter-trafficking organization around the world. The dataset I’m using include information of 48,801 victims of human trafficking. The dataset is regularly updated. As a result of different data collection methods by different organizations, missing values can be found throughout the dataset. Also, because of the nature of this crime, many victims are hidden and cannot be identified. This dataset only include detected and registered victims. Therefore, the dataset cannot be considered a random and representative sample of global human trafficking.

In terms of the different types of trafficking, sexual exploitation and forced labour are the most prominent. However, there are other numerous forms of trafficking including begging, forced marriage, baby selling, organ removal, forced military, etc. Some cases are mixed forms.

I choose this topic because I watched a few movies about human trafficking and I feel like this crime is difficult to fight and the victims of this crime suffered so much and only small percentage could escape. So, I wanted to learn more about this topic, become more self-aware, and also remind people around me. I hope nobody will ever be the victim of this crime.

Clean the Data

Replaces the negative 99 with NA’s

# columns 5 to 63 contains -99 and therefore need to be converted
columns_to_convert <- c(5:63)  

# set a function called replace_with_NA to be used in the next step
replace_with_NA <- function(x) {
  ifelse(x <0, yes = NA, no = x)  # use ifelse function to convert all negative values to NA
}

# use mutate_at to generate data_withNA
data_withNA <- mutate_at(data, columns_to_convert, replace_with_NA)

Rename the ageBroad column

# Rename the ageBroad so the format is consistent and can be sorted in ascending order
data_withNA$ageBroad[data_withNA$ageBroad == "0--8"] <- "00-08" 
data_withNA$ageBroad[data_withNA$ageBroad == "9--17"] <- "09-17" 
data_withNA$ageBroad[data_withNA$ageBroad == "18--20"] <- "18-20" 
data_withNA$ageBroad[data_withNA$ageBroad == "21--23"] <- "21-23" 
data_withNA$ageBroad[data_withNA$ageBroad == "24--26"] <- "24-26"  
data_withNA$ageBroad[data_withNA$ageBroad == "27--29"] <- "27-29"
data_withNA$ageBroad[data_withNA$ageBroad == "30--38"] <- "30-38"
data_withNA$ageBroad[data_withNA$ageBroad == "39--47"] <- "39-47"

Change the 0s to No and 1s to Yes for the following columns:

# Replace 1s with No and 1s with Yes
data_withNA$isForcedLabour[data_withNA$isForcedLabour == "0"] <- "No"
data_withNA$isForcedLabour[data_withNA$isForcedLabour == "1"] <- "Yes"
data_withNA$isSexualExploit[data_withNA$isSexualExploit == "0"] <- "No"
data_withNA$isSexualExploit[data_withNA$isSexualExploit == "1"] <- "Yes"
data_withNA$isSexAndLabour[data_withNA$isSexAndLabour == "0"] <- "No"
data_withNA$isSexAndLabour[data_withNA$isSexAndLabour == "1"] <- "Yes"
data_withNA$isForcedMarriage[data_withNA$isForcedMarriage == "0"] <- "No"
data_withNA$isForcedMarriage[data_withNA$isForcedMarriage == "1"] <- "Yes"
data_withNA$isOrganRemoval[data_withNA$isOrganRemoval == "0"] <- "No"
data_withNA$isOrganRemoval[data_withNA$isOrganRemoval == "1"] <- "Yes"
data_withNA$isForcedMilitary[data_withNA$isForcedMilitary == "0"] <- "No"
data_withNA$isForcedMilitary[data_withNA$isForcedMilitary == "1"] <- "Yes"
data_withNA$isSlaveryAndPractices[data_withNA$isSlaveryAndPractices == "0"] <- "No"
data_withNA$isSlaveryAndPractices[data_withNA$isSlaveryAndPractices == "1"] <- "Yes"
data_withNA$isOtherExploit[data_withNA$isOtherExploit == "0"] <- "No"
data_withNA$isOtherExploit[data_withNA$isOtherExploit == "1"] <- "Yes"

Plot 1 Female vs. Male Victims Over Time

# group the data by year and victim's gender, and then count the cases for each gender group. use ggplot to create a proportional bar chart to compare male and female victims 
data_withNA %>%
  group_by(yearOfRegistration, gender) %>%
  summarise(cases = sum(!is.na(gender))) %>%
  ggplot(aes(x = yearOfRegistration, y = cases, fill = gender)) +
  geom_col(colour = "white", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  # set the palette
  scale_fill_brewer(palette = "Pastel2") +
  # add labels
  labs(title = "Proportion of Female and Male Victims Over Time", 
       subtitle = "From 2002 to 2019", 
       x = "Year", 
       y = "Proportion") +
  # use theme grey and change the size of the plot text to 12
  theme_grey(base_size = 12)
## `summarise()` has grouped output by 'yearOfRegistration'. You can override using the `.groups` argument.

The majority of human trafficking victims are females, however, The proportion of male victims is increasing overtime, especially during 2010 and 2015. Human trafficking can happen to anyone and the ways of exploitation are various, including sexual coercion, forced labor, forced marriage, organ removal, etc.

Plot 2 Female vs Male Victims in Each Age Group

# groupo the data by age range and gender, and then count the cases in each age range. use ggplot to create a proportion bar chart to compare the female and male victims in each age group. 
data_withNA %>%
  group_by(ageBroad, gender) %>%
  summarise(cases = sum(!is.na(gender))) %>%
  ggplot(aes(x = ageBroad, y = cases, fill = gender)) +
  geom_col(colour = "white", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  # set the palette
  scale_fill_brewer(palette = "Pastel1") +
  # add labels
  labs(title = "Proportion of Female and Male Victims in Each Age Group", 
       x = "Age Group", 
       y = "Proportion") +
  # use theme dark and change the size of the plot text to 12
  theme_dark(base_size = 12)
## `summarise()` has grouped output by 'ageBroad'. You can override using the `.groups` argument.

Majority of female victims are in lower age group. In age groups 18-20, 21-23, 24-26, more than 75% of victims are females.

Male victims have larger proportion in age groups 30+ and the lowest age group 0-8.

Plot 3 Area Chart of Cases Over Years

# group the data by year and gender and count the number of cases
data1 <- data_withNA %>%
  group_by(yearOfRegistration, gender) %>%
  summarise(numberOfCases = sum(!is.na(gender))) 
## `summarise()` has grouped output by 'yearOfRegistration'. You can override using the `.groups` argument.
# set the color palette
cols1 <- brewer.pal(2, "Dark2")
## Warning in brewer.pal(2, "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels
# create the stacked area chart over years grouped by gender
highchart() %>%
  hc_add_series(data = data1, 
                type = "area", 
                hcaes(x = yearOfRegistration, 
                      y = numberOfCases, 
                      group = gender, 
                      na.omit = TRUE)) %>%
  hc_colors(cols1) %>%
  hc_plotOptions(series = list(stacking = "normal", 
                               marker = list(enabled = FALSE, 
                               states = list(hover = list(enabled = FALSE))),
                               lineWidth = 0.5, 
                               lineColor = "white")) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "Number of Cases")) %>%
  hc_title(text = "Number of Registered Cases Over Years", 
           margin = 20, 
           align = "center") %>%
  # customize the legend
  hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
  # customize the tooltip
  hc_tooltip(shared = TRUE, 
             borderColor = "black")

The number of registered cases for both male and female are increasing over years. There’s a peak point in 2016 where female victims reached 12,003 and male victims reached 4,396.

The total number of detected trafficking victims are increasing over the years, but I couldn’t find a solid reason why 2016 is a peak point in this dataset. One possibility is the limitation of the data source and samples. It could be that a lot more data was collected in 2016. According to 2016 Global Report on Trafficking in Persons published by United Nations Office on Drugs and Crime, many countries have criminalized most forms of trafficking as set out in the UN Trafficking in Persons Protocol. The number of countries doing that increased from 33 in 2003 to 158 in 2016. This increase has helped to assist the victims and collect related data.

Source: https://www.unodc.org/documents/data-and-analysis/glotip/2016_Global_Report_on_Trafficking_in_Persons.pdf

Plot 4 Area Chart - Number of Registered Cases in Each Age Group

# group the data by year, gender, and age group, and then count the cases, filter out NA in age group 
data2 <- data_withNA %>%
  group_by(yearOfRegistration, gender, ageBroad) %>%
  summarise(numberOfCases = sum(!is.na(gender))) %>%
  filter(ageBroad != "NA")
## `summarise()` has grouped output by 'yearOfRegistration', 'gender'. You can override using the `.groups` argument.
# set the color palette
cols2 <- brewer.pal(9, "Paired")

# create the stacked area chart over years grouped by ageBroad
highchart() %>%
  hc_add_series(data = data2, 
                type = "area", 
                hcaes(x = yearOfRegistration, 
                      y = numberOfCases, 
                      group = ageBroad, 
                      na.omit = TRUE)) %>%  #omit entries that are missing age data
  hc_colors(cols2) %>%
  hc_plotOptions(series = list(stacking = "normal", 
                               marker = list(enabled = FALSE, 
                               states = list(hover = list(enabled = FALSE))),
                               lineWidth = 0.5, 
                               lineColor = "white")) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "Number of Cases")) %>%
  hc_title(text = "Number of Registered Cases in Each Age Group", 
           margin = 20, 
           align = "center") %>%
  # customize the legend
  hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
  # customize the tooltip
  hc_tooltip(shared = TRUE, 
             borderColor = "black")

We can see the total number of victims in age group 09-17, 30-38 increased dramatically. Combining this plot with plot 2, we could speculate that the increasing in age group 09-17 comes from female victims because the percentage of female victims is more than 75%. We can also speculate that more male victims in age group 30-38 contribute to the increase because and the proportion of male victims is large in that group.

Part II. Final Project

Left join global trafficking dataset with country code dataset to bring full country names and region.

setwd("~/School/MC/DATA 110/Datasets")
countrycode <- read_csv("countrycode.csv")  #read countrycode dataset
## 
## -- Column specification --------------------------------------------------------
## cols(
##   citizenship = col_character(),
##   country = col_character(),
##   region = col_character(),
##   income_level = col_character()
## )
data_withCountryName <- data_withNA %>%
  left_join(countrycode, by = "citizenship")  #data_withNA left join with countrycode

What are regions with the most registered cases?

data3 <- data_withCountryName %>%
  select(yearOfRegistration, gender, ageBroad, majorityStatus, citizenship, country, region, income_level) 

head(data3)
## # A tibble: 6 x 8
##   yearOfRegistrati~ gender ageBroad majorityStatus citizenship country region   
##               <dbl> <chr>  <chr>    <chr>          <chr>       <chr>   <chr>    
## 1              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## 2              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## 3              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## 4              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## 5              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## 6              2002 Female 18-20    Adult          CO          Colomb~ Latin Am~
## # ... with 1 more variable: income_level <chr>

Plot 5 Pie Chart - Case Distribution Among Regions

data3 %>%
  group_by(region) %>%
  count(region) %>%   # group by region and count number of cases in each region
  ggplot(aes(x="", y = n, fill = region)) +
  geom_bar(stat = "identity") +
  coord_polar("y", start = 0) +
  theme_void() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Registered Cases in Each Region") 

East Asia & Pacific and Europe & Central Asia have the most registered cases. These two regions constitute more than 50% of total cases in this dataset. Countries in these two regions are also destinations for trafficking victims from a wide range of origins.

subset(data3, is.na(region)) 
## # A tibble: 8,858 x 8
##    yearOfRegistration gender ageBroad majorityStatus citizenship country region
##                 <dbl> <chr>  <chr>    <chr>          <chr>       <chr>   <chr> 
##  1               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  2               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  3               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  4               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  5               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  6               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  7               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  8               2015 Female 18-20    Adult          00          <NA>    <NA>  
##  9               2015 Female 18-20    Adult          00          <NA>    <NA>  
## 10               2015 Female 18-20    Adult          00          <NA>    <NA>  
## # ... with 8,848 more rows, and 1 more variable: income_level <chr>

In this dataset, the citizenship of more than 8,800 victims remain unidentified. According to UNODC’s Global Report on Trafficking in Persons, traffickers target victims who are marginalized or in difficult situation. Some of them are refugees who are stateless due to arbitrary deprivation of their citizenship.

Plot 6 Bar Chart - Female and Male Victims in Each region

data3 %>%
  filter(region != "NA") %>%
  ggplot(mapping = aes(y = region, fill = gender)) +
  geom_bar(position = "dodge") +
  scale_fill_brewer(palette = "Pastel1") +
  theme_minimal() +
  labs(title = "Female vs. Male Victims in Each Region", 
       x = "Number of Cases", 
       y = "Region") 

Plot 7 Bar Chart - Adult and Minor Victims in Each Region

data3 %>%
  filter(region != "NA") %>%
  ggplot(mapping = aes(y = region, fill = majorityStatus)) +
  geom_bar(position = "dodge") +
  scale_fill_brewer(palette = "Pastel2") +
  theme_minimal() +
  labs(title = "Adult vs. Minor Victims in Each Region", 
       x = "Number of Cases", 
       y = "Region") 

Most reported victims are female adults. The percentage of female victims in North America is more than any other region’s.

Sub-Saharan Africa and Latin America & Caribbean are the two regions that have more minor victims than adult victims. In Latin America & Caribbean, children are trafficked for the purpose of exploitative begging, forced criminal activity, and illegal adoption. Countries in Sub-Saharan Africa region report children trafficking for forced criminal activity, forced marriage, and other mixed forms of exploitation (Global Report on Trafficking in Persons 2020).

Which countries have the most registered cases?

data3 %>%
  group_by(country) %>%
  count(country) %>%
  arrange(desc(n))
## # A tibble: 46 x 2
## # Groups:   country [46]
##    country           n
##    <chr>         <int>
##  1 Philippines   11365
##  2 <NA>           8858
##  3 Ukraine        7761
##  4 Moldova        5901
##  5 United States  3636
##  6 Cambodia       1979
##  7 Indonesia      1971
##  8 Belarus        1463
##  9 Myanmar        1250
## 10 Romania         655
## # ... with 36 more rows

There are eight countries that have total of more than 1000 cases.

Select eight countries with the highest case numbers and make a plot

Plot 8 Bar Chart - Adult Victims vs. Minor Victims in Each Country

data3 %>%
  filter(country == "Philippines" | country == "Ukraine" | country == "Moldova" | 
           country == "United States" | country == "Cambodia" | country == "Indonesia" |
           country == "Belarus" | country == "Myanmar" ) %>%
  group_by(country, majorityStatus) %>%
  count(majorityStatus) %>%
  #filter(majorityStatus != "NA") %>%
  ggplot(mapping = aes(x = country, y = n, fill = majorityStatus)) +
  geom_col(colour = "white", position = "fill") +
  #scale_fill_brewer(palette = "Pastel1") +
  theme_minimal() +
  labs(title = "Adult vs. Minor Victims in Each Country", 
       x = "Country", 
       y = "Proportion")  

Moldova and United States have higher shares of children among detected victims.

Number of Registered Cases for Top 8 Countries Over Years

data4 <- data3 %>%
  group_by(yearOfRegistration, country) %>%
  count(country) %>%
  filter(country == "Philippines" | country == "Ukraine" | country == "Moldova" | 
           country == "United States" | country == "Cambodia" | country == "Indonesia" |
           country == "Belarus" | country == "Myanmar" )
data4
## # A tibble: 71 x 3
## # Groups:   yearOfRegistration, country [71]
##    yearOfRegistration country       n
##                 <dbl> <chr>     <int>
##  1               2002 Moldova     597
##  2               2002 Ukraine      92
##  3               2003 Moldova     215
##  4               2003 Ukraine      13
##  5               2004 Belarus      54
##  6               2004 Moldova     169
##  7               2005 Belarus     263
##  8               2005 Indonesia    11
##  9               2005 Moldova     196
## 10               2005 Ukraine     119
## # ... with 61 more rows

Plot 9 Line Chart

highchart() %>%
  hc_add_series(data = data4, 
                type = "line", 
                hcaes(x = yearOfRegistration, 
                      y = n, 
                      group = country)) %>%
  hc_colors(cols2) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "Number of Cases")) %>%
  hc_title(text = "Number of Registered Cases Over Years", 
           margin = 20, 
           align = "center")  %>%
  hc_legend(align = "right", 
            verticalAlign = "top") %>%
  hc_tooltip(shared = TRUE,
             borderColor = "black"
             )
data_withCountryName %>%
  filter(country == "Philippines" & yearOfRegistration == "2016") %>%
  group_by(typeOfExploitConcatenated) %>%
  count(typeOfExploitConcatenated) 
## # A tibble: 5 x 2
## # Groups:   typeOfExploitConcatenated [5]
##   typeOfExploitConcatenated         n
##   <chr>                         <int>
## 1 Forced labour                  2336
## 2 Other                          6879
## 3 Sexual exploitation             367
## 4 Slavery and similar practices   359
## 5 <NA>                           1321

Cambodia and Ukraine have a increasing trend in registered cases. The trend for other countries has up and down. The total cases were higher during year 2014 and 2016. In 2016, there are 11,262 victims that are registered as Filipinos.

Plot 10 Treemap

data5 <- data_withCountryName %>%
  group_by(yearOfRegistration, typeOfExploitConcatenated) %>%
  count(typeOfExploitConcatenated) %>%
  filter(typeOfExploitConcatenated != "NA") 
data5
## # A tibble: 39 x 3
## # Groups:   yearOfRegistration, typeOfExploitConcatenated [39]
##    yearOfRegistration typeOfExploitConcatenated     n
##                 <dbl> <chr>                     <int>
##  1               2002 Sexual exploitation         867
##  2               2003 Sexual exploitation         357
##  3               2004 Sexual exploitation         160
##  4               2005 Forced labour               601
##  5               2005 Other                        11
##  6               2005 Sexual exploitation         492
##  7               2006 Forced labour               369
##  8               2006 Sexual exploitation         318
##  9               2007 Forced labour               219
## 10               2007 Sexual exploitation         344
## # ... with 29 more rows
treemap(data5, index="typeOfExploitConcatenated", vSize="n", palette="RdYlBu")

Sexual exploitation and forced labour are two major forms of exploitation.

Plot 11 Line Chart

highchart() %>%
  hc_add_series(data = data5, 
                type = "line", 
                hcaes(x = yearOfRegistration, 
                      y = n, 
                      group = typeOfExploitConcatenated)) %>%
  hc_colors(cols1) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "Number of Cases")) %>%
  hc_title(text = "Trends in Registered Cases for Each Form of Exploitation", 
           margin = 20, 
           align = "center")  %>%
  hc_legend(align = "right", 
            verticalAlign = "top") %>%
  hc_tooltip(shared = TRUE,
             borderColor = "black"
             )

Since 2010, victims of forced labour and sexual exploitation are increasing.

Plot 12 Bar Chart - Shares of Each Form of Exploitation 2010 - 2018

data_withCountryName$typeOfExploitConcatenated[data_withCountryName$typeOfExploitConcatenated == "Forced labour;Sexual exploitation;Combined sexual and labour exploitation"] <- "Forced labour+Sexual exploitation" 
# change to shorter name
data_withCountryName %>%
  select(yearOfRegistration, typeOfExploitConcatenated) %>%
  filter(yearOfRegistration == 2010:2018) %>%
  group_by(yearOfRegistration) %>%
  count(typeOfExploitConcatenated) %>%
  ggplot(mapping = aes(x = yearOfRegistration, y = n, fill = typeOfExploitConcatenated)) +
  geom_col(colour = "white", position = "fill") +
  labs(title = "Shares of Forms of Exploitation", 
       subtitle = "From 2010 to 2018", 
       x = "Year", 
       y = "Proportion", 
       fill = "Forms of Exploitation") +
  theme_minimal()
## Warning in yearOfRegistration == 2010:2018: longer object length is not a
## multiple of shorter object length

Many cases are missing exploitation type in early years. Shares of sexual exploitation are increasing since 2015. According to Global Report on Trafficking in Persons 2020, Overall, 50 per cent of detected victims were trafficked for sexual exploitation and 38 per cent for forced labour, while 6 per cent were subjected to forced criminal activity and more than one per cent to begging.

Reference

  1. The Counter Trafficking Data Collaborative https://www.ctdatacollaborative.org/

  2. Global Report on Trafficking in Persons 2020 https://www.unodc.org/documents/data-and-analysis/tip/2021/GLOTiP_2020_15jan_web.pdf

  3. Global Report on Trafficking in Persons 2016 https://www.unodc.org/documents/data-and-analysis/glotip/2016_Global_Report_on_Trafficking_in_Persons.pdf