library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(RColorBrewer)
setwd("~/School/MC/DATA 110/Datasets")
data <- read_csv("humantrafficking.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `By using this data you agree to the Terms of Use: https://www.ctdatacollaborative.org/terms-use` = col_logical(),
## Datasource = col_character(),
## gender = col_character(),
## ageBroad = col_character(),
## majorityStatus = col_character(),
## citizenship = col_character(),
## typeOfExploitConcatenated = col_character(),
## CountryOfExploitation = col_character()
## )
## i Use `spec()` for the full column specifications.
## Warning: 61259 parsing failures.
## row col expected actual file
## 1608 typeOfLabourConcatenated a double Not specified 'humantrafficking.csv'
## 1609 typeOfLabourConcatenated a double Other;Not specified 'humantrafficking.csv'
## 1610 typeOfLabourConcatenated a double Not specified 'humantrafficking.csv'
## 1611 typeOfLabourConcatenated a double Not specified 'humantrafficking.csv'
## 1612 typeOfLabourConcatenated a double Not specified 'humantrafficking.csv'
## .... ........................ ........ ................... ......................
## See problems(...) for more details.
This is a global human trafficking dataset downloaded from https://www.ctdatacollaborative.org/.
The Counter-Trafficking Data Collaborative is a global data hub on human trafficking, collecting data from counter-trafficking organization around the world. The dataset I’m using include information of 48,801 victims of human trafficking. The dataset is regularly updated. As a result of different data collection methods by different organizations, missing values can be found throughout the dataset. Also, because of the nature of this crime, many victims are hidden and cannot be identified. This dataset only include detected and registered victims. Therefore, the dataset cannot be considered a random and representative sample of global human trafficking.
In terms of the different types of trafficking, sexual exploitation and forced labour are the most prominent. However, there are other numerous forms of trafficking including begging, forced marriage, baby selling, organ removal, forced military, etc. Some cases are mixed forms.
I choose this topic because I watched a few movies about human trafficking and I feel like this crime is difficult to fight and the victims of this crime suffered so much and only small percentage could escape. So, I wanted to learn more about this topic, become more self-aware, and also remind people around me. I hope nobody will ever be the victim of this crime.
# columns 5 to 63 contains -99 and therefore need to be converted
columns_to_convert <- c(5:63)
# set a function called replace_with_NA to be used in the next step
replace_with_NA <- function(x) {
ifelse(x <0, yes = NA, no = x) # use ifelse function to convert all negative values to NA
}
# use mutate_at to generate data_withNA
data_withNA <- mutate_at(data, columns_to_convert, replace_with_NA)
# Rename the ageBroad so the format is consistent and can be sorted in ascending order
data_withNA$ageBroad[data_withNA$ageBroad == "0--8"] <- "00-08"
data_withNA$ageBroad[data_withNA$ageBroad == "9--17"] <- "09-17"
data_withNA$ageBroad[data_withNA$ageBroad == "18--20"] <- "18-20"
data_withNA$ageBroad[data_withNA$ageBroad == "21--23"] <- "21-23"
data_withNA$ageBroad[data_withNA$ageBroad == "24--26"] <- "24-26"
data_withNA$ageBroad[data_withNA$ageBroad == "27--29"] <- "27-29"
data_withNA$ageBroad[data_withNA$ageBroad == "30--38"] <- "30-38"
data_withNA$ageBroad[data_withNA$ageBroad == "39--47"] <- "39-47"
# Replace 1s with No and 1s with Yes
data_withNA$isForcedLabour[data_withNA$isForcedLabour == "0"] <- "No"
data_withNA$isForcedLabour[data_withNA$isForcedLabour == "1"] <- "Yes"
data_withNA$isSexualExploit[data_withNA$isSexualExploit == "0"] <- "No"
data_withNA$isSexualExploit[data_withNA$isSexualExploit == "1"] <- "Yes"
data_withNA$isSexAndLabour[data_withNA$isSexAndLabour == "0"] <- "No"
data_withNA$isSexAndLabour[data_withNA$isSexAndLabour == "1"] <- "Yes"
data_withNA$isForcedMarriage[data_withNA$isForcedMarriage == "0"] <- "No"
data_withNA$isForcedMarriage[data_withNA$isForcedMarriage == "1"] <- "Yes"
data_withNA$isOrganRemoval[data_withNA$isOrganRemoval == "0"] <- "No"
data_withNA$isOrganRemoval[data_withNA$isOrganRemoval == "1"] <- "Yes"
data_withNA$isForcedMilitary[data_withNA$isForcedMilitary == "0"] <- "No"
data_withNA$isForcedMilitary[data_withNA$isForcedMilitary == "1"] <- "Yes"
data_withNA$isSlaveryAndPractices[data_withNA$isSlaveryAndPractices == "0"] <- "No"
data_withNA$isSlaveryAndPractices[data_withNA$isSlaveryAndPractices == "1"] <- "Yes"
data_withNA$isOtherExploit[data_withNA$isOtherExploit == "0"] <- "No"
data_withNA$isOtherExploit[data_withNA$isOtherExploit == "1"] <- "Yes"
# group the data by year and victim's gender, and then count the cases for each gender group. use ggplot to create a proportional bar chart to compare male and female victims
data_withNA %>%
group_by(yearOfRegistration, gender) %>%
summarise(cases = sum(!is.na(gender))) %>%
ggplot(aes(x = yearOfRegistration, y = cases, fill = gender)) +
geom_col(colour = "white", position = "fill") +
scale_y_continuous(labels = scales::percent) +
# set the palette
scale_fill_brewer(palette = "Pastel2") +
# add labels
labs(title = "Proportion of Female and Male Victims Over Time",
subtitle = "From 2002 to 2019",
x = "Year",
y = "Proportion") +
# use theme grey and change the size of the plot text to 12
theme_grey(base_size = 12)
## `summarise()` has grouped output by 'yearOfRegistration'. You can override using the `.groups` argument.
The majority of human trafficking victims are females, however, The proportion of male victims is increasing overtime, especially during 2010 and 2015. Human trafficking can happen to anyone and the ways of exploitation are various, including sexual coercion, forced labor, forced marriage, organ removal, etc.
# groupo the data by age range and gender, and then count the cases in each age range. use ggplot to create a proportion bar chart to compare the female and male victims in each age group.
data_withNA %>%
group_by(ageBroad, gender) %>%
summarise(cases = sum(!is.na(gender))) %>%
ggplot(aes(x = ageBroad, y = cases, fill = gender)) +
geom_col(colour = "white", position = "fill") +
scale_y_continuous(labels = scales::percent) +
# set the palette
scale_fill_brewer(palette = "Pastel1") +
# add labels
labs(title = "Proportion of Female and Male Victims in Each Age Group",
x = "Age Group",
y = "Proportion") +
# use theme dark and change the size of the plot text to 12
theme_dark(base_size = 12)
## `summarise()` has grouped output by 'ageBroad'. You can override using the `.groups` argument.
Majority of female victims are in lower age group. In age groups 18-20, 21-23, 24-26, more than 75% of victims are females.
Male victims have larger proportion in age groups 30+ and the lowest age group 0-8.
# group the data by year and gender and count the number of cases
data1 <- data_withNA %>%
group_by(yearOfRegistration, gender) %>%
summarise(numberOfCases = sum(!is.na(gender)))
## `summarise()` has grouped output by 'yearOfRegistration'. You can override using the `.groups` argument.
# set the color palette
cols1 <- brewer.pal(2, "Dark2")
## Warning in brewer.pal(2, "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels
# create the stacked area chart over years grouped by gender
highchart() %>%
hc_add_series(data = data1,
type = "area",
hcaes(x = yearOfRegistration,
y = numberOfCases,
group = gender,
na.omit = TRUE)) %>%
hc_colors(cols1) %>%
hc_plotOptions(series = list(stacking = "normal",
marker = list(enabled = FALSE,
states = list(hover = list(enabled = FALSE))),
lineWidth = 0.5,
lineColor = "white")) %>%
hc_xAxis(title = list(text = "Year")) %>%
hc_yAxis(title = list(text = "Number of Cases")) %>%
hc_title(text = "Number of Registered Cases Over Years",
margin = 20,
align = "center") %>%
# customize the legend
hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
# customize the tooltip
hc_tooltip(shared = TRUE,
borderColor = "black")
The number of registered cases for both male and female are increasing over years. There’s a peak point in 2016 where female victims reached 12,003 and male victims reached 4,396.
The total number of detected trafficking victims are increasing over the years, but I couldn’t find a solid reason why 2016 is a peak point in this dataset. One possibility is the limitation of the data source and samples. It could be that a lot more data was collected in 2016. According to 2016 Global Report on Trafficking in Persons published by United Nations Office on Drugs and Crime, many countries have criminalized most forms of trafficking as set out in the UN Trafficking in Persons Protocol. The number of countries doing that increased from 33 in 2003 to 158 in 2016. This increase has helped to assist the victims and collect related data.
# group the data by year, gender, and age group, and then count the cases, filter out NA in age group
data2 <- data_withNA %>%
group_by(yearOfRegistration, gender, ageBroad) %>%
summarise(numberOfCases = sum(!is.na(gender))) %>%
filter(ageBroad != "NA")
## `summarise()` has grouped output by 'yearOfRegistration', 'gender'. You can override using the `.groups` argument.
# set the color palette
cols2 <- brewer.pal(9, "Paired")
# create the stacked area chart over years grouped by ageBroad
highchart() %>%
hc_add_series(data = data2,
type = "area",
hcaes(x = yearOfRegistration,
y = numberOfCases,
group = ageBroad,
na.omit = TRUE)) %>% #omit entries that are missing age data
hc_colors(cols2) %>%
hc_plotOptions(series = list(stacking = "normal",
marker = list(enabled = FALSE,
states = list(hover = list(enabled = FALSE))),
lineWidth = 0.5,
lineColor = "white")) %>%
hc_xAxis(title = list(text = "Year")) %>%
hc_yAxis(title = list(text = "Number of Cases")) %>%
hc_title(text = "Number of Registered Cases in Each Age Group",
margin = 20,
align = "center") %>%
# customize the legend
hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
# customize the tooltip
hc_tooltip(shared = TRUE,
borderColor = "black")
We can see the total number of victims in age group 09-17, 30-38 increased dramatically. Combining this plot with plot 2, we could speculate that the increasing in age group 09-17 comes from female victims because the percentage of female victims is more than 75%. We can also speculate that more male victims in age group 30-38 contribute to the increase because and the proportion of male victims is large in that group.
setwd("~/School/MC/DATA 110/Datasets")
countrycode <- read_csv("countrycode.csv") #read countrycode dataset
##
## -- Column specification --------------------------------------------------------
## cols(
## citizenship = col_character(),
## country = col_character(),
## region = col_character(),
## income_level = col_character()
## )
data_withCountryName <- data_withNA %>%
left_join(countrycode, by = "citizenship") #data_withNA left join with countrycode
data3 <- data_withCountryName %>%
select(yearOfRegistration, gender, ageBroad, majorityStatus, citizenship, country, region, income_level)
head(data3)
## # A tibble: 6 x 8
## yearOfRegistrati~ gender ageBroad majorityStatus citizenship country region
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## 2 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## 3 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## 4 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## 5 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## 6 2002 Female 18-20 Adult CO Colomb~ Latin Am~
## # ... with 1 more variable: income_level <chr>
data3 %>%
group_by(region) %>%
count(region) %>% # group by region and count number of cases in each region
ggplot(aes(x="", y = n, fill = region)) +
geom_bar(stat = "identity") +
coord_polar("y", start = 0) +
theme_void() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Registered Cases in Each Region")
East Asia & Pacific and Europe & Central Asia have the most registered cases. These two regions constitute more than 50% of total cases in this dataset. Countries in these two regions are also destinations for trafficking victims from a wide range of origins.
subset(data3, is.na(region))
## # A tibble: 8,858 x 8
## yearOfRegistration gender ageBroad majorityStatus citizenship country region
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2015 Female 18-20 Adult 00 <NA> <NA>
## 2 2015 Female 18-20 Adult 00 <NA> <NA>
## 3 2015 Female 18-20 Adult 00 <NA> <NA>
## 4 2015 Female 18-20 Adult 00 <NA> <NA>
## 5 2015 Female 18-20 Adult 00 <NA> <NA>
## 6 2015 Female 18-20 Adult 00 <NA> <NA>
## 7 2015 Female 18-20 Adult 00 <NA> <NA>
## 8 2015 Female 18-20 Adult 00 <NA> <NA>
## 9 2015 Female 18-20 Adult 00 <NA> <NA>
## 10 2015 Female 18-20 Adult 00 <NA> <NA>
## # ... with 8,848 more rows, and 1 more variable: income_level <chr>
In this dataset, the citizenship of more than 8,800 victims remain unidentified. According to UNODC’s Global Report on Trafficking in Persons, traffickers target victims who are marginalized or in difficult situation. Some of them are refugees who are stateless due to arbitrary deprivation of their citizenship.
data3 %>%
filter(region != "NA") %>%
ggplot(mapping = aes(y = region, fill = gender)) +
geom_bar(position = "dodge") +
scale_fill_brewer(palette = "Pastel1") +
theme_minimal() +
labs(title = "Female vs. Male Victims in Each Region",
x = "Number of Cases",
y = "Region")
data3 %>%
filter(region != "NA") %>%
ggplot(mapping = aes(y = region, fill = majorityStatus)) +
geom_bar(position = "dodge") +
scale_fill_brewer(palette = "Pastel2") +
theme_minimal() +
labs(title = "Adult vs. Minor Victims in Each Region",
x = "Number of Cases",
y = "Region")
Most reported victims are female adults. The percentage of female victims in North America is more than any other region’s.
Sub-Saharan Africa and Latin America & Caribbean are the two regions that have more minor victims than adult victims. In Latin America & Caribbean, children are trafficked for the purpose of exploitative begging, forced criminal activity, and illegal adoption. Countries in Sub-Saharan Africa region report children trafficking for forced criminal activity, forced marriage, and other mixed forms of exploitation (Global Report on Trafficking in Persons 2020).
data3 %>%
group_by(country) %>%
count(country) %>%
arrange(desc(n))
## # A tibble: 46 x 2
## # Groups: country [46]
## country n
## <chr> <int>
## 1 Philippines 11365
## 2 <NA> 8858
## 3 Ukraine 7761
## 4 Moldova 5901
## 5 United States 3636
## 6 Cambodia 1979
## 7 Indonesia 1971
## 8 Belarus 1463
## 9 Myanmar 1250
## 10 Romania 655
## # ... with 36 more rows
There are eight countries that have total of more than 1000 cases.
data3 %>%
filter(country == "Philippines" | country == "Ukraine" | country == "Moldova" |
country == "United States" | country == "Cambodia" | country == "Indonesia" |
country == "Belarus" | country == "Myanmar" ) %>%
group_by(country, majorityStatus) %>%
count(majorityStatus) %>%
#filter(majorityStatus != "NA") %>%
ggplot(mapping = aes(x = country, y = n, fill = majorityStatus)) +
geom_col(colour = "white", position = "fill") +
#scale_fill_brewer(palette = "Pastel1") +
theme_minimal() +
labs(title = "Adult vs. Minor Victims in Each Country",
x = "Country",
y = "Proportion")
Moldova and United States have higher shares of children among detected victims.
data4 <- data3 %>%
group_by(yearOfRegistration, country) %>%
count(country) %>%
filter(country == "Philippines" | country == "Ukraine" | country == "Moldova" |
country == "United States" | country == "Cambodia" | country == "Indonesia" |
country == "Belarus" | country == "Myanmar" )
data4
## # A tibble: 71 x 3
## # Groups: yearOfRegistration, country [71]
## yearOfRegistration country n
## <dbl> <chr> <int>
## 1 2002 Moldova 597
## 2 2002 Ukraine 92
## 3 2003 Moldova 215
## 4 2003 Ukraine 13
## 5 2004 Belarus 54
## 6 2004 Moldova 169
## 7 2005 Belarus 263
## 8 2005 Indonesia 11
## 9 2005 Moldova 196
## 10 2005 Ukraine 119
## # ... with 61 more rows
highchart() %>%
hc_add_series(data = data4,
type = "line",
hcaes(x = yearOfRegistration,
y = n,
group = country)) %>%
hc_colors(cols2) %>%
hc_xAxis(title = list(text = "Year")) %>%
hc_yAxis(title = list(text = "Number of Cases")) %>%
hc_title(text = "Number of Registered Cases Over Years",
margin = 20,
align = "center") %>%
hc_legend(align = "right",
verticalAlign = "top") %>%
hc_tooltip(shared = TRUE,
borderColor = "black"
)
data_withCountryName %>%
filter(country == "Philippines" & yearOfRegistration == "2016") %>%
group_by(typeOfExploitConcatenated) %>%
count(typeOfExploitConcatenated)
## # A tibble: 5 x 2
## # Groups: typeOfExploitConcatenated [5]
## typeOfExploitConcatenated n
## <chr> <int>
## 1 Forced labour 2336
## 2 Other 6879
## 3 Sexual exploitation 367
## 4 Slavery and similar practices 359
## 5 <NA> 1321
Cambodia and Ukraine have a increasing trend in registered cases. The trend for other countries has up and down. The total cases were higher during year 2014 and 2016. In 2016, there are 11,262 victims that are registered as Filipinos.
library(treemap)
data5 <- data_withCountryName %>%
group_by(yearOfRegistration, typeOfExploitConcatenated) %>%
count(typeOfExploitConcatenated) %>%
filter(typeOfExploitConcatenated != "NA")
data5
## # A tibble: 39 x 3
## # Groups: yearOfRegistration, typeOfExploitConcatenated [39]
## yearOfRegistration typeOfExploitConcatenated n
## <dbl> <chr> <int>
## 1 2002 Sexual exploitation 867
## 2 2003 Sexual exploitation 357
## 3 2004 Sexual exploitation 160
## 4 2005 Forced labour 601
## 5 2005 Other 11
## 6 2005 Sexual exploitation 492
## 7 2006 Forced labour 369
## 8 2006 Sexual exploitation 318
## 9 2007 Forced labour 219
## 10 2007 Sexual exploitation 344
## # ... with 29 more rows
treemap(data5, index="typeOfExploitConcatenated", vSize="n", palette="RdYlBu")
Sexual exploitation and forced labour are two major forms of exploitation.
highchart() %>%
hc_add_series(data = data5,
type = "line",
hcaes(x = yearOfRegistration,
y = n,
group = typeOfExploitConcatenated)) %>%
hc_colors(cols1) %>%
hc_xAxis(title = list(text = "Year")) %>%
hc_yAxis(title = list(text = "Number of Cases")) %>%
hc_title(text = "Trends in Registered Cases for Each Form of Exploitation",
margin = 20,
align = "center") %>%
hc_legend(align = "right",
verticalAlign = "top") %>%
hc_tooltip(shared = TRUE,
borderColor = "black"
)
Since 2010, victims of forced labour and sexual exploitation are increasing.
The Counter Trafficking Data Collaborative https://www.ctdatacollaborative.org/
Global Report on Trafficking in Persons 2020 https://www.unodc.org/documents/data-and-analysis/tip/2021/GLOTiP_2020_15jan_web.pdf
Global Report on Trafficking in Persons 2016 https://www.unodc.org/documents/data-and-analysis/glotip/2016_Global_Report_on_Trafficking_in_Persons.pdf