The aim of this exercise is to analyze different metrics at country level. We will import two data sets from Tidy Tuesday namely CIA Factbook and Freedom Index Data. We analyze different metrics by countries and region and how they correlate with each other.
We will import the CIA Factbook data set and the Freedom Index data set from Tidy Tuesday. We do data cleaning such as standardizing country names in both data sets, merging them together, removing unwanted columns, filtering for null values and aggregating columns. After completing all data wrangling steps, a final data table (final_dt) is created, containing all necessary features for our analysis.
#install.packages("tidytuesdayR")
#install.packages("ggplot2")
#install.packages("data.table")
#install.packages("stringdist")
#install.packages('rnaturalearthdata')
#install.packages('rnaturalearth')
library(ggplot2)
library(tidytuesdayR)
library(data.table)
library(stringdist)
library(rnaturalearth)
library(rnaturalearthdata)
cia_factbook <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-10-22/cia_factbook.csv', show_col_types = FALSE)
dt <- data.table(cia_factbook)
freedom <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-02-22/freedom.csv', show_col_types = FALSE)
dt2 <- data.table(freedom)
print(colSums(is.na(dt)))
dt$maternal_mortality_rate <- NULL
dt <- dt[!is.na(dt$internet_users)]
dt <- dt[!is.na(dt$net_migration_rate)]
print(colSums(is.na(dt)))
# checking if Nulls exist
print(colSums(is.na(dt2)))
# Data Cleaning : Getting the latest data for each country and creating a new data table
dt3 <- dt2[, .SD[year == max(year)], by = country]
# Standardizing country names to be used in joining
# Find country names in cia factbook table but not freedom index table and vice versa
countries_in_dt_not_dt3 <- setdiff(unique(dt$country), unique(dt3$country))
countries_in_dt3_not_dt <- setdiff(unique(dt3$country), unique(dt$country))
# Print the result
print(countries_in_dt_not_dt3)
print(countries_in_dt3_not_dt)
# Standardizing country names in Freedom Index Table to match CIA Factbook, for proper joining
dt3[, country := fcase(
country == "Bahamas", "Bahamas, The",
country == "Bolivia (Plurinational State of)", "Bolivia",
country == "Brunei Darussalam", "Brunei",
country == "Democratic Republic of the Congo", "Congo, Democratic Republic of the",
country == "Congo", "Congo, Republic of the",
country == "Côte d’Ivoire", "Cote d'Ivoire",
country == "Czechia", "Czech Republic",
country == "Gambia", "Gambia, The",
country == "Iran (Islamic Republic of)", "Iran",
country == "Republic of Korea", "Korea, South",
country == "Lao People's Democratic Republic", "Laos",
country == "North Macedonia", "Macedonia",
country == "Micronesia (Federated States of)", "Micronesia, Federated States of",
country == "Republic of Moldova", "Moldova",
country == "Russian Federation", "Russia",
country == "Syrian Arab Republic", "Syria",
country == "United Republic of Tanzania", "Tanzania",
country == "United Kingdom of Great Britain and Northern Ireland", "United Kingdom",
country == "United States of America", "United States",
country == "Venezuela (Bolivarian Republic of)", "Venezuela",
country == "Viet Nam", "Vietnam",
default = country # Keep original if no match
)]
print(dt3)
# Merging Freedom Index table with CIA Factbook to bring extra variables and performing creation/removal of some features to create Final Dataset for Analysis
final_dt <- merge(dt, dt3, by = "country")
final_dt$year <- NULL
final_dt[, internet_user_percentage := (internet_users/population)*100]
final_dt$internet_users <- NULL
final_dt[, birth_rate:= (birth_rate/10)]
final_dt[, death_rate := (death_rate/10)]
final_dt[, infant_mortality_rate := (infant_mortality_rate/10)]
final_dt[, net_migration_rate := (net_migration_rate/10)]
final_dt$Region_Code <- NULL
colnames(final_dt)[colnames(final_dt) == "CL"] <- "civil_liberty_ranking"
colnames(final_dt)[colnames(final_dt) == "PR"] <- "political_rights_ranking"
colnames(final_dt)[colnames(final_dt) == "is_ldc"] <- "is_least_developed"
final_dt$civil_liberty_ranking <- as.factor(final_dt$civil_liberty_ranking)
final_dt$political_rights_ranking <- as.factor(final_dt$political_rights_ranking)
final_dt$is_least_developed <- as.factor(final_dt$is_least_developed)
final_dt[, population_density := (population/area)]
summary_table <- as.data.frame(summary(final_dt))
print(summary_table)
Country: Name of the country
Area: Total area of the country (in square kilometers).
birth_rate: Birth rate (number of live births per 100 people).
death_rate: Death rate (number of deaths per 100 people). infant_mortality_rate: Infant mortality rate (number of deaths of infants under one year old per 100 live births).
internet_users_percentage: Number of internet users by population.
life_exp_at_birth: Life expectancy at birth (in years).
net_migration_rate: Net migration rate (number of migrants per 100 people).
population: Total population of the country.
population_growth_rate: Population growth rate (multiplier).
population_density: Population per square kilometers
civil_liberties_rankings: Categorical division of Civil Liberties where 1 means highest
political_rights_rankings: Categorical division of Political Rights where 1 means highest
Status: Status (Free F, Not Free NF, Partially Free PF)
Region_Code: UN Region code
Region_Name: UN Region Name
is_least_developed: Is a least developed country or not (categorical 0/1)
library(data.table)
library(ggplot2)
# Identify numeric columns and store in a list
numeric_vars <- names(final_dt)[sapply(final_dt, is.numeric)]
# Convert to long format using the selected numeric columns
dt_long <- melt(final_dt, measure.vars = numeric_vars,
variable.name = "Variable", value.name = "Value")
# Create distribution plots for numeric variables
ggplot(dt_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "green", alpha = 0.7) +
facet_wrap(~ Variable, scales = "free") +
labs(title = "Distribution of Numeric Variables") +
theme_light()
# Identify categorical columns
cat_vars <- names(final_dt)[sapply(final_dt, is.character) | sapply(final_dt, is.factor)]
# Loop through categorical variables and create bar plots
for (cat_var in cat_vars) {
if (cat_var == "country") next # Skip the loop iteration if the variable is "country"
p <- ggplot(final_dt, aes(x = get(cat_var))) +
geom_bar(fill = "orange", alpha = 0.8) +
labs(title = paste("Distribution of", cat_var), x = cat_var, y = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_light()
print(p)
}
# Calculate the average net migration rate for each status category
ggplot(final_dt, aes(x = Status, y = net_migration_rate)) +
stat_summary(fun = "mean", # Calculate mean for each status category
geom = "bar", # Use bars to display the average
fill = "lightblue", # Bar color
alpha = 0.7) +
labs(title = "Average Net Migration Rate by Status", x = "Status",
y = "Average Net Migration Rate") +
scale_x_discrete(labels = c("Free", "Not Free", "Partially Free")) + # Map old categories to new labels
theme_light() +
theme(axis.text.x = element_text(vjust = 1.5, hjust = 0.5)) # Move x-axis labels closer to x-axis (y = 0)
Countries which are Partially Free have positive migration rate as people who are looking to move abroad are able to do so. Free countries have higher influx of migrants instead of out flux hence the average migration rate is negative. Countries that are not free have highly negative net migration rate as people in these countries probably face restrictions and are unable to migrate.
ggplot(final_dt, aes(x = population_growth_rate, y = population_density, color = is_least_developed)) +
geom_point(size = 3) +
labs(title = "Population Density vs Population Growth Rate",
x = "Population Growth Rate",
y = "Population Density") +
scale_color_manual(values = c("blue", "orange")) +
coord_cartesian(ylim = c(0, 2500)) + # Zoom in without removing data points
theme_light()
Countries which are least developed have higher population growth rate on average as compared to developed countries. However, population growth rate is not significantly related to population density.
# Plot for Top 10 Birth Rates
ggplot(head(final_dt[order(-final_dt$birth_rate), ], 10),
aes(x = reorder(country, -birth_rate),
y = birth_rate, fill = "Birth Rate")) + geom_bar(stat = "identity") +
labs(title = "Top 10 Countries by Birth Rate",
x = "Country",
y = "Birth Rate per 100 People") +
scale_fill_manual(values = c("Birth Rate" = "blue")) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill = "none")
# Plot for Top 10 Death Rates
ggplot(head(final_dt[order(-final_dt$death_rate), ], 10),
aes(x = reorder(country, -death_rate),
y = death_rate,fill = "Death Rate")) + geom_bar(stat = "identity") +
labs(title = "Top 10 Countries by Death Rate",
x = "Country",
y = "Death Rate per 100 People") +
scale_fill_manual(values = c("Death Rate" = "red")) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
African nations dominate the birth rate per 100 people. The top 3 countries include Niger, Mali and Uganda. In terms of deaths per 100 people, top 3 countries include South Africa, Ukraine and Lesotho.
ggplot(final_dt, aes(x = population_density, fill = Region_Name)) +
geom_density(alpha = 0.6) +
labs(title = "Density Plot of Average Population by Region",
x = "Population Density",
y = "Density", fill = "Region") +
theme_light() +
coord_cartesian(xlim = c(0, 800)) +
scale_fill_brewer(palette = "Set1") + # Change palette to "Set1", "Paired", etc.
theme(legend.position = "right")
Americas and Asia have higher population density countries as compared to Oceania, Africa and Europe.
ggplot(final_dt, aes(x = as.factor(is_least_developed),
y = life_exp_at_birth,
col = as.factor(is_least_developed))) +
geom_jitter(alpha = 0.7)+
geom_boxplot(alpha = 0.6, outlier.color = "black", outlier.shape = 16) +
labs(title = "Life Expectancy Distribution by Development Status",
x = "",
y = "Avg Life Expectancy (years)",
fill = "Least Developed") +
scale_x_discrete(labels = c("0" = "Developed", "1" = "Least Developed")) +
coord_cartesian(ylim = c(25, 100)) + # Zoom in without removing data points
theme_light() +
theme(legend.position = "none")
Developed countries have a median avg life expectancy of around 75 years as compared to 62 years for least developed countries. Although there are some outlying countries but developed countries have visibly higher avg life expectancy compared to least developed countries.
ggplot(final_dt, aes(x = as.factor(Status),
y = internet_user_percentage,
fill = as.factor(Status))) +
geom_violin(alpha = 0.6) + # Corrected function name
labs(title = "Internet Usage Distribution by Freedom Status",
x = "",
y = "Internet Usage Percentage",
fill = "Freedom Status") +
scale_x_discrete(labels = c("NF" = "Not Free", "PF" = "Partially Free", "F" = "Free")) +
coord_cartesian(ylim = c(0, 100)) + # Zoom in without removing data points
theme_light() +
theme(legend.position = "none")
In terms of freedom status, countries which are deemed free have higher internet usage percentage as shown by the violin plot. Whereas countries which are not free or partially free behave similar to each other with significantly lower internet usage percentage than free countries.
ggplot(final_dt, aes(x = life_exp_at_birth, y = infant_mortality_rate, color = Region_Name)) +
geom_point(size = 3) +
labs(title = "Life Expectancy at Birth vs Infant Mortality Rate",
x = "Life Expectancy (Years)",
y = "Infant Mortality Rate") +
#scale_color_manual(values = c("blue", "red")) +
coord_cartesian(ylim = c(0, 20)) + # Zoom in without removing data points
theme_light()
The graph shows an inverse relationship between Infant Mortality Rates and Life Expectancy at birth. Countries which have higher life expectancy at birth tend to have lower infant mortality rates on average.
Creating an additional table to be used in tandem with natural earth library for map creation
# Load world map data using the natural earth library
world_map <- ne_countries(scale = "medium", returnclass = "sf")
#creating a deep copy of our data set so original data is not changed
final_dt_map <- copy(final_dt)
#checking mismatch of country names present in our data set but not natural earth library data
countries_in_dt_not_worldmap <- setdiff(unique(final_dt$country), unique(world_map$name_long))
print(countries_in_dt_not_worldmap)
#Mapping country names in our dataset to standardize for country names present in natural earth library world map
final_dt_map[, country := fcase(
country == "Bahamas, The", "Bahamas",
country == "Brunei","Brunei Darussalam",
country == "Congo, Democratic Republic of the","Democratic Republic of the Congo",
country == "Congo, Republic of the","Republic of the Congo",
country == "Cote d'Ivoire","Côte d'Ivoire",
country == "Cabo Verde", "Republic of Cabo Verde",
country == "Gambia, The","The Gambia",
country == "Korea, South","Republic of Korea",
country == "Laos","Lao PDR",
country == "Macedonia","North Macedonia",
country == "Micronesia, Federated States of","Federated States of Micronesia",
country == "Republic of Moldova", "Moldova",
country == "Russia","Russian Federation",
country == "Sao Tome and Principe", "São Tomé and Principe",
default = country # Keep original if no match
)]
print(final_dt_map)
# joining data from world map to our dataset
final_dt_map2 <- merge(final_dt_map, world_map, by.x = "country", by.y = "name_long")
Plotting Civil Liberties and Political Rights on World Map - Using Color Brewer Library (colorbrewer2.org) for Palettes
# Load required library
library(RColorBrewer)
# Political Rights Ranking Map with ColorBrewer
ggplot(final_dt_map2) +
geom_sf(aes(fill = as.factor(political_rights_ranking), geometry = geometry),
color = "white",
size = 0.2) +
scale_fill_brewer(
palette = "PRGn", # Using the BrBG palette from ColorBrewer
direction = - 1,
name = "Political Rights"
) +
labs(title = "Political Rights Ranking by Country") +
theme_light()
# Civil Liberty Ranking Map with ColorBrewer
ggplot(final_dt_map2) +
geom_sf(aes(fill = as.factor(civil_liberty_ranking), geometry = geometry),
color = "white",
size = 0.2) +
scale_fill_brewer(
palette = "PRGn", # Using the same BrBG palette
direction = -1,
name = "Civil Liberties"
) +
labs(title = "Civil Liberty Ranking by Country") +
theme_light()
Trends for both Political Rights and Civil Liberty are similar for countries around the globe. Countries in Americas, Western Europe and Oceania are better (higher rankings) in terms of political rights and civil liberties as compared to Africa, Asia and Eastern Europe.
This project was done for the course Coding 3: Introduction to R under the guidance of Professor Gergely Daroczi. We extend our gratitude to him for his guidance. The project satisified the requirements present under Final Project at: https://github.com/daroczig/CEU-R-intro?tab=readme-ov-file#final-project. Some additional work on top of the requirements mentioned, is presented as well.