library(tidyr)
library(dplyr)
library(ggplot2)
library(stringr)
library(ggrepel)
# Read CSV file, skip 3 rows, remove 'X' column
df1 <- read.csv("/Users/williamberritt/Downloads/API_SM/API_SM.POP.NETM_DS2_en_csv_v2_105.csv", header = TRUE, skip = 3) |> subset(select = -c(X))
# Reshape data from wide to long format using pivot_longer
df1 <- df1 |> pivot_longer(cols = (X1960:X2022), names_to = 'year', values_to ='value')
# Display first few rows of the reshaped data frame
head(df1)
## # A tibble: 6 Ă— 6
## Country.Name Country.Code Indicator.Name Indicator.Code year value
## <chr> <chr> <chr> <chr> <chr> <int>
## 1 Aruba ABW Net migration SM.POP.NETM X1960 0
## 2 Aruba ABW Net migration SM.POP.NETM X1961 -569
## 3 Aruba ABW Net migration SM.POP.NETM X1962 -609
## 4 Aruba ABW Net migration SM.POP.NETM X1963 -646
## 5 Aruba ABW Net migration SM.POP.NETM X1964 -684
## 6 Aruba ABW Net migration SM.POP.NETM X1965 -726
# Calculate total migration by country code
migration_by_country_code <- df1 |>
group_by(Country.Name) |>
summarize(total_migration = sum(value))
# Rename columns
colnames(migration_by_country_code) <- c("country", "total_migration")
# List of country names
country_names <- c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Korea, North", "Korea, South", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe")
# Create a data frame with country names
countries_df <- data.frame(country = country_names)
# Merge country names with migration data and remove rows with missing migration data
countries_only <- na.omit(left_join(countries_df, migration_by_country_code, by='country'))
# Extract the top 5 countries with the highest migration influx
top_5_highest_influx <- head(countries_only[order(-countries_only$total_migration),])
top_5_highest_influx
## country total_migration
## 185 United States 66105084
## 63 Germany 13151498
## 32 Canada 9937374
## 9 Australia 7678338
## 162 Spain 7118049
## 150 Saudi Arabia 6772680
# Extract the top 5 countries with the highest migration exodus
top_5_highest_exodus <- head(countries_only[order(countries_only$total_migration),])
top_5_highest_exodus
## country total_migration
## 14 Bangladesh -20194739
## 131 Pakistan -18321926
## 36 China -16760013
## 111 Mexico -14101292
## 75 India -10760905
## 137 Philippines -9979653
# Read Poverty Data Set
poverty <- read.csv("/Users/williamberritt/Downloads/poverty-rate-by-country-2024.csv")
head(poverty)
## country PovertyRateValue PovertyRateValueDataYear LessThan10USDPerDay
## 1 India 21.9 2011 92.1
## 2 China 0.0 2020 45.8
## 3 United States NA NA 2.2
## 4 Indonesia 9.5 2022 79.7
## 5 Pakistan 21.9 2018 94.5
## 6 Nigeria 40.1 2018 96.9
## LessThan365USDPerDay
## 1 44.8
## 2 3.0
## 3 1.2
## 4 22.4
## 5 39.8
## 6 63.5
# Merge country names with poverty data
countries_poverty <- left_join(countries_df, poverty, by='country')
# Select relevant columns and remove rows with missing data
countries_poverty <- subset(countries_poverty, select = c('country', 'PovertyRateValue'))
countries_poverty <- na.omit(countries_poverty)
# Merge poverty data with migration data
migration_vs_poverty <- left_join(countries_poverty, migration_by_country_code, by='country')
# Remove rows with missing data
migration_vs_poverty <- na.omit(migration_vs_poverty)
# Extract variables for the plot
x <- migration_vs_poverty$PovertyRateValue
y <- migration_vs_poverty$total_migration
# Create a scatter plot
plot(x, y)
# Create a copy of the original data frame
df_decade <- df1
# Add a column for the absolute value of migration
df_decade$absolute_value <- abs(df_decade$value)
# Extract the year from the 'year' column
df_decade$year <- str_replace(df_decade$year, 'X', '')
df_decade$year <- as.numeric(df_decade$year)
# Create a new column for the decade
df_decade$decade <- (floor(df_decade$year/10))*10
# Rename columns
colnames(df_decade) <- c("country", "code", 'indicate', 'indic_code', 'year', 'value', 'absolute_value', 'decade')
# Merge country names with the updated data frame
df_decade <- left_join(countries_df, df_decade, by='country')
df_decade <- na.omit(df_decade)
# Calculate total absolute migration by decade
migration_by_decade <- df_decade |> group_by(decade) |> summarize(absolute_migration = sum(absolute_value))
# Adjust the value for the decade 2020
migration_by_decade$adj_value <- ifelse(migration_by_decade$decade == 2020, migration_by_decade$absolute_migration/.3, migration_by_decade$absolute_migration)
# Create a plot
plot(migration_by_decade$decade, migration_by_decade$adj_value)
# Read population data from CSV file
pop_data <- read.csv('/Users/williamberritt/Documents/pop_data_proj2.csv')
# Select relevant columns
pop_data <- subset(pop_data, select = c('Year', 'World.Population'))
# Create a new column for the decade
pop_data$decade <- (floor(pop_data$Year/10))*10
# Remove commas from population numbers and convert to numeric
pop_data$World.Population <- str_replace_all(pop_data$World.Population, ',', '')
pop_data$World.Population <- as.numeric(pop_data$World.Population)
# Calculate average population by decade
avg_pop_decade <- pop_data |> group_by(decade) |> summarize(avg_pop = mean(World.Population))
# Merge average population and migration data by decade
migration_with_pop <- left_join(avg_pop_decade, migration_by_decade, by='decade')
# Calculate adjusted absolute migration rate per 100 people
migration_with_pop$adj_absolute_migration_rate_per_100 <- (migration_with_pop$adj_value / migration_with_pop$avg_pop) * 100
# Create a plot
plot(migration_with_pop$decade, migration_with_pop$adj_absolute_migration_rate_per_100)
# Group data by country and decade, calculating absolute migration
country_grouped_by_decade <- df_decade |> group_by(country, decade) |> summarize(absolute_migration = sum(absolute_value))
# Merge with country names
ONLY_country_grouped_by_decade <- left_join(countries_df, country_grouped_by_decade, by='country')
ONLY_country_grouped_by_decade <- na.omit(ONLY_country_grouped_by_decade)
# Subset data for each decade
decade_analysis_1960 <- subset(ONLY_country_grouped_by_decade, decade == 1960)
decade_analysis_1970 <- subset(ONLY_country_grouped_by_decade, decade == 1970)
decade_analysis_1980 <- subset(ONLY_country_grouped_by_decade, decade == 1980)
decade_analysis_1990 <- subset(ONLY_country_grouped_by_decade, decade == 1990)
decade_analysis_2000 <- subset(ONLY_country_grouped_by_decade, decade == 2000)
decade_analysis_2010 <- subset(ONLY_country_grouped_by_decade, decade == 2010)
decade_analysis_2020 <- subset(ONLY_country_grouped_by_decade, decade == 2020)
# Extract top 5 countries with the highest absolute migration for each decade
decade_analysis_1960 <- head(decade_analysis_1960[order(-decade_analysis_1960$absolute_migration),])
decade_analysis_1970 <- head(decade_analysis_1970[order(-decade_analysis_1970$absolute_migration),])
decade_analysis_1980 <- head(decade_analysis_1980[order(-decade_analysis_1980$absolute_migration),])
decade_analysis_1990 <- head(decade_analysis_1990[order(-decade_analysis_1990$absolute_migration),])
decade_analysis_2000 <- head(decade_analysis_2000[order(-decade_analysis_2000$absolute_migration),])
decade_analysis_2010 <- head(decade_analysis_2010[order(-decade_analysis_2010$absolute_migration),])
decade_analysis_2020 <- head(decade_analysis_2020[order(-decade_analysis_2020$absolute_migration),])
# Display results for 2020 and 1960
decade_analysis_2020
## country decade absolute_migration
## 1154 Ukraine 2020 6676584
## 894 Poland 2020 3370134
## 1175 United States 2020 2235680
## 845 Pakistan 2020 1226119
## 954 Saudi Arabia 2020 1208176
## 915 Romania 2020 942261
decade_analysis_1960
## country decade absolute_migration
## 1169 United States 1960 5148652
## 881 Philippines 1960 2305361
## 377 France 1960 1851980
## 399 Germany 1960 1554014
## 234 China 1960 1269564
## 895 Portugal 1960 1235114