library(tidyverse)
library(readr)
df<-read_csv("WDICSV.csv")
country_list<-read_csv("WDICountry.csv")
head(df)
colnames(df)
head(country_list)
colnames(country_list)
# filter real countries' code
valid_countries <- country_list %>%
filter(!is.na(Region) & Region != "",
`Income Group` != "Aggregates",
!grepl("Rep\\.", `Table Name`)) %>%
pull(`Country Code`)
head(valid_countries)
## [1] "ABW" "AFG" "AGO" "ALB" "AND" "ARE"
length(valid_countries)
## [1] 208
# Keep only real countries & 2014-2024 data in df
df_filtered <- df %>% filter(`Country Code` %in% valid_countries) %>% select(`Country Name`, `Country Code`, `Indicator Name`, `Indicator Code`, `2014`:`2024`)
head(df_filtered)
## # A tibble: 6 × 15
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `2014` `2015`
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Afghanistan AFG Access to clean … EG.CFT.ACCS.ZS 26.1 27.6
## 2 Afghanistan AFG Access to clean … EG.CFT.ACCS.RU.… 10.2 11.4
## 3 Afghanistan AFG Access to clean … EG.CFT.ACCS.UR.… 78 79.5
## 4 Afghanistan AFG Access to electr… EG.ELC.ACCS.ZS 89.5 71.5
## 5 Afghanistan AFG Access to electr… EG.ELC.ACCS.RU.… 86.5 64.6
## 6 Afghanistan AFG Access to electr… EG.ELC.ACCS.UR.… 98.7 92.5
## # ℹ 9 more variables: `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <dbl>,
## # `2020` <dbl>, `2021` <dbl>, `2022` <dbl>, `2023` <dbl>, `2024` <dbl>
dim(df_filtered)
## [1] 315328 15
# Filter: GDP > 100B USD in 2024
valid_gdp_countries <- df_filtered %>%
filter(`Indicator Name` == "GDP (current US$)" & `2024` > 100000000000) %>%
pull(`Country Code`)
# Filter: Population > 10M in 2024
valid_pop_countries <- df_filtered %>%
filter(`Indicator Name` == "Population, total" & `2024` > 10000000) %>%
pull(`Country Code`)
# Final valid countries = intersection
valid_countries_final <- intersect(valid_gdp_countries, valid_pop_countries)
# Keep only those countries
df_filtered_final <- df_filtered %>%
filter(`Country Code` %in% valid_countries_final)
dim(df_filtered_final)
## [1] 72768 15
df_potential <- df_filtered_final %>%
filter(`Indicator Name` %in% c("GDP (current US$)",
"GDP growth (annual %)",
"Population, total"))
head(df_potential)
## # A tibble: 6 × 15
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `2014`
## <chr> <chr> <chr> <chr> <dbl>
## 1 Algeria DZA GDP (current US$) NY.GDP.MKTP.CD 2.39e11
## 2 Algeria DZA GDP growth (annual %) NY.GDP.MKTP.KD.ZG 4.10e 0
## 3 Algeria DZA Population, total SP.POP.TOTL 3.92e 7
## 4 Argentina ARG GDP (current US$) NY.GDP.MKTP.CD 5.26e11
## 5 Argentina ARG GDP growth (annual %) NY.GDP.MKTP.KD.ZG -2.51e 0
## 6 Argentina ARG Population, total SP.POP.TOTL 4.30e 7
## # ℹ 10 more variables: `2015` <dbl>, `2016` <dbl>, `2017` <dbl>, `2018` <dbl>,
## # `2019` <dbl>, `2020` <dbl>, `2021` <dbl>, `2022` <dbl>, `2023` <dbl>,
## # `2024` <dbl>
df_growth <- df_potential %>%
filter(`Indicator Name` == "GDP growth (annual %)") %>%
pivot_longer(cols = `2014`:`2024`, names_to = "Year", values_to = "GDP_Growth") %>%
group_by(`Country Name`) %>%
summarize(
Avg_GDP_Growth = mean(GDP_Growth, na.rm = TRUE)
) %>%
ungroup()
head(df_growth)
## # A tibble: 6 × 2
## `Country Name` Avg_GDP_Growth
## <chr> <dbl>
## 1 Algeria 2.25
## 2 Argentina -0.107
## 3 Australia 2.36
## 4 Bangladesh 6.27
## 5 Belgium 1.65
## 6 Brazil 0.832
df_2024 <- df_potential %>%
filter(`Indicator Name` %in% c("GDP (current US$)", "Population, total")) %>%
select(`Country Name`, `Indicator Name`, `2024`) %>%
pivot_wider(names_from = `Indicator Name`, values_from = `2024`)
df_final <- df_2024 %>%
left_join(df_growth, by = "Country Name") %>%
filter(`GDP (current US$)` > 100000000000,
`Population, total` > 10000000,
Avg_GDP_Growth > 3) %>%
arrange(desc(Avg_GDP_Growth))
df_final %>% count()
## # A tibble: 1 × 1
## n
## <int>
## 1 18
print(df_final)
## # A tibble: 18 × 4
## `Country Name` `GDP (current US$)` `Population, total` Avg_GDP_Growth
## <chr> <dbl> <dbl> <dbl>
## 1 Bangladesh 4.50e11 173562364 6.27
## 2 India 3.91e12 1450935791 6.18
## 3 Viet Nam 4.76e11 100987686 6.18
## 4 China 1.87e13 1408975000 5.94
## 5 Uzbekistan 1.15e11 36361859 5.92
## 6 Dominican Republic 1.24e11 11427557 5.02
## 7 Philippines 4.62e11 115843670 4.93
## 8 Turkiye 1.32e12 85518661 4.80
## 9 Kenya 1.24e11 56432944 4.64
## 10 Indonesia 1.40e12 283487931 4.29
## 11 Malaysia 4.22e11 35557673 4.18
## 12 Pakistan 3.73e11 251269164 3.75
## 13 Poland 9.15e11 36554707 3.70
## 14 Guatemala 1.13e11 18406359 3.58
## 15 Romania 3.83e11 19069340 3.40
## 16 Kazakhstan 2.88e11 20592571 3.10
## 17 United Arab Emirates 5.37e11 10876981 3.09
## 18 Saudi Arabia 1.24e12 35300280 3.05
Bangladesh,India,Vietnam,China,Uzbekistan,Dominican Republic,Philippines,Turkiye,Kenya,Indonesia, Malaysia,Pakistan,Poland,Guatemala, Romania,Kazakhstan,United Arab Emirates and Saudi Arabia.
ggplot(df_final, aes(x = `GDP (current US$)`/1000000000000,
y = Avg_GDP_Growth,
size = `Population, total`/1000000000,
label = `Country Name`)) +
geom_point(color = "steelblue", alpha = 0.7) +
geom_text(vjust = -0.5, size = 2.8) +
scale_x_continuous(labels = scales::number_format(accuracy = 0.1)) +
scale_size_continuous(labels = scales::number_format(accuracy = 0.1)) +
labs(title = "Potential Growth Markets (2024)",
x = "GDP (Trillion USD)",
y = "Average GDP Growth (2014-2024)",
size = "Population in Billion") +
theme_minimal()
·High-growth countries: Bangladesh, Vietnam, India, Uzbekistan etc., with growth rates > 5%, are considered future potential markets.
·Large-scale markets: China, India have large GDP sizes, suitable for scaled asset allocation.
·Medium-sized high-growth markets:Philippines, Turkye, Kenya etc., have high growth rates and medium population sizes, suitable for high-growth investment.
·Investment portfolio approach: By combining GDP size, growth rate, and population, one can focus on countries with high growth + high potential + reasonable scale for targeted allocation.