Step 1: Choosing our dataset and loading data
## Selecting data from https://data.worldbank.org/indicator/NY.GDP.PCAP.CD
setwd("/Users/georgegatsios/Desktop/mydata") #setting working directory
gdp_pc <- read_csv("gdp_per_capita_historic.csv", skip = 4) #reading data, but skipping first 4 to include our column headers in the first row
head(gdp_pc) #viewing the data, which indicates we have run into a problem: the raw World Bank data includes metadata rows and columns for each year.
## # A tibble: 6 × 70
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Aruba ABW GDP per capita … NY.GDP.PCAP.CD NA NA
## 2 Africa Eastern… AFE GDP per capita … NY.GDP.PCAP.CD 186. 187.
## 3 Afghanistan AFG GDP per capita … NY.GDP.PCAP.CD NA NA
## 4 Africa Western… AFW GDP per capita … NY.GDP.PCAP.CD 122. 127.
## 5 Angola AGO GDP per capita … NY.GDP.PCAP.CD NA NA
## 6 Albania ALB GDP per capita … NY.GDP.PCAP.CD NA NA
## # ℹ 64 more variables: `1962` <dbl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## # `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## # `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## # `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## # `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## # `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>,
## # `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, …
Step 2: Cleaning our dataset
# Removing metadata rows/cols, and pivoting years into one column
gdp_pc_clean <- gdp_pc %>%
select(`Country Name`, `Country Code`, starts_with("19"), starts_with("20")) %>%
pivot_longer(cols = -c(`Country Name`, `Country Code`),
names_to = "Year", values_to = "GDP_per_capita") %>%
mutate(Year = as.integer(Year))
# Removing NAs
gdp_pc_clean <- gdp_pc_clean %>% filter(!is.na(GDP_per_capita))
head(gdp_pc_clean) #viewing the revised, 'clean' data frame
## # A tibble: 6 × 4
## `Country Name` `Country Code` Year GDP_per_capita
## <chr> <chr> <int> <dbl>
## 1 Aruba ABW 1986 6768.
## 2 Aruba ABW 1987 8244.
## 3 Aruba ABW 1988 10056.
## 4 Aruba ABW 1989 11507.
## 5 Aruba ABW 1990 12188.
## 6 Aruba ABW 1991 13234.
Step 3: Coming up with a guiding question for our analysis
Question: How has GDP per capita grown in high-income vs. low-income
countries since 2000?
Step 4: Producing some summary statistics for our dataset
# Preparing a summary statistics data frame, which looks at average and max GDP per capita values of all countries across the years (since 2000)
summary_stats <- gdp_pc_clean %>%
filter(Year >= 2000) %>%
group_by(`Country Name`) %>%
summarize(
avg_gdp = mean(GDP_per_capita, na.rm = TRUE),
max_gdp = max(GDP_per_capita, na.rm = TRUE),
.groups = "drop"
)
head(summary_stats) #viewing the result
## # A tibble: 6 × 3
## `Country Name` avg_gdp max_gdp
## <chr> <dbl> <dbl>
## 1 Afghanistan 415. 651.
## 2 Africa Eastern and Southern 1349. 1736.
## 3 Africa Western and Central 1459. 2221.
## 4 Albania 4370. 10012.
## 5 Algeria 4268. 6095.
## 6 American Samoa 11575. 18017.
Step 5: Producing our first insight
# Counting how many countries had GDP per capita > 20,000 USD in 2000
high_income_count1 <- gdp_pc_clean %>%
filter(Year == 2000) %>%
summarize(count_high_income = sum(GDP_per_capita > 20000, na.rm = TRUE))
high_income_count1 #returning the result, which indicates the number of countries with GDP per capita > 20,000 USD in 2000
## # A tibble: 1 × 1
## count_high_income
## <int>
## 1 40
# Counting how many countries had GDP per capita > 20,000 USD in 2020
high_income_count2 <- gdp_pc_clean %>%
filter(Year == 2020) %>%
summarize(count_high_income = sum(GDP_per_capita > 20000, na.rm = TRUE))
high_income_count2 #returning the result, which indicates the number of countries with GDP per capita > 20,000 USD in 2020
## # A tibble: 1 × 1
## count_high_income
## <int>
## 1 64
high_income_change = (high_income_count2 - high_income_count1)*100/(high_income_count1)
print(paste("The number of countries with GDP per capita exceeding 20,000 USD has grown by", high_income_change, "% during the time period from 2000 to 2020, indicating global economic growth"))
## [1] "The number of countries with GDP per capita exceeding 20,000 USD has grown by 60 % during the time period from 2000 to 2020, indicating global economic growth"
Step 6: Producing another insight (visually)
# Comparing GDP per capita for selected countries over time
countries <- c("United States", "China", "India", "Nigeria", "Brazil")
gdp_pc_clean %>%
filter(`Country Name` %in% countries, Year >= 2000) %>%
ggplot(aes(x = Year, y = GDP_per_capita, color = `Country Name`)) +
geom_line(size = 1) +
labs(
title = "GDP per Capita (2000–2020)",
x = "Year",
y = "GDP per capita (current US$)",
caption = str_wrap("Note. From the graph above, we get another valuble insight. Specifically, high-income US's GDP per capita has grown signficantly. fast-developing countries like China and India show rapid upward trends in GDP per capita, whereas some economies (such as that of Nigeria) have relatively flat GDP per capita growth, pointing to persistent global inequalities. Source: The World Bank Group (2025)", width = 100)
) +
theme_minimal() +
theme(
plot.caption = element_text(hjust = 0, size = 10, lineheight = 1.1)
)
