Step 1: Choosing our dataset and loading data

## Selecting data from https://data.worldbank.org/indicator/NY.GDP.PCAP.CD 
setwd("/Users/georgegatsios/Desktop/mydata") #setting working directory
gdp_pc <- read_csv("gdp_per_capita_historic.csv", skip = 4) #reading data, but skipping first 4 to include our column headers in the first row
head(gdp_pc) #viewing the data, which indicates we have run into a problem: the raw World Bank data includes metadata rows and columns for each year.
## # A tibble: 6 × 70
##   `Country Name`  `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
##   <chr>           <chr>          <chr>            <chr>             <dbl>  <dbl>
## 1 Aruba           ABW            GDP per capita … NY.GDP.PCAP.CD      NA     NA 
## 2 Africa Eastern… AFE            GDP per capita … NY.GDP.PCAP.CD     186.   187.
## 3 Afghanistan     AFG            GDP per capita … NY.GDP.PCAP.CD      NA     NA 
## 4 Africa Western… AFW            GDP per capita … NY.GDP.PCAP.CD     122.   127.
## 5 Angola          AGO            GDP per capita … NY.GDP.PCAP.CD      NA     NA 
## 6 Albania         ALB            GDP per capita … NY.GDP.PCAP.CD      NA     NA 
## # ℹ 64 more variables: `1962` <dbl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## #   `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## #   `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## #   `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## #   `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## #   `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>,
## #   `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, …

Step 2: Cleaning our dataset

# Removing metadata rows/cols, and pivoting years into one column
gdp_pc_clean <- gdp_pc %>%
  select(`Country Name`, `Country Code`, starts_with("19"), starts_with("20")) %>%
  pivot_longer(cols = -c(`Country Name`, `Country Code`),
               names_to = "Year", values_to = "GDP_per_capita") %>%
  mutate(Year = as.integer(Year))

# Removing NAs
gdp_pc_clean <- gdp_pc_clean %>% filter(!is.na(GDP_per_capita))

head(gdp_pc_clean) #viewing the revised, 'clean' data frame
## # A tibble: 6 × 4
##   `Country Name` `Country Code`  Year GDP_per_capita
##   <chr>          <chr>          <int>          <dbl>
## 1 Aruba          ABW             1986          6768.
## 2 Aruba          ABW             1987          8244.
## 3 Aruba          ABW             1988         10056.
## 4 Aruba          ABW             1989         11507.
## 5 Aruba          ABW             1990         12188.
## 6 Aruba          ABW             1991         13234.

Step 3: Coming up with a guiding question for our analysis

Question: How has GDP per capita grown in high-income vs. low-income countries since 2000?

Step 4: Producing some summary statistics for our dataset

# Preparing a summary statistics data frame, which looks at average and max GDP per capita values of all countries across the years (since 2000) 
summary_stats <- gdp_pc_clean %>%
  filter(Year >= 2000) %>%
  group_by(`Country Name`) %>%
  summarize(
    avg_gdp = mean(GDP_per_capita, na.rm = TRUE),
    max_gdp = max(GDP_per_capita, na.rm = TRUE),
    .groups = "drop"
  )

head(summary_stats) #viewing the result 
## # A tibble: 6 × 3
##   `Country Name`              avg_gdp max_gdp
##   <chr>                         <dbl>   <dbl>
## 1 Afghanistan                    415.    651.
## 2 Africa Eastern and Southern   1349.   1736.
## 3 Africa Western and Central    1459.   2221.
## 4 Albania                       4370.  10012.
## 5 Algeria                       4268.   6095.
## 6 American Samoa               11575.  18017.

Step 5: Producing our first insight

# Counting how many countries had GDP per capita > 20,000 USD in 2000
high_income_count1 <- gdp_pc_clean %>%
  filter(Year == 2000) %>%
  summarize(count_high_income = sum(GDP_per_capita > 20000, na.rm = TRUE))

high_income_count1 #returning the result, which indicates the number of countries with GDP per capita > 20,000 USD in 2000
## # A tibble: 1 × 1
##   count_high_income
##               <int>
## 1                40
# Counting how many countries had GDP per capita > 20,000 USD in 2020
high_income_count2 <- gdp_pc_clean %>%
  filter(Year == 2020) %>%
  summarize(count_high_income = sum(GDP_per_capita > 20000, na.rm = TRUE))

high_income_count2 #returning the result, which indicates the number of countries with GDP per capita > 20,000 USD in 2020
## # A tibble: 1 × 1
##   count_high_income
##               <int>
## 1                64
high_income_change = (high_income_count2 - high_income_count1)*100/(high_income_count1)

print(paste("The number of countries with GDP per capita exceeding 20,000 USD has grown by", high_income_change, "% during the time period from 2000 to 2020, indicating global economic growth"))
## [1] "The number of countries with GDP per capita exceeding 20,000 USD has grown by 60 % during the time period from 2000 to 2020, indicating global economic growth"

Step 6: Producing another insight (visually)

# Comparing GDP per capita for selected countries over time
countries <- c("United States", "China", "India", "Nigeria", "Brazil")

gdp_pc_clean %>%
  filter(`Country Name` %in% countries, Year >= 2000) %>%
  ggplot(aes(x = Year, y = GDP_per_capita, color = `Country Name`)) +
  geom_line(size = 1) +
  labs(
    title = "GDP per Capita (2000–2020)",
    x = "Year",
    y = "GDP per capita (current US$)",
    caption = str_wrap("Note. From the graph above, we get another valuble insight. Specifically, high-income US's GDP per capita has grown signficantly. fast-developing countries like China and India show rapid upward trends in GDP per capita, whereas some economies (such as that of Nigeria) have relatively flat GDP per capita growth, pointing to persistent global inequalities. Source: The World Bank Group (2025)", width = 100)
    
  ) +
  theme_minimal() +
  theme(
    plot.caption = element_text(hjust = 0, size = 10, lineheight = 1.1)
  )