##install.packages("tidyverse")
## Note: Professor, I commented out install.packages() because the package was already installed, so it only needed to be loaded using library().
library(tidyverse)
gapminder_wide <- read_csv("data/gapminder_wide(1).csv")Ayberk_KOCAKIR_Assignment_1
The Economic Question
How have GDP per capita and life expectancy evolved across different continents since 1952? Which continents have seen the fastest growth, and which countries are outliers?
Part 1: Setup and Data Loading
In this section, I load the dataset and prepare the environment for the analysis.
Note: The dataset was automatically downloaded as gapminder_wide(1).csv, which is why the file name differs from data/gapminder_wide.csv.
Task 1.1:
glimpse(gapminder_wide)Rows: 142
Columns: 26
$ country <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Argenti…
$ continent <chr> "Asia", "Europe", "Africa", "Africa", "Americas", "Ocea…
$ gdpPercap_1952 <dbl> 779.4453, 1601.0561, 2449.0082, 3520.6103, 5911.3151, 1…
$ gdpPercap_1957 <dbl> 820.8530, 1942.2842, 3013.9760, 3827.9405, 6856.8562, 1…
$ gdpPercap_1962 <dbl> 853.1007, 2312.8890, 2550.8169, 4269.2767, 7133.1660, 1…
$ gdpPercap_1967 <dbl> 836.1971, 2760.1969, 3246.9918, 5522.7764, 8052.9530, 1…
$ gdpPercap_1972 <dbl> 739.9811, 3313.4222, 4182.6638, 5473.2880, 9443.0385, 1…
$ gdpPercap_1977 <dbl> 786.1134, 3533.0039, 4910.4168, 3008.6474, 10079.0267, …
$ gdpPercap_1982 <dbl> 978.0114, 3630.8807, 5745.1602, 2756.9537, 8997.8974, 1…
$ gdpPercap_1987 <dbl> 852.3959, 3738.9327, 5681.3585, 2430.2083, 9139.6714, 2…
$ gdpPercap_1992 <dbl> 649.3414, 2497.4379, 5023.2166, 2627.8457, 9308.4187, 2…
$ gdpPercap_1997 <dbl> 635.3414, 3193.0546, 4797.2951, 2277.1409, 10967.2820, …
$ gdpPercap_2002 <dbl> 726.7341, 4604.2117, 5288.0404, 2773.2873, 8797.6407, 3…
$ gdpPercap_2007 <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 12779.3796, …
$ lifeExp_1952 <dbl> 28.801, 55.230, 43.077, 30.015, 62.485, 69.120, 66.800,…
$ lifeExp_1957 <dbl> 30.33200, 59.28000, 45.68500, 31.99900, 64.39900, 70.33…
$ lifeExp_1962 <dbl> 31.99700, 64.82000, 48.30300, 34.00000, 65.14200, 70.93…
$ lifeExp_1967 <dbl> 34.02000, 66.22000, 51.40700, 35.98500, 65.63400, 71.10…
$ lifeExp_1972 <dbl> 36.08800, 67.69000, 54.51800, 37.92800, 67.06500, 71.93…
$ lifeExp_1977 <dbl> 38.43800, 68.93000, 58.01400, 39.48300, 68.48100, 73.49…
$ lifeExp_1982 <dbl> 39.854, 70.420, 61.368, 39.942, 69.942, 74.740, 73.180,…
$ lifeExp_1987 <dbl> 40.822, 72.000, 65.799, 39.906, 70.774, 76.320, 74.940,…
$ lifeExp_1992 <dbl> 41.674, 71.581, 67.744, 40.647, 71.868, 77.560, 76.040,…
$ lifeExp_1997 <dbl> 41.763, 72.950, 69.152, 40.963, 73.275, 78.830, 77.510,…
$ lifeExp_2002 <dbl> 42.129, 75.651, 70.994, 41.003, 74.340, 80.370, 78.980,…
$ lifeExp_2007 <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.235, 79.829,…
gapminder_wide# A tibble: 142 × 26
country continent gdpPercap_1952 gdpPercap_1957 gdpPercap_1962 gdpPercap_1967
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghan… Asia 779. 821. 853. 836.
2 Albania Europe 1601. 1942. 2313. 2760.
3 Algeria Africa 2449. 3014. 2551. 3247.
4 Angola Africa 3521. 3828. 4269. 5523.
5 Argent… Americas 5911. 6857. 7133. 8053.
6 Austra… Oceania 10040. 10950. 12217. 14526.
7 Austria Europe 6137. 8843. 10751. 12835.
8 Bahrain Asia 9867. 11636. 12753. 14805.
9 Bangla… Asia 684. 662. 686. 721.
10 Belgium Europe 8343. 9715. 10991. 13149.
# ℹ 132 more rows
# ℹ 20 more variables: gdpPercap_1972 <dbl>, gdpPercap_1977 <dbl>,
# gdpPercap_1982 <dbl>, gdpPercap_1987 <dbl>, gdpPercap_1992 <dbl>,
# gdpPercap_1997 <dbl>, gdpPercap_2002 <dbl>, gdpPercap_2007 <dbl>,
# lifeExp_1952 <dbl>, lifeExp_1957 <dbl>, lifeExp_1962 <dbl>,
# lifeExp_1967 <dbl>, lifeExp_1972 <dbl>, lifeExp_1977 <dbl>,
# lifeExp_1982 <dbl>, lifeExp_1987 <dbl>, lifeExp_1992 <dbl>, …
My answer: In our dataset contains 142 rows and 26 colums. Each row represents a country, while the columns contain different variables such as continent, GDP per capita, life expectancy. The column names include both the variable name and the year (for example, lifeExp_1952 and gdpPercap_1952). This indicates that the dataset is stored in a wide format, where the same variable appears in multiple columns for different years.
Part 2: Data Tidying
Task 2.1:
Note: To convert the dataset into tidy format, I used the pivot_longer( ) function. This reshapes the dataset so that each variable becomes a column and each observation becomes a row. In our wide dataset, the years and variables are combined in the same columns name. For example, lifeExp_1952 and gdpPercap_1952. To make the data tidy and easier to analyze, these values needed to be separated into different columns.
gap_tidy <- gapminder_wide |>
pivot_longer(
cols = -c(country, continent),
names_to = c(".value", "year"),
names_sep = "_",
values_drop_na = FALSE
) |>
mutate(year = as.numeric(year)) |>
select(country, continent, year, gdpPercap, lifeExp)
head(gap_tidy, 10)# A tibble: 10 × 5
country continent year gdpPercap lifeExp
<chr> <chr> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 779. 28.8
2 Afghanistan Asia 1957 821. 30.3
3 Afghanistan Asia 1962 853. 32.0
4 Afghanistan Asia 1967 836. 34.0
5 Afghanistan Asia 1972 740. 36.1
6 Afghanistan Asia 1977 786. 38.4
7 Afghanistan Asia 1982 978. 39.9
8 Afghanistan Asia 1987 852. 40.8
9 Afghanistan Asia 1992 649. 41.7
10 Afghanistan Asia 1997 635. 41.8
## Note: gap_tidy <- I did not add this code because we just want to observe first 10 row.Task 2.2:
The .value sentinel tells R to use part of the column name as the name of a new variable. In the wide dataset, the variable names and years were combined in the column names (for example, lifeExp_1952). Using .value separates them into proper variables and a year column, which makes the dataset tidy and easier to analyze. For example,
Before: lifeExp_1952
After: lifeExp | 1952
Task 2.3:
gap_filtered <- gap_tidy |>
filter(
year >= 1970,
country %in% c("Turkey", "Brazil", "Korea, Rep.", "Germany", "United States", "China")
)Part 3: Grouped Summaries
Task 3.1:
continent_avg_gdp <- gap_tidy |>
group_by(continent) |>
summarize(
avg_gdp = mean(gdpPercap, na.rm = TRUE),
avg_lifeExp = mean(lifeExp, na.rm = TRUE),
n_countries = n_distinct(country),
.groups = "drop"
)
continent_avg_gdp# A tibble: 5 × 4
continent avg_gdp avg_lifeExp n_countries
<chr> <dbl> <dbl> <int>
1 Africa 2194. 48.9 52
2 Americas 7136. 64.7 25
3 Asia 7902. 60.1 33
4 Europe 14469. 71.9 30
5 Oceania 18622. 74.3 2
Questions to answer: - Which continent has the highest average GDP per capita? - Which continent has the highest average life expectancy? - Are these the same continent? Why might that be?
My Answer: Oceania has the highest average GDP per capita (18621.609). Oceania has the highest average life expectancy (74.32621). Oceania has the highest average GDP per capita and the highest average life expectancy. This is likely because the continent mainly includes Australia and New Zealand, which are highly developed countries with strong economies, high living standards, and advanced healthcare systems.
Task 3.2:
countries_avg_gdp <- gap_tidy |>
group_by(country) |>
summarize(
avg_gdp = mean(gdpPercap, na.rm = TRUE),
avg_lifeExp = mean(lifeExp, na.rm = TRUE),
.groups = "drop"
) |>
arrange(desc(avg_gdp)) |>
slice_max(order_by = avg_gdp, n = 5)
countries_avg_gdp# A tibble: 5 × 3
country avg_gdp avg_lifeExp
<chr> <dbl> <dbl>
1 Kuwait 65333. 68.9
2 Switzerland 27074. 75.6
3 Norway 26747. 75.8
4 United States 26261. 73.5
5 Canada 22411. 74.9
Question: Do any of these countries surprise you? Why might small, wealthy countries appear at the top?
My Answer: The results are not very surprising because these countries are known for having strong economies and high living standards. Small but wealthy countries like Kuwait and Switzerland appear at the top. Small countries like Kuwait at the top because Kuwait, for example, earns large revenues from oil exports, and this income is shared among a smaller population, which increases GDP per capita.
Task 3.3:
continent_info <- gap_tidy |>
select(country, continent) |>
distinct()
cor_by_continent <- gap_tidy |>
group_by(continent) |>
summarize(
correlation = cor(gdpPercap, lifeExp, use = "complete.obs"),
n_obs = n(),
.groups = "drop"
)
cor_by_continent# A tibble: 5 × 3
continent correlation n_obs
<chr> <dbl> <int>
1 Africa 0.426 624
2 Americas 0.558 300
3 Asia 0.382 396
4 Europe 0.781 360
5 Oceania 0.956 24
Question: - In which continent is the relationship strongest (highest correlation)? - In which continent is it weakest? - What might explain the differences between continents?
My Answer:
strongest relationship → Oceania
weakest relationship → Africa
In Oceania, countries such as Australia and New Zealand have high incomes, strong healthcare systems, and high living standards, which creates a strong positive relationship between income and life expectancy. In contrast, many African countries face economic inequality, limited healthcare access, and structural development challenges, which weakens the relationship between income and health outcomes.
Part 4: Data Integration
Task 4.1:
life_data <- read_csv("data/gap_life.csv")
gdp_data <- read_csv("data/gap_gdp.csv")
glimpse(life_data)Rows: 1,618
Columns: 3
$ country <chr> "Mali", "Malaysia", "Zambia", "Greece", "Swaziland", "Iran", "…
$ year <dbl> 1992, 1967, 1987, 2002, 1967, 1997, 2007, 2007, 1957, 2002, 19…
$ lifeExp <dbl> 48.388, 59.371, 50.821, 78.256, 46.633, 68.042, 73.747, 78.098…
glimpse(gdp_data)Rows: 1,618
Columns: 3
$ country <chr> "Bangladesh", "Mongolia", "Taiwan", "Burkina Faso", "Angola"…
$ year <dbl> 1987, 1997, 2002, 1962, 1962, 1977, 2007, 1962, 1992, 1972, …
$ gdpPercap <dbl> 751.9794, 1902.2521, 23235.4233, 722.5120, 4269.2767, 2785.4…
Task 4.2:
gap_joined <- inner_join(life_data, gdp_data, by = c("country", "year"))
nrow(gap_joined)[1] 1535
Task 4.3:
n_distinct(gap_joined$country)[1] 142
nrow(gap_joined)[1] 1535
How many rows are in gap_joined? = 1535
How many unique countries are in gap_joined? =142
The joined dataset may have fewer rows than the original gap_life.csv and gap_gdp.csv datasets because the join keeps only observations that exist in both datasets for the same country and year.
Task 4.4:
gap_joined |>
filter(is.na(lifeExp) | is.na(gdpPercap))# A tibble: 0 × 4
# ℹ 4 variables: country <chr>, year <dbl>, lifeExp <dbl>, gdpPercap <dbl>
Task 4.5:
The analysis and prediction may seem seriously biased when we do not consider the missing values. For example, in the dataset some missing values leads to prejudice about the model which diminish the performans of the analysis. As an Economist, we can use imputation method to handle missing values.
However, the trade-off is that the estimated values may not perfectly reflect the true data and could introduce estimation errors into the analysis.
Part 5: Economic Interpretation
- Which continent has seen the most dramatic economic growth since 1952? (Look at the numbers – don’t just guess.)
gdp_growth <- gap_tidy |>
filter(year %in% c(1952, 2007)) |>
group_by(continent, year) |>
summarize(avg_gdp = mean(gdpPercap, na.rm = TRUE), .groups = "drop")
gdp_growth# A tibble: 10 × 3
continent year avg_gdp
<chr> <dbl> <dbl>
1 Africa 1952 1253.
2 Africa 2007 3089.
3 Americas 1952 4079.
4 Americas 2007 11003.
5 Asia 1952 5195.
6 Asia 2007 12473.
7 Europe 1952 5661.
8 Europe 2007 25054.
9 Oceania 1952 10298.
10 Oceania 2007 29810.
Oceania has experienced the most dramatic economic growth since 1952. The average GDP per capita in Oceania increased significantly from about 10298.086 in 1952 to around 29810.188 in 2007
Is there a clear relationship between GDP per capita and life expectancy across continents? Refer to your correlation results.
continent_info <- gap_tidy |> select(country, continent) |> distinct() cor_by_continent <- gap_tidy |> group_by(continent) |> summarize( correlation = cor(gdpPercap, lifeExp, use = "complete.obs"), n_obs = n(), .groups = "drop" ) cor_by_continent# A tibble: 5 × 3 continent correlation n_obs <chr> <dbl> <int> 1 Africa 0.426 624 2 Americas 0.558 300 3 Asia 0.382 396 4 Europe 0.781 360 5 Oceania 0.956 24The correlation results show a positive relationship between GDP per capita and life expectancy across continents.
What are the main limitations of this analysis? Consider data quality, missing values, time period, and what the data can’t tell us.
One limitation of this analysis relates to data structure and quality. In the original dataset, variables such as GDP per capita and life expectancy were combined with the year in the column names (for example, lifeExp_1952 and gdpPercap_1952). This required reshaping the data into a tidy format before analysis, which may introduce potential data handling errors if not done carefully.
sum(is.na(gapminder_wide))[1] 0
sum(is.na(gap_tidy))[1] 0
Although the dataset does not contain missing values, there are still several limitations. The data only covers the period from 1952 to 2007, so it does not reflect more recent economic and demographic changes.
AI Use Log:
| Tool Used | Prompt Given | How You Verified or Modified the Output |
|---|---|---|
| ChatGPT | How can I calculate economic growth between 1952 and 2007 using GDP per capita in R? | I tested the suggested code in RStudio and adapted it to my gap_tidy dataset to calculate the growth for each continent. |
| ChatGPT | How to check for missing values in an R dataset? | I ran the suggested commands in my dataset to confirm whether missing values existed and interpreted the results myself. |
| ChatGPT | Help rephrase some explanations in my assignment to make them clearer and more fluent | I rewrote and adjusted the explanations to ensure they reflected my own understanding of the analysis. |