library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(countrycode)
population <- read_csv("/Users/twinklerein/Downloads/pop.csv")
## Rows: 197 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (302): country, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 18...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
life_expectancy <- read_csv("/Users/twinklerein/Downloads/lex.csv")
## Rows: 196 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (301): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income <- read_csv("/Users/twinklerein/Downloads/mincpcap_cppp.csv")
## Rows: 195 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (301): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
babies_per_woman <- read_csv("/Users/twinklerein/Downloads/children_per_woman_total_fertility.csv")
## Rows: 197 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (301): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
child_mortality <- read_csv("/Users/twinklerein/Downloads/child_mortality_0_5_year_olds_dying_per_1000_born.csv")
## Rows: 197 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (301): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
co2_emissions <- read_csv("/Users/twinklerein/Downloads/co2_pcap_cons.csv")
## Rows: 194 Columns: 224
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): country, 2003, 2004, 2005, 2006, 2011, 2012, 2013
## dbl (216): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdp_per_capita <- read_csv("/Users/twinklerein/Downloads/gdp_pcap.csv")
## Rows: 195 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (199): country, 1901, 1903, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 19...
## dbl (103): 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
health_spending <- read_csv("/Users/twinklerein/Downloads/total_health_spending_per_person_us.csv")
## Rows: 190 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (16): 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
population_density <- read_csv("/Users/twinklerein/Downloads/population_density_per_square_km.csv")
## Rows: 236 Columns: 152
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (152): country, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 19...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
basic_water <- read_csv("/Users/twinklerein/Downloads/at_least_basic_water_source_overall_access_percent.csv")
## Rows: 215 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (23): 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
murders <- read_csv("/Users/twinklerein/Downloads/murder_total_deaths.csv")
## Rows: 204 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): country, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 199...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
population <- population %>% select(country, `2010`) %>% rename(population = `2010`)
life_expectancy <- life_expectancy %>% select(country, `2010`) %>% rename(lifeexp = `2010`)
income <- income %>% select(country, `2010`) %>% rename(income = `2010`)
babies_per_woman <- babies_per_woman %>% select(country, `2010`) %>% rename(chdperwoman = `2010`)
child_mortality <- child_mortality %>% select(country, `2010`) %>% rename(childmort = `2010`)
co2_emissions <- co2_emissions %>% select(country, `2010`) %>% rename(co2 = `2010`)
gdp_per_capita <- gdp_per_capita %>% select(country, `2010`) %>% rename(gdpcapita = `2010`)
health_spending <- health_spending %>% select(country, `2010`) %>% rename(healthspend = `2010`)
population_density <- population_density %>% select(country, `2010`) %>% rename(popdensity = `2010`)
basic_water <- basic_water %>% select(country, `2010`) %>% rename(water = `2010`)
murders <- murders %>% select(country, `2010`) %>% rename(murder = `2010`)
text.to.num <- function(x) {
xx <- as.character(x)
res <- rep(0, length(x))
for (i in 1:length(x)) {
if (is.na(xx[i])) {
res[i] <- NA
next
}
if (grepl("M", xx[i], ignore.case = TRUE)) {
res[i] <- as.numeric(gsub("M", "", xx[i], ignore.case = TRUE)) * 1e6
} else if (grepl("k", xx[i], ignore.case = TRUE)) {
res[i] <- as.numeric(gsub("k", "", xx[i], ignore.case = TRUE)) * 1e3
} else {
res[i] <- as.numeric(xx[i])
}
}
res
}
population$population <- text.to.num(population$population)
## Warning in text.to.num(population$population): NAs introduced by coercion
## Warning in text.to.num(population$population): NAs introduced by coercion
life_expectancy$lifeexp <- text.to.num(life_expectancy$lifeexp)
income$income <- text.to.num(income$income)
babies_per_woman$chdperwoman <- text.to.num(babies_per_woman$chdperwoman)
child_mortality$childmort <- text.to.num(child_mortality$childmort)
co2_emissions$co2 <- text.to.num(co2_emissions$co2)
gdp_per_capita$gdpcapita <- text.to.num(gdp_per_capita$gdpcapita)
health_spending$healthspend <- text.to.num(health_spending$healthspend)
population_density$popdensity <- text.to.num(population_density$popdensity)
basic_water$water <- text.to.num(basic_water$water)
murders$murder <- text.to.num(murders$murder)
alldata <- list(population, life_expectancy, income, babies_per_woman, child_mortality,
co2_emissions, gdp_per_capita, health_spending, population_density,
basic_water, murders) %>%
reduce(function(x, y) merge(x, y, by = "country", all = TRUE))
alldata$continent <- countrycode(sourcevar = alldata$country,
origin = "country.name",
destination = "continent")
## Warning: Some values were not matched unambiguously: Kosovo
alldata <- alldata %>% drop_na()
head(alldata)
## country population lifeexp income chdperwoman childmort co2
## 1 Afghanistan 28300000 60.5 4.50 6.20 88.00 0.29
## 2 Albania 2930000 78.1 9.77 1.65 13.30 2.26
## 3 Algeria 36200000 74.5 9.08 2.88 27.40 3.28
## 4 Andorra 80700 81.8 66.50 1.28 4.18 6.12
## 5 Angola 23300000 60.2 6.29 6.19 120.00 1.24
## 6 Antigua and Barbuda 85300 75.9 18.40 1.79 9.59 5.96
## gdpcapita healthspend popdensity water murder continent
## 1 2030 37.7 43.6 48.8 4130.00 Asia
## 2 10700 241.0 107.0 91.4 65.90 Europe
## 3 11000 178.0 15.2 92.3 530.00 Africa
## 4 60500 3100.0 172.0 100.0 0.50 Europe
## 5 7690 123.0 18.7 50.4 824.00 Africa
## 6 20100 690.0 194.0 98.4 5.05 Americas
summary(alldata)
## country population lifeexp income
## Length:177 Min. : 10000 Min. :32.50 Min. : 1.69
## Class :character 1st Qu.: 1710000 1st Qu.:64.00 1st Qu.: 4.91
## Mode :character Median : 7300000 Median :72.30 Median : 11.20
## Mean : 22512891 Mean :70.35 Mean : 19.85
## 3rd Qu.: 22500000 3rd Qu.:76.50 3rd Qu.: 23.70
## Max. :311000000 Max. :83.30 Max. :120.00
## chdperwoman childmort co2 gdpcapita
## Min. :0.930 Min. : 2.62 Min. : 0.030 Min. : 804
## 1st Qu.:1.790 1st Qu.: 8.53 1st Qu.: 0.678 1st Qu.: 3770
## Median :2.410 Median : 19.90 Median : 2.620 Median : 11300
## Mean :2.984 Mean : 37.90 Mean : 5.053 Mean : 18203
## 3rd Qu.:4.130 3rd Qu.: 59.90 3rd Qu.: 7.430 3rd Qu.: 24500
## Max. :7.640 Max. :209.00 Max. :26.400 Max. :112000
## healthspend popdensity water murder
## Min. : 11.9 Min. : 1.74 Min. : 34.00 Min. : 0.5
## 1st Qu.: 76.9 1st Qu.: 30.20 1st Qu.: 73.50 1st Qu.: 51.8
## Median : 320.0 Median : 75.00 Median : 93.00 Median : 264.0
## Mean : 998.0 Mean : 178.95 Mean : 84.69 Mean : 1804.9
## 3rd Qu.: 864.0 3rd Qu.: 162.00 3rd Qu.: 98.60 3rd Qu.: 797.0
## Max. :8360.0 Max. :7430.00 Max. :100.00 Max. :62900.0
## continent
## Length:177
## Class :character
## Mode :character
##
##
##
#Task 1: Scatterplot of Life Expectancy vs. Child Mortality We begin by plotting a scatterplot of Life Expectancy against Child Mortality to examine the relationship between these two variables. The plot shows a negative correlation: countries with higher child mortality tend to have lower life expectancy.
ggplot(alldata, aes(x = childmort, y = lifeexp)) +
geom_point() +
labs(title = "Life Expectancy vs. Child Mortality",
x = "Child Mortality",
y = "Life Expectancy")
We further analyze the relationship between Life Expectancy and Child Mortality by adding continent (color) and population (size) to the scatterplot. Key observations: - African countries have higher child mortality and lower life expectancy. - European and American countries have lower child mortality and higher life expectancy. - Larger countries (bigger points) are more concentrated in certain regions.
We examine the distribution of Life Expectancy using a histogram. The distribution is right-skewed, indicating that most countries have higher life expectancy, while a few countries (mainly in Africa) have significantly lower life expectancy.
A boxplot of Life Expectancy reveals the median, quartiles, and potential outliers. Key observations: - The median life expectancy is around 72 years. - Several outliers exist, primarily countries with very low life expectancy.
We compare Life Expectancy across continents using a grouped boxplot. Key observations: - Europe and Americas have the highest life expectancy with little variability. - Africa has the lowest life expectancy and the most variability. - Asia and Oceania fall in between.
We visualize the number of countries in each continent. Key observations: - Africa has the most countries in the dataset. - Asia has the second-highest number of countries. - Oceania has the fewest countries.
We create an indicator variable child_indicator to
classify countries based on whether the average number of children per
woman is less than or equal to 2. Key observations: -
Most European and American countries
have low fertility rates (<=2). - Many African and
some Asian countries have high fertility rates
(>2).
We visualize the distribution of the child_indicator
across continents. Key observations: - Europe and
Americas have a high percentage of countries with low
fertility rates (<=2). - Africa has a high
percentage of countries with high fertility rates (>2).