# load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# load countries of the world_cia_kaggle data
countries_of_the_world <- read.csv("C:/Users/tianm/Documents/DATA110/data/countries of the world_cia_kaggle.csv") |>
janitor::clean_names()
head(countries_of_the_world)
## country region population area_sq_mi
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 4 American Samoa OCEANIA 57794 199
## 5 Andorra WESTERN EUROPE 71201 468
## 6 Angola SUB-SAHARAN AFRICA 12127071 1246700
## pop_density_per_sq_mi coastline_coast_area_ratio net_migration
## 1 48,0 0,00 23,06
## 2 124,6 1,26 -4,93
## 3 13,8 0,04 -0,39
## 4 290,4 58,29 -20,71
## 5 152,1 0,00 6,6
## 6 9,7 0,13 0
## infant_mortality_per_1000_births gdp_per_capita literacy phones_per_1000
## 1 163,07 700 36,0 3,2
## 2 21,52 4500 86,5 71,2
## 3 31 6000 70,0 78,1
## 4 9,27 8000 97,0 259,5
## 5 4,05 19000 100,0 497,2
## 6 191,19 1900 42,0 7,8
## arable crops other climate birthrate deathrate agriculture industry service
## 1 12,13 0,22 87,65 1 46,6 20,34 0,38 0,24 0,38
## 2 21,09 4,42 74,49 3 15,11 5,22 0,232 0,188 0,579
## 3 3,22 0,25 96,53 1 17,14 4,61 0,101 0,6 0,298
## 4 10 15 75 2 22,46 3,27
## 5 2,22 0 97,78 3 8,71 6,25
## 6 2,41 0,24 97,35 45,11 24,2 0,096 0,658 0,246
# show column names
colnames(countries_of_the_world)
## [1] "country" "region"
## [3] "population" "area_sq_mi"
## [5] "pop_density_per_sq_mi" "coastline_coast_area_ratio"
## [7] "net_migration" "infant_mortality_per_1000_births"
## [9] "gdp_per_capita" "literacy"
## [11] "phones_per_1000" "arable"
## [13] "crops" "other"
## [15] "climate" "birthrate"
## [17] "deathrate" "agriculture"
## [19] "industry" "service"
# select columns of interest
selected_data <- countries_of_the_world |>
select('country', 'region', 'population', 'gdp_per_capita', 'literacy') |>
filter(!is.na(gdp_per_capita)) |>
filter(!is.na(literacy)) |>
mutate(literacy = as.numeric(gsub(",", ".", literacy)))
str(selected_data)
## 'data.frame': 226 obs. of 5 variables:
## $ country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ population : int 31056997 3581655 32930091 57794 71201 12127071 13477 69108 39921833 2976372 ...
## $ gdp_per_capita: int 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ literacy : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
library(ggplot2)
library(ggthemes)
# select columns of interest
selected_data |>
ggplot(size = 3, alpha = 0.5, aes(x = literacy, y = gdp_per_capita)) +
theme_minimal(base_size = 14, base_family = "serif")+
geom_point(aes(color = region)) +
geom_smooth(method = "gam", se =FALSE, color = "black", lty = 2, linewidth = 0.3 ) +
labs(
title = "Scatterplot of Literacy vs. GDP by Region",
x = "Literacy (%)", y = "GDP ($ per capita)",
color = "region", shape = "region"
)
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
This scatterplot shows the correlation between GDP per capita and literacy rates for different countries, categorized by region.
The majority of regions, particularly those in Western Europe, Northern America, and Oceania, that have high literacy rates (almost 100%) also typically have greater GDP per capita.
In general, Sub-Saharan Africa has lower GDP per capita and literacy rates.
A significant cluster of regions, mostly from Latin America the Caribbean, and Asia (except from the Near East), with literacy rates between 75 and 100 percent, although their GDP per capita varies greatly.
p1 <- ggplot(selected_data, aes(x = `literacy`, y = `gdp_per_capita`, label = country, color = region, size = population),size = 3, alpha = 0.5) +
theme_minimal(base_size = 20, base_family = "serif")+
geom_point() +
geom_smooth(method = "gam", se =FALSE, color = "black", lty = 2, linewidth = 0.3 )+
labs(x = "Literacy(%)", y = "GDP ($ per capita)", title = "Scatterplot of Literacy vs. GDP by Regions") +
scale_color_manual(values = RColorBrewer::brewer.pal(n = n_distinct(selected_data$region), name = "Paired")) +
scale_size_continuous(labels = scales::comma) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p1
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label,
## size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 17 rows containing missing values (`geom_point()`).
The population of each region is indicated by the different circle sizes. It’s interesting to point out that not all of the nations with larger populations (larger circles) also have the greatest rates of literacy or GDP per capita. This implies that there is no direct relationship between these two measures and population size.