# load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# load countries of the world_cia_kaggle data
countries_of_the_world <- read.csv("C:/Users/tianm/Documents/DATA110/data/countries of the world_cia_kaggle.csv") |>
  janitor::clean_names()

head(countries_of_the_world)
##           country                              region population area_sq_mi
## 1    Afghanistan        ASIA (EX. NEAR EAST)            31056997     647500
## 2        Albania  EASTERN EUROPE                         3581655      28748
## 3        Algeria  NORTHERN AFRICA                       32930091    2381740
## 4 American Samoa  OCEANIA                                  57794        199
## 5        Andorra  WESTERN EUROPE                           71201        468
## 6         Angola  SUB-SAHARAN AFRICA                    12127071    1246700
##   pop_density_per_sq_mi coastline_coast_area_ratio net_migration
## 1                  48,0                       0,00         23,06
## 2                 124,6                       1,26         -4,93
## 3                  13,8                       0,04         -0,39
## 4                 290,4                      58,29        -20,71
## 5                 152,1                       0,00           6,6
## 6                   9,7                       0,13             0
##   infant_mortality_per_1000_births gdp_per_capita literacy phones_per_1000
## 1                           163,07            700     36,0             3,2
## 2                            21,52           4500     86,5            71,2
## 3                               31           6000     70,0            78,1
## 4                             9,27           8000     97,0           259,5
## 5                             4,05          19000    100,0           497,2
## 6                           191,19           1900     42,0             7,8
##   arable crops other climate birthrate deathrate agriculture industry service
## 1  12,13  0,22 87,65       1      46,6     20,34        0,38     0,24    0,38
## 2  21,09  4,42 74,49       3     15,11      5,22       0,232    0,188   0,579
## 3   3,22  0,25 96,53       1     17,14      4,61       0,101      0,6   0,298
## 4     10    15    75       2     22,46      3,27                             
## 5   2,22     0 97,78       3      8,71      6,25                             
## 6   2,41  0,24 97,35             45,11      24,2       0,096    0,658   0,246
# show column names
colnames(countries_of_the_world)
##  [1] "country"                          "region"                          
##  [3] "population"                       "area_sq_mi"                      
##  [5] "pop_density_per_sq_mi"            "coastline_coast_area_ratio"      
##  [7] "net_migration"                    "infant_mortality_per_1000_births"
##  [9] "gdp_per_capita"                   "literacy"                        
## [11] "phones_per_1000"                  "arable"                          
## [13] "crops"                            "other"                           
## [15] "climate"                          "birthrate"                       
## [17] "deathrate"                        "agriculture"                     
## [19] "industry"                         "service"
# select columns of interest
selected_data <- countries_of_the_world |>
  select('country', 'region', 'population', 'gdp_per_capita', 'literacy') |>
  filter(!is.na(gdp_per_capita)) |>
  filter(!is.na(literacy)) |>
  mutate(literacy = as.numeric(gsub(",", ".", literacy)))


str(selected_data)
## 'data.frame':    226 obs. of  5 variables:
##  $ country       : chr  "Afghanistan " "Albania " "Algeria " "American Samoa " ...
##  $ region        : chr  "ASIA (EX. NEAR EAST)         " "EASTERN EUROPE                     " "NORTHERN AFRICA                    " "OCEANIA                            " ...
##  $ population    : int  31056997 3581655 32930091 57794 71201 12127071 13477 69108 39921833 2976372 ...
##  $ gdp_per_capita: int  700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
##  $ literacy      : num  36 86.5 70 97 100 42 95 89 97.1 98.6 ...
library(ggplot2)
library(ggthemes)
# select columns of interest
selected_data |>
  ggplot(size = 3, alpha = 0.5, aes(x = literacy, y = gdp_per_capita)) +
  theme_minimal(base_size = 14, base_family = "serif")+
  geom_point(aes(color = region)) + 
  geom_smooth(method = "gam", se  =FALSE, color = "black", lty = 2, linewidth = 0.3 ) +
  labs(
    title = "Scatterplot of Literacy vs. GDP by Region",
    x = "Literacy (%)", y = "GDP ($ per capita)",
    color = "region", shape = "region"
  )
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).

This scatterplot shows the correlation between GDP per capita and literacy rates for different countries, categorized by region.

  1. The majority of regions, particularly those in Western Europe, Northern America, and Oceania, that have high literacy rates (almost 100%) also typically have greater GDP per capita.

  2. In general, Sub-Saharan Africa has lower GDP per capita and literacy rates.

  3. A significant cluster of regions, mostly from Latin America the Caribbean, and Asia (except from the Near East), with literacy rates between 75 and 100 percent, although their GDP per capita varies greatly.

p1 <- ggplot(selected_data, aes(x = `literacy`, y = `gdp_per_capita`, label = country, color = region, size = population),size = 3, alpha = 0.5) +
  theme_minimal(base_size = 20, base_family = "serif")+
  geom_point() +
  geom_smooth(method = "gam", se  =FALSE, color = "black", lty = 2, linewidth = 0.3 )+
  labs(x = "Literacy(%)", y = "GDP ($ per capita)", title = "Scatterplot of Literacy vs. GDP by Regions") +
  scale_color_manual(values = RColorBrewer::brewer.pal(n = n_distinct(selected_data$region), name = "Paired")) +
  scale_size_continuous(labels = scales::comma) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
p1
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label,
## size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 17 rows containing missing values (`geom_point()`).

The population of each region is indicated by the different circle sizes. It’s interesting to point out that not all of the nations with larger populations (larger circles) also have the greatest rates of literacy or GDP per capita. This implies that there is no direct relationship between these two measures and population size.