Import data

#csv file
data <- read.csv ("Worldmap - Sheet1.csv")

Introduction

Questions

Variation

ggplot(data= data)+
    geom_bar(mapping = aes(x= Continents))

Visualizing distributions

#density 
ggplot(data = data, mapping = aes( x= X.))+
    geom_histogram(binwidth =0.02)

Typical values

data <- data %>%
  mutate(Population = as.numeric(Population), Land.Area.Km2 = as.numeric(Land.Area.Km2))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Population = as.numeric(Population)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(data, aes(x = Population)) +
  geom_freqpoly(bindwith = 50000000, color = "steelblue", linewidth = 1) +
  labs(
    title = "Frequency Polygon of Population",
    x = "Population",
    y = "Count"
  )
## Warning in geom_freqpoly(bindwith = 5e+07, color = "steelblue", linewidth = 1):
## Ignoring unknown parameters: `bindwith`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 232 rows containing non-finite outside the scale range
## (`stat_bin()`).

Unusual values

ggplot(data, aes(y = Land.Area.Km2)) +
  geom_boxplot(outlier.color = "blue", outlier.size = 3) +
  labs(
    title = "Unusual Land Area Values",
    y = "Land.Area.Km2)"
  )
## Warning: Removed 183 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Missing Values

missing_tbl <- data %>%
  summarize(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing")

ggplot(missing_tbl, aes(x = variable, y = missing)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Missing Values per Variable",
    x = "Variable",
    y = "Number of Missing Values"
  ) +
  theme_minimal()

Covariation

data_ranked <- data %>%
  mutate(X. = rank(Population))

ggplot(data_ranked, aes(x = Continents, y = X., color = Continents)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Covariation: Country Population Rank by Continent",
    y = "Population Rank (1 = largest)"
  )

A categorical and continuous variable

# Load packages
library(tidyverse)


data_clean <- data %>%
  mutate(
    
    Continents = as.character(Continents),
    
    World. = readr::parse_number(World.)
  ) %>%
  filter(
    !is.na(World.),        
    World. > 0,            
    !is.na(Continents),    
    !str_detect(Continents, "/"),  
    !str_detect(Continents, ",")   
  ) %>%
  mutate(
    Continents = droplevels(factor(Continents))
  )


ggplot(data_clean, aes(x = Continents, y = World., fill = Continents)) +
  geom_boxplot(alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Covariation: World % by Continent (Cleaned)",
    x = "Continent",
    y = "World % of World Population"
  )

Two categorical variables

library(tidyverse)


world <- readr::read_csv("Worldmap - Sheet1.csv")


world2 <- world %>%
  mutate(
    World_num = readr::parse_number(`World%`),   
    World_cat = cut(
      World_num,
      breaks = c(-Inf, 1, 3, 10, Inf),
      labels = c("<1%", "1–3%", "3–10%", ">10%")
    )
  )

ggplot(world2, aes(x = Continents, fill = World_cat)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Countries by Continent and World Population Share Category",
    x = "Continent",
    y = "Number of Countries",
    fill = "World %"
  ) +
  theme_minimal()

Two continous variables

library(tidyverse)

world <- readr::read_csv("Worldmap - Sheet1.csv")

world2 <- world %>%
  mutate(
    Population_num = as.numeric(Population),
    LandArea_num = as.numeric(`Land Area Km2`) ) 


ggplot(world2, aes(x = LandArea_num, y = Population_num)) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Population vs Land Area",
    x = "Land Area (km²)",
    y = "Population"
  ) +
  theme_minimal()

## Patterns and models

library(tidyverse)
library(modelr)
library(scales)

world <- readr::read_csv("Worldmap - Sheet1.csv")

world2 <- world %>%
  mutate(
    Population_num = as.numeric(Population),
    LandArea_num   = as.numeric(`Land Area Km2`)
  ) %>%
 
  filter(
    !is.na(Population_num),
    !is.na(LandArea_num),
    Population_num > 0,
    LandArea_num > 0
  )

mod_world <- lm(log(Population_num) ~ log(LandArea_num), data = world2)

world2 <- world2 %>%
  add_residuals(mod_world) %>%
  mutate(resid_exp = exp(resid))

g1 <- ggplot(world2, aes(x = LandArea_num, y = resid_exp)) +
  geom_point(alpha = 0.7) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Residuals from log(Population) ~ log(Land Area)",
    x = "Land Area (km²)",
    y = "Residual (back-transformed)"
  ) +
  theme_minimal()

g2 <- ggplot(world2, aes(x = Continents, y = resid_exp)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Residuals by Continent",
    x = "Continent",
    y = "Residual (back-transformed)"
  ) +
  theme_minimal()

g1

g2