Part 2
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
us <- read_excel("US.xlsx")
sk <- read_excel("South Korea.xlsx")
cg <- read_excel("Congo.xlsx")
summary(us$Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 83512 966739 1497779 1376055 1882197 2370937
summary(sk$Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 24284 113722 103647 172672 239173
summary(cg$Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187 3395 7601 11262 18833 35674
us <- us %>%
mutate(Value = ifelse(Sex == "Male", -Value, Value))
sk <- sk %>%
mutate(Value = ifelse(Sex == "Male", -Value, Value))
cg <- cg %>%
mutate(Value = ifelse(Sex == "Male", -Value, Value))
## Population Pyramid
## US
ggplot(us, aes(x = `Age`, y = Value, fill = Sex)) +
geom_bar(stat = "identity", width = 0.7) +
coord_flip() +
scale_y_continuous(
breaks = seq(-2500000, 2500000, 1000000),
labels = function(x) format(abs(x), big.mark = ","),
limits = c(-2500000, 2500000)
) +
labs(title = "Population Pyramid by Sex, United States in 1990",
x = "Age", y = "Population") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal() +
theme(legend.title = element_blank())

## SK
ggplot(sk, aes(x = `Age`, y = Value, fill = Sex)) +
geom_bar(stat = "identity", width = 0.7) +
coord_flip() +
scale_y_continuous(
breaks = seq(-250000, 250000, 100000),
labels = function(x) format(abs(x), big.mark = ","),
limits = c(-250000, 250000)
) +
labs(title = "Population Pyramid by Sex, South Korea in 1993",
x = "Age", y = "Population") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal() +
theme(legend.title = element_blank())

## CG
ggplot(cg, aes(x = `Age`, y = Value, fill = Sex)) +
geom_bar(stat = "identity", width = 0.7) +
coord_flip() +
scale_y_continuous(
breaks = seq(-40000, 40000, 10000),
labels = function(x) format(abs(x), big.mark = ","),
limits = c(-40000, 40000)
) +
labs(title = "Population Pyramid by Sex, Congo in 2007",
x = "Age", y = "Population") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal() +
theme(legend.title = element_blank())

###### Moving average and linear regression
library(readxl)
library(dplyr)
library(ggplot2)
sk1993 <- read_excel("SK1993.xlsx")
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
sk1993$population_ma <- rollmean(sk1993$Value, k = 5, fill = NA)
lm_model <- lm(Value ~ Age, data = sk1993)
sk1993$population_lm <- predict(lm_model)
coef_lm <- coef(lm_model)
slope <- round(coef_lm[2], 2)
intercept <- round(coef_lm[1], 2)
formula_text <- paste0("y = ", slope, " * x + ", intercept)
ggplot(sk1993, aes(x = Age)) +
geom_line(aes(y = Value, color = "Original Data"), size = 1) +
geom_line(aes(y = population_ma, color = "Moving Average"), size = 1) +
geom_line(aes(y = population_lm, color = "Linear Regression"), size = 1) +
labs(title = "South Korea Population Estimate: Moving Average vs Linear Regression",
x = "Age", y = "Population") +
scale_color_manual(values = c("blue", "red", "green")) +
theme_minimal() +
annotate("text", x = 70, y = max(sk1993$Value), label = formula_text, color = "black", size = 5, hjust = 1) +
guides(color = guide_legend(title = "Legend"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
