Part 1

library(readxl)
data <- read_excel("Assignment 3 Data_Part 1.xlsx")
## New names:
## • `` -> `...7`
library(ggplot2)


ggplot(data, aes(x = Age, y = Value, color = as.factor(Year), group = Year)) +
  geom_line() +                           
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +  
  scale_y_continuous(labels = scales::comma) +        
  labs(x = "Age", y = "Population Estimate", color = "Year") +     
  theme_minimal()                                     

Part 2

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
us <- read_excel("US.xlsx")
sk <- read_excel("South Korea.xlsx")
cg <- read_excel("Congo.xlsx")


summary(us$Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   83512  966739 1497779 1376055 1882197 2370937
summary(sk$Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4   24284  113722  103647  172672  239173
summary(cg$Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     187    3395    7601   11262   18833   35674
us <- us %>%
  mutate(Value = ifelse(Sex == "Male", -Value, Value))


sk <- sk %>%
  mutate(Value = ifelse(Sex == "Male", -Value, Value))


cg <- cg %>%
  mutate(Value = ifelse(Sex == "Male", -Value, Value))


## Population Pyramid

## US
  
ggplot(us, aes(x = `Age`, y = Value, fill = Sex)) +
  geom_bar(stat = "identity", width = 0.7) + 
  coord_flip() + 
  scale_y_continuous(
    breaks = seq(-2500000, 2500000, 1000000),  
    labels = function(x) format(abs(x), big.mark = ","),  
    limits = c(-2500000, 2500000) 
  ) +
  labs(title = "Population Pyramid by Sex, United States in 1990",
       x = "Age", y = "Population") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +  
  theme_minimal() + 
  theme(legend.title = element_blank())  

## SK
  
ggplot(sk, aes(x = `Age`, y = Value, fill = Sex)) +
  geom_bar(stat = "identity", width = 0.7) + 
  coord_flip() + 
  scale_y_continuous(
    breaks = seq(-250000, 250000, 100000),  
    labels = function(x) format(abs(x), big.mark = ","),  
    limits = c(-250000, 250000) 
  ) +
  labs(title = "Population Pyramid by Sex, South Korea in 1993",
       x = "Age", y = "Population") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +  
  theme_minimal() + 
  theme(legend.title = element_blank())  

## CG
  
ggplot(cg, aes(x = `Age`, y = Value, fill = Sex)) +
  geom_bar(stat = "identity", width = 0.7) + 
  coord_flip() + 
  scale_y_continuous(
    breaks = seq(-40000, 40000, 10000),  
    labels = function(x) format(abs(x), big.mark = ","),  
    limits = c(-40000, 40000) 
  ) +
  labs(title = "Population Pyramid by Sex, Congo in 2007",
       x = "Age", y = "Population") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +  
  theme_minimal() + 
  theme(legend.title = element_blank())  

###### Moving average and linear regression



library(readxl)
library(dplyr)
library(ggplot2)

sk1993 <- read_excel("SK1993.xlsx")

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
sk1993$population_ma <- rollmean(sk1993$Value, k = 5, fill = NA)
lm_model <- lm(Value ~ Age, data = sk1993)

sk1993$population_lm <- predict(lm_model)

coef_lm <- coef(lm_model)  
slope <- round(coef_lm[2], 2)  
intercept <- round(coef_lm[1], 2) 

formula_text <- paste0("y = ", slope, " * x + ", intercept)


ggplot(sk1993, aes(x = Age)) +
  geom_line(aes(y = Value, color = "Original Data"), size = 1) +  
  geom_line(aes(y = population_ma, color = "Moving Average"), size = 1) + 
  geom_line(aes(y = population_lm, color = "Linear Regression"), size = 1) +  
  labs(title = "South Korea Population Estimate: Moving Average vs Linear Regression", 
       x = "Age", y = "Population") +
  scale_color_manual(values = c("blue", "red", "green")) + 
  theme_minimal() +
  annotate("text", x = 70, y = max(sk1993$Value), label = formula_text, color = "black", size = 5, hjust = 1) +
  guides(color = guide_legend(title = "Legend"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).