library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)


TX <- read_delim("Texas.txt", delim = "\t")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 86 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Notes, Age Group, Age Group Code, Gender, Gender Code
## dbl (1): Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CA <- read_delim("California.txt", delim = "\t")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 86 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Notes, Age Group, Age Group Code, Gender, Gender Code
## dbl (1): Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TX <- TX[!is.na(TX$Gender), ]
CA <- CA[!is.na(CA$Gender), ]

TX <- TX %>%
  mutate(`Age Group` = factor(`Age Group`, levels = unique(`Age Group`)) )

CA <- CA %>%
  mutate(`Age Group` = factor(`Age Group`, levels = unique(`Age Group`)) )

## Distribution

## Texas

ggplot(TX, aes(x = `Age Group`, y = Population, group = Gender, color = Gender)) +
  geom_line(size = 1.2) + 
  geom_point(size = 2) +  
  labs(title = "Age Distribution by Gender, Texas",
       x = "Age Group", y = "Population") +
  scale_y_continuous(labels = scales::comma) +  
  theme_minimal() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## California

ggplot(CA, aes(x = `Age Group`, y = Population, group = Gender, color = Gender)) +
  geom_line(size = 1.2) + 
  geom_point(size = 2) +  
  labs(title = "Age Distribution by Gender, California",
       x = "Age Group", y = "Population") +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

TX <- TX %>%
  mutate(Population = ifelse(Gender == "Male", -Population, Population))

CA <- CA %>%
  mutate(Population = ifelse(Gender == "Male", -Population, Population) )


## Population Pyramid

## TX
  
ggplot(TX, aes(x = `Age Group`, y = Population, fill = Gender)) +
  geom_bar(stat = "identity", width = 0.7) + 
  coord_flip() + 
  scale_y_continuous(
    breaks = seq(-1600000, 1600000, 400000),  
    labels = function(x) format(abs(x), big.mark = ","),  
    limits = c(-1600000, 1600000) 
  ) +
  labs(title = "Population Pyramid by Gender, Texas",
       x = "Age Group", y = "Population") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +  
  theme_minimal() + 
  theme(legend.title = element_blank())  

## CA
  
ggplot(CA, aes(x = `Age Group`, y = Population, fill = Gender)) +
  geom_bar(stat = "identity", width = 0.7) + 
  coord_flip() +  
  scale_y_continuous(
    breaks = seq(-16000000, 1600000, 400000),  
    labels = function(x) format(abs(x), big.mark = ","), 
    limits = c(-1600000, 1600000) 
  ) +
  labs(title = "Population Pyramid by Gender, California",
       x = "Age Group", y = "Population") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +  
  theme_minimal() + 
  theme(legend.title = element_blank())