library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

loading and cleaning datasets

data3 <- read.csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment2\\Bexar County Population estimates 2020.csv") 
data4 <- data3[apply(data3, 1, function(row) all(!is.na(row) & row != "")), ]
str(data4)
## 'data.frame':    1375 obs. of  9 variables:
##  $ Age           : chr  "< 1 year" "< 1 year" "< 1 year" "< 1 year" ...
##  $ Age.Code      : chr  "0" "0" "0" "0" ...
##  $ Race          : chr  "American Indian or Alaska Native" "American Indian or Alaska Native" "American Indian or Alaska Native" "American Indian or Alaska Native" ...
##  $ Race.Code     : chr  "1002-5" "1002-5" "1002-5" "1002-5" ...
##  $ Ethnicity     : chr  "Hispanic or Latino" "Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
##  $ Ethnicity.Code: chr  "2135-2" "2135-2" "2186-5" "2186-5" ...
##  $ Gender        : chr  "Female" "Male" "Female" "Male" ...
##  $ Gender.Code   : chr  "F" "M" "F" "M" ...
##  $ Population    : int  251 261 34 34 129 133 427 448 286 300 ...
data4 <- data4 %>%
  mutate(Age.Code = as.numeric(Age.Code))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Age.Code = as.numeric(Age.Code)`.
## Caused by warning:
## ! NAs introduced by coercion
data4a <- na.omit(data4)

Hispanic or Latino

data4b<-data4a%>%
filter(Ethnicity.Code=="2135-2")
summary(data4b)
##      Age               Age.Code      Race            Race.Code        
##  Length:680         Min.   : 0   Length:680         Length:680        
##  Class :character   1st Qu.:21   Class :character   Class :character  
##  Mode  :character   Median :42   Mode  :character   Mode  :character  
##                     Mean   :42                                        
##                     3rd Qu.:63                                        
##                     Max.   :84                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:680         Length:680         Length:680         Length:680        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population    
##  Min.   :   1.0  
##  1st Qu.:  69.0  
##  Median : 152.0  
##  Mean   :1797.6  
##  3rd Qu.: 576.5  
##  Max.   :9914.0
str(data4b)
## 'data.frame':    680 obs. of  9 variables:
##  $ Age           : chr  "< 1 year" "< 1 year" "< 1 year" "< 1 year" ...
##  $ Age.Code      : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ Race          : chr  "American Indian or Alaska Native" "American Indian or Alaska Native" "Asian or Pacific Islander" "Asian or Pacific Islander" ...
##  $ Race.Code     : chr  "1002-5" "1002-5" "A-PI" "A-PI" ...
##  $ Ethnicity     : chr  "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" ...
##  $ Ethnicity.Code: chr  "2135-2" "2135-2" "2135-2" "2135-2" ...
##  $ Gender        : chr  "Female" "Male" "Female" "Male" ...
##  $ Gender.Code   : chr  "F" "M" "F" "M" ...
##  $ Population    : int  251 261 129 133 286 300 8031 8320 229 223 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 ...
##   ..- attr(*, "names")= chr [1:16] "2465" "2466" "2468" "2469" ...

Mean Age

data4b%>%
summarise(mean_age = weighted.mean(data4b$Age.Code, w = data4b$Population))
data4bfemale<-data4b%>%
filter(Gender=="Female")
summary(data4bfemale)
##      Age               Age.Code      Race            Race.Code        
##  Length:340         Min.   : 0   Length:340         Length:340        
##  Class :character   1st Qu.:21   Class :character   Class :character  
##  Mode  :character   Median :42   Mode  :character   Mode  :character  
##                     Mean   :42                                        
##                     3rd Qu.:63                                        
##                     Max.   :84                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:340         Length:340         Length:340         Length:340        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population     
##  Min.   :   3.00  
##  1st Qu.:  74.75  
##  Median : 153.50  
##  Mean   :1826.87  
##  3rd Qu.: 703.50  
##  Max.   :9530.00
data4bfemale%>%
summarise(mean_age = weighted.mean(data4bfemale$Age.Code, w =data4bfemale$Population))
data4bmale<-data4b%>%
filter(Gender=="Male")
summary(data4bmale)
##      Age               Age.Code      Race            Race.Code        
##  Length:340         Min.   : 0   Length:340         Length:340        
##  Class :character   1st Qu.:21   Class :character   Class :character  
##  Mode  :character   Median :42   Mode  :character   Mode  :character  
##                     Mean   :42                                        
##                     3rd Qu.:63                                        
##                     Max.   :84                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:340         Length:340         Length:340         Length:340        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population    
##  Min.   :   1.0  
##  1st Qu.:  64.0  
##  Median : 151.0  
##  Mean   :1768.2  
##  3rd Qu.: 576.5  
##  Max.   :9914.0
data4bmale%>%
summarise(mean_age = weighted.mean(data4bmale$Age.Code, w =data4bmale$Population))

Calculate the total population in each group

youth_population <- sum(data4b$Population[data4b$Age.Code >= 0 & data4b$Age.Code <= 14])
working_population <- sum(data4b$Population[data4b$Age.Code >= 15 & data4b$Age.Code <= 64])
old_age_population <- sum(data4b$Population[data4b$Age.Code >= 65])
youth_dependency_ratio <- (youth_population / working_population) * 100
old_age_dependency_ratio <- (old_age_population / working_population) * 100
total_dependency_ratio <- ((youth_population + old_age_population) / working_population) * 100
cat("Youth Dependency Ratio:", sprintf("%.2f", youth_dependency_ratio), "\n")
## Youth Dependency Ratio: 35.07
cat("Old-Age Dependency Ratio:", sprintf("%.2f", old_age_dependency_ratio), "\n")
## Old-Age Dependency Ratio: 13.48
cat("Total Dependency Ratio:", sprintf("%.2f", total_dependency_ratio), "\n")
## Total Dependency Ratio: 48.55

Age grouping (.<1,1-4,5-9,19-14….)

str(data4b)
## 'data.frame':    680 obs. of  9 variables:
##  $ Age           : chr  "< 1 year" "< 1 year" "< 1 year" "< 1 year" ...
##  $ Age.Code      : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ Race          : chr  "American Indian or Alaska Native" "American Indian or Alaska Native" "Asian or Pacific Islander" "Asian or Pacific Islander" ...
##  $ Race.Code     : chr  "1002-5" "1002-5" "A-PI" "A-PI" ...
##  $ Ethnicity     : chr  "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino" ...
##  $ Ethnicity.Code: chr  "2135-2" "2135-2" "2135-2" "2135-2" ...
##  $ Gender        : chr  "Female" "Male" "Female" "Male" ...
##  $ Gender.Code   : chr  "F" "M" "F" "M" ...
##  $ Population    : int  251 261 129 133 286 300 8031 8320 229 223 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 ...
##   ..- attr(*, "names")= chr [1:16] "2465" "2466" "2468" "2469" ...
data4b$age_group <- cut(data4b$Age.Code, 
                      breaks = c(-1,0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84),
                      labels = c("<1", "1-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", 
                                 "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", 
                                 "65-69", "70-74", "75-79", "80-84"),
                      right = TRUE)  # Include the right endpoint in intervals (i.e., 4 means ages up to 4 years)

# Display the dataset with new age groups
head(data4b)
data4c<-subset(data4b,select=c(age_group,Population,Gender))%>%  
group_by(age_group,Gender) %>%  
  summarise(population = sum(Population, na.rm = TRUE))
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.

Non Hispanic

data4d<-data4a%>%
filter(Ethnicity.Code=="2186-5")
summary(data4d)
##      Age               Age.Code         Race            Race.Code        
##  Length:679         Min.   : 0.00   Length:679         Length:679        
##  Class :character   1st Qu.:21.00   Class :character   Class :character  
##  Mode  :character   Median :42.00   Mode  :character   Mode  :character  
##                     Mean   :41.94                                        
##                     3rd Qu.:63.00                                        
##                     Max.   :84.00                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:679         Length:679         Length:679         Length:679        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population  
##  Min.   :   4  
##  1st Qu.:  62  
##  Median : 580  
##  Mean   :1142  
##  3rd Qu.:1480  
##  Max.   :4684
data4d%>%
summarise(mean_age = weighted.mean(data4d$Age.Code, w = data4d$Population))
data4dfemale<-data4d%>%
filter(Gender=="Female")
summary(data4dfemale)
##      Age               Age.Code      Race            Race.Code        
##  Length:340         Min.   : 0   Length:340         Length:340        
##  Class :character   1st Qu.:21   Class :character   Class :character  
##  Mode  :character   Median :42   Mode  :character   Mode  :character  
##                     Mean   :42                                        
##                     3rd Qu.:63                                        
##                     Max.   :84                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:340         Length:340         Length:340         Length:340        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population  
##  Min.   :   4  
##  1st Qu.:  67  
##  Median : 587  
##  Mean   :1136  
##  3rd Qu.:1460  
##  Max.   :4220
data4dfemale%>%
summarise(mean_age = weighted.mean(data4dfemale$Age.Code, w =data4dfemale$Population))
data4dmale<-data4d%>%
filter(Gender=="Male")
summary(data4dmale)
##      Age               Age.Code         Race            Race.Code        
##  Length:339         Min.   : 0.00   Length:339         Length:339        
##  Class :character   1st Qu.:21.00   Class :character   Class :character  
##  Mode  :character   Median :42.00   Mode  :character   Mode  :character  
##                     Mean   :41.88                                        
##                     3rd Qu.:63.00                                        
##                     Max.   :84.00                                        
##   Ethnicity         Ethnicity.Code        Gender          Gender.Code       
##  Length:339         Length:339         Length:339         Length:339        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Population  
##  Min.   :   4  
##  1st Qu.:  58  
##  Median : 571  
##  Mean   :1149  
##  3rd Qu.:1487  
##  Max.   :4684
data4dmale%>%
summarise(mean_age = weighted.mean(data4dmale$Age.Code, w =data4dmale$Population))

Calculate the total population in each group

youth_population1 <- sum(data4d$Population[data4d$Age.Code >= 0 & data4d$Age.Code <= 14])
working_population1 <- sum(data4d$Population[data4d$Age.Code >= 15 & data4d$Age.Code <= 64])
old_age_population1 <- sum(data4d$Population[data4d$Age.Code >= 65])
youth_dependency_ratio1 <- (youth_population1 / working_population1) * 100
old_age_dependency_ratio1 <- (old_age_population1 / working_population1) * 100
total_dependency_ratio1 <- ((youth_population1 + old_age_population1) / working_population1) * 100
cat("Youth Dependency Ratio1:", sprintf("%.2f", youth_dependency_ratio1), "\n")
## Youth Dependency Ratio1: 25.68
cat("Old-Age Dependency Ratio1:", sprintf("%.2f", old_age_dependency_ratio1), "\n")
## Old-Age Dependency Ratio1: 22.16
cat("Total Dependency Ratio1:", sprintf("%.2f", total_dependency_ratio1), "\n")
## Total Dependency Ratio1: 47.84

Age grouping (.<1,1-4,5-9,19-14….)

str(data4d)
## 'data.frame':    679 obs. of  9 variables:
##  $ Age           : chr  "< 1 year" "< 1 year" "< 1 year" "< 1 year" ...
##  $ Age.Code      : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ Race          : chr  "American Indian or Alaska Native" "American Indian or Alaska Native" "Asian or Pacific Islander" "Asian or Pacific Islander" ...
##  $ Race.Code     : chr  "1002-5" "1002-5" "A-PI" "A-PI" ...
##  $ Ethnicity     : chr  "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
##  $ Ethnicity.Code: chr  "2186-5" "2186-5" "2186-5" "2186-5" ...
##  $ Gender        : chr  "Female" "Male" "Female" "Male" ...
##  $ Gender.Code   : chr  "F" "M" "F" "M" ...
##  $ Population    : int  34 34 427 448 1033 1073 2728 2853 29 28 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 ...
##   ..- attr(*, "names")= chr [1:16] "2465" "2466" "2468" "2469" ...
data4d$age_group <- cut(data4d$Age.Code, 
                      breaks = c(-1,0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84),
                      labels = c("<1", "1-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", 
                                 "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", 
                                 "65-69", "70-74", "75-79", "80-84"),
                      right = TRUE)  # Include the right endpoint in intervals (i.e., 4 means ages up to 4 years)

# Display the dataset with new age groups
head(data4d)
data4e<-subset(data4d,select=c(age_group,Population,Gender))%>%  
group_by(age_group,Gender) %>%  
  summarise(population = sum(Population, na.rm = TRUE))
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.

Population Pyramid 1

data4b <- data4b %>%
  mutate(Population = ifelse(Gender == "Male", -Population, Population))

# Create the population pyramid
ggplot(data4b, aes(x = Age.Code, y = Population, fill = Gender)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = abs) +  # Show positive labels on the y-axis
  labs(y = "Population", x = "Age", title = "Bexar County Population Pyramid for Hispanics, 2020") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

### Population Pyramid 1a

data4c <- data4c %>%
  mutate(population = ifelse(Gender == "Male", -population, population))

# Create the population pyramid
ggplot(data4c, aes(x = age_group, y = population, fill = Gender)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = abs) +  # Show positive labels on the y-axis
  labs(y = "Population", x = "Age Interval", title = "Bexar County Population Pyramid for Hispanics, 2020") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

Population Pyramid 2

data4d <- data4d %>%
  mutate(Population = ifelse(Gender == "Male", -Population, Population))

# Create the population pyramid
ggplot(data4d, aes(x = Age.Code, y = Population, fill = Gender)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = abs) +  # Show positive labels on the y-axis
  labs(y = "Population", x = "Age", title = "Bexar County Population Pyramid for Non-hispanics, 2020") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

data4e <- data4e %>%
  mutate(population = ifelse(Gender == "Male", -population, population))

# Create the population pyramid
ggplot(data4e, aes(x = age_group, y = population, fill = Gender)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = abs) +  # Show positive labels on the y-axis
  labs(y = "Population", x = "Age Interval", title = "Bexar County Population Pyramid for Non-hispanics, 2020") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

### Graph: Hispanics

data4b_graph <- data4b %>% 
group_by(Age.Code) %>% 
summarize(sumpopulation = sum(Population))
data4b_graph1 <- data4b %>% 
group_by(Age.Code,Gender) %>% 
summarize(sumpopulation = sum(Population))
## `summarise()` has grouped output by 'Age.Code'. You can override using the
## `.groups` argument.
ggplot() + 
    geom_line(data = data4b_graph, aes(Age.Code, sumpopulation)) +
    geom_line(data = data4b_graph1, aes(Age.Code, sumpopulation, color = Gender)) +
    labs(
        title = "Bexar County age structure for Hispanics, 2020",  # Adding a title
        x = "Age",  # Label for the x-axis
        y = "Population"  # Label for the y-axis
    ) +
    scale_x_continuous(breaks = seq(0, 85, by = 5)) +scale_y_continuous(breaks = seq(0, 20000, by = 2500))

Graph: Non-hispanics

data4d_graph <- data4d %>% 
group_by(Age.Code) %>% 
summarize(sumpopulation = sum(Population))
data4d_graph1 <- data4d %>% 
group_by(Age.Code,Gender) %>% 
summarize(sumpopulation = sum(Population))
## `summarise()` has grouped output by 'Age.Code'. You can override using the
## `.groups` argument.
ggplot() + 
    geom_line(data = data4d_graph, aes(Age.Code, sumpopulation)) +
    geom_line(data = data4d_graph1, aes(Age.Code, sumpopulation, color = Gender)) +
    labs(
        title = "Bexar County age structure for Non-hispanics, 2020",  # Adding a title
        x = "Age",  # Label for the x-axis
        y = "Population"  # Label for the y-axis
    ) +
    scale_x_continuous(breaks = seq(0, 85, by = 5)) +scale_y_continuous(breaks = seq(0, 20000, by = 2500))