Data Acquisition

censusdata <- read.csv("~/Desktop/Statistics 2 Homework/hw_wk3 3/DAV5303_HW3_Assignment_files/CC-EST2020-ALLDATA6.csv")
censusdata <- censusdata[complete.cases(censusdata),]
censusdata <- censusdata %>% select(STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE)
save(censusdata, file="censusdata.RData")

Data Split

total_agegrp <- censusdata %>% dplyr::filter(AGEGRP==0)
nottotal_agegrp <-censusdata %>% dplyr::filter(AGEGRP!=0)
nrow(total_agegrp) + nrow(nottotal_agegrp)
## [1] 836038

Aggregate across Age Groups

nottotal_agegrp$TOT_POP <- as.numeric(nottotal_agegrp$TOT_POP)
## Warning: NAs introduced by coercion
nottotal_agegrp$TOT_MALE <- as.numeric(nottotal_agegrp$TOT_MALE)
## Warning: NAs introduced by coercion
nottotal_agegrp$TOT_FEMALE <- as.numeric(nottotal_agegrp$TOT_FEMALE)
## Warning: NAs introduced by coercion
reduction1 <- nottotal_agegrp %>%
  #select(AGEGRP, TOT_POP, TOT_MALE, TOT_FEMALE) %>%
  group_by(STNAME, CTYNAME, YEAR) %>%
  summarise(TOT_POP = sum(TOT_POP),TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE)) 
## `summarise()` has grouped output by 'STNAME', 'CTYNAME'. You can override using
## the `.groups` argument.
print(reduction1)
## # A tibble: 44,002 × 6
## # Groups:   STNAME, CTYNAME [3,143]
##    STNAME  CTYNAME         YEAR TOT_POP TOT_MALE TOT_FEMALE
##    <chr>   <chr>          <int>   <dbl>    <dbl>      <dbl>
##  1 Alabama Autauga County     1   54571    26569      28002
##  2 Alabama Autauga County     2   54582    26576      28006
##  3 Alabama Autauga County     3   54761    26667      28094
##  4 Alabama Autauga County     4   55229    26980      28249
##  5 Alabama Autauga County     5   54970    26830      28140
##  6 Alabama Autauga County     6   54747    26588      28159
##  7 Alabama Autauga County     7   54922    26804      28118
##  8 Alabama Autauga County     8   54903    26752      28151
##  9 Alabama Autauga County     9   55302    26999      28303
## 10 Alabama Autauga County    10   55448    27041      28407
## # ℹ 43,992 more rows

Q3 Questions to ask

  • Does this reduced data set have the same dimensions as set1? No reduction 1 has only 6 variables, but does have the same number of observations *Comment more than just yes/no. E.g. should they? It should have the same number of observations because we are summing these variables, and not filtering out observations.
sum(is.na(nottotal_agegrp$AGEGRP))
## [1] 0

Aggregate across Years

reduction2a <- reduction1 %>%
  filter(YEAR >= 3 & YEAR <= 12) %>%
  group_by(STNAME, CTYNAME, YEAR) %>%
  summarise(TOT_POP = sum(TOT_POP), TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE))
## `summarise()` has grouped output by 'STNAME', 'CTYNAME'. You can override using
## the `.groups` argument.
print(reduction2a)
## # A tibble: 31,430 × 6
## # Groups:   STNAME, CTYNAME [3,143]
##    STNAME  CTYNAME         YEAR TOT_POP TOT_MALE TOT_FEMALE
##    <chr>   <chr>          <int>   <dbl>    <dbl>      <dbl>
##  1 Alabama Autauga County     3   54761    26667      28094
##  2 Alabama Autauga County     4   55229    26980      28249
##  3 Alabama Autauga County     5   54970    26830      28140
##  4 Alabama Autauga County     6   54747    26588      28159
##  5 Alabama Autauga County     7   54922    26804      28118
##  6 Alabama Autauga County     8   54903    26752      28151
##  7 Alabama Autauga County     9   55302    26999      28303
##  8 Alabama Autauga County    10   55448    27041      28407
##  9 Alabama Autauga County    11   55533    27049      28484
## 10 Alabama Autauga County    12   55769    27078      28691
## # ℹ 31,420 more rows

Aggregate across Counties

reduction3 <- reduction2a %>%
  group_by(STNAME, YEAR) %>%
  summarise(TOT_POP = sum(TOT_POP), TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE))
## `summarise()` has grouped output by 'STNAME'. You can override using the
## `.groups` argument.
print(reduction3)
## # A tibble: 510 × 5
## # Groups:   STNAME [51]
##    STNAME   YEAR TOT_POP TOT_MALE TOT_FEMALE
##    <chr>   <int>   <dbl>    <dbl>      <dbl>
##  1 Alabama     3 4785514  2323013    2462501
##  2 Alabama     4 4799642  2328518    2471124
##  3 Alabama     5 4816632  2336196    2480436
##  4 Alabama     6 4831586  2343135    2488451
##  5 Alabama     7 4843737  2348012    2495725
##  6 Alabama     8 4854803  2352806    2501997
##  7 Alabama     9 4866824  2357211    2509613
##  8 Alabama    10 4877989  2360503    2517486
##  9 Alabama    11 4891628  2365445    2526183
## 10 Alabama    12 4907965  2371832    2536133
## # ℹ 500 more rows

Aggregate across States

reduction4 <- reduction3 %>%
  group_by(STNAME, YEAR) %>%
  summarise(TOT_POP = sum(TOT_POP))
## `summarise()` has grouped output by 'STNAME'. You can override using the
## `.groups` argument.
print(reduction4)
## # A tibble: 510 × 3
## # Groups:   STNAME [51]
##    STNAME   YEAR TOT_POP
##    <chr>   <int>   <dbl>
##  1 Alabama     3 4785514
##  2 Alabama     4 4799642
##  3 Alabama     5 4816632
##  4 Alabama     6 4831586
##  5 Alabama     7 4843737
##  6 Alabama     8 4854803
##  7 Alabama     9 4866824
##  8 Alabama    10 4877989
##  9 Alabama    11 4891628
## 10 Alabama    12 4907965
## # ℹ 500 more rows

Questions to ask

  • Is this true? I do get a single population by year for each state. The data reduction makes sense.