Data Acquisition
censusdata <- read.csv("~/Desktop/Statistics 2 Homework/hw_wk3 3/DAV5303_HW3_Assignment_files/CC-EST2020-ALLDATA6.csv")
censusdata <- censusdata[complete.cases(censusdata),]
censusdata <- censusdata %>% select(STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE)
save(censusdata, file="censusdata.RData")
Data Split
total_agegrp <- censusdata %>% dplyr::filter(AGEGRP==0)
nottotal_agegrp <-censusdata %>% dplyr::filter(AGEGRP!=0)
nrow(total_agegrp) + nrow(nottotal_agegrp)
## [1] 836038
Aggregate across Age Groups
nottotal_agegrp$TOT_POP <- as.numeric(nottotal_agegrp$TOT_POP)
## Warning: NAs introduced by coercion
nottotal_agegrp$TOT_MALE <- as.numeric(nottotal_agegrp$TOT_MALE)
## Warning: NAs introduced by coercion
nottotal_agegrp$TOT_FEMALE <- as.numeric(nottotal_agegrp$TOT_FEMALE)
## Warning: NAs introduced by coercion
reduction1 <- nottotal_agegrp %>%
#select(AGEGRP, TOT_POP, TOT_MALE, TOT_FEMALE) %>%
group_by(STNAME, CTYNAME, YEAR) %>%
summarise(TOT_POP = sum(TOT_POP),TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE))
## `summarise()` has grouped output by 'STNAME', 'CTYNAME'. You can override using
## the `.groups` argument.
print(reduction1)
## # A tibble: 44,002 × 6
## # Groups: STNAME, CTYNAME [3,143]
## STNAME CTYNAME YEAR TOT_POP TOT_MALE TOT_FEMALE
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Alabama Autauga County 1 54571 26569 28002
## 2 Alabama Autauga County 2 54582 26576 28006
## 3 Alabama Autauga County 3 54761 26667 28094
## 4 Alabama Autauga County 4 55229 26980 28249
## 5 Alabama Autauga County 5 54970 26830 28140
## 6 Alabama Autauga County 6 54747 26588 28159
## 7 Alabama Autauga County 7 54922 26804 28118
## 8 Alabama Autauga County 8 54903 26752 28151
## 9 Alabama Autauga County 9 55302 26999 28303
## 10 Alabama Autauga County 10 55448 27041 28407
## # ℹ 43,992 more rows
Q3 Questions to ask
- Does this reduced data set have the same dimensions as set1? No
reduction 1 has only 6 variables, but does have the same number of
observations *Comment more than just yes/no. E.g. should they? It should
have the same number of observations because we are summing these
variables, and not filtering out observations.
sum(is.na(nottotal_agegrp$AGEGRP))
## [1] 0
Aggregate across Years
reduction2a <- reduction1 %>%
filter(YEAR >= 3 & YEAR <= 12) %>%
group_by(STNAME, CTYNAME, YEAR) %>%
summarise(TOT_POP = sum(TOT_POP), TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE))
## `summarise()` has grouped output by 'STNAME', 'CTYNAME'. You can override using
## the `.groups` argument.
print(reduction2a)
## # A tibble: 31,430 × 6
## # Groups: STNAME, CTYNAME [3,143]
## STNAME CTYNAME YEAR TOT_POP TOT_MALE TOT_FEMALE
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Alabama Autauga County 3 54761 26667 28094
## 2 Alabama Autauga County 4 55229 26980 28249
## 3 Alabama Autauga County 5 54970 26830 28140
## 4 Alabama Autauga County 6 54747 26588 28159
## 5 Alabama Autauga County 7 54922 26804 28118
## 6 Alabama Autauga County 8 54903 26752 28151
## 7 Alabama Autauga County 9 55302 26999 28303
## 8 Alabama Autauga County 10 55448 27041 28407
## 9 Alabama Autauga County 11 55533 27049 28484
## 10 Alabama Autauga County 12 55769 27078 28691
## # ℹ 31,420 more rows
Aggregate across Counties
reduction3 <- reduction2a %>%
group_by(STNAME, YEAR) %>%
summarise(TOT_POP = sum(TOT_POP), TOT_MALE = sum(TOT_MALE), TOT_FEMALE = sum(TOT_FEMALE))
## `summarise()` has grouped output by 'STNAME'. You can override using the
## `.groups` argument.
print(reduction3)
## # A tibble: 510 × 5
## # Groups: STNAME [51]
## STNAME YEAR TOT_POP TOT_MALE TOT_FEMALE
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Alabama 3 4785514 2323013 2462501
## 2 Alabama 4 4799642 2328518 2471124
## 3 Alabama 5 4816632 2336196 2480436
## 4 Alabama 6 4831586 2343135 2488451
## 5 Alabama 7 4843737 2348012 2495725
## 6 Alabama 8 4854803 2352806 2501997
## 7 Alabama 9 4866824 2357211 2509613
## 8 Alabama 10 4877989 2360503 2517486
## 9 Alabama 11 4891628 2365445 2526183
## 10 Alabama 12 4907965 2371832 2536133
## # ℹ 500 more rows
Aggregate across States
reduction4 <- reduction3 %>%
group_by(STNAME, YEAR) %>%
summarise(TOT_POP = sum(TOT_POP))
## `summarise()` has grouped output by 'STNAME'. You can override using the
## `.groups` argument.
print(reduction4)
## # A tibble: 510 × 3
## # Groups: STNAME [51]
## STNAME YEAR TOT_POP
## <chr> <int> <dbl>
## 1 Alabama 3 4785514
## 2 Alabama 4 4799642
## 3 Alabama 5 4816632
## 4 Alabama 6 4831586
## 5 Alabama 7 4843737
## 6 Alabama 8 4854803
## 7 Alabama 9 4866824
## 8 Alabama 10 4877989
## 9 Alabama 11 4891628
## 10 Alabama 12 4907965
## # ℹ 500 more rows
Questions to ask
- Is this true? I do get a single population by year for each state.
The data reduction makes sense.