library(tidyverse)
library(magrittr)
library(readxl)
library(stargazer)
library(DT)
library(usmap)
data2018 <-
read_csv("G:/My Drive/homework/Matthew R/PEP_2018_PEPANNRES_with_ann.csv",
skip = 1)
data2018 %<>%
select(Geography, `April 1, 2010 - Census`, `Population Estimate (as of July 1) - 2018`) %>%
rename(state = Geography,
res2010 = `April 1, 2010 - Census`,
pep2018 = `Population Estimate (as of July 1) - 2018`)
data2018 %>% filter(!data2018$state %in% state.name)
## # A tibble: 3 x 3
## state res2010 pep2018
## <chr> <dbl> <dbl>
## 1 United States 308745538 327167434
## 2 District of Columbia 601723 702455
## 3 Puerto Rico 3725789 3195153
# Given US Total
data2018 %>%
filter(state == "United States") %>%
select(!pep2018)
## # A tibble: 1 x 2
## state res2010
## <chr> <dbl>
## 1 United States 308745538
# Given US Total includes D.C. but not Puerto Rico
data2018$res2010[2:52] %>% sum()
## [1] 308745538
data2018 %<>% filter(data2018$state %in% state.name)
data2018 %<>%
mutate(PercentChange2018 = (pep2018 - res2010) / res2010 * 100) %>%
select(!res2010)
data2010 <-
read_excel("G:/My Drive/homework/Matthew R/ApportionmentPopulation2010.xls",
skip = 8, n_max = 52)
## New names:
## * `` -> ...3
## * `` -> ...5
data2010 %<>%
slice(-c(1:2)) %>%
select(STATE, `(APRIL 1, 2010)`, `2010 CENSUS`) %>%
rename(state = STATE,
appor2010 = `(APRIL 1, 2010)`,
rep2010 = `2010 CENSUS`)
data2010 %>%
select(appor2010) %>%
as.data.frame() %>%
stargazer(.,
type = "text", # type = "html"
summary.stat = c("min", "max", "mean", "median", "sd"),
digits = 0)
##
## ==========================================================
## Statistic Min Max Mean Median St. Dev.
## ----------------------------------------------------------
## appor2010 568,300 37,341,989 6,183,669 4,452,284 6,869,559
## ----------------------------------------------------------
data2010 %>%
select(appor2010) %>%
pivot_longer(cols = appor2010, names_to = "Variable") %>%
group_by(Variable) %>%
summarize(across(.cols = value,
list(Min = min, Max = max, Mean = mean, Median = median, SD = sd),
.names = "{.fn}"
)
)
## # A tibble: 1 x 6
## Variable Min Max Mean Median SD
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 appor2010 568300 37341989 6183669. 4452284 6869559.
data2010 %>% slice_max(appor2010)
## # A tibble: 1 x 3
## state appor2010 rep2010
## <chr> <dbl> <dbl>
## 1 California 37341989 53
data2010 %>% slice_min(appor2010)
## # A tibble: 1 x 3
## state appor2010 rep2010
## <chr> <dbl> <dbl>
## 1 Wyoming 568300 1
data2010 %>% slice_max(appor2010, n = 5)
## # A tibble: 5 x 3
## state appor2010 rep2010
## <chr> <dbl> <dbl>
## 1 California 37341989 53
## 2 Texas 25268418 36
## 3 New York 19421055 27
## 4 Florida 18900773 27
## 5 Illinois 12864380 18
data2010 %>%
ggplot(aes(appor2010)) +
geom_histogram() +
geom_vline(xintercept = mean(data2010$appor2010), col = "red") +
geom_vline(xintercept = median(data2010$appor2010), col = "blue") +
labs(caption = "Red = Mean, Blue = Median") +
xlab("Apportionment Population") +
ggtitle("2010")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data2010 %>%
ggplot(aes(appor2010)) +
geom_density() +
geom_vline(xintercept = mean(data2010$appor2010), col = "red") +
geom_vline(xintercept = median(data2010$appor2010), col = "blue") +
labs(caption = "Red = Mean, Blue = Median") +
xlab("Apportionment Population") +
ggtitle("2010")
data2010 %>%
ggplot(aes(log(appor2010))) +
geom_histogram() +
geom_vline(xintercept = mean(log(data2010$appor2010)), col = "red") +
geom_vline(xintercept = median(log(data2010$appor2010)), col = "blue") +
labs(caption = "Red = Mean, Blue = Median") +
xlab("Log of Apportionment Population") +
ggtitle("2010")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data2010 %>%
ggplot(aes(log(appor2010))) +
geom_density() +
geom_vline(xintercept = mean(log(data2010$appor2010)), col = "red") +
geom_vline(xintercept = median(log(data2010$appor2010)), col = "blue") +
labs(caption = "Red = Mean, Blue = Median") +
xlab("Log of Apportionment Population") +
ggtitle("2010")
See vertical lines in Problem 4
data2010 %>%
ggplot(aes(appor2010, rep2010)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab("Apportionment Population") +
ylab("Number of Representitives") +
ggtitle("2010")
## `geom_smooth()` using formula 'y ~ x'
data2010 %>% lm(rep2010 ~ appor2010, data = .) %>% summary()
##
## Call:
## lm(formula = rep2010 ~ appor2010, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48503 -0.21379 -0.00812 0.20630 0.55572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.868e-02 5.470e-02 -0.89 0.378
## appor2010 1.415e-06 5.951e-09 237.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2862 on 48 degrees of freedom
## Multiple R-squared: 0.9992, Adjusted R-squared: 0.9991
## F-statistic: 5.652e+04 on 1 and 48 DF, p-value: < 2.2e-16
data2010 %>%
ggplot(aes(log(appor2010), rep2010)) +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "loess", se = FALSE, col = "gray65") +
geom_point() +
xlab("Log of Apportionment Population") +
ylab("Number of Representitives") +
ggtitle("2010")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
data2010 %>% lm(rep2010 ~ log(appor2010), data = .) %>% summary()
##
## Call:
## lm(formula = rep2010 ~ log(appor2010), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.287 -3.566 -2.201 1.682 26.262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -111.215 11.696 -9.509 1.28e-12 ***
## log(appor2010) 7.912 0.770 10.275 1.03e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.492 on 48 degrees of freedom
## Multiple R-squared: 0.6875, Adjusted R-squared: 0.6809
## F-statistic: 105.6 on 1 and 48 DF, p-value: 1.034e-13
https://www.census.gov/content/dam/Census/library/publications/2011/dec/c2010br-08.pdf#page=6
data.x <- data2010 %>% full_join(data2018)
## Joining, by = "state"
denom <- vector()
for (n in 2:60){denom[n - 1] <- 1 / sqrt(n*(n-1))}
PriorityValues <- tibble(state = rep(state.name, each = 59),
rank = c(t(outer(data.x$pep2018, denom))
)
)
# Alternative
PV <- matrix(nrow = 50, ncol = 59)
for (i in 1:50){
for (j in 2:60){
PV[i, j - 1] <- data.x$pep2018[i] * denom[j - 1]
}
}
PriorityValues %<>% slice_max(rank, n = 435 - 50)
# PriorityValues %>% count(state)
rep2020 <- table(PriorityValues$state) %>% as.data.frame()
names(rep2020) <- c("state", "rep2020est")
data.x %<>%
left_join(rep2020) %>%
replace_na(list(rep2020est = 0))
## Joining, by = "state"
data.x %<>% mutate(rep2020est = rep2020est + 1)
sum(data.x$rep2020est)
## [1] 435
data.x %>%
ggplot(aes(rep2020est, state)) +
geom_col()+
scale_y_discrete(limits = sort(state.name, decreasing = TRUE)) +
ggtitle("Estimated Total Representitives in 2020 Based on 2018 PEP") +
labs(x = "", y = "")
data.x %>% datatable(options = list(pageLength = 5))
data.x %>% summary()
## state appor2010 rep2010 pep2018
## Length:50 Min. : 568300 Min. : 1.00 Min. : 577737
## Class :character 1st Qu.: 1838822 1st Qu.: 3.00 1st Qu.: 1836691
## Mode :character Median : 4452284 Median : 6.00 Median : 4564190
## Mean : 6183669 Mean : 8.70 Mean : 6529300
## 3rd Qu.: 6704938 3rd Qu.: 9.75 3rd Qu.: 7444605
## Max. :37341989 Max. :53.00 Max. :39557045
## PercentChange2018 rep2020est
## Min. :-2.545 Min. : 1.00
## 1st Qu.: 1.832 1st Qu.: 2.25
## Median : 4.128 Median : 6.00
## Mean : 5.344 Mean : 8.70
## 3rd Qu.: 8.529 3rd Qu.:10.00
## Max. :14.372 Max. :53.00
data.x %>%
select(!state) %>%
pivot_longer(cols = everything(), names_to = "Variable") %>%
ggplot(aes(y = value)) +
geom_boxplot() +
facet_wrap(vars(Variable), scales = "free_y") +
theme(axis.text.x = element_blank())
# Top 3 estimated # Reps in 2020
data.x %>%
select(state, rep2010, rep2020est) %>%
slice_max(rep2020est, n = 3) %>%
mutate(PercentTotal2020 = rep2020est / 435 * 100)
## # A tibble: 3 x 4
## state rep2010 rep2020est PercentTotal2020
## <chr> <dbl> <dbl> <dbl>
## 1 California 53 53 12.2
## 2 Texas 36 38 8.74
## 3 Florida 27 28 6.44
# Estimated % top three reps in 2020
data.x %>%
select(rep2020est) %>%
slice_max(rep2020est, n = 3) %>%
sum() / 435 * 100
## [1] 27.35632
# Top 3 # Reps in 2010
data.x %>%
select(state, rep2010, rep2020est) %>%
slice_max(rep2010, n = 3)
## # A tibble: 4 x 3
## state rep2010 rep2020est
## <chr> <dbl> <dbl>
## 1 California 53 53
## 2 Texas 36 38
## 3 Florida 27 28
## 4 New York 27 26
data.x %>%
select(state, rep2010, rep2020est) %>%
filter(rep2010 == 1)
## # A tibble: 7 x 3
## state rep2010 rep2020est
## <chr> <dbl> <dbl>
## 1 Alaska 1 1
## 2 Delaware 1 1
## 3 Montana 1 1
## 4 North Dakota 1 1
## 5 South Dakota 1 1
## 6 Vermont 1 1
## 7 Wyoming 1 1
data.x %<>%
mutate(difference = as.character(rep2020est - rep2010))
data.x %>%
select(difference) %>%
table()
## .
## -1 0 1 2
## 7 37 5 1
data.x %>%
ggplot(aes(difference)) +
geom_bar() +
ggtitle("Estimated Representitive Change from 2010 to 2020 based on 2018 PEP") +
ylab("Number of States") +
xlab("Change")
plot_usmap(data = data.x,
values = "difference",
color = "black",
regions = "states",
labels = TRUE) +
scale_fill_brewer(type = 'qual', palette = 3, name = "Change") +
# labs(fill = "Change") +
theme(legend.position = "right") +
ggtitle("Estimated Representitive Change from 2010 to 2020 based on 2018 PEP")