library(tidyverse)
library(magrittr)
library(readxl)
library(stargazer)
library(DT)
library(usmap)

Problem 2

data2018 <-
  read_csv("G:/My Drive/homework/Matthew R/PEP_2018_PEPANNRES_with_ann.csv",
           skip = 1)

data2018 %<>%
  select(Geography, `April 1, 2010 - Census`, `Population Estimate (as of July 1) - 2018`) %>%
  rename(state = Geography,
         res2010 = `April 1, 2010 - Census`, 
         pep2018 = `Population Estimate (as of July 1) - 2018`)

Part (a)

data2018 %>% filter(!data2018$state %in% state.name)
## # A tibble: 3 x 3
##   state                  res2010   pep2018
##   <chr>                    <dbl>     <dbl>
## 1 United States        308745538 327167434
## 2 District of Columbia    601723    702455
## 3 Puerto Rico            3725789   3195153

Part (b)

# Given US Total
data2018 %>% 
  filter(state == "United States") %>%
  select(!pep2018)
## # A tibble: 1 x 2
##   state           res2010
##   <chr>             <dbl>
## 1 United States 308745538
# Given US Total includes D.C. but not Puerto Rico
data2018$res2010[2:52] %>% sum()
## [1] 308745538
data2018 %<>% filter(data2018$state %in% state.name)

Part (c)

data2018 %<>%
  mutate(PercentChange2018 = (pep2018 - res2010) / res2010 * 100) %>%
  select(!res2010)

Problem 3

data2010 <-
  read_excel("G:/My Drive/homework/Matthew R/ApportionmentPopulation2010.xls",
             skip = 8, n_max = 52)
## New names:
## * `` -> ...3
## * `` -> ...5
data2010 %<>%
  slice(-c(1:2)) %>%
  select(STATE, `(APRIL 1, 2010)`, `2010 CENSUS`) %>%
  rename(state = STATE,
         appor2010 = `(APRIL 1, 2010)`,
         rep2010 = `2010 CENSUS`)

Part (a)

data2010 %>%
  select(appor2010) %>%
  as.data.frame() %>%
  stargazer(.,
            type = "text", # type = "html"
            summary.stat = c("min", "max", "mean", "median", "sd"),
            digits = 0)
## 
## ==========================================================
## Statistic   Min      Max       Mean     Median   St. Dev. 
## ----------------------------------------------------------
## appor2010 568,300 37,341,989 6,183,669 4,452,284 6,869,559
## ----------------------------------------------------------
data2010 %>% 
  select(appor2010) %>%
  pivot_longer(cols = appor2010, names_to = "Variable") %>%
  group_by(Variable) %>%
  summarize(across(.cols = value,
                   list(Min = min, Max = max, Mean = mean, Median = median, SD = sd),
                   .names = "{.fn}"
                   )
            )
## # A tibble: 1 x 6
##   Variable     Min      Max     Mean  Median       SD
##   <chr>      <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
## 1 appor2010 568300 37341989 6183669. 4452284 6869559.

Part (b)

data2010 %>% slice_max(appor2010)
## # A tibble: 1 x 3
##   state      appor2010 rep2010
##   <chr>          <dbl>   <dbl>
## 1 California  37341989      53
data2010 %>% slice_min(appor2010)
## # A tibble: 1 x 3
##   state   appor2010 rep2010
##   <chr>       <dbl>   <dbl>
## 1 Wyoming    568300       1
data2010 %>% slice_max(appor2010, n = 5)
## # A tibble: 5 x 3
##   state      appor2010 rep2010
##   <chr>          <dbl>   <dbl>
## 1 California  37341989      53
## 2 Texas       25268418      36
## 3 New York    19421055      27
## 4 Florida     18900773      27
## 5 Illinois    12864380      18

Problem 4

Part (a)

data2010 %>%
  ggplot(aes(appor2010)) +
  geom_histogram() +
  geom_vline(xintercept = mean(data2010$appor2010), col = "red") +
  geom_vline(xintercept = median(data2010$appor2010), col = "blue") +
  labs(caption = "Red = Mean, Blue = Median") + 
  xlab("Apportionment Population") +
  ggtitle("2010")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data2010 %>%
  ggplot(aes(appor2010)) +
  geom_density() +
  geom_vline(xintercept = mean(data2010$appor2010), col = "red") +
  geom_vline(xintercept = median(data2010$appor2010), col = "blue") +
  labs(caption = "Red = Mean, Blue = Median") + 
  xlab("Apportionment Population") +
  ggtitle("2010")

Part (b)

data2010 %>%
  ggplot(aes(log(appor2010))) +
  geom_histogram() +
  geom_vline(xintercept = mean(log(data2010$appor2010)), col = "red") +
  geom_vline(xintercept = median(log(data2010$appor2010)), col = "blue") +
  labs(caption = "Red = Mean, Blue = Median") + 
  xlab("Log of Apportionment Population") +
  ggtitle("2010")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data2010 %>%
  ggplot(aes(log(appor2010))) +
  geom_density() +
  geom_vline(xintercept = mean(log(data2010$appor2010)), col = "red") +
  geom_vline(xintercept = median(log(data2010$appor2010)), col = "blue") +
  labs(caption = "Red = Mean, Blue = Median") + 
  xlab("Log of Apportionment Population") +
  ggtitle("2010")

Problem 5

See vertical lines in Problem 4

Problem 6

Part (a)

data2010 %>%
  ggplot(aes(appor2010, rep2010)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) + 
  xlab("Apportionment Population") +
  ylab("Number of Representitives") +
  ggtitle("2010")
## `geom_smooth()` using formula 'y ~ x'

data2010 %>% lm(rep2010 ~ appor2010, data = .) %>% summary()
## 
## Call:
## lm(formula = rep2010 ~ appor2010, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48503 -0.21379 -0.00812  0.20630  0.55572 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.868e-02  5.470e-02   -0.89    0.378    
## appor2010    1.415e-06  5.951e-09  237.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2862 on 48 degrees of freedom
## Multiple R-squared:  0.9992, Adjusted R-squared:  0.9991 
## F-statistic: 5.652e+04 on 1 and 48 DF,  p-value: < 2.2e-16

Part (b)

data2010 %>%
  ggplot(aes(log(appor2010), rep2010)) +
  geom_smooth(method = "lm", se = FALSE) + 
  geom_smooth(method = "loess", se = FALSE, col = "gray65") +
  geom_point() +
  xlab("Log of Apportionment Population") +
  ylab("Number of Representitives") +
  ggtitle("2010")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

data2010 %>% lm(rep2010 ~ log(appor2010), data = .) %>% summary()
## 
## Call:
## lm(formula = rep2010 ~ log(appor2010), data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.287 -3.566 -2.201  1.682 26.262 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -111.215     11.696  -9.509 1.28e-12 ***
## log(appor2010)    7.912      0.770  10.275 1.03e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.492 on 48 degrees of freedom
## Multiple R-squared:  0.6875, Adjusted R-squared:  0.6809 
## F-statistic: 105.6 on 1 and 48 DF,  p-value: 1.034e-13

Problem 7

https://www.census.gov/content/dam/Census/library/publications/2011/dec/c2010br-08.pdf#page=6

data.x <- data2010 %>% full_join(data2018)
## Joining, by = "state"

Step (1)

denom <- vector()
for (n in 2:60){denom[n - 1] <- 1 / sqrt(n*(n-1))}

Step (2) (3)

PriorityValues <- tibble(state = rep(state.name, each = 59),
                         rank = c(t(outer(data.x$pep2018, denom))
                                )
                         )

# Alternative
PV <- matrix(nrow = 50, ncol = 59)
for (i in 1:50){
  for (j in 2:60){
    PV[i, j - 1] <- data.x$pep2018[i] * denom[j - 1] 
  }
}

Step (4)

PriorityValues %<>% slice_max(rank, n = 435 - 50)

Step (5)

# PriorityValues %>% count(state)
rep2020 <- table(PriorityValues$state) %>% as.data.frame()
names(rep2020) <- c("state", "rep2020est")

Step (6)

data.x %<>% 
  left_join(rep2020) %>% 
  replace_na(list(rep2020est = 0))
## Joining, by = "state"

Step (7)

data.x %<>% mutate(rep2020est = rep2020est + 1)
sum(data.x$rep2020est)
## [1] 435
data.x %>%
  ggplot(aes(rep2020est, state)) +
  geom_col()+
  scale_y_discrete(limits = sort(state.name, decreasing = TRUE)) +
  ggtitle("Estimated Total Representitives in 2020 Based on 2018 PEP") +
  labs(x = "", y = "")

data.x %>% datatable(options = list(pageLength = 5))
data.x %>% summary()
##     state             appor2010           rep2010         pep2018        
##  Length:50          Min.   :  568300   Min.   : 1.00   Min.   :  577737  
##  Class :character   1st Qu.: 1838822   1st Qu.: 3.00   1st Qu.: 1836691  
##  Mode  :character   Median : 4452284   Median : 6.00   Median : 4564190  
##                     Mean   : 6183669   Mean   : 8.70   Mean   : 6529300  
##                     3rd Qu.: 6704938   3rd Qu.: 9.75   3rd Qu.: 7444605  
##                     Max.   :37341989   Max.   :53.00   Max.   :39557045  
##  PercentChange2018   rep2020est   
##  Min.   :-2.545    Min.   : 1.00  
##  1st Qu.: 1.832    1st Qu.: 2.25  
##  Median : 4.128    Median : 6.00  
##  Mean   : 5.344    Mean   : 8.70  
##  3rd Qu.: 8.529    3rd Qu.:10.00  
##  Max.   :14.372    Max.   :53.00
data.x %>%
  select(!state) %>%
  pivot_longer(cols = everything(), names_to = "Variable") %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(vars(Variable), scales = "free_y") +
  theme(axis.text.x = element_blank())

Part (a)

# Top 3 estimated # Reps in 2020
data.x %>%
  select(state, rep2010, rep2020est) %>%
  slice_max(rep2020est, n = 3) %>%
  mutate(PercentTotal2020 = rep2020est / 435 * 100)
## # A tibble: 3 x 4
##   state      rep2010 rep2020est PercentTotal2020
##   <chr>        <dbl>      <dbl>            <dbl>
## 1 California      53         53            12.2 
## 2 Texas           36         38             8.74
## 3 Florida         27         28             6.44
# Estimated % top three reps in 2020
data.x %>%
  select(rep2020est) %>%
  slice_max(rep2020est, n = 3) %>%
  sum() / 435 * 100
## [1] 27.35632
# Top 3 # Reps in 2010
data.x %>%
  select(state, rep2010, rep2020est) %>%
  slice_max(rep2010, n = 3)
## # A tibble: 4 x 3
##   state      rep2010 rep2020est
##   <chr>        <dbl>      <dbl>
## 1 California      53         53
## 2 Texas           36         38
## 3 Florida         27         28
## 4 New York        27         26

Part (b)

data.x %>%
  select(state, rep2010, rep2020est) %>%
  filter(rep2010 == 1)
## # A tibble: 7 x 3
##   state        rep2010 rep2020est
##   <chr>          <dbl>      <dbl>
## 1 Alaska             1          1
## 2 Delaware           1          1
## 3 Montana            1          1
## 4 North Dakota       1          1
## 5 South Dakota       1          1
## 6 Vermont            1          1
## 7 Wyoming            1          1

Problem 8

data.x %<>%
  mutate(difference = as.character(rep2020est - rep2010))

data.x %>%
  select(difference) %>%
  table()
## .
## -1  0  1  2 
##  7 37  5  1
data.x %>%
  ggplot(aes(difference)) +
  geom_bar() +
  ggtitle("Estimated Representitive Change from 2010 to 2020 based on 2018 PEP") +
  ylab("Number of States") +
  xlab("Change")

Problem 9

Part (a)

plot_usmap(data = data.x,
           values = "difference",
           color = "black", 
           regions = "states",
           labels = TRUE) +
  scale_fill_brewer(type = 'qual', palette = 3, name = "Change") +
  # labs(fill = "Change") +
  theme(legend.position = "right") +
  ggtitle("Estimated Representitive Change from 2010 to 2020 based on 2018 PEP")