homework

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Vote <- read_csv("~/Downloads/Voter.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   weight_18_24_2018 = col_logical(),
##   izip_2019 = col_character(),
##   housevote_other_2019 = col_character(),
##   senatevote_other_2019 = col_character(),
##   senatevote2_other_2019 = col_character(),
##   SenCand1Name_2019 = col_character(),
##   SenCand1Party_2019 = col_character(),
##   SenCand2Name_2019 = col_character(),
##   SenCand2Party_2019 = col_character(),
##   SenCand3Name_2019 = col_character(),
##   SenCand3Party_2019 = col_character(),
##   SenCand1Name2_2019 = col_character(),
##   SenCand1Party2_2019 = col_character(),
##   SenCand2Name2_2019 = col_character(),
##   SenCand2Party2_2019 = col_character(),
##   SenCand3Name2_2019 = col_character(),
##   SenCand3Party2_2019 = col_character(),
##   governorvote_other_2019 = col_character(),
##   GovCand1Name_2019 = col_character(),
##   GovCand1Party_2019 = col_character()
##   # ... with 108 more columns
## )

## See spec(...) for full column specifications.

## Warning: 800 parsing failures.
##  row               col           expected           actual                    file
## 2033 weight_18_24_2018 1/0/T/F/TRUE/FALSE .917710168467982 '~/Downloads/Voter.csv'
## 2828 weight_18_24_2018 1/0/T/F/TRUE/FALSE 1.41022291345592 '~/Downloads/Voter.csv'
## 4511 weight_18_24_2018 1/0/T/F/TRUE/FALSE 1.77501243840922 '~/Downloads/Voter.csv'
## 7264 weight_18_24_2018 1/0/T/F/TRUE/FALSE 1.29486870319614 '~/Downloads/Voter.csv'
## 7277 weight_18_24_2018 1/0/T/F/TRUE/FALSE 1.44972719707603 '~/Downloads/Voter.csv'
## .... ................. .................. ................ .......................
## See problems(...) for more details.

head(Vote)

## # A tibble: 6 x 1,282
##   weight_2016 weight_2017 weight_panel_20… weight_latino_2… weight_18_24_20…
##         <dbl>       <dbl>            <dbl>            <dbl> <lgl>           
## 1       0.358       0.438            0.503               NA NA              
## 2       0.563       0.366            0.389               NA NA              
## 3       0.552       0.550            0.684               NA NA              
## 4       0.208      NA               NA                   NA NA              
## 5       0.334       0.346            0.322               NA NA              
## 6       0.207       0.148            0.594               NA NA              
## # … with 1,277 more variables: weight_overall_2018 <dbl>, weight_2019 <dbl>,
## #   weight1_2018 <dbl>, weight1_2019 <dbl>, weight2_2019 <dbl>,
## #   weight3_2019 <dbl>, cassfullcd <dbl>, vote2020_2019 <dbl>,
## #   trumpapp_2019 <dbl>, fav_trump_2019 <dbl>, fav_obama_2019 <dbl>,
## #   fav_hrc_2019 <dbl>, fav_sanders_2019 <dbl>, fav_putin_2019 <dbl>,
## #   fav_schumer_2019 <dbl>, fav_pelosi_2019 <dbl>, fav_comey_2019 <dbl>,
## #   fav_mueller_2019 <dbl>, fav_mcconnell_2019 <dbl>, fav_kavanaugh_2019 <dbl>,
## #   fav_biden_2019 <dbl>, fav_warren_2019 <dbl>, fav_harris_2019 <dbl>,
## #   fav_gillibrand_2019 <dbl>, fav_patrick_2019 <dbl>, fav_booker_2019 <dbl>,
## #   fav_garcetti_2019 <dbl>, fav_klobuchar_2019 <dbl>, fav_gorsuch_2019 <dbl>,
## #   fav_kasich_2019 <dbl>, fav_haley_2019 <dbl>, fav_bloomberg_2019 <dbl>,
## #   fav_holder_2019 <dbl>, fav_avenatti_2019 <dbl>, fav_castro_2019 <dbl>,
## #   fav_landrieu_2019 <dbl>, fav_orourke_2019 <dbl>,
## #   fav_hickenlooper_2019 <dbl>, fav_pence_2019 <dbl>, add_confirm_2019 <dbl>,
## #   izip_2019 <chr>, votereg_2019 <dbl>, votereg_f_2019 <dbl>,
## #   regzip_2019 <dbl>, region_2019 <dbl>, turnout18post_2019 <dbl>,
## #   tsmart_G2018_2019 <dbl>, tsmart_G2018_vote_type_2019 <dbl>,
## #   tsmart_P2018_2019 <dbl>, tsmart_P2018_party_2019 <dbl>,
## #   tsmart_P2018_vote_type_2019 <dbl>, housevote_2019 <dbl>,
## #   housevote_other_2019 <chr>, senatevote_2019 <dbl>,
## #   senatevote_other_2019 <chr>, senatevote2_2019 <dbl>,
## #   senatevote2_other_2019 <chr>, SenCand1Name_2019 <chr>,
## #   SenCand1Party_2019 <chr>, SenCand2Name_2019 <chr>,
## #   SenCand2Party_2019 <chr>, SenCand3Name_2019 <chr>,
## #   SenCand3Party_2019 <chr>, SenCand1Name2_2019 <chr>,
## #   SenCand1Party2_2019 <chr>, SenCand2Name2_2019 <chr>,
## #   SenCand2Party2_2019 <chr>, SenCand3Name2_2019 <chr>,
## #   SenCand3Party2_2019 <chr>, governorvote_2019 <dbl>,
## #   governorvote_other_2019 <chr>, GovCand1Name_2019 <chr>,
## #   GovCand1Party_2019 <chr>, GovCand2Name_2019 <chr>,
## #   GovCand2Party_2019 <chr>, GovCand3Name_2019 <chr>,
## #   GovCand3Party_2019 <chr>, inst_court_2019 <dbl>, inst_media_2019 <dbl>,
## #   inst_congress_2019 <dbl>, inst_justice_2019 <dbl>, inst_FBI_2019 <dbl>,
## #   inst_military_2019 <dbl>, inst_church_2019 <dbl>, inst_business_2019 <dbl>,
## #   Democrats_2019 <dbl>, Republicans_2019 <dbl>, Men_2019 <dbl>,
## #   Women_2019 <dbl>, wm_2019 <dbl>, ww_2019 <dbl>, bm_2019 <dbl>,
## #   bw_2019 <dbl>, hm_2019 <dbl>, hw_2019 <dbl>, rwm_2019 <dbl>,
## #   rww_2019 <dbl>, rbm_2019 <dbl>, rbw_2019 <dbl>, pwm_2019 <dbl>, …

Data Preparation

Vote <-Vote %>%
mutate(vote2020_2019 =
        ifelse(vote2020_2019==1,"Donald Trump",
        ifelse(vote2020_2019==2,"The Democratic candidate",
        ifelse(vote2020_2019==3, "I would not vote",
        ifelse(vote2020_2019==4, "Not sure", NA)))),
Women_2019 =
ifelse(Women_2019>100,NA, Women_2019))%>%
select(Women_2019, vote2020_2019)%>%
filter(vote2020_2019 %in% c("Donald Trump","The Democratic candidate"))

Actual Hypothesis / Averages for both groups

Vote %>%
  group_by(vote2020_2019) %>%
  summarize(Womens_2019 = mean(Women_2019, na.rm=TRUE))

## # A tibble: 2 x 2
##   vote2020_2019            Womens_2019
##   <chr>                          <dbl>
## 1 Donald Trump                    73.0
## 2 The Democratic candidate        81.9

This shows that those who would vote for the Democratic candidate would have more respect for women than those who would vote for Donald Trump.

Data Visualization of Population Distributions

Vote %>%
    filter(vote2020_2019 %in% c("Donald Trump", "The Democratic candidate")) %>%
    ggplot()+
    geom_histogram(aes(x=Women_2019, fill=vote2020_2019))+
    facet_wrap(~vote2020_2019)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 305 rows containing non-finite values (stat_bin).

It seems that those who would vote for the Democratic candidate lean more on the higher ratings on their feelings toward women. On the other side, we see that those who vote for Donald Trump lean more on the 50s.

Sampling Distribution:

Donald_vote <- Vote %>%
  filter(vote2020_2019=="Donald Trump")

Donald_vote <-
  replicate(10000,
  sample(Donald_vote$Women_2019,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)

Dem_vote <- Vote %>%
  filter(vote2020_2019=="The Democratic candidate")

Dem_vote <-
  replicate(10000,
  sample(Dem_vote$Women_2019,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)

Sampling Population

ggplot()+
geom_histogram(data=Donald_vote, aes(x=mean),fill="red")+ geom_histogram(data=Dem_vote, aes(x=mean),fill="blue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Both sampling disturbutions overlap a little in between 70s-80s. However, we see that those who would vote for the Democratic candidate lean more on the higher 80s and those who would vote for Donald Trump lean more on the lower 70s. Both sampling distributions are normally distrubuted.

Null Hypothesis

Vote%>%
summarize(Women_2019 = mean(Women_2019,na.rm=TRUE))

## # A tibble: 1 x 1
##   Women_2019
##        <dbl>
## 1       77.9

If a person’s political vote (Donald Trump and Democratic candidate) makes no difference in their feeling towards women, then we should find that the group-wise averages are .78 for members of both groups.

T-Test

t.test(Women_2019~vote2020_2019, data=Vote)

## 
##  Welch Two Sample t-test
## 
## data:  Women_2019 by vote2020_2019
## t = -16.006, df = 4588.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.954328 -7.781945
## sample estimates:
##             mean in group Donald Trump mean in group The Democratic candidate 
##                               73.03526                               81.90339

options(scipen =9999)

Since the p-value is less than .05, we reject the null hypothesis, there’s no difference between the means, and our findings are significant.