Read in data

Turk data

td = read_csv("turk_data.csv") %>%
  slice(-1:-3) %>%
  select(18:36) %>%
  mutate(id = 1:n()) 

td %<>% 
  mutate(Sex = tolower(Sex))%>%
  mutate_all(funs(as.factor)) 


names(td) = tolower(names(td))

Pew data

pew = read_csv("pewsurvey.csv") %>%
  rename(variable = var,
         value = response) %>%
  mutate(variable = tolower(variable),
         variable = as.factor(variable))

Compare to Pew data

munged.props = td %>%
  select(-16:-20) %>%
  mutate(intmob = ifelse(intmob == "yes", "use internet",
                         "do not use internet")) %>%
  rename(q.1 = pol1,
         q.19f2 = pol19f2,
         q.1a = pol1aa,
         q.20f1 = pol20f1,
         q.30f2 = pol30f2,
         q.62f1 = pol62f1a,
         q20 = techq20)
  
raw.props = munged.props %>%
  gather(variable, value)  %>%
  group_by(variable, value)  %>%
  summarise (n = n()) %>%
  mutate(obs.prop = n / sum(n)) %>%
  filter(variable != "attention") %>%
  ungroup() %>%
  mutate(value = tolower(value),
         variable = tolower(variable),
         variable = as.factor(variable))

raw.d = full_join(raw.props, 
                  pew, by=c("value", "variable")) %>%
        filter(!is.na(value)) %>%
        arrange(variable) %>%
        mutate(obs.prop = ifelse(is.na(obs.prop), 0,
                                 obs.prop),
        pew.estimates = pew.estimates/100) 
  
  
## what's up with q.13

## Plot
ggplot(raw.d, aes(x = obs.prop, y = pew.estimates)) +
  geom_point() +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1) +
  ggtitle("raw unweighted estimates")+
  ylim(0,1)

Correlations

cor.test(raw.d$obs.prop, raw.d$pew.estimates) 
## 
##  Pearson's product-moment correlation
## 
## data:  raw.d$obs.prop and raw.d$pew.estimates
## t = 4.6907, df = 38, p-value = 3.472e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3624086 0.7715264
## sample estimates:
##       cor 
## 0.6055546

Population data

pops = read_csv("population.csv") %>%
        mutate_each(funs(as.factor),
                    c(RACE5, HISP, STNAME, SEX)) %>% 
        select(-DATE, -STATE)

pops %>%
  skim() %>%
  kable()
var type stat level value
AGE integer missing .all 0.00000
AGE integer complete .all 87720.00000
AGE integer n .all 87720.00000
AGE integer mean .all 42.50000
AGE integer sd .all 24.82452
AGE integer min .all 0.00000
AGE integer median .all 42.50000
AGE integer quantile 25% 21.00000
AGE integer quantile 75% 64.00000
AGE integer max .all 85.00000
AGE integer hist ▇▇▇▇▇▇▇▇▇▇ 0.00000
SEX factor missing .all 0.00000
SEX factor complete .all 87720.00000
SEX factor n .all 87720.00000
SEX factor count Female 43860.00000
SEX factor count Male 43860.00000
SEX factor count NA 0.00000
SEX factor n_unique .all 2.00000
RACE5 factor missing .all 0.00000
RACE5 factor complete .all 87720.00000
RACE5 factor n .all 87720.00000
RACE5 factor count AIAN alone or in combination 17544.00000
RACE5 factor count Asian alone or in combination 17544.00000
RACE5 factor count Black alone or in combination 17544.00000
RACE5 factor count NHPI alone or in combination 17544.00000
RACE5 factor count White alone or in combination 17544.00000
RACE5 factor count NA 0.00000
RACE5 factor n_unique .all 5.00000
HISP factor missing .all 0.00000
HISP factor complete .all 87720.00000
HISP factor n .all 87720.00000
HISP factor count Hispanic 43860.00000
HISP factor count Non-Hispanic 43860.00000
HISP factor count NA 0.00000
HISP factor n_unique .all 2.00000
STNAME factor missing .all 0.00000
STNAME factor complete .all 87720.00000
STNAME factor n .all 87720.00000
STNAME factor count Alabama 1720.00000
STNAME factor count Alaska 1720.00000
STNAME factor count Arizona 1720.00000
STNAME factor count Arkansas 1720.00000
STNAME factor count California 1720.00000
STNAME factor count Colorado 1720.00000
STNAME factor count Connecticut 1720.00000
STNAME factor count Delaware 1720.00000
STNAME factor count District of Columbia 1720.00000
STNAME factor count Florida 1720.00000
STNAME factor count Georgia 1720.00000
STNAME factor count Hawaii 1720.00000
STNAME factor count Idaho 1720.00000
STNAME factor count Illinois 1720.00000
STNAME factor count Indiana 1720.00000
STNAME factor count Iowa 1720.00000
STNAME factor count Kansas 1720.00000
STNAME factor count Kentucky 1720.00000
STNAME factor count Louisiana 1720.00000
STNAME factor count Maine 1720.00000
STNAME factor count Maryland 1720.00000
STNAME factor count Massachusetts 1720.00000
STNAME factor count Michigan 1720.00000
STNAME factor count Minnesota 1720.00000
STNAME factor count Mississippi 1720.00000
STNAME factor count Missouri 1720.00000
STNAME factor count Montana 1720.00000
STNAME factor count Nebraska 1720.00000
STNAME factor count Nevada 1720.00000
STNAME factor count New Hampshire 1720.00000
STNAME factor count New Jersey 1720.00000
STNAME factor count New Mexico 1720.00000
STNAME factor count New York 1720.00000
STNAME factor count North Carolina 1720.00000
STNAME factor count North Dakota 1720.00000
STNAME factor count Ohio 1720.00000
STNAME factor count Oklahoma 1720.00000
STNAME factor count Oregon 1720.00000
STNAME factor count Pennsylvania 1720.00000
STNAME factor count Rhode Island 1720.00000
STNAME factor count South Carolina 1720.00000
STNAME factor count South Dakota 1720.00000
STNAME factor count Tennessee 1720.00000
STNAME factor count Texas 1720.00000
STNAME factor count Utah 1720.00000
STNAME factor count Vermont 1720.00000
STNAME factor count Virginia 1720.00000
STNAME factor count Washington 1720.00000
STNAME factor count West Virginia 1720.00000
STNAME factor count Wisconsin 1720.00000
STNAME factor count Wyoming 1720.00000
STNAME factor count NA 0.00000
STNAME factor n_unique .all 51.00000
POP integer missing .all 0.00000
POP integer complete .all 87720.00000
POP integer n .all 87720.00000
POP integer mean .all 3735.17396
POP integer sd .all 11283.22860
POP integer min .all 0.00000
POP integer median .all 182.00000
POP integer quantile 25% 29.00000
POP integer quantile 75% 1541.00000
POP integer max .all 291539.00000
POP integer hist ▇▁▁▁▁▁▁▁▁▁ 0.00000