library(socsci)
library(car)
cces <- read_csv("https://raw.githubusercontent.com/ryanburge/cces/master/CCES%20for%20Methods/small_cces.csv")
Let’s start by taking a look at our dataset using the %>% and the glimpse command. You can get a pipe easily by hit CTRL + SHIFT + M.
cces %>% glimpse()
## Observations: 64,600
## Variables: 33
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ X1_1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ id <dbl> 222168628, 273691199, 284214415, 287557695, 2903876...
## $ state <dbl> 33, 22, 29, 1, 8, 1, 48, 42, 13, 42, 15, 48, 12, 48...
## $ birthyr <dbl> 1969, 1994, 1964, 1988, 1982, 1963, 1962, 1991, 196...
## $ gender <dbl> 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, ...
## $ educ <dbl> 2, 2, 2, 2, 5, 2, 2, 1, 2, 2, 5, 3, 3, 4, 3, 3, 3, ...
## $ race <dbl> 1, 1, 2, 2, 1, 6, 1, 1, 1, 1, 4, 1, 6, 3, 6, 7, 1, ...
## $ marital <dbl> 1, 5, 5, 5, 1, 4, 2, 2, 1, 1, 1, 1, 1, 6, 5, 5, 5, ...
## $ natecon <dbl> 3, 4, 5, 4, 2, 4, 3, 5, 4, 5, 2, 6, 3, 3, 3, 3, 6, ...
## $ mymoney <dbl> 2, 3, 2, 4, 2, 4, 3, 5, 4, 3, 2, 1, 3, 3, 2, 5, 4, ...
## $ econfuture <dbl> 6, 5, 4, 5, 6, 4, 3, 6, 6, 5, 3, 6, 6, 2, 2, 5, 6, ...
## $ police <dbl> 2, 3, 2, 2, 2, 3, 2, 2, 2, 1, 2, 1, 3, 2, 1, 4, 2, ...
## $ background <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, ...
## $ registry <dbl> 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 8, 1, 1, 2, 1, 2, 1, ...
## $ assaultban <dbl> 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, ...
## $ conceal <dbl> 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, ...
## $ pathway <dbl> 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, ...
## $ border <dbl> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, ...
## $ dreamer <dbl> 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, ...
## $ deport <dbl> 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, ...
## $ prochoice <dbl> 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, ...
## $ prolife <dbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, ...
## $ gaym <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ employ <dbl> 7, 7, 6, 6, 2, 6, 1, 4, 3, 1, 5, 1, 7, 2, 1, 4, 4, ...
## $ pid7 <dbl> 5, 4, 1, 4, 2, 2, 6, 4, 7, 7, 2, 4, 4, 1, 4, 1, 2, ...
## $ attend <dbl> 6, 8, 3, 4, 6, 2, 2, 5, 4, 5, 5, 5, 4, 6, 6, 6, 6, ...
## $ religion <dbl> 11, 98, 2, 11, 10, 1, 1, 11, 11, 2, 1, 1, 10, 11, 9...
## $ vote16 <dbl> 1, 1, NA, NA, 2, 99, 1, NA, 1, 1, 2, 5, 99, NA, 5, ...
## $ ideo5 <dbl> 3, 3, 5, 4, 2, 6, 5, 3, 4, 3, 3, 4, 6, 1, 3, 1, 2, ...
## $ union <dbl> 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, ...
## $ income <dbl> 97, 6, 4, 1, 7, 1, 3, 1, 4, 7, 10, 5, 2, 2, 3, 1, 6...
## $ sexuality <dbl> 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 6, 1, 1, 1, 1, ...
One of the most helpful functions to get a simple sense of how many of each race in the dataset is to use the table command. Again, this uses cces and then the pipe. You need to take a look at the codebook to understand what one and two represents. I know that 1 = male and 2 = female.
cces %>%
ct(gender)
## # A tibble: 2 x 3
## gender n pct
## <dbl> <int> <dbl>
## 1 1 29531 0.457
## 2 2 35069 0.543
R has a lot of cool functions. One of the most helpful is called filter. It will filter the data based on whatever parameters we have set up.
The racial breakdown of women in the dataset. It’s important that you do == (2 equal signs).
cces %>%
filter(gender == 2) %>%
ct(race)
## # A tibble: 8 x 3
## race n pct
## <dbl> <int> <dbl>
## 1 1 24489 0.698
## 2 2 5137 0.146
## 3 3 2894 0.083
## 4 4 1115 0.032
## 5 5 248 0.007
## 6 6 858 0.024
## 7 7 261 0.007
## 8 8 67 0.002
The gender breakdown of just Hispanics OR Asians. You would use the |, which is just above the enter key. You have to hit shift.
cces %>%
filter(race == 3 | race == 4) %>%
ct(gender)
## # A tibble: 2 x 3
## gender n pct
## <dbl> <int> <dbl>
## 1 1 3507 0.467
## 2 2 4009 0.533
What about the political ideology of black males? That’s the & symbol.
cces %>%
filter(race == 2 & gender ==1) %>%
ct(pid7)
## # A tibble: 10 x 3
## pid7 n pct
## <dbl> <int> <dbl>
## 1 1 1311 0.47
## 2 2 524 0.188
## 3 3 321 0.115
## 4 4 323 0.116
## 5 5 66 0.024
## 6 6 68 0.024
## 7 7 95 0.034
## 8 8 78 0.028
## 9 98 2 0.001
## 10 99 1 0
The gender breakdown of just people of color. You use the exclamation point and the equals sign (!=). That translates to NOT EQUALS.
cces %>%
filter(race != 1) %>%
ct(gender)
## # A tibble: 2 x 3
## gender n pct
## <dbl> <int> <dbl>
## 1 1 7731 0.422
## 2 2 10580 0.578
Do you want to create a dichotomous variable?
cces %>%
mutate(white = recode(race, "1=1; else =0")) %>%
ct(white)
## # A tibble: 2 x 3
## white n pct
## <dbl> <int> <dbl>
## 1 0 18311 0.283
## 2 1 46289 0.717
Or do you want to have multiple levels of a variable?
cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
ct(race)
## Warning: Factor `race` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 5 x 3
## race n pct
## <fct> <int> <dbl>
## 1 White 46289 0.717
## 2 Black 7926 0.123
## 3 Hispanic 5238 0.081
## 4 Asian 2278 0.035
## 5 <NA> 2869 0.044
But, do you notice how you have some NAs values? We can still do our count but exclude them by including show_na = FALSE
cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
ct(race, show_na = FALSE)
## # A tibble: 4 x 3
## race n pct
## <fct> <int> <dbl>
## 1 White 46289 0.75
## 2 Black 7926 0.128
## 3 Hispanic 5238 0.085
## 4 Asian 2278 0.037
Or, you can do simple math:
cces %>%
mutate(age = 2016 - birthyr) %>%
ct(age)
## # A tibble: 80 x 3
## age n pct
## <dbl> <int> <dbl>
## 1 18 668 0.01
## 2 19 776 0.012
## 3 20 682 0.011
## 4 21 750 0.012
## 5 22 788 0.012
## 6 23 784 0.012
## 7 24 913 0.014
## 8 25 1002 0.016
## 9 26 1303 0.02
## 10 27 1126 0.017
## # ... with 70 more rows
graph <- cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
ct(race, show_na = FALSE)
graph %>%
ggplot(., aes(x = race, y = pct)) +
geom_col()
If you want to make dodged bars here’s a tutorial for that:
Let’s visualize the age distribution of each racial group
graph <- cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian",
TRUE ~ "All Others")) %>%
mutate(age = 2016 - birthyr) %>%
group_by(race) %>%
ct(age)
graph %>%
ggplot(., aes(x = age, y = pct, color = race, group = race)) +
geom_point() +
geom_line()
Let’s do smoothed lines now
graph <- cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian",
TRUE ~ "All Others")) %>%
mutate(age = 2016 - birthyr) %>%
group_by(race) %>%
ct(age)
graph %>%
ggplot(., aes(x = age, y = pct, color = race, group = race)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Finding the average age in the dataset
cces %>%
mutate(age = 2016 - birthyr) %>%
mean_ci(age)
## # A tibble: 1 x 7
## mean sd n level se lower upper
## <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 47.9 16.8 64600 0.05 0.0662 47.8 48.0
Finding the average age for each racial group
cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
mutate(age = 2016- birthyr) %>%
group_by(race) %>%
mean_ci(age)
## Warning: Factor `race` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 5 x 8
## race mean sd n level se lower upper
## <fct> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 White 49.8 16.9 46289 0.05 0.0786 49.6 49.9
## 2 Black 44.7 15.9 7926 0.05 0.178 44.4 45.1
## 3 Hispanic 40.9 14.9 5238 0.05 0.205 40.5 41.3
## 4 Asian 39.4 14.0 2278 0.05 0.293 38.8 40.0
## 5 <NA> 45.5 16.6 2869 0.05 0.310 44.9 46.1
Now, we can graph that
graph <- cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
mutate(age = 2016- birthyr) %>%
group_by(race) %>%
mean_ci(age)
## Warning: Factor `race` contains implicit NA, consider using
## `forcats::fct_explicit_na`
graph %>%
ggplot(., aes(x = race, y = mean)) +
geom_point() +
geom_errorbar(aes(ymin = lower, ymax = upper))
Graph that without the NAs
graph <- cces %>%
mutate(race = frcode(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Hispanic",
race == 4 ~ "Asian")) %>%
mutate(age = 2016- birthyr) %>%
group_by(race) %>%
mean_ci(age)
## Warning: Factor `race` contains implicit NA, consider using
## `forcats::fct_explicit_na`
graph %>%
filter(race != "NA") %>%
ggplot(., aes(x = race, y = mean)) +
geom_point() +
geom_errorbar(aes(ymin = lower, ymax = upper))