The R Basics

library(socsci)
library(car)
cces <- read_csv("https://raw.githubusercontent.com/ryanburge/cces/master/CCES%20for%20Methods/small_cces.csv")

Taking A Quick Look At Data

Let’s start by taking a look at our dataset using the %>% and the glimpse command. You can get a pipe easily by hit CTRL + SHIFT + M.

cces %>% glimpse()

## Rows: 64,600
## Columns: 33
## $ X1         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ X1_1       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ id         <dbl> 222168628, 273691199, 284214415, 287557695, 290387662, 2...
## $ state      <dbl> 33, 22, 29, 1, 8, 1, 48, 42, 13, 42, 15, 48, 12, 48, 20,...
## $ birthyr    <dbl> 1969, 1994, 1964, 1988, 1982, 1963, 1962, 1991, 1963, 19...
## $ gender     <dbl> 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2,...
## $ educ       <dbl> 2, 2, 2, 2, 5, 2, 2, 1, 2, 2, 5, 3, 3, 4, 3, 3, 3, 2, 5,...
## $ race       <dbl> 1, 1, 2, 2, 1, 6, 1, 1, 1, 1, 4, 1, 6, 3, 6, 7, 1, 1, 3,...
## $ marital    <dbl> 1, 5, 5, 5, 1, 4, 2, 2, 1, 1, 1, 1, 1, 6, 5, 5, 5, 1, 1,...
## $ natecon    <dbl> 3, 4, 5, 4, 2, 4, 3, 5, 4, 5, 2, 6, 3, 3, 3, 3, 6, 4, 6,...
## $ mymoney    <dbl> 2, 3, 2, 4, 2, 4, 3, 5, 4, 3, 2, 1, 3, 3, 2, 5, 4, 5, 3,...
## $ econfuture <dbl> 6, 5, 4, 5, 6, 4, 3, 6, 6, 5, 3, 6, 6, 2, 2, 5, 6, 4, 5,...
## $ police     <dbl> 2, 3, 2, 2, 2, 3, 2, 2, 2, 1, 2, 1, 3, 2, 1, 4, 2, 1, 2,...
## $ background <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,...
## $ registry   <dbl> 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 8, 1, 1, 2, 1, 2, 1, 1, 1,...
## $ assaultban <dbl> 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1,...
## $ conceal    <dbl> 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1,...
## $ pathway    <dbl> 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2,...
## $ border     <dbl> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2,...
## $ dreamer    <dbl> 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2,...
## $ deport     <dbl> 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2,...
## $ prochoice  <dbl> 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1,...
## $ prolife    <dbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,...
## $ gaym       <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,...
## $ employ     <dbl> 7, 7, 6, 6, 2, 6, 1, 4, 3, 1, 5, 1, 7, 2, 1, 4, 4, 6, 4,...
## $ pid7       <dbl> 5, 4, 1, 4, 2, 2, 6, 4, 7, 7, 2, 4, 4, 1, 4, 1, 2, 2, 2,...
## $ attend     <dbl> 6, 8, 3, 4, 6, 2, 2, 5, 4, 5, 5, 5, 4, 6, 6, 6, 6, 1, 7,...
## $ religion   <dbl> 11, 98, 2, 11, 10, 1, 1, 11, 11, 2, 1, 1, 10, 11, 9, 11,...
## $ vote16     <dbl> 1, 1, NA, NA, 2, 99, 1, NA, 1, 1, 2, 5, 99, NA, 5, 4, 99...
## $ ideo5      <dbl> 3, 3, 5, 4, 2, 6, 5, 3, 4, 3, 3, 4, 6, 1, 3, 1, 2, 3, 3,...
## $ union      <dbl> 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ income     <dbl> 97, 6, 4, 1, 7, 1, 3, 1, 4, 7, 10, 5, 2, 2, 3, 1, 6, 3, ...
## $ sexuality  <dbl> 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1,...

Counting

One of the most helpful functions to get a simple sense of how many of each race in the dataset is to use the table command. Again, this uses cces and then the pipe. You need to take a look at the codebook to understand what one and two represents. I know that 1 = male and 2 = female.

cces %>% 
  ct(gender)

## # A tibble: 2 x 3
##   gender     n   pct
##    <dbl> <int> <dbl>
## 1      1 29531 0.457
## 2      2 35069 0.543

Filtering

R has a lot of cool functions. One of the most helpful is called filter. It will filter the data based on whatever parameters we have set up.

The racial breakdown of women in the dataset. It’s important that you do == (2 equal signs).

cces %>% 
  filter(gender == 2) %>% 
  ct(race)

## # A tibble: 8 x 3
##    race     n   pct
##   <dbl> <int> <dbl>
## 1     1 24489 0.698
## 2     2  5137 0.146
## 3     3  2894 0.083
## 4     4  1115 0.032
## 5     5   248 0.007
## 6     6   858 0.024
## 7     7   261 0.007
## 8     8    67 0.002

Recoding

Do you want to create a dichotomous variable? Use the case_when function. When looking at the codebook, we see that white is 1. So this race == 1 ~ 1, tells R to take the race variable and when it equals one, then convert that to the number 1. The TRUE ~ 0 means, take everything else and make it zero.

cces %>% 
  mutate(white = case_when(race == 1 ~ 1,
                           TRUE ~ 0)) %>% 
  ct(white)

## # A tibble: 2 x 3
##   white     n   pct
##   <dbl> <int> <dbl>
## 1     0 18311 0.283
## 2     1 46289 0.717

Or do you want to have multiple levels of a variable?

cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian")) %>% 
  ct(race)

## # A tibble: 5 x 3
##   race         n   pct
##   <fct>    <int> <dbl>
## 1 White    46289 0.717
## 2 Black     7926 0.123
## 3 Hispanic  5238 0.081
## 4 Asian     2278 0.035
## 5 <NA>      2869 0.044

But, do you notice how you have some NAs values? We can still do our count but exclude them by including show_na = FALSE

cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian")) %>% 
  ct(race, show_na = FALSE)

## # A tibble: 4 x 3
##   race         n   pct
##   <fct>    <int> <dbl>
## 1 White    46289 0.75 
## 2 Black     7926 0.128
## 3 Hispanic  5238 0.085
## 4 Asian     2278 0.037

Or, you can do simple math:

cces %>% 
  mutate(age = 2016 - birthyr) %>% 
  ct(age)

## # A tibble: 80 x 3
##      age     n   pct
##    <dbl> <int> <dbl>
##  1    18   668 0.01 
##  2    19   776 0.012
##  3    20   682 0.011
##  4    21   750 0.012
##  5    22   788 0.012
##  6    23   784 0.012
##  7    24   913 0.014
##  8    25  1002 0.016
##  9    26  1303 0.02 
## 10    27  1126 0.017
## # ... with 70 more rows

Making a Bar Chart

graph <- cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian")) %>% 
  ct(race, show_na = FALSE)

graph %>% 
  ggplot(., aes(x = race, y = pct)) +
  geom_col()

If you want to make dodged bars here’s a tutorial for that:

Dodged Bars

Making a Line Graph

graph <- cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian",
                       TRUE ~ "All Others")) %>% 
  mutate(age = 2016 - birthyr) %>% 
  group_by(race) %>% 
  ct(age)
  
graph %>% 
  ggplot(., aes(x = age, y = pct, color = race, group = race)) +
  geom_point() +
  geom_line()

Let’s do smoothed lines now.

graph <- cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian",
                       TRUE ~ "All Others")) %>% 
  mutate(age = 2016 - birthyr) %>% 
  group_by(race) %>% 
  ct(age)
  
graph %>% 
  ggplot(., aes(x = age, y = pct, color = race, group = race)) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Finding the Mean + Graph

cces %>% 
  mutate(age = 2016 - birthyr) %>% 
  mean_ci(age)

## # A tibble: 1 x 7
##    mean    sd     n level     se lower upper
##   <dbl> <dbl> <int> <dbl>  <dbl> <dbl> <dbl>
## 1  47.9  16.8 64600  0.05 0.0662  47.8  48.0

Finding the average age for each racial group.

cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian")) %>% 
  mutate(age = 2016- birthyr) %>% 
  group_by(race) %>% 
  mean_ci(age)

## # A tibble: 5 x 8
##   race      mean    sd     n level     se lower upper
##   <fct>    <dbl> <dbl> <int> <dbl>  <dbl> <dbl> <dbl>
## 1 White     49.8  16.9 46289  0.05 0.0786  49.6  49.9
## 2 Black     44.7  15.9  7926  0.05 0.178   44.4  45.1
## 3 Hispanic  40.9  14.9  5238  0.05 0.205   40.5  41.3
## 4 Asian     39.4  14.0  2278  0.05 0.293   38.8  40.0
## 5 <NA>      45.5  16.6  2869  0.05 0.310   44.9  46.1

Now, we can graph that

graph <- cces %>% 
  mutate(race = frcode(race == 1 ~ "White",
                       race == 2 ~ "Black",
                       race == 3 ~ "Hispanic",
                       race == 4 ~ "Asian")) %>% 
  mutate(age = 2016- birthyr) %>% 
  group_by(race) %>% 
  mean_ci(age)


graph %>% 
  ggplot(., aes(x = race, y = mean)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower, ymax = upper))

Graph that without NAs

graph %>% 
  filter(race != "NA") %>% 
  ggplot(., aes(x = race, y = mean)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower, ymax = upper))