Chapter 15
Introduction
# To create/see data summary.
skimr::skim(gss_cat)
Data summary
Name |
gss_cat |
Number of rows |
21483 |
Number of columns |
9 |
_______________________ |
|
Column type frequency: |
|
factor |
6 |
numeric |
3 |
________________________ |
|
Group variables |
None |
Variable type: factor
marital |
0 |
1 |
FALSE |
6 |
Mar: 10117, Nev: 5416, Div: 3383, Wid: 1807 |
race |
0 |
1 |
FALSE |
3 |
Whi: 16395, Bla: 3129, Oth: 1959, Not: 0 |
rincome |
0 |
1 |
FALSE |
16 |
$25: 7363, Not: 7043, $20: 1283, $10: 1168 |
partyid |
0 |
1 |
FALSE |
10 |
Ind: 4119, Not: 3690, Str: 3490, Not: 3032 |
relig |
0 |
1 |
FALSE |
15 |
Pro: 10846, Cat: 5124, Non: 3523, Chr: 689 |
denom |
0 |
1 |
FALSE |
30 |
Not: 10072, Oth: 2534, No : 1683, Sou: 1536 |
Variable type: numeric
year |
0 |
1.00 |
2006.50 |
4.45 |
2000 |
2002 |
2006 |
2010 |
2014 |
▇▃▇▂▆ |
age |
76 |
1.00 |
47.18 |
17.29 |
18 |
33 |
46 |
59 |
89 |
▇▇▇▅▂ |
tvhours |
10146 |
0.53 |
2.98 |
2.59 |
0 |
1 |
2 |
4 |
24 |
▇▂▁▁▁ |
Creating Factors
# Two strings are created to show the problem.
x1 <- c("Dec", "Apr", "Jan", "Mar")
x2 <- c("Dec", "Apr", "Jam", "Mar")
# Create a list of the valid levels.
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Create factors for both strings.
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- factor(x2, levels = month_levels)
# Typos are automatically converted to NA.
y2
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# Use parse_factor to receive a warning that there is a problem.
y2 <- parse_factor(x2, levels = month_levels)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set Jam
General Social Survey
gss_cat
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near … Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str r… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independe… Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near … Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str d… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong de… Prot… Sout… NA
## 7 2000 Never married 36 White $25000 or more Not str r… Chri… Not … 3
## 8 2000 Divorced 44 White $7000 to 7999 Ind,near … Prot… Luth… NA
## 9 2000 Married 44 White $25000 or more Not str d… Prot… Other 0
## 10 2000 Married 47 White $25000 or more Strong re… Prot… Sout… 3
## # ℹ 21,473 more rows
Modifying Factor Order
Unordered Factor Levels
# Transform data: Calculate average tv hours by religion.
tvhours_by_relig <- gss_cat %>%
group_by(relig) %>%
summarise(avg_tvhours = mean(tvhours, na.rm = TRUE))
tvhours_by_relig
## # A tibble: 15 × 2
## relig avg_tvhours
## <fct> <dbl>
## 1 No answer 2.72
## 2 Don't know 4.62
## 3 Inter-nondenominational 2.87
## 4 Native american 3.46
## 5 Christian 2.79
## 6 Orthodox-christian 2.42
## 7 Moslem/islam 2.44
## 8 Other eastern 1.67
## 9 Hinduism 1.89
## 10 Buddhism 2.38
## 11 Other 2.73
## 12 None 2.71
## 13 Jewish 2.52
## 14 Catholic 2.96
## 15 Protestant 3.15
# Plot
tvhours_by_relig %>%
ggplot(aes(x = avg_tvhours, y = relig)) +
geom_point()

Ordered Factor Levels
tvhours_by_relig %>%
ggplot(aes(x = avg_tvhours, y = fct_reorder(.f = relig, .x = avg_tvhours))) +
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Daily TV Hours Watched")

Moving a Single Level to the Front
tvhours_by_relig %>%
ggplot(aes(x = avg_tvhours,
y = fct_reorder(.f = relig, .x = avg_tvhours) %>% fct_relevel("Don't know"))) +
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Daily TV Hours Watched")

Modifying Factor Levels
gss_cat %>% distinct(partyid)
## # A tibble: 10 × 1
## partyid
## <fct>
## 1 Ind,near rep
## 2 Not str republican
## 3 Independent
## 4 Not str democrat
## 5 Strong democrat
## 6 Ind,near dem
## 7 Strong republican
## 8 Other party
## 9 No answer
## 10 Don't know
gss_cat %>% count(partyid)
## # A tibble: 10 × 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
gss_cat %>% distinct(race)
## # A tibble: 3 × 1
## race
## <fct>
## 1 White
## 2 Black
## 3 Other
# Recode
gss_cat %>%
# Rename levels
mutate(race_rev = fct_recode(race, "POC" = "Black")) %>%
select(race, race_rev) %>%
filter(race == "Black")
## # A tibble: 3,129 × 2
## race race_rev
## <fct> <fct>
## 1 Black POC
## 2 Black POC
## 3 Black POC
## 4 Black POC
## 5 Black POC
## 6 Black POC
## 7 Black POC
## 8 Black POC
## 9 Black POC
## 10 Black POC
## # ℹ 3,119 more rows
# Collapse multiple levels into one.
gss_cat %>%
mutate(race_col = fct_collapse(race, "Minority" = c("Black", "Other"))) %>%
select(race, race_col) %>%
filter(race != "White")
## # A tibble: 5,088 × 2
## race race_col
## <fct> <fct>
## 1 Black Minority
## 2 Black Minority
## 3 Black Minority
## 4 Other Minority
## 5 Black Minority
## 6 Other Minority
## 7 Black Minority
## 8 Other Minority
## 9 Black Minority
## 10 Black Minority
## # ℹ 5,078 more rows
# Lump small levels into other levels.
gss_cat %>% count(race)
## # A tibble: 3 × 2
## race n
## <fct> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
gss_cat %>% mutate(race_lump = fct_lump(race)) %>% distinct(race_lump)
## # A tibble: 2 × 1
## race_lump
## <fct>
## 1 White
## 2 Other
Chapter 16
Introduction
Creating Dates/Times
Date-Time Components
Time Spans