loops

loops, functions, apply family

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

dataset <- mtcars

options(scipen = 3)
options(digits = 4)

functions

if you are repating a chunk of code more than once, its a good idea to make a function, example:

dataset$Zdisp <- (dataset$disp - mean(dataset$disp)/ sd(dataset$disp))
dataset$Zhp <- (dataset$hp - mean(dataset$hp)/ sd(dataset$hp))
dataset$Zdrat <- (dataset$drat - mean(dataset$drat)/ sd(dataset$drat))

The code isnt easy to read line by line, this also opens up opportunity for errors. write a function to make code more efficient:

dataset <- mtcars

standardize <- function(x){
  y <- (x-mean(x)) / sd(x)
  return(y)
}



dataset$Zdisp <- standardize(dataset$disp)
dataset$Zhp <- standardize(dataset$hp)
dataset$Zdrat <- standardize(dataset$drat)

Loops and control statements

use of loops should be limited as much as possible because they are not efficient. Two most common loops are for loop and why loop

dataset <- dataset %>%
  dplyr::select(disp, hp, drat, wt, qsec, Zdisp, Zhp, Zdrat)

dataset

##                      disp  hp drat    wt  qsec    Zdisp      Zhp    Zdrat
## Mazda RX4           160.0 110 3.90 2.620 16.46 -0.57062 -0.53509  0.56751
## Mazda RX4 Wag       160.0 110 3.90 2.875 17.02 -0.57062 -0.53509  0.56751
## Datsun 710          108.0  93 3.85 2.320 18.61 -0.99018 -0.78304  0.47400
## Hornet 4 Drive      258.0 110 3.08 3.215 19.44  0.22009 -0.53509 -0.96612
## Hornet Sportabout   360.0 175 3.15 3.440 17.02  1.04308  0.41294 -0.83520
## Valiant             225.0 105 2.76 3.460 20.22 -0.04617 -0.60802 -1.56461
## Duster 360          360.0 245 3.21 3.570 15.84  1.04308  1.43390 -0.72298
## Merc 240D           146.7  62 3.69 3.190 20.00 -0.67793 -1.23518  0.17475
## Merc 230            140.8  95 3.92 3.150 22.90 -0.72554 -0.75387  0.60492
## Merc 280            167.6 123 3.92 3.440 18.30 -0.50930 -0.34549  0.60492
## Merc 280C           167.6 123 3.92 3.440 18.90 -0.50930 -0.34549  0.60492
## Merc 450SE          275.8 180 3.07 4.070 17.40  0.36371  0.48587 -0.98482
## Merc 450SL          275.8 180 3.07 3.730 17.60  0.36371  0.48587 -0.98482
## Merc 450SLC         275.8 180 3.07 3.780 18.00  0.36371  0.48587 -0.98482
## Cadillac Fleetwood  472.0 205 2.93 5.250 17.98  1.94675  0.85050 -1.24666
## Lincoln Continental 460.0 215 3.00 5.424 17.82  1.84993  0.99635 -1.11574
## Chrysler Imperial   440.0 230 3.23 5.345 17.42  1.68856  1.21513 -0.68558
## Fiat 128             78.7  66 4.08 2.200 19.47 -1.22659 -1.17684  0.90416
## Honda Civic          75.7  52 4.93 1.615 18.52 -1.25079 -1.38103  2.49390
## Toyota Corolla       71.1  65 4.22 1.835 19.90 -1.28791 -1.19142  1.16600
## Toyota Corona       120.1  97 3.70 2.465 20.01 -0.89255 -0.72470  0.19346
## Dodge Challenger    318.0 150 2.76 3.520 16.87  0.70420  0.04831 -1.56461
## AMC Javelin         304.0 150 3.15 3.435 17.30  0.59124  0.04831 -0.83520
## Camaro Z28          350.0 245 3.73 3.840 15.41  0.96240  1.43390  0.24957
## Pontiac Firebird    400.0 175 3.08 3.845 17.05  1.36582  0.41294 -0.96612
## Fiat X1-9            79.0  66 4.08 1.935 18.90 -1.22417 -1.17684  0.90416
## Porsche 914-2       120.3  91 4.43 2.140 16.70 -0.89094 -0.81221  1.55876
## Lotus Europa         95.1 113 3.77 1.513 16.90 -1.09427 -0.49134  0.32438
## Ford Pantera L      351.0 264 4.22 3.170 14.50  0.97046  1.71102  1.16600
## Ferrari Dino        145.0 175 3.62 2.770 15.50 -0.69165  0.41294  0.04383
## Maserati Bora       301.0 335 3.54 3.570 14.60  0.56704  2.74657 -0.10579
## Volvo 142E          121.0 109 4.11 2.780 18.60 -0.88529 -0.54968  0.96027

this example gives a name to an index “i” that starts at 1 and goes to a maximum point, taken to be the number of columns in the dataset

loops are computationally slow and they are prone to bugs. Apply is usually a better approach which will be shown below

colMeans <- vector(length = ncol(dataset))

for (i in 1:ncol(dataset)){
  colMeans[i] = mean(dataset[,i])
}

colMeans

## [1]  2.307e+02  1.467e+02  3.597e+00  3.217e+00  1.785e+01 -9.085e-17  1.041e-17
## [8] -2.919e-16

emptyMatrix <- matrix(0, nrow = 5, ncol = 5)

for(i in 1:nrow(emptyMatrix)){
  for(j in 1:ncol(emptyMatrix)){
    emptyMatrix[i,j] <- paste0(i, ",", j)
  }
}

emptyMatrix

##      [,1]  [,2]  [,3]  [,4]  [,5] 
## [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
## [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
## [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
## [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
## [5,] "5,1" "5,2" "5,3" "5,4" "5,5"

now look at while loops

stock <- 300 #define starting value
days <- 1 #count number of days
set.seed(555)

while(stock < 350) {
  stock <- stock + runif(1, -5, 20) #add random noise, generate from a uniform(-5,20) distribution
  
  days <- days + 1
  
  print(days)
}

## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6

stock

## [1] 353.2

ifelse statement

Zhp <- dataset$hp
HP_aboveAverage <- ifelse(Zhp >= 0, TRUE, FALSE)

HP_df <- data.frame(Zhp = Zhp, HP_aboveAverage = HP_aboveAverage)
HP_df

##    Zhp HP_aboveAverage
## 1  110            TRUE
## 2  110            TRUE
## 3   93            TRUE
## 4  110            TRUE
## 5  175            TRUE
## 6  105            TRUE
## 7  245            TRUE
## 8   62            TRUE
## 9   95            TRUE
## 10 123            TRUE
## 11 123            TRUE
## 12 180            TRUE
## 13 180            TRUE
## 14 180            TRUE
## 15 205            TRUE
## 16 215            TRUE
## 17 230            TRUE
## 18  66            TRUE
## 19  52            TRUE
## 20  65            TRUE
## 21  97            TRUE
## 22 150            TRUE
## 23 150            TRUE
## 24 245            TRUE
## 25 175            TRUE
## 26  66            TRUE
## 27  91            TRUE
## 28 113            TRUE
## 29 264            TRUE
## 30 175            TRUE
## 31 335            TRUE
## 32 109            TRUE

Apply

Apply is much faster than loop

dataset

##                      disp  hp drat    wt  qsec    Zdisp      Zhp    Zdrat
## Mazda RX4           160.0 110 3.90 2.620 16.46 -0.57062 -0.53509  0.56751
## Mazda RX4 Wag       160.0 110 3.90 2.875 17.02 -0.57062 -0.53509  0.56751
## Datsun 710          108.0  93 3.85 2.320 18.61 -0.99018 -0.78304  0.47400
## Hornet 4 Drive      258.0 110 3.08 3.215 19.44  0.22009 -0.53509 -0.96612
## Hornet Sportabout   360.0 175 3.15 3.440 17.02  1.04308  0.41294 -0.83520
## Valiant             225.0 105 2.76 3.460 20.22 -0.04617 -0.60802 -1.56461
## Duster 360          360.0 245 3.21 3.570 15.84  1.04308  1.43390 -0.72298
## Merc 240D           146.7  62 3.69 3.190 20.00 -0.67793 -1.23518  0.17475
## Merc 230            140.8  95 3.92 3.150 22.90 -0.72554 -0.75387  0.60492
## Merc 280            167.6 123 3.92 3.440 18.30 -0.50930 -0.34549  0.60492
## Merc 280C           167.6 123 3.92 3.440 18.90 -0.50930 -0.34549  0.60492
## Merc 450SE          275.8 180 3.07 4.070 17.40  0.36371  0.48587 -0.98482
## Merc 450SL          275.8 180 3.07 3.730 17.60  0.36371  0.48587 -0.98482
## Merc 450SLC         275.8 180 3.07 3.780 18.00  0.36371  0.48587 -0.98482
## Cadillac Fleetwood  472.0 205 2.93 5.250 17.98  1.94675  0.85050 -1.24666
## Lincoln Continental 460.0 215 3.00 5.424 17.82  1.84993  0.99635 -1.11574
## Chrysler Imperial   440.0 230 3.23 5.345 17.42  1.68856  1.21513 -0.68558
## Fiat 128             78.7  66 4.08 2.200 19.47 -1.22659 -1.17684  0.90416
## Honda Civic          75.7  52 4.93 1.615 18.52 -1.25079 -1.38103  2.49390
## Toyota Corolla       71.1  65 4.22 1.835 19.90 -1.28791 -1.19142  1.16600
## Toyota Corona       120.1  97 3.70 2.465 20.01 -0.89255 -0.72470  0.19346
## Dodge Challenger    318.0 150 2.76 3.520 16.87  0.70420  0.04831 -1.56461
## AMC Javelin         304.0 150 3.15 3.435 17.30  0.59124  0.04831 -0.83520
## Camaro Z28          350.0 245 3.73 3.840 15.41  0.96240  1.43390  0.24957
## Pontiac Firebird    400.0 175 3.08 3.845 17.05  1.36582  0.41294 -0.96612
## Fiat X1-9            79.0  66 4.08 1.935 18.90 -1.22417 -1.17684  0.90416
## Porsche 914-2       120.3  91 4.43 2.140 16.70 -0.89094 -0.81221  1.55876
## Lotus Europa         95.1 113 3.77 1.513 16.90 -1.09427 -0.49134  0.32438
## Ford Pantera L      351.0 264 4.22 3.170 14.50  0.97046  1.71102  1.16600
## Ferrari Dino        145.0 175 3.62 2.770 15.50 -0.69165  0.41294  0.04383
## Maserati Bora       301.0 335 3.54 3.570 14.60  0.56704  2.74657 -0.10579
## Volvo 142E          121.0 109 4.11 2.780 18.60 -0.88529 -0.54968  0.96027

Applies the mean() function over the columns of dataset dataframe to return a vector of means. The three arguments here are: an array (a dataframe is an array), a margin (1 for rows, 2 for columns), and a function to apply over that margin (mean())

apply(dataset, 2, mean)

##       disp         hp       drat         wt       qsec      Zdisp        Zhp 
##  2.307e+02  1.467e+02  3.597e+00  3.217e+00  1.785e+01 -9.085e-17  1.041e-17 
##      Zdrat 
## -2.919e-16

there are many other helpful functions in this family, the most commonly used are lapply() and sapply(). Others are vapply(), tapply(), mapply()

x <- list(a = 1.5, beta = exp(-3:3), logic = c(TRUE, FALSE, FALSE, TRUE))

get the mean for each item from the list of different vectors

lapply(x, mean)

## $a
## [1] 1.5
## 
## $beta
## [1] 4.535
## 
## $logic
## [1] 0.5

this functionality is not limited to functions which return singular values. example using quantile() below:

lapply(x, quantile, probs = 1:3/4)

## $a
## 25% 50% 75% 
## 1.5 1.5 1.5 
## 
## $beta
##    25%    50%    75% 
## 0.2516 1.0000 5.0537 
## 
## $logic
## 25% 50% 75% 
## 0.0 0.5 1.0

lapply returns a list where sapply will output a vector or a matrix

sapply(x, mean)

##     a  beta logic 
## 1.500 4.535 0.500

sapply(x, quantile, probs = 1:3/4)

##       a   beta logic
## 25% 1.5 0.2516   0.0
## 50% 1.5 1.0000   0.5
## 75% 1.5 5.0537   1.0

these are very useful functions, going back to top example, repeat it using sapply

Zmatrix <- sapply(dataset[, c("disp", "hp", "drat")], standardize)
Zscores <- data.frame(Zmatrix)
colnames(Zscores) <- c("Zdisp", "Zhp", "Zdrat")
Zscores

##       Zdisp      Zhp    Zdrat
## 1  -0.57062 -0.53509  0.56751
## 2  -0.57062 -0.53509  0.56751
## 3  -0.99018 -0.78304  0.47400
## 4   0.22009 -0.53509 -0.96612
## 5   1.04308  0.41294 -0.83520
## 6  -0.04617 -0.60802 -1.56461
## 7   1.04308  1.43390 -0.72298
## 8  -0.67793 -1.23518  0.17475
## 9  -0.72554 -0.75387  0.60492
## 10 -0.50930 -0.34549  0.60492
## 11 -0.50930 -0.34549  0.60492
## 12  0.36371  0.48587 -0.98482
## 13  0.36371  0.48587 -0.98482
## 14  0.36371  0.48587 -0.98482
## 15  1.94675  0.85050 -1.24666
## 16  1.84993  0.99635 -1.11574
## 17  1.68856  1.21513 -0.68558
## 18 -1.22659 -1.17684  0.90416
## 19 -1.25079 -1.38103  2.49390
## 20 -1.28791 -1.19142  1.16600
## 21 -0.89255 -0.72470  0.19346
## 22  0.70420  0.04831 -1.56461
## 23  0.59124  0.04831 -0.83520
## 24  0.96240  1.43390  0.24957
## 25  1.36582  0.41294 -0.96612
## 26 -1.22417 -1.17684  0.90416
## 27 -0.89094 -0.81221  1.55876
## 28 -1.09427 -0.49134  0.32438
## 29  0.97046  1.71102  1.16600
## 30 -0.69165  0.41294  0.04383
## 31  0.56704  2.74657 -0.10579
## 32 -0.88529 -0.54968  0.96027

We will explore the “forcats” package of the tidyverse. In this tutorial we will examine the following things:

Factors and their levels
Modifying factor order
Modifying factor levels

Please note that some of the code in this tutorial was adapted from Chapter 15 of the book “R for Data Science” by Hadley Wickham and Garrett Grolemund. The full book can be found at: https://r4ds.had.co.nz/#

A good cheat sheet for forcats functions can be found at: https://rstudio.com/resources/cheatsheets/

Another good tutorial for forcats: https://www.r-bloggers.com/2020/06/working-with-factors-in-r-tutorial-forcats-package/

Factors and their levels

Let’s look at the motivation for factors.

x1 <- c("Dec", "Apr", "Mar", "Jan")
x2 <- c("Dec", "Apr", "Mar", "Jam")   # This has a typo

It is important to define the levels of factors. Because we are looking at month variables, our levels are the various months.

month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")

Now let’s see what happens when we pass these levels to the levels argument of the factor() function.

y1 <- factor(x1, levels = month_levels)
y1

## [1] Dec Apr Mar Jan
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

y2 <- factor(x2, levels = month_levels)
y2

## [1] Dec  Apr  Mar  <NA>
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

Notice that in the y2 factor, the “Jam” typo has been dropped.

By default, if you create a factor variable but do not specify the levels, the levels will be in alphabetical order.

f1 <- factor(x1)
f1

## [1] Dec Apr Mar Jan
## Levels: Apr Dec Jan Mar

Alternatively, you can use the unique() function in the following, and the levels will be ordered in the order they appear in the data:

o1 <- factor(x1, levels = unique(x1))
o1

## [1] Dec Apr Mar Jan
## Levels: Dec Apr Mar Jan

It is clearest to see how important the order of levels is in graphs. Let’s generate some fake data as “counts” and generate some bar charts to see the real-world impact of the order of factor levels.

set.seed(36)
counts <- rnorm(4, 50, 10)

orderedData <- data.frame(Month = y1, Count = counts)
ggplot(orderedData) +
  geom_bar(aes(x = Month, y = Count), stat = "identity")

alphabeticalData <- data.frame(Month = f1, Count = counts)
ggplot(alphabeticalData) +
  geom_bar(aes(x = Month, y = Count), stat = "identity")

firstAppearanceData <- data.frame(Month = o1, Count = counts)
ggplot(firstAppearanceData) +
  geom_bar(aes(x = Month, y = Count), stat = "identity")

Modifying factor order

The forcats package has a number of helper functions, most of which involve either modifying the order of factor levels, or the levels themselves. We will start the factor order. We will use the dataset gss_cat which is built into the forcats package. Let’s start with the count() function. This is essentially a wrapper for the combination of group_by() followed by tally(), from the dplyr package. We will create and look at a new data frame, then create a tally by level of the variable “marital”.

data <- gss_cat
head(data)

## # A tibble: 6 x 9
##    year marital         age race  rincome        partyid     relig denom tvhours
##   <int> <fct>         <int> <fct> <fct>          <fct>       <fct> <fct>   <int>
## 1  2000 Never married    26 White $8000 to 9999  Ind,near r~ Prot~ Sout~      12
## 2  2000 Divorced         48 White $8000 to 9999  Not str re~ Prot~ Bapt~      NA
## 3  2000 Widowed          67 White Not applicable Independent Prot~ No d~       2
## 4  2000 Never married    39 White Not applicable Ind,near r~ Orth~ Not ~       4
## 5  2000 Divorced         25 White Not applicable Not str de~ None  Not ~       1
## 6  2000 Married          25 White $20000 - 24999 Strong dem~ Prot~ Sout~      NA

data %>%
  count(marital)

## # A tibble: 6 x 2
##   marital           n
##   <fct>         <int>
## 1 No answer        17
## 2 Never married  5416
## 3 Separated       743
## 4 Divorced       3383
## 5 Widowed        1807
## 6 Married       10117

Alternatively, you can use the add_count() function to add these directly to the original data.

data %>%
  add_count(marital, name = "MaritalCount")

## # A tibble: 21,483 x 10
##     year marital         age race  rincome   partyid relig denom tvhours Marit~1
##    <int> <fct>         <int> <fct> <fct>     <fct>   <fct> <fct>   <int>   <int>
##  1  2000 Never married    26 White $8000 to~ Ind,ne~ Prot~ Sout~      12    5416
##  2  2000 Divorced         48 White $8000 to~ Not st~ Prot~ Bapt~      NA    3383
##  3  2000 Widowed          67 White Not appl~ Indepe~ Prot~ No d~       2    1807
##  4  2000 Never married    39 White Not appl~ Ind,ne~ Orth~ Not ~       4    5416
##  5  2000 Divorced         25 White Not appl~ Not st~ None  Not ~       1    3383
##  6  2000 Married          25 White $20000 -~ Strong~ Prot~ Sout~      NA   10117
##  7  2000 Never married    36 White $25000 o~ Not st~ Chri~ Not ~       3    5416
##  8  2000 Divorced         44 White $7000 to~ Ind,ne~ Prot~ Luth~      NA    3383
##  9  2000 Married          44 White $25000 o~ Not st~ Prot~ Other       0   10117
## 10  2000 Married          47 White $25000 o~ Strong~ Prot~ Sout~       3   10117
## # ... with 21,473 more rows, and abbreviated variable name 1: MaritalCount

Let’s look at a simple bar chart of this variable.

data %>%
  ggplot() +
  geom_bar(aes(x = marital))

Notice these are out of order. We can use the fct_infreq() to reorder the levels of the factor from most to least frequent:

data %>%
  ggplot() +
  geom_bar(aes(x = fct_infreq(marital)))

However, you may wish to have these in reverse order. We can use the fct_rev() function to reverse the current order of factor levels; thus these functions work incredibly well together. We will use the mutate() function to alter the actual data.

data %>%
  mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
  ggplot() +
  geom_bar(aes(x = marital))

Next, we are going to use the fct_reorder() function, which helps to change the order of the levels of factors by another variable. We will do this with the variable “relig”, and start by creating a summary dataframe based on this variable.

relig_summary <- data %>%
  group_by(relig) %>%
  summarise(
    age = mean(age, na.rm = TRUE),
    tvhours = mean(tvhours, na.rm = TRUE),
    n = n()
  )
relig_summary

## # A tibble: 15 x 4
##    relig                     age tvhours     n
##    <fct>                   <dbl>   <dbl> <int>
##  1 No answer                49.5    2.72    93
##  2 Don't know               35.9    4.62    15
##  3 Inter-nondenominational  40.0    2.87   109
##  4 Native american          38.9    3.46    23
##  5 Christian                40.1    2.79   689
##  6 Orthodox-christian       50.4    2.42    95
##  7 Moslem/islam             37.6    2.44   104
##  8 Other eastern            45.9    1.67    32
##  9 Hinduism                 37.7    1.89    71
## 10 Buddhism                 44.7    2.38   147
## 11 Other                    41.0    2.73   224
## 12 None                     41.2    2.71  3523
## 13 Jewish                   52.4    2.52   388
## 14 Catholic                 46.9    2.96  5124
## 15 Protestant               49.9    3.15 10846

Let’s create a scatterplot of TV hours by religion to see a relationship:

ggplot(relig_summary, aes(tvhours, relig)) + geom_point()

Let’s rearrange using the fct_reorder() function:

ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) + geom_point()

Now let’s suppose, however, that we want to move one or more specific levels. The fct_relevel() function can help with this. We will move the “No answer” category to the end.

relig_summary <- relig_summary %>%
  mutate(relig = fct_reorder(relig, tvhours)) %>%
  mutate(relig = fct_relevel(relig, "No answer"))
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()

Modifying factor levels

Next, we can actually recode and modify the levels of factors. Let’s look at the “partyid” variable.

data %>% 
  count(partyid)

## # A tibble: 10 x 2
##    partyid                n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490

Notice there are many levels of these factors. We can change the names of these levels using the fct_recode() function.

data <- data %>%
  mutate(partyid = fct_recode(partyid,
    "Republican, strong"    = "Strong republican",
    "Republican, weak"      = "Not str republican",
    "Independent, near rep" = "Ind,near rep",
    "Independent, near dem" = "Ind,near dem",
    "Democrat, weak"        = "Not str democrat",
    "Democrat, strong"      = "Strong democrat"
  ))
data %>%
  count(partyid)

## # A tibble: 10 x 2
##    partyid                   n
##    <fct>                 <int>
##  1 No answer               154
##  2 Don't know                1
##  3 Other party             393
##  4 Republican, strong     2314
##  5 Republican, weak       3032
##  6 Independent, near rep  1791
##  7 Independent            4119
##  8 Independent, near dem  2499
##  9 Democrat, weak         3690
## 10 Democrat, strong       3490

These look a bit better. Next, we can collapse many levels into fewer using the fct_collapse() function.

data <- data %>%
  mutate(partyid = fct_collapse(partyid,
    "other" = c("No answer", "Don't know", "Other party"),
    "republican" = c("Republican, strong", "Republican, weak"),
    "independent" = c("Independent, near rep", "Independent", "Independent, near dem"),
    "democrat" = c("Democrat, weak", "Democrat, strong")))
data %>%
  count(partyid)

## # A tibble: 4 x 2
##   partyid         n
##   <fct>       <int>
## 1 other         548
## 2 republican   5346
## 3 independent  8409
## 4 democrat     7180

Lastly, we have helper functions fct_lump_min(), fct_lump_prop(), fct_lump_n(), and fct_lump_lowfreq(). Definitions are as follows:

fct_lump_min(): lumps levels that appear fewer than min times.
fct_lump_prop(): lumps levels that appear in fewer prop * n times.
fct_lump_n() lumps all levels except for the n most frequent (or least frequent if n < 0)
fct_lump_lowfreq() lumps together the least frequent levels, ensuring that “other” is still the smallest level.

Let’s see an example with the fct_lump_n() function:

data %>%
  mutate(marital = fct_lump_n(marital, n = 3)) %>%
  count(marital)

## # A tibble: 4 x 2
##   marital           n
##   <fct>         <int>
## 1 Never married  5416
## 2 Divorced       3383
## 3 Married       10117
## 4 Other          2567

This leaves the top three levels by frequency, and lumps everything else together into the “Other” category.

loops

John Ross

2022-08-19

Factors and their levels

Modifying factor order

Modifying factor levels