loops, functions, apply family
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
dataset <- mtcars
options(scipen = 3)
options(digits = 4)
functions
if you are repating a chunk of code more than once, its a good idea to make a function, example:
dataset$Zdisp <- (dataset$disp - mean(dataset$disp)/ sd(dataset$disp))
dataset$Zhp <- (dataset$hp - mean(dataset$hp)/ sd(dataset$hp))
dataset$Zdrat <- (dataset$drat - mean(dataset$drat)/ sd(dataset$drat))
The code isnt easy to read line by line, this also opens up opportunity for errors. write a function to make code more efficient:
dataset <- mtcars
standardize <- function(x){
y <- (x-mean(x)) / sd(x)
return(y)
}
dataset$Zdisp <- standardize(dataset$disp)
dataset$Zhp <- standardize(dataset$hp)
dataset$Zdrat <- standardize(dataset$drat)
Loops and control statements
use of loops should be limited as much as possible because they are not efficient. Two most common loops are for loop and why loop
dataset <- dataset %>%
dplyr::select(disp, hp, drat, wt, qsec, Zdisp, Zhp, Zdrat)
dataset
## disp hp drat wt qsec Zdisp Zhp Zdrat
## Mazda RX4 160.0 110 3.90 2.620 16.46 -0.57062 -0.53509 0.56751
## Mazda RX4 Wag 160.0 110 3.90 2.875 17.02 -0.57062 -0.53509 0.56751
## Datsun 710 108.0 93 3.85 2.320 18.61 -0.99018 -0.78304 0.47400
## Hornet 4 Drive 258.0 110 3.08 3.215 19.44 0.22009 -0.53509 -0.96612
## Hornet Sportabout 360.0 175 3.15 3.440 17.02 1.04308 0.41294 -0.83520
## Valiant 225.0 105 2.76 3.460 20.22 -0.04617 -0.60802 -1.56461
## Duster 360 360.0 245 3.21 3.570 15.84 1.04308 1.43390 -0.72298
## Merc 240D 146.7 62 3.69 3.190 20.00 -0.67793 -1.23518 0.17475
## Merc 230 140.8 95 3.92 3.150 22.90 -0.72554 -0.75387 0.60492
## Merc 280 167.6 123 3.92 3.440 18.30 -0.50930 -0.34549 0.60492
## Merc 280C 167.6 123 3.92 3.440 18.90 -0.50930 -0.34549 0.60492
## Merc 450SE 275.8 180 3.07 4.070 17.40 0.36371 0.48587 -0.98482
## Merc 450SL 275.8 180 3.07 3.730 17.60 0.36371 0.48587 -0.98482
## Merc 450SLC 275.8 180 3.07 3.780 18.00 0.36371 0.48587 -0.98482
## Cadillac Fleetwood 472.0 205 2.93 5.250 17.98 1.94675 0.85050 -1.24666
## Lincoln Continental 460.0 215 3.00 5.424 17.82 1.84993 0.99635 -1.11574
## Chrysler Imperial 440.0 230 3.23 5.345 17.42 1.68856 1.21513 -0.68558
## Fiat 128 78.7 66 4.08 2.200 19.47 -1.22659 -1.17684 0.90416
## Honda Civic 75.7 52 4.93 1.615 18.52 -1.25079 -1.38103 2.49390
## Toyota Corolla 71.1 65 4.22 1.835 19.90 -1.28791 -1.19142 1.16600
## Toyota Corona 120.1 97 3.70 2.465 20.01 -0.89255 -0.72470 0.19346
## Dodge Challenger 318.0 150 2.76 3.520 16.87 0.70420 0.04831 -1.56461
## AMC Javelin 304.0 150 3.15 3.435 17.30 0.59124 0.04831 -0.83520
## Camaro Z28 350.0 245 3.73 3.840 15.41 0.96240 1.43390 0.24957
## Pontiac Firebird 400.0 175 3.08 3.845 17.05 1.36582 0.41294 -0.96612
## Fiat X1-9 79.0 66 4.08 1.935 18.90 -1.22417 -1.17684 0.90416
## Porsche 914-2 120.3 91 4.43 2.140 16.70 -0.89094 -0.81221 1.55876
## Lotus Europa 95.1 113 3.77 1.513 16.90 -1.09427 -0.49134 0.32438
## Ford Pantera L 351.0 264 4.22 3.170 14.50 0.97046 1.71102 1.16600
## Ferrari Dino 145.0 175 3.62 2.770 15.50 -0.69165 0.41294 0.04383
## Maserati Bora 301.0 335 3.54 3.570 14.60 0.56704 2.74657 -0.10579
## Volvo 142E 121.0 109 4.11 2.780 18.60 -0.88529 -0.54968 0.96027
this example gives a name to an index “i” that starts at 1 and goes to a maximum point, taken to be the number of columns in the dataset
loops are computationally slow and they are prone to bugs. Apply is usually a better approach which will be shown below
colMeans <- vector(length = ncol(dataset))
for (i in 1:ncol(dataset)){
colMeans[i] = mean(dataset[,i])
}
colMeans
## [1] 2.307e+02 1.467e+02 3.597e+00 3.217e+00 1.785e+01 -9.085e-17 1.041e-17
## [8] -2.919e-16
emptyMatrix <- matrix(0, nrow = 5, ncol = 5)
for(i in 1:nrow(emptyMatrix)){
for(j in 1:ncol(emptyMatrix)){
emptyMatrix[i,j] <- paste0(i, ",", j)
}
}
emptyMatrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
## [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
## [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
## [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
## [5,] "5,1" "5,2" "5,3" "5,4" "5,5"
now look at while loops
stock <- 300 #define starting value
days <- 1 #count number of days
set.seed(555)
while(stock < 350) {
stock <- stock + runif(1, -5, 20) #add random noise, generate from a uniform(-5,20) distribution
days <- days + 1
print(days)
}
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
stock
## [1] 353.2
ifelse statement
Zhp <- dataset$hp
HP_aboveAverage <- ifelse(Zhp >= 0, TRUE, FALSE)
HP_df <- data.frame(Zhp = Zhp, HP_aboveAverage = HP_aboveAverage)
HP_df
## Zhp HP_aboveAverage
## 1 110 TRUE
## 2 110 TRUE
## 3 93 TRUE
## 4 110 TRUE
## 5 175 TRUE
## 6 105 TRUE
## 7 245 TRUE
## 8 62 TRUE
## 9 95 TRUE
## 10 123 TRUE
## 11 123 TRUE
## 12 180 TRUE
## 13 180 TRUE
## 14 180 TRUE
## 15 205 TRUE
## 16 215 TRUE
## 17 230 TRUE
## 18 66 TRUE
## 19 52 TRUE
## 20 65 TRUE
## 21 97 TRUE
## 22 150 TRUE
## 23 150 TRUE
## 24 245 TRUE
## 25 175 TRUE
## 26 66 TRUE
## 27 91 TRUE
## 28 113 TRUE
## 29 264 TRUE
## 30 175 TRUE
## 31 335 TRUE
## 32 109 TRUE
Apply
Apply is much faster than loop
dataset
## disp hp drat wt qsec Zdisp Zhp Zdrat
## Mazda RX4 160.0 110 3.90 2.620 16.46 -0.57062 -0.53509 0.56751
## Mazda RX4 Wag 160.0 110 3.90 2.875 17.02 -0.57062 -0.53509 0.56751
## Datsun 710 108.0 93 3.85 2.320 18.61 -0.99018 -0.78304 0.47400
## Hornet 4 Drive 258.0 110 3.08 3.215 19.44 0.22009 -0.53509 -0.96612
## Hornet Sportabout 360.0 175 3.15 3.440 17.02 1.04308 0.41294 -0.83520
## Valiant 225.0 105 2.76 3.460 20.22 -0.04617 -0.60802 -1.56461
## Duster 360 360.0 245 3.21 3.570 15.84 1.04308 1.43390 -0.72298
## Merc 240D 146.7 62 3.69 3.190 20.00 -0.67793 -1.23518 0.17475
## Merc 230 140.8 95 3.92 3.150 22.90 -0.72554 -0.75387 0.60492
## Merc 280 167.6 123 3.92 3.440 18.30 -0.50930 -0.34549 0.60492
## Merc 280C 167.6 123 3.92 3.440 18.90 -0.50930 -0.34549 0.60492
## Merc 450SE 275.8 180 3.07 4.070 17.40 0.36371 0.48587 -0.98482
## Merc 450SL 275.8 180 3.07 3.730 17.60 0.36371 0.48587 -0.98482
## Merc 450SLC 275.8 180 3.07 3.780 18.00 0.36371 0.48587 -0.98482
## Cadillac Fleetwood 472.0 205 2.93 5.250 17.98 1.94675 0.85050 -1.24666
## Lincoln Continental 460.0 215 3.00 5.424 17.82 1.84993 0.99635 -1.11574
## Chrysler Imperial 440.0 230 3.23 5.345 17.42 1.68856 1.21513 -0.68558
## Fiat 128 78.7 66 4.08 2.200 19.47 -1.22659 -1.17684 0.90416
## Honda Civic 75.7 52 4.93 1.615 18.52 -1.25079 -1.38103 2.49390
## Toyota Corolla 71.1 65 4.22 1.835 19.90 -1.28791 -1.19142 1.16600
## Toyota Corona 120.1 97 3.70 2.465 20.01 -0.89255 -0.72470 0.19346
## Dodge Challenger 318.0 150 2.76 3.520 16.87 0.70420 0.04831 -1.56461
## AMC Javelin 304.0 150 3.15 3.435 17.30 0.59124 0.04831 -0.83520
## Camaro Z28 350.0 245 3.73 3.840 15.41 0.96240 1.43390 0.24957
## Pontiac Firebird 400.0 175 3.08 3.845 17.05 1.36582 0.41294 -0.96612
## Fiat X1-9 79.0 66 4.08 1.935 18.90 -1.22417 -1.17684 0.90416
## Porsche 914-2 120.3 91 4.43 2.140 16.70 -0.89094 -0.81221 1.55876
## Lotus Europa 95.1 113 3.77 1.513 16.90 -1.09427 -0.49134 0.32438
## Ford Pantera L 351.0 264 4.22 3.170 14.50 0.97046 1.71102 1.16600
## Ferrari Dino 145.0 175 3.62 2.770 15.50 -0.69165 0.41294 0.04383
## Maserati Bora 301.0 335 3.54 3.570 14.60 0.56704 2.74657 -0.10579
## Volvo 142E 121.0 109 4.11 2.780 18.60 -0.88529 -0.54968 0.96027
Applies the mean() function over the columns of dataset dataframe to return a vector of means. The three arguments here are: an array (a dataframe is an array), a margin (1 for rows, 2 for columns), and a function to apply over that margin (mean())
apply(dataset, 2, mean)
## disp hp drat wt qsec Zdisp Zhp
## 2.307e+02 1.467e+02 3.597e+00 3.217e+00 1.785e+01 -9.085e-17 1.041e-17
## Zdrat
## -2.919e-16
there are many other helpful functions in this family, the most commonly used are lapply() and sapply(). Others are vapply(), tapply(), mapply()
x <- list(a = 1.5, beta = exp(-3:3), logic = c(TRUE, FALSE, FALSE, TRUE))
get the mean for each item from the list of different vectors
lapply(x, mean)
## $a
## [1] 1.5
##
## $beta
## [1] 4.535
##
## $logic
## [1] 0.5
this functionality is not limited to functions which return singular values. example using quantile() below:
lapply(x, quantile, probs = 1:3/4)
## $a
## 25% 50% 75%
## 1.5 1.5 1.5
##
## $beta
## 25% 50% 75%
## 0.2516 1.0000 5.0537
##
## $logic
## 25% 50% 75%
## 0.0 0.5 1.0
lapply returns a list where sapply will output a vector or a matrix
sapply(x, mean)
## a beta logic
## 1.500 4.535 0.500
sapply(x, quantile, probs = 1:3/4)
## a beta logic
## 25% 1.5 0.2516 0.0
## 50% 1.5 1.0000 0.5
## 75% 1.5 5.0537 1.0
these are very useful functions, going back to top example, repeat it using sapply
Zmatrix <- sapply(dataset[, c("disp", "hp", "drat")], standardize)
Zscores <- data.frame(Zmatrix)
colnames(Zscores) <- c("Zdisp", "Zhp", "Zdrat")
Zscores
## Zdisp Zhp Zdrat
## 1 -0.57062 -0.53509 0.56751
## 2 -0.57062 -0.53509 0.56751
## 3 -0.99018 -0.78304 0.47400
## 4 0.22009 -0.53509 -0.96612
## 5 1.04308 0.41294 -0.83520
## 6 -0.04617 -0.60802 -1.56461
## 7 1.04308 1.43390 -0.72298
## 8 -0.67793 -1.23518 0.17475
## 9 -0.72554 -0.75387 0.60492
## 10 -0.50930 -0.34549 0.60492
## 11 -0.50930 -0.34549 0.60492
## 12 0.36371 0.48587 -0.98482
## 13 0.36371 0.48587 -0.98482
## 14 0.36371 0.48587 -0.98482
## 15 1.94675 0.85050 -1.24666
## 16 1.84993 0.99635 -1.11574
## 17 1.68856 1.21513 -0.68558
## 18 -1.22659 -1.17684 0.90416
## 19 -1.25079 -1.38103 2.49390
## 20 -1.28791 -1.19142 1.16600
## 21 -0.89255 -0.72470 0.19346
## 22 0.70420 0.04831 -1.56461
## 23 0.59124 0.04831 -0.83520
## 24 0.96240 1.43390 0.24957
## 25 1.36582 0.41294 -0.96612
## 26 -1.22417 -1.17684 0.90416
## 27 -0.89094 -0.81221 1.55876
## 28 -1.09427 -0.49134 0.32438
## 29 0.97046 1.71102 1.16600
## 30 -0.69165 0.41294 0.04383
## 31 0.56704 2.74657 -0.10579
## 32 -0.88529 -0.54968 0.96027
We will explore the “forcats” package of the tidyverse. In this tutorial we will examine the following things:
Please note that some of the code in this tutorial was adapted from Chapter 15 of the book “R for Data Science” by Hadley Wickham and Garrett Grolemund. The full book can be found at: https://r4ds.had.co.nz/#
A good cheat sheet for forcats functions can be found at: https://rstudio.com/resources/cheatsheets/
Another good tutorial for forcats: https://www.r-bloggers.com/2020/06/working-with-factors-in-r-tutorial-forcats-package/
Let’s look at the motivation for factors.
x1 <- c("Dec", "Apr", "Mar", "Jan")
x2 <- c("Dec", "Apr", "Mar", "Jam") # This has a typo
It is important to define the levels of factors. Because we are looking at month variables, our levels are the various months.
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
Now let’s see what happens when we pass these levels to the levels
argument of the factor()
function.
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Mar Jan
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec Apr Mar <NA>
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Notice that in the y2
factor, the “Jam” typo has been
dropped.
By default, if you create a factor variable but do not specify the levels, the levels will be in alphabetical order.
f1 <- factor(x1)
f1
## [1] Dec Apr Mar Jan
## Levels: Apr Dec Jan Mar
Alternatively, you can use the unique()
function in the
following, and the levels will be ordered in the order they appear in
the data:
o1 <- factor(x1, levels = unique(x1))
o1
## [1] Dec Apr Mar Jan
## Levels: Dec Apr Mar Jan
It is clearest to see how important the order of levels is in graphs. Let’s generate some fake data as “counts” and generate some bar charts to see the real-world impact of the order of factor levels.
set.seed(36)
counts <- rnorm(4, 50, 10)
orderedData <- data.frame(Month = y1, Count = counts)
ggplot(orderedData) +
geom_bar(aes(x = Month, y = Count), stat = "identity")
alphabeticalData <- data.frame(Month = f1, Count = counts)
ggplot(alphabeticalData) +
geom_bar(aes(x = Month, y = Count), stat = "identity")
firstAppearanceData <- data.frame(Month = o1, Count = counts)
ggplot(firstAppearanceData) +
geom_bar(aes(x = Month, y = Count), stat = "identity")
The forcats package has a number of helper functions, most of which
involve either modifying the order of factor levels, or the levels
themselves. We will start the factor order. We will use the dataset
gss_cat
which is built into the forcats package. Let’s
start with the count()
function. This is essentially a
wrapper for the combination of group_by()
followed by
tally()
, from the dplyr package. We will create and look at
a new data frame, then create a tally by level of the variable
“marital”.
data <- gss_cat
head(data)
## # A tibble: 6 x 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near r~ Prot~ Sout~ 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str re~ Prot~ Bapt~ NA
## 3 2000 Widowed 67 White Not applicable Independent Prot~ No d~ 2
## 4 2000 Never married 39 White Not applicable Ind,near r~ Orth~ Not ~ 4
## 5 2000 Divorced 25 White Not applicable Not str de~ None Not ~ 1
## 6 2000 Married 25 White $20000 - 24999 Strong dem~ Prot~ Sout~ NA
data %>%
count(marital)
## # A tibble: 6 x 2
## marital n
## <fct> <int>
## 1 No answer 17
## 2 Never married 5416
## 3 Separated 743
## 4 Divorced 3383
## 5 Widowed 1807
## 6 Married 10117
Alternatively, you can use the add_count()
function to
add these directly to the original data.
data %>%
add_count(marital, name = "MaritalCount")
## # A tibble: 21,483 x 10
## year marital age race rincome partyid relig denom tvhours Marit~1
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int> <int>
## 1 2000 Never married 26 White $8000 to~ Ind,ne~ Prot~ Sout~ 12 5416
## 2 2000 Divorced 48 White $8000 to~ Not st~ Prot~ Bapt~ NA 3383
## 3 2000 Widowed 67 White Not appl~ Indepe~ Prot~ No d~ 2 1807
## 4 2000 Never married 39 White Not appl~ Ind,ne~ Orth~ Not ~ 4 5416
## 5 2000 Divorced 25 White Not appl~ Not st~ None Not ~ 1 3383
## 6 2000 Married 25 White $20000 -~ Strong~ Prot~ Sout~ NA 10117
## 7 2000 Never married 36 White $25000 o~ Not st~ Chri~ Not ~ 3 5416
## 8 2000 Divorced 44 White $7000 to~ Ind,ne~ Prot~ Luth~ NA 3383
## 9 2000 Married 44 White $25000 o~ Not st~ Prot~ Other 0 10117
## 10 2000 Married 47 White $25000 o~ Strong~ Prot~ Sout~ 3 10117
## # ... with 21,473 more rows, and abbreviated variable name 1: MaritalCount
Let’s look at a simple bar chart of this variable.
data %>%
ggplot() +
geom_bar(aes(x = marital))
Notice these are out of order. We can use the
fct_infreq()
to reorder the levels of the factor from most
to least frequent:
data %>%
ggplot() +
geom_bar(aes(x = fct_infreq(marital)))
However, you may wish to have these in reverse order. We can use the
fct_rev()
function to reverse the current order of factor
levels; thus these functions work incredibly well together. We will use
the mutate()
function to alter the actual data.
data %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot() +
geom_bar(aes(x = marital))
Next, we are going to use the fct_reorder()
function,
which helps to change the order of the levels of factors by another
variable. We will do this with the variable “relig”, and start by
creating a summary dataframe based on this variable.
relig_summary <- data %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
relig_summary
## # A tibble: 15 x 4
## relig age tvhours n
## <fct> <dbl> <dbl> <int>
## 1 No answer 49.5 2.72 93
## 2 Don't know 35.9 4.62 15
## 3 Inter-nondenominational 40.0 2.87 109
## 4 Native american 38.9 3.46 23
## 5 Christian 40.1 2.79 689
## 6 Orthodox-christian 50.4 2.42 95
## 7 Moslem/islam 37.6 2.44 104
## 8 Other eastern 45.9 1.67 32
## 9 Hinduism 37.7 1.89 71
## 10 Buddhism 44.7 2.38 147
## 11 Other 41.0 2.73 224
## 12 None 41.2 2.71 3523
## 13 Jewish 52.4 2.52 388
## 14 Catholic 46.9 2.96 5124
## 15 Protestant 49.9 3.15 10846
Let’s create a scatterplot of TV hours by religion to see a relationship:
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()
Let’s rearrange using the fct_reorder()
function:
ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) + geom_point()
Now let’s suppose, however, that we want to move one or more specific
levels. The fct_relevel()
function can help with this. We
will move the “No answer” category to the end.
relig_summary <- relig_summary %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
mutate(relig = fct_relevel(relig, "No answer"))
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()
Next, we can actually recode and modify the levels of factors. Let’s look at the “partyid” variable.
data %>%
count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
Notice there are many levels of these factors. We can change the
names of these levels using the fct_recode()
function.
data <- data %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
))
data %>%
count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
These look a bit better. Next, we can collapse many levels into fewer
using the fct_collapse()
function.
data <- data %>%
mutate(partyid = fct_collapse(partyid,
"other" = c("No answer", "Don't know", "Other party"),
"republican" = c("Republican, strong", "Republican, weak"),
"independent" = c("Independent, near rep", "Independent", "Independent, near dem"),
"democrat" = c("Democrat, weak", "Democrat, strong")))
data %>%
count(partyid)
## # A tibble: 4 x 2
## partyid n
## <fct> <int>
## 1 other 548
## 2 republican 5346
## 3 independent 8409
## 4 democrat 7180
Lastly, we have helper functions fct_lump_min()
,
fct_lump_prop()
, fct_lump_n()
, and
fct_lump_lowfreq()
. Definitions are as follows:
fct_lump_min()
: lumps levels that appear fewer than min
times.fct_lump_prop()
: lumps levels that appear in fewer prop
* n times.fct_lump_n()
lumps all levels except for the n most
frequent (or least frequent if n < 0)fct_lump_lowfreq()
lumps together the least frequent
levels, ensuring that “other” is still the smallest level.Let’s see an example with the fct_lump_n()
function:
data %>%
mutate(marital = fct_lump_n(marital, n = 3)) %>%
count(marital)
## # A tibble: 4 x 2
## marital n
## <fct> <int>
## 1 Never married 5416
## 2 Divorced 3383
## 3 Married 10117
## 4 Other 2567
This leaves the top three levels by frequency, and lumps everything else together into the “Other” category.