Module 13: Apply it to your data 12

Import your data

data("mtcars")
rolling_stone <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-05-07/rolling_stone.csv')

## Rows: 691 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): sort_name, clean_name, album, genre, type, spotify_url, artist_gen...
## dbl (13): rank_2003, rank_2012, rank_2020, differential, release_year, weeks...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Repeat the same operation over different columns of a data frame

Case of numeric variables

mtcars %>% map_dbl(.x = ., .f = ~mean(x = .x))

##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

mtcars %>% map_dbl(mean)

##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

mtcars %>% map_dbl(.x = ., .f = ~mean(x = .x, trim = .1))

##         mpg         cyl        disp          hp        drat          wt 
##  19.6961538   6.2307692 222.5230769 141.1923077   3.5792308   3.1526923 
##        qsec          vs          am        gear        carb 
##  17.8276923   0.4230769   0.3846154   3.6153846   2.6538462

mtcars %>% map_dbl(mean, trim = .1)

##         mpg         cyl        disp          hp        drat          wt 
##  19.6961538   6.2307692 222.5230769 141.1923077   3.5792308   3.1526923 
##        qsec          vs          am        gear        carb 
##  17.8276923   0.4230769   0.3846154   3.6153846   2.6538462

mtcars %>% select(mpg)

##                      mpg
## Mazda RX4           21.0
## Mazda RX4 Wag       21.0
## Datsun 710          22.8
## Hornet 4 Drive      21.4
## Hornet Sportabout   18.7
## Valiant             18.1
## Duster 360          14.3
## Merc 240D           24.4
## Merc 230            22.8
## Merc 280            19.2
## Merc 280C           17.8
## Merc 450SE          16.4
## Merc 450SL          17.3
## Merc 450SLC         15.2
## Cadillac Fleetwood  10.4
## Lincoln Continental 10.4
## Chrysler Imperial   14.7
## Fiat 128            32.4
## Honda Civic         30.4
## Toyota Corolla      33.9
## Toyota Corona       21.5
## Dodge Challenger    15.5
## AMC Javelin         15.2
## Camaro Z28          13.3
## Pontiac Firebird    19.2
## Fiat X1-9           27.3
## Porsche 914-2       26.0
## Lotus Europa        30.4
## Ford Pantera L      15.8
## Ferrari Dino        19.7
## Maserati Bora       15.0
## Volvo 142E          21.4

mtcars %>% select(.data = ., mpg)

##                      mpg
## Mazda RX4           21.0
## Mazda RX4 Wag       21.0
## Datsun 710          22.8
## Hornet 4 Drive      21.4
## Hornet Sportabout   18.7
## Valiant             18.1
## Duster 360          14.3
## Merc 240D           24.4
## Merc 230            22.8
## Merc 280            19.2
## Merc 280C           17.8
## Merc 450SE          16.4
## Merc 450SL          17.3
## Merc 450SLC         15.2
## Cadillac Fleetwood  10.4
## Lincoln Continental 10.4
## Chrysler Imperial   14.7
## Fiat 128            32.4
## Honda Civic         30.4
## Toyota Corolla      33.9
## Toyota Corona       21.5
## Dodge Challenger    15.5
## AMC Javelin         15.2
## Camaro Z28          13.3
## Pontiac Firebird    19.2
## Fiat X1-9           27.3
## Porsche 914-2       26.0
## Lotus Europa        30.4
## Ford Pantera L      15.8
## Ferrari Dino        19.7
## Maserati Bora       15.0
## Volvo 142E          21.4

Create your own function

double_by_factor <- function(x, factor) {x * factor}
10 %>% double_by_factor(factor = 2)

## [1] 20

mtcars %>% map_dfr(.x = ., .f = ~double_by_factor(x = .x,factor = 10))

## # A tibble: 32 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   210    60  1600  1100  39    26.2  165.     0    10    40    40
##  2   210    60  1600  1100  39    28.8  170.     0    10    40    40
##  3   228    40  1080   930  38.5  23.2  186.    10    10    40    10
##  4   214    60  2580  1100  30.8  32.2  194.    10     0    30    10
##  5   187    80  3600  1750  31.5  34.4  170.     0     0    30    20
##  6   181    60  2250  1050  27.6  34.6  202.    10     0    30    10
##  7   143    80  3600  2450  32.1  35.7  158.     0     0    30    40
##  8   244    40  1467   620  36.9  31.9  200     10     0    40    20
##  9   228    40  1408   950  39.2  31.5  229     10     0    40    20
## 10   192    60  1676  1230  39.2  34.4  183     10     0    40    40
## # ℹ 22 more rows

mtcars %>% map_dfr(double_by_factor, factor = 10)

## # A tibble: 32 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   210    60  1600  1100  39    26.2  165.     0    10    40    40
##  2   210    60  1600  1100  39    28.8  170.     0    10    40    40
##  3   228    40  1080   930  38.5  23.2  186.    10    10    40    10
##  4   214    60  2580  1100  30.8  32.2  194.    10     0    30    10
##  5   187    80  3600  1750  31.5  34.4  170.     0     0    30    20
##  6   181    60  2250  1050  27.6  34.6  202.    10     0    30    10
##  7   143    80  3600  2450  32.1  35.7  158.     0     0    30    40
##  8   244    40  1467   620  36.9  31.9  200     10     0    40    20
##  9   228    40  1408   950  39.2  31.5  229     10     0    40    20
## 10   192    60  1676  1230  39.2  34.4  183     10     0    40    40
## # ℹ 22 more rows

Repeat the same operation over different elements of a list

When you have a grouping variable (factor)

mtcars %>% lm(formula = mpg ~ wt, data = .)

## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      37.285       -5.344

mtcars %>% distinct(cyl)

##                   cyl
## Mazda RX4           6
## Datsun 710          4
## Hornet Sportabout   8

reg_coef_tbl <- mtcars %>%
    
    split(.$cyl) %>%
    
    map(.x = ., .f = ~lm(formula = mpg ~ wt, data = .)) %>%
    
    map(broom::tidy, conf.int = TRUE) %>%
    
    bind_rows(.id = "cyl") %>%
    
    filter(term == "wt")

reg_coef_tbl %>%
    
    mutate(estimate = -estimate,
           conf.low = -conf.low,
           conf.high = -conf.high) %>%
    ggplot(aes(x = estimate, y = cyl)) +
    geom_point() +
    geom_errorbar(aes(xmin = conf.low, xmax = conf.high))

Create your own

Choose either one of the two cases above and apply it to your data

rolling_stone %>% select_if(is.numeric) %>% map_dbl(.x = ., .f = ~mean(x = .x, na.rm = TRUE))

##                rank_2003                rank_2012                rank_2020 
##               250.504000               250.500000               250.500000 
##             differential             release_year       weeks_on_billboard 
##               -12.322721              1982.872648                64.270979 
##  peak_billboard_position       spotify_popularity      artist_member_count 
##                61.193922                55.805810                 2.746356 
##    artist_birth_year_sum debut_album_release_year       ave_age_at_top_500 
##              5363.214286              1976.871720                29.609107 
##            years_between 
##                 5.928571

rolling_stone %>% select_if(is.numeric) %>% map_dbl(mean, na.rm = TRUE)

##                rank_2003                rank_2012                rank_2020 
##               250.504000               250.500000               250.500000 
##             differential             release_year       weeks_on_billboard 
##               -12.322721              1982.872648                64.270979 
##  peak_billboard_position       spotify_popularity      artist_member_count 
##                61.193922                55.805810                 2.746356 
##    artist_birth_year_sum debut_album_release_year       ave_age_at_top_500 
##              5363.214286              1976.871720                29.609107 
##            years_between 
##                 5.928571

rolling_stone %>% select_if(is.numeric) %>% map_dbl(.x = ., .f = ~mean(x = .x, trim = .1, na.rm = TRUE))

##                rank_2003                rank_2012                rank_2020 
##               250.502500               250.500000               250.500000 
##             differential             release_year       weeks_on_billboard 
##               -10.667269              1981.853526                51.159389 
##  peak_billboard_position       spotify_popularity      artist_member_count 
##                51.260398                56.221374                 2.480000 
##    artist_birth_year_sum debut_album_release_year       ave_age_at_top_500 
##              4845.996364              1976.001818                27.780227 
##            years_between 
##                 4.067273

rolling_stone %>% select_if(is.numeric) %>% map_dbl(mean, trim = .1, na.rm = TRUE)

##                rank_2003                rank_2012                rank_2020 
##               250.502500               250.500000               250.500000 
##             differential             release_year       weeks_on_billboard 
##               -10.667269              1981.853526                51.159389 
##  peak_billboard_position       spotify_popularity      artist_member_count 
##                51.260398                56.221374                 2.480000 
##    artist_birth_year_sum debut_album_release_year       ave_age_at_top_500 
##              4845.996364              1976.001818                27.780227 
##            years_between 
##                 4.067273

rolling_stone %>% select(spotify_popularity)

## # A tibble: 691 × 1
##    spotify_popularity
##                 <dbl>
##  1                 48
##  2                 50
##  3                 58
##  4                 62
##  5                 64
##  6                 73
##  7                 67
##  8                 47
##  9                 75
## 10                 52
## # ℹ 681 more rows

rolling_stone %>% select(.data = ., spotify_popularity)

## # A tibble: 691 × 1
##    spotify_popularity
##                 <dbl>
##  1                 48
##  2                 50
##  3                 58
##  4                 62
##  5                 64
##  6                 73
##  7                 67
##  8                 47
##  9                 75
## 10                 52
## # ℹ 681 more rows

normalize <- function(x, min_val) { (x - min_val) / (max(x, na.rm = TRUE) - min_val) }
100 %>% normalize(min_val = 1)

## [1] 1

rolling_stone %>% select_if(is.numeric) %>% map_dfr(.x = ., .f = ~normalize(x = .x, min_val = min(.x, na.rm = TRUE)))

## # A tibble: 691 × 13
##    rank_2003 rank_2012 rank_2020 differential release_year weeks_on_billboard
##        <dbl>     <dbl>     <dbl>        <dbl>        <dbl>              <dbl>
##  1    0.198     0.200     0.563         0.324       0                 0.0176 
##  2    0.427     0.431     0.910         0.264       0                NA      
##  3    0.108     0.110     0.663         0.227       0.0156            0.134  
##  4    0.611     0.615    NA             0.311       0.0156           NA      
##  5    0.0982    0.0982    0.453         0.329       0.0312            0.00541
##  6   NA        NA         0.0621        0.985       0.953             0.116  
##  7   NA         0.902     0.0641        0.984       0.797             0.232  
##  8    0.842     0.840    NA             0.427       0.0312           NA      
##  9   NA        NA         0.134         0.948       0.469             0.0351 
## 10    0.0220    0.0220    0.0601        0.489       0.0625           NA      
## # ℹ 681 more rows
## # ℹ 7 more variables: peak_billboard_position <dbl>, spotify_popularity <dbl>,
## #   artist_member_count <dbl>, artist_birth_year_sum <dbl>,
## #   debut_album_release_year <dbl>, ave_age_at_top_500 <dbl>,
## #   years_between <dbl>

rolling_stone %>% select_if(is.numeric) %>% map_dfr(normalize, min_val = 1)

## # A tibble: 691 × 13
##    rank_2003 rank_2012 rank_2020 differential release_year weeks_on_billboard
##        <dbl>     <dbl>     <dbl>        <dbl>        <dbl>              <dbl>
##  1    0.198     0.200     0.563       -0.379         0.968            0.0176 
##  2    0.427     0.431     0.910       -0.501         0.968           NA      
##  3    0.108     0.110     0.663       -0.576         0.969            0.134  
##  4    0.611     0.615    NA           -0.406         0.969           NA      
##  5    0.0982    0.0982    0.453       -0.369         0.969            0.00541
##  6   NA        NA         0.0621       0.969         0.999            0.116  
##  7   NA         0.902     0.0641       0.967         0.994            0.232  
##  8    0.842     0.840    NA           -0.168         0.969           NA      
##  9   NA        NA         0.134        0.894         0.983            0.0351 
## 10    0.0220    0.0220    0.0601      -0.0414        0.970           NA      
## # ℹ 681 more rows
## # ℹ 7 more variables: peak_billboard_position <dbl>, spotify_popularity <dbl>,
## #   artist_member_count <dbl>, artist_birth_year_sum <dbl>,
## #   debut_album_release_year <dbl>, ave_age_at_top_500 <dbl>,
## #   years_between <dbl>

rolling_stone %>% lm(formula = spotify_popularity ~ rank_2020, data = .)

## 
## Call:
## lm(formula = spotify_popularity ~ rank_2020, data = .)
## 
## Coefficients:
## (Intercept)    rank_2020  
##    63.34844     -0.02232

rolling_stone %>% distinct(genre)

## # A tibble: 17 × 1
##    genre                              
##    <chr>                              
##  1 Big Band/Jazz                      
##  2 Rock n' Roll/Rhythm & Blues        
##  3 <NA>                               
##  4 Soul/Gospel/R&B                    
##  5 Hip-Hop/Rap                        
##  6 Blues/Blues Rock                   
##  7 Country/Folk/Country Rock/Folk Rock
##  8 Indie/Alternative Rock             
##  9 Punk/Post-Punk/New Wave/Power Pop  
## 10 Electronic                         
## 11 Funk/Disco                         
## 12 Latin                              
## 13 Hard Rock/Metal                    
## 14 Singer-Songwriter/Heartland Rock   
## 15 Blues/Blues ROck                   
## 16 Reggae                             
## 17 Afrobeat

reg_coef_tbl <- rolling_stone %>%
 
    split(.$genre) %>%
 
    map(.x = ., .f = ~lm(formula = spotify_popularity ~ rank_2020, data = .)) %>%
 
    map(broom::tidy, conf.int = TRUE) %>%
 
    bind_rows(.id = "genre") %>%
 
    filter(term == "rank_2020")

## Warning in qt(a, object$df.residual): NaNs produced
## Warning in qt(a, object$df.residual): NaNs produced

reg_coef_tbl %>%
 
    mutate(estimate = -estimate,
           conf.low = -conf.low,
           conf.high = -conf.high) %>%
    ggplot(aes(x = estimate, y = genre)) +
    geom_point() +
    geom_errorbar(aes(xmin = conf.low, xmax = conf.high))

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Module 13: Apply it to your data 12

Chapter 21 Iteration

Kyle Jasper

Import your data

Repeat the same operation over different columns of a data frame

Repeat the same operation over different elements of a list

Create your own