Module 13: Apply it to your data 12

Import your data

data("mtcars")
skimr::skim(mtcars)

Data summary
Name	mtcars
Number of rows	32
Number of columns	11
_______________________
Column type frequency:
numeric	11
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mpg	1	20.09	6.03	10.40	15.43	19.20	22.80	33.90	▃▇▅▁▂
cyl	1	6.19	1.79	4.00	4.00	6.00	8.00	8.00	▆▁▃▁▇
disp	1	230.72	123.94	71.10	120.83	196.30	326.00	472.00	▇▃▃▃▂
hp	1	146.69	68.56	52.00	96.50	123.00	180.00	335.00	▇▇▆▃▁
drat	1	3.60	0.53	2.76	3.08	3.70	3.92	4.93	▇▃▇▅▁
wt	1	3.22	0.98	1.51	2.58	3.33	3.61	5.42	▃▃▇▁▂
qsec	1	17.85	1.79	14.50	16.89	17.71	18.90	22.90	▃▇▇▂▁
vs	1	0.44	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
am	1	0.41	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
gear	1	3.69	0.74	3.00	3.00	4.00	4.00	5.00	▇▁▆▁▂
carb	1	2.81	1.62	1.00	2.00	2.00	4.00	8.00	▇▂▅▁▁

mtcars %>% distinct(cyl)

##                   cyl
## Mazda RX4           6
## Datsun 710          4
## Hornet Sportabout   8

myData <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-29/wcmatches.csv')

## Rows: 900 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): country, city, stage, home_team, away_team, outcome, win_conditio...
## dbl   (3): year, home_score, away_score
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Repeat the same operation over different columns of a data frame

Case of numeric variables

mtcars %>%
    
    # mutate(char_var = "A") %>%
    
    map(mean) # map(.x = ., .f = mean)

## $mpg
## [1] 20.09062
## 
## $cyl
## [1] 6.1875
## 
## $disp
## [1] 230.7219
## 
## $hp
## [1] 146.6875
## 
## $drat
## [1] 3.596563
## 
## $wt
## [1] 3.21725
## 
## $qsec
## [1] 17.84875
## 
## $vs
## [1] 0.4375
## 
## $am
## [1] 0.40625
## 
## $gear
## [1] 3.6875
## 
## $carb
## [1] 2.8125

Create your own function

multiply_by_factor <- function(x, factor) {x * factor}

10 %>% multiply_by_factor(factor = 2)

## [1] 20

mtcars %>% map(.x = ., .f = ~multiply_by_factor(x = .x, factor = 2))

## $mpg
##  [1] 42.0 42.0 45.6 42.8 37.4 36.2 28.6 48.8 45.6 38.4 35.6 32.8 34.6 30.4 20.8
## [16] 20.8 29.4 64.8 60.8 67.8 43.0 31.0 30.4 26.6 38.4 54.6 52.0 60.8 31.6 39.4
## [31] 30.0 42.8
## 
## $cyl
##  [1] 12 12  8 12 16 12 16  8  8 12 12 16 16 16 16 16 16  8  8  8  8 16 16 16 16
## [26]  8  8  8 16 12 16  8
## 
## $disp
##  [1] 320.0 320.0 216.0 516.0 720.0 450.0 720.0 293.4 281.6 335.2 335.2 551.6
## [13] 551.6 551.6 944.0 920.0 880.0 157.4 151.4 142.2 240.2 636.0 608.0 700.0
## [25] 800.0 158.0 240.6 190.2 702.0 290.0 602.0 242.0
## 
## $hp
##  [1] 220 220 186 220 350 210 490 124 190 246 246 360 360 360 410 430 460 132 104
## [20] 130 194 300 300 490 350 132 182 226 528 350 670 218
## 
## $drat
##  [1] 7.80 7.80 7.70 6.16 6.30 5.52 6.42 7.38 7.84 7.84 7.84 6.14 6.14 6.14 5.86
## [16] 6.00 6.46 8.16 9.86 8.44 7.40 5.52 6.30 7.46 6.16 8.16 8.86 7.54 8.44 7.24
## [31] 7.08 8.22
## 
## $wt
##  [1]  5.240  5.750  4.640  6.430  6.880  6.920  7.140  6.380  6.300  6.880
## [11]  6.880  8.140  7.460  7.560 10.500 10.848 10.690  4.400  3.230  3.670
## [21]  4.930  7.040  6.870  7.680  7.690  3.870  4.280  3.026  6.340  5.540
## [31]  7.140  5.560
## 
## $qsec
##  [1] 32.92 34.04 37.22 38.88 34.04 40.44 31.68 40.00 45.80 36.60 37.80 34.80
## [13] 35.20 36.00 35.96 35.64 34.84 38.94 37.04 39.80 40.02 33.74 34.60 30.82
## [25] 34.10 37.80 33.40 33.80 29.00 31.00 29.20 37.20
## 
## $vs
##  [1] 0 0 2 2 0 2 0 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 0 2 0 0 0 2
## 
## $am
##  [1] 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 0 2 2 2 2 2 2 2
## 
## $gear
##  [1]  8  8  8  6  6  6  6  8  8  8  8  6  6  6  6  6  6  8  8  8  6  6  6  6  6
## [26]  8 10 10 10 10 10  8
## 
## $carb
##  [1]  8  8  2  2  4  2  8  4  4  8  8  6  6  6  8  8  8  2  4  2  2  4  4  8  4
## [26]  2  4  4  8 12 16  4

mtcars %>% map_dfc(.x = ., .f = ~multiply_by_factor(x = .x, factor = 2))

## # A tibble: 32 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  42      12  320    220  7.8   5.24  32.9     0     2     8     8
##  2  42      12  320    220  7.8   5.75  34.0     0     2     8     8
##  3  45.6     8  216    186  7.7   4.64  37.2     2     2     8     2
##  4  42.8    12  516    220  6.16  6.43  38.9     2     0     6     2
##  5  37.4    16  720    350  6.3   6.88  34.0     0     0     6     4
##  6  36.2    12  450    210  5.52  6.92  40.4     2     0     6     2
##  7  28.6    16  720    490  6.42  7.14  31.7     0     0     6     8
##  8  48.8     8  293.   124  7.38  6.38  40       2     0     8     4
##  9  45.6     8  282.   190  7.84  6.3   45.8     2     0     8     4
## 10  38.4    12  335.   246  7.84  6.88  36.6     2     0     8     8
## # ℹ 22 more rows

mtcars %>% map(multiply_by_factor, factor = 2)

## $mpg
##  [1] 42.0 42.0 45.6 42.8 37.4 36.2 28.6 48.8 45.6 38.4 35.6 32.8 34.6 30.4 20.8
## [16] 20.8 29.4 64.8 60.8 67.8 43.0 31.0 30.4 26.6 38.4 54.6 52.0 60.8 31.6 39.4
## [31] 30.0 42.8
## 
## $cyl
##  [1] 12 12  8 12 16 12 16  8  8 12 12 16 16 16 16 16 16  8  8  8  8 16 16 16 16
## [26]  8  8  8 16 12 16  8
## 
## $disp
##  [1] 320.0 320.0 216.0 516.0 720.0 450.0 720.0 293.4 281.6 335.2 335.2 551.6
## [13] 551.6 551.6 944.0 920.0 880.0 157.4 151.4 142.2 240.2 636.0 608.0 700.0
## [25] 800.0 158.0 240.6 190.2 702.0 290.0 602.0 242.0
## 
## $hp
##  [1] 220 220 186 220 350 210 490 124 190 246 246 360 360 360 410 430 460 132 104
## [20] 130 194 300 300 490 350 132 182 226 528 350 670 218
## 
## $drat
##  [1] 7.80 7.80 7.70 6.16 6.30 5.52 6.42 7.38 7.84 7.84 7.84 6.14 6.14 6.14 5.86
## [16] 6.00 6.46 8.16 9.86 8.44 7.40 5.52 6.30 7.46 6.16 8.16 8.86 7.54 8.44 7.24
## [31] 7.08 8.22
## 
## $wt
##  [1]  5.240  5.750  4.640  6.430  6.880  6.920  7.140  6.380  6.300  6.880
## [11]  6.880  8.140  7.460  7.560 10.500 10.848 10.690  4.400  3.230  3.670
## [21]  4.930  7.040  6.870  7.680  7.690  3.870  4.280  3.026  6.340  5.540
## [31]  7.140  5.560
## 
## $qsec
##  [1] 32.92 34.04 37.22 38.88 34.04 40.44 31.68 40.00 45.80 36.60 37.80 34.80
## [13] 35.20 36.00 35.96 35.64 34.84 38.94 37.04 39.80 40.02 33.74 34.60 30.82
## [25] 34.10 37.80 33.40 33.80 29.00 31.00 29.20 37.20
## 
## $vs
##  [1] 0 0 2 2 0 2 0 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 0 2 0 0 0 2
## 
## $am
##  [1] 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 0 2 2 2 2 2 2 2
## 
## $gear
##  [1]  8  8  8  6  6  6  6  8  8  8  8  6  6  6  6  6  6  8  8  8  6  6  6  6  6
## [26]  8 10 10 10 10 10  8
## 
## $carb
##  [1]  8  8  2  2  4  2  8  4  4  8  8  6  6  6  8  8  8  2  4  2  2  4  4  8  4
## [26]  2  4  4  8 12 16  4

Repeat the same operation over different elements of a list

When you have a grouping variable (factor)

reg_coeff_tbl <- mtcars %>%
    
    # Split the data frame into a list by factor
    split(.$cyl) %>%
    
    # Repeat the same operation over each element
    map(~lm(mpg ~ wt, data = .)) %>%
    
    # Return regression coefficients in a tidy tibble
    map(broom::tidy, conf.int = TRUE) %>%
    
    # Bind multiple data frames by row
    bind_rows(.id = "cyl") %>%
    
    # Filter for coefficient of interest
    filter(term == "wt")

reg_coeff_tbl %>%
    
    mutate(estimate = -estimate,
           conf.low = -conf.low,
           conf.high = -conf.high) %>%
    
    ggplot(aes(x = estimate, y = cyl)) +
    geom_point() +
    geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
    
    theme(legend.position = "none")

Create your own

Choose either one of the two cases above and apply it to your data

myData1 <- na.omit(myData[, c("year", "country", "city", "stage", "home_team", "away_team", "home_score", "away_score", "outcome", "winning_team", "losing_team", "date", "month", "dayofweek")])
myData1

## # A tibble: 731 × 14
##     year country city    stage home_team away_team home_score away_score outcome
##    <dbl> <chr>   <chr>   <chr> <chr>     <chr>          <dbl>      <dbl> <chr>  
##  1  1930 Uruguay Montev… Grou… France    Mexico             4          1 H      
##  2  1930 Uruguay Montev… Grou… Belgium   United S…          0          3 A      
##  3  1930 Uruguay Montev… Grou… Brazil    Yugoslav…          1          2 A      
##  4  1930 Uruguay Montev… Grou… Peru      Romania            1          3 A      
##  5  1930 Uruguay Montev… Grou… Argentina France             1          0 H      
##  6  1930 Uruguay Montev… Grou… Chile     Mexico             3          0 H      
##  7  1930 Uruguay Montev… Grou… Bolivia   Yugoslav…          0          4 A      
##  8  1930 Uruguay Montev… Grou… Paraguay  United S…          0          3 A      
##  9  1930 Uruguay Montev… Grou… Uruguay   Peru               1          0 H      
## 10  1930 Uruguay Montev… Grou… Argentina Mexico             6          3 H      
## # ℹ 721 more rows
## # ℹ 5 more variables: winning_team <chr>, losing_team <chr>, date <date>,
## #   month <chr>, dayofweek <chr>

myData55 <- na.omit(myData[, c("year", "home_score", "away_score")])
myData55

## # A tibble: 900 × 3
##     year home_score away_score
##    <dbl>      <dbl>      <dbl>
##  1  1930          4          1
##  2  1930          0          3
##  3  1930          1          2
##  4  1930          1          3
##  5  1930          1          0
##  6  1930          3          0
##  7  1930          0          4
##  8  1930          0          3
##  9  1930          1          0
## 10  1930          6          3
## # ℹ 890 more rows

double_by_vector <- function(x, factor) {x * factor}
10 %>% double_by_vector(factor = 2)

## [1] 20

100 %>% double_by_vector(factor = 2)

## [1] 200

myData55 %>% map_dfr(double_by_vector, factor = 10)

## # A tibble: 900 × 3
##     year home_score away_score
##    <dbl>      <dbl>      <dbl>
##  1 19300         40         10
##  2 19300          0         30
##  3 19300         10         20
##  4 19300         10         30
##  5 19300         10          0
##  6 19300         30          0
##  7 19300          0         40
##  8 19300          0         30
##  9 19300         10          0
## 10 19300         60         30
## # ℹ 890 more rows

myData4 <- myData1 %>%
    filter(home_team %in% c("Brazil", "Uruguay", "Argentina")) %>%
    select(home_team, home_score) %>%
    
    pivot_longer(cols = c("home_team"),
                 names_to = "home_or_away",
                 values_to = "hometeam")

standard_deviation <- myData4 %>%
    group_by(hometeam) %>%
    summarise(total_home_score = sum(home_score),
              stddev_home_score = sd(home_score))

print(standard_deviation)

## # A tibble: 3 × 3
##   hometeam  total_home_score stddev_home_score
##   <chr>                <dbl>             <dbl>
## 1 Argentina              110              1.63
## 2 Brazil                 170              1.51
## 3 Uruguay                 30              1.59

myData5 <- myData4 %>%
    group_by(hometeam) %>%
    summarise(total_home_score = sum(home_score),
              mean_home_score = mean(home_score),
              stddev_home_score = sd(home_score))

myData6 <- myData5 %>%
    mutate(conf_int_lower = mean_home_score - qt(1 - 0.05 / 2, n() - 1) * (stddev_home_score / sqrt(n())),
conf_int_upper = mean_home_score + qt(1 - 0.05 / 2, n() - 1) * (stddev_home_score / sqrt(n())))

myData6 %>%
    ggplot(aes(x = mean_home_score, y = hometeam)) +
    geom_point() +
    geom_errorbarh(aes(xmin = conf_int_lower, xmax = conf_int_upper), width = 0.4) +
    
    labs(x = "Goals", y = "Team", title = "Mean Home Scores with Confidence Intervals")

## Warning in geom_errorbarh(aes(xmin = conf_int_lower, xmax = conf_int_upper), :
## Ignoring unknown parameters: `width`

Module 13: Apply it to your data 12

Chapter 21 Iteration

Sondre Asheim

Import your data

Repeat the same operation over different columns of a data frame

Repeat the same operation over different elements of a list

Create your own