library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
site <- "http://stats.espncricinfo.com/ci/engine/stats/index.html?class=10;page=1;team=289;template=results;type=batting;wrappertype=print"

raw_html <- read_html(site)
raw_html
## {html_document}
## <html xmlns="http://www.w3.org/1999/xhtml">
## [1] <head>\n<title>Batting records | Women's Twenty20 Internationals | Cricin ...
## [2] <body onload="return guruStart();">\n<div id="ciMainContainer">\n <div id ...
tables <- html_table(raw_html, fill = TRUE)


ausw_t20 <- tables[[3]]
glimpse(ausw_t20)
## Rows: 50
## Columns: 15
## $ Player <chr> "MM Lanning", "AJ Healy", "BL Mooney", "EJ Villani", "AJ Blackw~
## $ Span   <chr> "2010-2021", "2010-2021", "2016-2021", "2009-2018", "2005-2017"~
## $ Mat    <int> 110, 118, 58, 62, 95, 123, 64, 46, 73, 40, 36, 54, 85, 15, 37, ~
## $ Inns   <chr> "104", "103", "55", "58", "81", "74", "55", "40", "52", "40", "~
## $ NO     <chr> "23", "17", "12", "10", "19", "31", "10", "6", "22", "2", "2", ~
## $ Runs   <chr> "2914", "2121", "1554", "1369", "1314", "1243", "941", "813", "~
## $ HS     <chr> "133*", "148*", "117*", "90*", "61", "60*", "68*", "93", "69*",~
## $ Ave    <chr> "35.97", "24.66", "36.13", "28.52", "21.19", "28.90", "20.91", ~
## $ BF     <chr> "2520", "1628", "1269", "1158", "1414", "1173", "875", "620", "~
## $ SR     <chr> "115.63", "130.28", "122.45", "118.22", "92.92", "105.96", "107~
## $ `100`  <chr> "2", "1", "2", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0"~
## $ `50`   <chr> "13", "12", "10", "12", "1", "4", "3", "4", "3", "2", "3", "1",~
## $ `0`    <chr> "1", "10", "3", "4", "5", "4", "4", "4", "1", "2", "2", "4", "2~
## $ `4s`   <chr> "345", "274", "200", "177", "87", "106", "82", "91", "64", "80"~
## $ `6s`   <chr> "39", "41", "8", "12", "1", "23", "20", "30", "10", "11", "3", ~

Modification 1: Remove empty characters Change the HS column to remove any occurrence of “" with an empty character "". (Hint: when using the function str_replace() for the character "”, the pattern will be "\*").

ausw_t20 <- ausw_t20 %>% 
  mutate(HS = str_remove(HS, "\\*"))

Modification 2: Make the data long form Modify the ausw_t20 data so the data is in long form by pivoting all the variables except for Player and Span.

ausw_t20_long <- 
  ausw_t20 %>% 
  #mutate(Mat = as.character(Mat)) %>% 
  pivot_longer(cols = Mat:`6s`, names_to = "statistic", values_to = "value", values_transform = list(value=as.character))

Modify the value column so the pattern “-” is replaced with an empty character (use str_replace() again). Then, change the value column to be a numeric variable with the as.numeric() function.

ausw_t20_long  <- 
  ausw_t20_long %>% 
  mutate(value = as.numeric(str_remove(value, "-")))

Change the data set back to wide form, with each row now representing a player from the cricket team

ausw_t20 <- ausw_t20_long %>% 
  pivot_wider(id_cols = Player ,names_from = statistic, values_from = value)
ausw_t20
## # A tibble: 50 x 14
##    Player        Mat  Inns    NO  Runs    HS   Ave    BF    SR `100`  `50`   `0`
##    <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 MM Lanning    110   104    23  2914   133  36.0  2520 116.      2    13     1
##  2 AJ Healy      118   103    17  2121   148  24.7  1628 130.      1    12    10
##  3 BL Mooney      58    55    12  1554   117  36.1  1269 122.      2    10     3
##  4 EJ Villani     62    58    10  1369    90  28.5  1158 118.      0    12     4
##  5 AJ Blackwe~    95    81    19  1314    61  21.2  1414  92.9     0     1     5
##  6 EA Perry      123    74    31  1243    60  28.9  1173 106.      0     4     4
##  7 JE Duffin      64    55    10   941    68  20.9   875 108.      0     3     4
##  8 A Gardner      46    40     6   813    93  23.9   620 131.      0     4     4
##  9 RL Haynes      73    52    22   793    69  26.4   675 117.      0     3     1
## 10 LJ Poulton     40    40     2   784    61  20.6   752 104.      0     2     2
## # ... with 40 more rows, and 2 more variables: 4s <dbl>, 6s <dbl>
library(ggplot2)

ggplot(ausw_t20, aes(x = Ave, y = SR)) +
  geom_point()
## Warning: Removed 5 rows containing missing values (geom_point).