library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
site <- "http://stats.espncricinfo.com/ci/engine/stats/index.html?class=10;page=1;team=289;template=results;type=batting;wrappertype=print"
raw_html <- read_html(site)
raw_html
## {html_document}
## <html xmlns="http://www.w3.org/1999/xhtml">
## [1] <head>\n<title>Batting records | Women's Twenty20 Internationals | Cricin ...
## [2] <body onload="return guruStart();">\n<div id="ciMainContainer">\n <div id ...
tables <- html_table(raw_html, fill = TRUE)
ausw_t20 <- tables[[3]]
glimpse(ausw_t20)
## Rows: 50
## Columns: 15
## $ Player <chr> "MM Lanning", "AJ Healy", "BL Mooney", "EJ Villani", "AJ Blackw~
## $ Span <chr> "2010-2021", "2010-2021", "2016-2021", "2009-2018", "2005-2017"~
## $ Mat <int> 110, 118, 58, 62, 95, 123, 64, 46, 73, 40, 36, 54, 85, 15, 37, ~
## $ Inns <chr> "104", "103", "55", "58", "81", "74", "55", "40", "52", "40", "~
## $ NO <chr> "23", "17", "12", "10", "19", "31", "10", "6", "22", "2", "2", ~
## $ Runs <chr> "2914", "2121", "1554", "1369", "1314", "1243", "941", "813", "~
## $ HS <chr> "133*", "148*", "117*", "90*", "61", "60*", "68*", "93", "69*",~
## $ Ave <chr> "35.97", "24.66", "36.13", "28.52", "21.19", "28.90", "20.91", ~
## $ BF <chr> "2520", "1628", "1269", "1158", "1414", "1173", "875", "620", "~
## $ SR <chr> "115.63", "130.28", "122.45", "118.22", "92.92", "105.96", "107~
## $ `100` <chr> "2", "1", "2", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0"~
## $ `50` <chr> "13", "12", "10", "12", "1", "4", "3", "4", "3", "2", "3", "1",~
## $ `0` <chr> "1", "10", "3", "4", "5", "4", "4", "4", "1", "2", "2", "4", "2~
## $ `4s` <chr> "345", "274", "200", "177", "87", "106", "82", "91", "64", "80"~
## $ `6s` <chr> "39", "41", "8", "12", "1", "23", "20", "30", "10", "11", "3", ~
Modification 1: Remove empty characters Change the HS column to remove any occurrence of “" with an empty character "". (Hint: when using the function str_replace() for the character "”, the pattern will be "\*").
ausw_t20 <- ausw_t20 %>%
mutate(HS = str_remove(HS, "\\*"))
Modification 2: Make the data long form Modify the ausw_t20 data so the data is in long form by pivoting all the variables except for Player and Span.
ausw_t20_long <-
ausw_t20 %>%
#mutate(Mat = as.character(Mat)) %>%
pivot_longer(cols = Mat:`6s`, names_to = "statistic", values_to = "value", values_transform = list(value=as.character))
Modify the value column so the pattern “-” is replaced with an empty character (use str_replace() again). Then, change the value column to be a numeric variable with the as.numeric() function.
ausw_t20_long <-
ausw_t20_long %>%
mutate(value = as.numeric(str_remove(value, "-")))
Change the data set back to wide form, with each row now representing a player from the cricket team
ausw_t20 <- ausw_t20_long %>%
pivot_wider(id_cols = Player ,names_from = statistic, values_from = value)
ausw_t20
## # A tibble: 50 x 14
## Player Mat Inns NO Runs HS Ave BF SR `100` `50` `0`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MM Lanning 110 104 23 2914 133 36.0 2520 116. 2 13 1
## 2 AJ Healy 118 103 17 2121 148 24.7 1628 130. 1 12 10
## 3 BL Mooney 58 55 12 1554 117 36.1 1269 122. 2 10 3
## 4 EJ Villani 62 58 10 1369 90 28.5 1158 118. 0 12 4
## 5 AJ Blackwe~ 95 81 19 1314 61 21.2 1414 92.9 0 1 5
## 6 EA Perry 123 74 31 1243 60 28.9 1173 106. 0 4 4
## 7 JE Duffin 64 55 10 941 68 20.9 875 108. 0 3 4
## 8 A Gardner 46 40 6 813 93 23.9 620 131. 0 4 4
## 9 RL Haynes 73 52 22 793 69 26.4 675 117. 0 3 1
## 10 LJ Poulton 40 40 2 784 61 20.6 752 104. 0 2 2
## # ... with 40 more rows, and 2 more variables: 4s <dbl>, 6s <dbl>
library(ggplot2)
ggplot(ausw_t20, aes(x = Ave, y = SR)) +
geom_point()
## Warning: Removed 5 rows containing missing values (geom_point).