R Notebook

library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions"
linkTabla="///div/table[4]"
open=htmltab(doc=linkPage, which=linkTabla)

names(open)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(open)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " AUS" ...
##  $ Champion          : chr  "Rod Laver" "Rod Laver" "John Newcombe" "John Newcombe" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " USA" ...
##  $ Runner-up         : chr  "Tony Roche" "John Newcombe" "Ken Rosewall" "Stan Smith" ...
##  $ Score in the final: chr  "6–3, 6–4, 6–2" "6–4, 5–7, 6–4, 6–4" "5–7, 6–3, 6–2, 3–6, 6–1" "6–3, 5–7, 2–6, 6–4, 6–4" ...

open=open[c(2)]

names(open)

## [1] "Country"

str(open)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " AUS" " AUS" " AUS" " AUS" ...

open

head(open)

library(questionr)
library(magrittr)
NOMINAL=freq(open$Country,cum = T)%>%data.frame()
NOMINAL=data.frame(variable=row.names(NOMINAL),NOMINAL,row.names = NULL)

open

library(ggplot2)
base = ggplot(data=NOMINAL,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

text1="Datos en la tabla estadística - países"
text2="Countries"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library(qcc)

## Package 'qcc' version 2.7

## Type 'citation("qcc")' for citing this R package in publications.

pareto.chart(table(open$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(open$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   15.000000  15.000000  28.846154    28.846154
##     SUI   8.000000  23.000000  15.384615    44.230769
##    SWE    7.000000  30.000000  13.461538    57.692308
##    AUS    6.000000  36.000000  11.538462    69.230769
##    SRB    5.000000  41.000000   9.615385    78.846154
##    FRG    3.000000  44.000000   5.769231    84.615385
##    ESP    2.000000  46.000000   3.846154    88.461538
##    GBR    2.000000  48.000000   3.846154    92.307692
##    CRO    1.000000  49.000000   1.923077    94.230769
##    GER    1.000000  50.000000   1.923077    96.153846
##    NED    1.000000  51.000000   1.923077    98.076923
##    TCH    1.000000  52.000000   1.923077   100.000000

el 80% se encuentra entre la República Checa, España y Francia

library(DescTools)

Mode(open$Country)

## [1] " USA"

USA es el país que más se repite

dataTable=table(open$Country)
1-max(prop.table(dataTable))

## [1] 0.7115385

La moda no representa el 71%

dataTable=table(open$Country)
Herfindahl(dataTable)

## [1] 0.1553254

La moda no es significativa

MUJERES

library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_ladies%27_singles_champions"
linkTabla="//div/table[4]"
openn=htmltab(doc=linkPage, which=linkTabla)

names(openn)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(openn)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " USA" " GBR" " AUS" " AUS" ...
##  $ Champion          : chr  "Billie Jean King" "Ann Jones" "Margaret Court" "Evonne Goolagong" ...
##  $ Country           : chr  " AUS" " USA" " USA" " AUS" ...
##  $ Runner-up         : chr  "Judy Tegart" "Billie Jean King" "Billie Jean King" "Margaret Court" ...
##  $ Score in the final: chr  "9–7, 7–5" "3–6, 6–3, 6–2" "14–12, 11–9" "6–4, 6–1" ...

openn=openn[c(2)]

names(openn)

## [1] "Country"

str(openn)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " USA" " GBR" " AUS" " AUS" ...

openn

head(openn)

library(questionr)
library(magrittr)
NOMINALL=freq(openn$Country,cum = T)%>%data.frame()
NOMINALL=data.frame(variable=row.names(NOMINALL),NOMINALL,row.names = NULL)

NOMINALL

library(ggplot2)
base = ggplot(data=NOMINALL,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

text1="Datos en la tabla estadística - países II"
text2="países"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library(qcc)

pareto.chart(table(openn$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(openn$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   29.000000  29.000000  55.769231    55.769231
##    GER    6.000000  35.000000  11.538462    67.307692
##    AUS    3.000000  38.000000   5.769231    73.076923
##    CZE    3.000000  41.000000   5.769231    78.846154
##    ESP    2.000000  43.000000   3.846154    82.692308
##    FRA    2.000000  45.000000   3.846154    86.538462
##    FRG    2.000000  47.000000   3.846154    90.384615
##    GBR    2.000000  49.000000   3.846154    94.230769
##     SUI   1.000000  50.000000   1.923077    96.153846
##    ROU    1.000000  51.000000   1.923077    98.076923
##    RUS    1.000000  52.000000   1.923077   100.000000

el 80% se encuentra entre la República Checa, España y Francia

library(DescTools)

Mode(openn$Country)

## [1] " USA"

el pais que más se repite es USA

dataTable=table(openn$Country)
1-max(prop.table(dataTable))

## [1] 0.4423077

a moda no representa el 44%

dataTable=table(openn$Country)
Herfindahl(dataTable)

## [1] 0.3380178

La moda se diferencia de los demás