SEGUNDA PRÁCTICA CALIFICADA

I. Guarda el link de wiki en un objeto de R, aqui el objeto se llama LINKtennisMen: “https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions”

Visita ese link, y ve a la tabla de los ganadores de la “Open Era” Analiza la columna Country del ganador (la segunda):

Calcula
Produce los gráficos
Calcula los estadísticos

Resuelvo 1:

library(htmltab)
linkPage= "https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions"
linkTabla= "///div/table[4]"
oper= htmltab(doc = linkPage, which = linkTabla)

names(oper)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(oper)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " AUS" ...
##  $ Champion          : chr  "Rod Laver" "Rod Laver" "John Newcombe" "John Newcombe" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " USA" ...
##  $ Runner-up         : chr  "Tony Roche" "John Newcombe" "Ken Rosewall" "Stan Smith" ...
##  $ Score in the final: chr  "6–3, 6–4, 6–2" "6–4, 5–7, 6–4, 6–4" "5–7, 6–3, 6–2, 3–6, 6–1" "6–3, 5–7, 2–6, 6–4, 6–4" ...

oper=oper[c(2)]

names(oper)

## [1] "Country"

str(oper)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " AUS" " AUS" " AUS" " AUS" ...

oper

##    Country
## 2      AUS
## 3      AUS
## 4      AUS
## 5      AUS
## 6      USA
## 7      TCH
## 8      USA
## 9      USA
## 10     SWE
## 11     SWE
## 12     SWE
## 13     SWE
## 14     SWE
## 15     USA
## 16     USA
## 17     USA
## 18     USA
## 19     FRG
## 20     FRG
## 21     AUS
## 22     SWE
## 23     FRG
## 24     SWE
## 25     GER
## 26     USA
## 27     USA
## 28     USA
## 29     USA
## 30     NED
## 31     USA
## 32     USA
## 33     USA
## 34     USA
## 35     CRO
## 36     AUS
## 37     SUI
## 38     SUI
## 39     SUI
## 40     SUI
## 41     SUI
## 42     ESP
## 43     SUI
## 44     ESP
## 45     SRB
## 46     SUI
## 47     GBR
## 48     SRB
## 49     SRB
## 50     GBR
## 51     SUI
## 52     SRB
## 53     SRB

head(oper)

##   Country
## 2     AUS
## 3     AUS
## 4     AUS
## 5     AUS
## 6     USA
## 7     TCH

library(questionr)
library(magrittr)
NomOper=freq(oper$Country,cum = T)%>%data.frame()
NomOper=data.frame(variable=row.names(NomOper),NomOper,row.names = NULL)

Compruebo….

NomOper

##    variable  n   X. val. X.cum val.cum
## 1       SUI  8 15.4 15.4  15.4    15.4
## 2       AUS  6 11.5 11.5  26.9    26.9
## 3       CRO  1  1.9  1.9  28.8    28.8
## 4       ESP  2  3.8  3.8  32.7    32.7
## 5       FRG  3  5.8  5.8  38.5    38.5
## 6       GBR  2  3.8  3.8  42.3    42.3
## 7       GER  1  1.9  1.9  44.2    44.2
## 8       NED  1  1.9  1.9  46.2    46.2
## 9       SRB  5  9.6  9.6  55.8    55.8
## 10      SWE  7 13.5 13.5  69.2    69.2
## 11      TCH  1  1.9  1.9  71.2    71.2
## 12      USA 15 28.8 28.8 100.0   100.0

Produciendo los gráficos:

library(ggplot2)
base = ggplot(data=NomOper,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

text1="Country Open era"
text2="Países"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library (qcc)

## Package 'qcc' version 2.7

## Type 'citation("qcc")' for citing this R package in publications.

pareto.chart(table(oper$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(oper$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   15.000000  15.000000  28.846154    28.846154
##     SUI   8.000000  23.000000  15.384615    44.230769
##    SWE    7.000000  30.000000  13.461538    57.692308
##    AUS    6.000000  36.000000  11.538462    69.230769
##    SRB    5.000000  41.000000   9.615385    78.846154
##    FRG    3.000000  44.000000   5.769231    84.615385
##    ESP    2.000000  46.000000   3.846154    88.461538
##    GBR    2.000000  48.000000   3.846154    92.307692
##    CRO    1.000000  49.000000   1.923077    94.230769
##    GER    1.000000  50.000000   1.923077    96.153846
##    NED    1.000000  51.000000   1.923077    98.076923
##    TCH    1.000000  52.000000   1.923077   100.000000

Conclusión: 3. calculando los estadísticos

library(DescTools)

MODA

Mode(oper$Country)

## [1] " USA"

USA es el país que se repite más entre los campeones

Dispersión

dataTable=table(oper$Country)
1-max(prop.table(dataTable))

## [1] 0.7115385

La moda no representa el 71% de los países

Concentracion: Herfindahl- Hirschman

dataTable=table(oper$Country)
Herfindahl(dataTable)

## [1] 0.1553254

< 0.01 : indica que la moda no es significativa, las categorias tienen pesos similares.
< 0.15 : indica que la moda no es significativa, varias categorias tienen pesos similares.
entre 0.15 - 0.25: hay una moda.
0.25: La moda se diferencia de los demas

Por lo tanto, nuestra moda no es significativa

#Segunda parte de la práctica

library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_ladies%27_singles_champions"
linkTabla="//div/table[4]"
opengirl=htmltab(doc=linkPage, which=linkTabla)

names(opengirl)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(opengirl)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " USA" " GBR" " AUS" " AUS" ...
##  $ Champion          : chr  "Billie Jean King" "Ann Jones" "Margaret Court" "Evonne Goolagong" ...
##  $ Country           : chr  " AUS" " USA" " USA" " AUS" ...
##  $ Runner-up         : chr  "Judy Tegart" "Billie Jean King" "Billie Jean King" "Margaret Court" ...
##  $ Score in the final: chr  "9–7, 7–5" "3–6, 6–3, 6–2" "14–12, 11–9" "6–4, 6–1" ...

opengirl=opengirl[c(2)]

names(opengirl)

## [1] "Country"

str(opengirl)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " USA" " GBR" " AUS" " AUS" ...

opengirl

##    Country
## 2      USA
## 3      GBR
## 4      AUS
## 5      AUS
## 6      USA
## 7      USA
## 8      USA
## 9      USA
## 10     USA
## 11     GBR
## 12     USA
## 13     USA
## 14     AUS
## 15     USA
## 16     USA
## 17     USA
## 18     USA
## 19     USA
## 20     USA
## 21     USA
## 22     FRG
## 23     FRG
## 24     USA
## 25     GER
## 26     GER
## 27     GER
## 28     ESP
## 29     GER
## 30     GER
## 31     SUI
## 32     CZE
## 33     USA
## 34     USA
## 35     USA
## 36     USA
## 37     USA
## 38     RUS
## 39     USA
## 40     FRA
## 41     USA
## 42     USA
## 43     USA
## 44     USA
## 45     CZE
## 46     USA
## 47     FRA
## 48     CZE
## 49     USA
## 50     USA
## 51     ESP
## 52     GER
## 53     ROU

head(opengirl)

##   Country
## 2     USA
## 3     GBR
## 4     AUS
## 5     AUS
## 6     USA
## 7     USA

Análisis

library(questionr)
library(magrittr)
NomOEG=freq(opengirl$Country,cum = T)%>%data.frame()
NomOEG=data.frame(variable=row.names(NomOEG),NomOEG,row.names = NULL)

NomOEG

##    variable  n   X. val. X.cum val.cum
## 1       SUI  1  1.9  1.9   1.9     1.9
## 2       AUS  3  5.8  5.8   7.7     7.7
## 3       CZE  3  5.8  5.8  13.5    13.5
## 4       ESP  2  3.8  3.8  17.3    17.3
## 5       FRA  2  3.8  3.8  21.2    21.2
## 6       FRG  2  3.8  3.8  25.0    25.0
## 7       GBR  2  3.8  3.8  28.8    28.8
## 8       GER  6 11.5 11.5  40.4    40.4
## 9       ROU  1  1.9  1.9  42.3    42.3
## 10      RUS  1  1.9  1.9  44.2    44.2
## 11      USA 29 55.8 55.8 100.0   100.0

GRÁFICOS

library(ggplot2)
base = ggplot(data=NomOEG,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

bar1 = bar1 + scale_x_discrete(limits = NomOEG$variable)
bar1

text1="Open era girl"
text2="Países"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library(qcc)

pareto.chart(table(opengirl$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(opengirl$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   29.000000  29.000000  55.769231    55.769231
##    GER    6.000000  35.000000  11.538462    67.307692
##    AUS    3.000000  38.000000   5.769231    73.076923
##    CZE    3.000000  41.000000   5.769231    78.846154
##    ESP    2.000000  43.000000   3.846154    82.692308
##    FRA    2.000000  45.000000   3.846154    86.538462
##    FRG    2.000000  47.000000   3.846154    90.384615
##    GBR    2.000000  49.000000   3.846154    94.230769
##     SUI   1.000000  50.000000   1.923077    96.153846
##    ROU    1.000000  51.000000   1.923077    98.076923
##    RUS    1.000000  52.000000   1.923077   100.000000

ESTADÍSTICOS

library(DescTools)

MODA

Mode(opengirl$Country)

## [1] " USA"

Nuevamente USA es el país que mpas se repite

Dispersión

dataTable=table(opengirl$Country)
1-max(prop.table(dataTable))

## [1] 0.4423077

La moda no representa el 44%

Concentración

dataTable=table(opengirl$Country)
Herfindahl(dataTable)

## [1] 0.3380178

La moda se diferencia de los demás