Práctica calificada II

PARTE 1

library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions"
linkTabla="///div/table[4]"
openera=htmltab(doc=linkPage, which=linkTabla)

names(openera)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(openera)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " AUS" ...
##  $ Champion          : chr  "Rod Laver" "Rod Laver" "John Newcombe" "John Newcombe" ...
##  $ Country           : chr  " AUS" " AUS" " AUS" " USA" ...
##  $ Runner-up         : chr  "Tony Roche" "John Newcombe" "Ken Rosewall" "Stan Smith" ...
##  $ Score in the final: chr  "6–3, 6–4, 6–2" "6–4, 5–7, 6–4, 6–4" "5–7, 6–3, 6–2, 3–6, 6–1" "6–3, 5–7, 2–6, 6–4, 6–4" ...

openera=openera[c(2)]

names(openera)

## [1] "Country"

str(openera)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " AUS" " AUS" " AUS" " AUS" ...

openera

head(openera)

Esta variables es nominal, así que prosigo con el análisis respectivo:

library(questionr)
library(magrittr)
NomOE=freq(openera$Country,cum = T)%>%data.frame()
NomOE=data.frame(variable=row.names(NomOE),NomOE,row.names = NULL)

NomOE

Ya creé mi tabla de frecuencias, ahora muestro mis gráficos

library(ggplot2)
base = ggplot(data=NomOE,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

text1="Países - Open Era hombres"
text2="países"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library(qcc)

## Package 'qcc' version 2.7

## Type 'citation("qcc")' for citing this R package in publications.

pareto.chart(table(openera$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(openera$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   15.000000  15.000000  28.846154    28.846154
##     SUI   8.000000  23.000000  15.384615    44.230769
##    SWE    7.000000  30.000000  13.461538    57.692308
##    AUS    6.000000  36.000000  11.538462    69.230769
##    SRB    5.000000  41.000000   9.615385    78.846154
##    FRG    3.000000  44.000000   5.769231    84.615385
##    ESP    2.000000  46.000000   3.846154    88.461538
##    GBR    2.000000  48.000000   3.846154    92.307692
##    CRO    1.000000  49.000000   1.923077    94.230769
##    GER    1.000000  50.000000   1.923077    96.153846
##    NED    1.000000  51.000000   1.923077    98.076923
##    TCH    1.000000  52.000000   1.923077   100.000000

*El 80% de los deportistas se encuentran entre SR, FRG y ESP

ESTADÍSTICOS

library(DescTools)

Moda

Mode(openera$Country)

## [1] " USA"

*El país que más se repite es Estados Unidos

Dispersión | Variación modal

dataTable=table(openera$Country)
1-max(prop.table(dataTable))

## [1] 0.7115385

*La moda no representa el 71%

Concentración

dataTable=table(openera$Country)
Herfindahl(dataTable)

## [1] 0.1553254

*La moda no es significativa

PARTE 2

library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_ladies%27_singles_champions"
linkTabla="//div/table[4]"
openerax=htmltab(doc=linkPage, which=linkTabla)

names(openerax)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

str(openerax)

## 'data.frame':    52 obs. of  6 variables:
##  $ Year              : chr  "1968" "1969" "1970" "1971" ...
##  $ Country           : chr  " USA" " GBR" " AUS" " AUS" ...
##  $ Champion          : chr  "Billie Jean King" "Ann Jones" "Margaret Court" "Evonne Goolagong" ...
##  $ Country           : chr  " AUS" " USA" " USA" " AUS" ...
##  $ Runner-up         : chr  "Judy Tegart" "Billie Jean King" "Billie Jean King" "Margaret Court" ...
##  $ Score in the final: chr  "9–7, 7–5" "3–6, 6–3, 6–2" "14–12, 11–9" "6–4, 6–1" ...

names(openerax)

## [1] "Year"               "Country"            "Champion"          
## [4] "Country"            "Runner-up"          "Score in the final"

openerax=openerax[c(2)]

names(openerax)

## [1] "Country"

str(openerax)

## 'data.frame':    52 obs. of  1 variable:
##  $ Country: chr  " USA" " GBR" " AUS" " AUS" ...

openerax

head(openerax)

Esta variables es nominal, así que prosigo con el análisis respectivo:

library(questionr)
library(magrittr)
NomOEX=freq(openerax$Country,cum = T)%>%data.frame()
NomOEX=data.frame(variable=row.names(NomOEX),NomOEX,row.names = NULL)

NomOEX

Ya creé mi tabla de frecuencias, ahora muestro mis gráficos

library(ggplot2)
base = ggplot(data=NomOEX,aes(x=variable,y=n)) 

bar1 = base + geom_bar(stat='identity') 

bar1

bar1 = bar1 + scale_x_discrete(limits = NomOEX$variable)
bar1

text1="Países - Open Era mujeres"
text2="países"
text3="Conteo"
text4="Fuente: Wikipedia"

bar2= bar1 + labs(title=text1,
                      x =text2, 
                      y = text3,
                      caption = text4) 
bar2

library(qcc)

pareto.chart(table(openerax$Country),cumperc = c(0,50,80,100))

##        
## Pareto chart analysis for table(openerax$Country)
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##    USA   29.000000  29.000000  55.769231    55.769231
##    GER    6.000000  35.000000  11.538462    67.307692
##    AUS    3.000000  38.000000   5.769231    73.076923
##    CZE    3.000000  41.000000   5.769231    78.846154
##    ESP    2.000000  43.000000   3.846154    82.692308
##    FRA    2.000000  45.000000   3.846154    86.538462
##    FRG    2.000000  47.000000   3.846154    90.384615
##    GBR    2.000000  49.000000   3.846154    94.230769
##     SUI   1.000000  50.000000   1.923077    96.153846
##    ROU    1.000000  51.000000   1.923077    98.076923
##    RUS    1.000000  52.000000   1.923077   100.000000

*Se nota que el 80% de los jugadores se encuentra entre la República Checa, España y Francia

ESTADÍSTICOS

library(DescTools)

Moda

Mode(openerax$Country)

## [1] " USA"

*Estados Unidos es el país que más se repite

Dispersión | Variación modal

dataTable=table(openerax$Country)
1-max(prop.table(dataTable))

## [1] 0.4423077

*La moda no representa el 44%

Concentración

dataTable=table(openerax$Country)
Herfindahl(dataTable)

## [1] 0.3380178

*La moda se diferencia de los demás