I. Guarda el link de wiki en un objeto de R, aqui el objeto se llama LINKtennisMen: “https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions”
Visita ese link, y ve a la tabla de los ganadores de la “Open Era” Analiza la columna Country del ganador (la segunda):
Resuelvo 1:
library(htmltab)
linkPage= "https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions"
linkTabla= "///div/table[4]"
oper= htmltab(doc = linkPage, which = linkTabla)
names(oper)
## [1] "Year" "Country" "Champion"
## [4] "Country" "Runner-up" "Score in the final"
str(oper)
## 'data.frame': 52 obs. of 6 variables:
## $ Year : chr "1968" "1969" "1970" "1971" ...
## $ Country : chr " AUS" " AUS" " AUS" " AUS" ...
## $ Champion : chr "Rod Laver" "Rod Laver" "John Newcombe" "John Newcombe" ...
## $ Country : chr " AUS" " AUS" " AUS" " USA" ...
## $ Runner-up : chr "Tony Roche" "John Newcombe" "Ken Rosewall" "Stan Smith" ...
## $ Score in the final: chr "6–3, 6–4, 6–2" "6–4, 5–7, 6–4, 6–4" "5–7, 6–3, 6–2, 3–6, 6–1" "6–3, 5–7, 2–6, 6–4, 6–4" ...
oper=oper[c(2)]
names(oper)
## [1] "Country"
str(oper)
## 'data.frame': 52 obs. of 1 variable:
## $ Country: chr " AUS" " AUS" " AUS" " AUS" ...
oper
## Country
## 2 AUS
## 3 AUS
## 4 AUS
## 5 AUS
## 6 USA
## 7 TCH
## 8 USA
## 9 USA
## 10 SWE
## 11 SWE
## 12 SWE
## 13 SWE
## 14 SWE
## 15 USA
## 16 USA
## 17 USA
## 18 USA
## 19 FRG
## 20 FRG
## 21 AUS
## 22 SWE
## 23 FRG
## 24 SWE
## 25 GER
## 26 USA
## 27 USA
## 28 USA
## 29 USA
## 30 NED
## 31 USA
## 32 USA
## 33 USA
## 34 USA
## 35 CRO
## 36 AUS
## 37 SUI
## 38 SUI
## 39 SUI
## 40 SUI
## 41 SUI
## 42 ESP
## 43 SUI
## 44 ESP
## 45 SRB
## 46 SUI
## 47 GBR
## 48 SRB
## 49 SRB
## 50 GBR
## 51 SUI
## 52 SRB
## 53 SRB
head(oper)
## Country
## 2 AUS
## 3 AUS
## 4 AUS
## 5 AUS
## 6 USA
## 7 TCH
library(questionr)
library(magrittr)
NomOper=freq(oper$Country,cum = T)%>%data.frame()
NomOper=data.frame(variable=row.names(NomOper),NomOper,row.names = NULL)
Compruebo….
NomOper
## variable n X. val. X.cum val.cum
## 1 SUI 8 15.4 15.4 15.4 15.4
## 2 AUS 6 11.5 11.5 26.9 26.9
## 3 CRO 1 1.9 1.9 28.8 28.8
## 4 ESP 2 3.8 3.8 32.7 32.7
## 5 FRG 3 5.8 5.8 38.5 38.5
## 6 GBR 2 3.8 3.8 42.3 42.3
## 7 GER 1 1.9 1.9 44.2 44.2
## 8 NED 1 1.9 1.9 46.2 46.2
## 9 SRB 5 9.6 9.6 55.8 55.8
## 10 SWE 7 13.5 13.5 69.2 69.2
## 11 TCH 1 1.9 1.9 71.2 71.2
## 12 USA 15 28.8 28.8 100.0 100.0
library(ggplot2)
base = ggplot(data=NomOper,aes(x=variable,y=n))
bar1 = base + geom_bar(stat='identity')
bar1
text1="Country Open era"
text2="Países"
text3="Conteo"
text4="Fuente: Wikipedia"
bar2= bar1 + labs(title=text1,
x =text2,
y = text3,
caption = text4)
bar2
library (qcc)
## Package 'qcc' version 2.7
## Type 'citation("qcc")' for citing this R package in publications.
pareto.chart(table(oper$Country),cumperc = c(0,50,80,100))
##
## Pareto chart analysis for table(oper$Country)
## Frequency Cum.Freq. Percentage Cum.Percent.
## USA 15.000000 15.000000 28.846154 28.846154
## SUI 8.000000 23.000000 15.384615 44.230769
## SWE 7.000000 30.000000 13.461538 57.692308
## AUS 6.000000 36.000000 11.538462 69.230769
## SRB 5.000000 41.000000 9.615385 78.846154
## FRG 3.000000 44.000000 5.769231 84.615385
## ESP 2.000000 46.000000 3.846154 88.461538
## GBR 2.000000 48.000000 3.846154 92.307692
## CRO 1.000000 49.000000 1.923077 94.230769
## GER 1.000000 50.000000 1.923077 96.153846
## NED 1.000000 51.000000 1.923077 98.076923
## TCH 1.000000 52.000000 1.923077 100.000000
Conclusión: 3. calculando los estadísticos
library(DescTools)
MODA
Mode(oper$Country)
## [1] " USA"
USA es el país que se repite más entre los campeones
Dispersión
dataTable=table(oper$Country)
1-max(prop.table(dataTable))
## [1] 0.7115385
La moda no representa el 71% de los países
Concentracion: Herfindahl- Hirschman
dataTable=table(oper$Country)
Herfindahl(dataTable)
## [1] 0.1553254
0.25: La moda se diferencia de los demas
Por lo tanto, nuestra moda no es significativa
#Segunda parte de la práctica
library(htmltab)
linkPage="https://en.wikipedia.org/wiki/List_of_Wimbledon_ladies%27_singles_champions"
linkTabla="//div/table[4]"
opengirl=htmltab(doc=linkPage, which=linkTabla)
names(opengirl)
## [1] "Year" "Country" "Champion"
## [4] "Country" "Runner-up" "Score in the final"
str(opengirl)
## 'data.frame': 52 obs. of 6 variables:
## $ Year : chr "1968" "1969" "1970" "1971" ...
## $ Country : chr " USA" " GBR" " AUS" " AUS" ...
## $ Champion : chr "Billie Jean King" "Ann Jones" "Margaret Court" "Evonne Goolagong" ...
## $ Country : chr " AUS" " USA" " USA" " AUS" ...
## $ Runner-up : chr "Judy Tegart" "Billie Jean King" "Billie Jean King" "Margaret Court" ...
## $ Score in the final: chr "9–7, 7–5" "3–6, 6–3, 6–2" "14–12, 11–9" "6–4, 6–1" ...
opengirl=opengirl[c(2)]
names(opengirl)
## [1] "Country"
str(opengirl)
## 'data.frame': 52 obs. of 1 variable:
## $ Country: chr " USA" " GBR" " AUS" " AUS" ...
opengirl
## Country
## 2 USA
## 3 GBR
## 4 AUS
## 5 AUS
## 6 USA
## 7 USA
## 8 USA
## 9 USA
## 10 USA
## 11 GBR
## 12 USA
## 13 USA
## 14 AUS
## 15 USA
## 16 USA
## 17 USA
## 18 USA
## 19 USA
## 20 USA
## 21 USA
## 22 FRG
## 23 FRG
## 24 USA
## 25 GER
## 26 GER
## 27 GER
## 28 ESP
## 29 GER
## 30 GER
## 31 SUI
## 32 CZE
## 33 USA
## 34 USA
## 35 USA
## 36 USA
## 37 USA
## 38 RUS
## 39 USA
## 40 FRA
## 41 USA
## 42 USA
## 43 USA
## 44 USA
## 45 CZE
## 46 USA
## 47 FRA
## 48 CZE
## 49 USA
## 50 USA
## 51 ESP
## 52 GER
## 53 ROU
head(opengirl)
## Country
## 2 USA
## 3 GBR
## 4 AUS
## 5 AUS
## 6 USA
## 7 USA
Análisis
library(questionr)
library(magrittr)
NomOEG=freq(opengirl$Country,cum = T)%>%data.frame()
NomOEG=data.frame(variable=row.names(NomOEG),NomOEG,row.names = NULL)
NomOEG
## variable n X. val. X.cum val.cum
## 1 SUI 1 1.9 1.9 1.9 1.9
## 2 AUS 3 5.8 5.8 7.7 7.7
## 3 CZE 3 5.8 5.8 13.5 13.5
## 4 ESP 2 3.8 3.8 17.3 17.3
## 5 FRA 2 3.8 3.8 21.2 21.2
## 6 FRG 2 3.8 3.8 25.0 25.0
## 7 GBR 2 3.8 3.8 28.8 28.8
## 8 GER 6 11.5 11.5 40.4 40.4
## 9 ROU 1 1.9 1.9 42.3 42.3
## 10 RUS 1 1.9 1.9 44.2 44.2
## 11 USA 29 55.8 55.8 100.0 100.0
GRÁFICOS
library(ggplot2)
base = ggplot(data=NomOEG,aes(x=variable,y=n))
bar1 = base + geom_bar(stat='identity')
bar1
bar1 = bar1 + scale_x_discrete(limits = NomOEG$variable)
bar1
text1="Open era girl"
text2="Países"
text3="Conteo"
text4="Fuente: Wikipedia"
bar2= bar1 + labs(title=text1,
x =text2,
y = text3,
caption = text4)
bar2
library(qcc)
pareto.chart(table(opengirl$Country),cumperc = c(0,50,80,100))
##
## Pareto chart analysis for table(opengirl$Country)
## Frequency Cum.Freq. Percentage Cum.Percent.
## USA 29.000000 29.000000 55.769231 55.769231
## GER 6.000000 35.000000 11.538462 67.307692
## AUS 3.000000 38.000000 5.769231 73.076923
## CZE 3.000000 41.000000 5.769231 78.846154
## ESP 2.000000 43.000000 3.846154 82.692308
## FRA 2.000000 45.000000 3.846154 86.538462
## FRG 2.000000 47.000000 3.846154 90.384615
## GBR 2.000000 49.000000 3.846154 94.230769
## SUI 1.000000 50.000000 1.923077 96.153846
## ROU 1.000000 51.000000 1.923077 98.076923
## RUS 1.000000 52.000000 1.923077 100.000000
ESTADÍSTICOS
library(DescTools)
MODA
Mode(opengirl$Country)
## [1] " USA"
Nuevamente USA es el país que mpas se repite
Dispersión
dataTable=table(opengirl$Country)
1-max(prop.table(dataTable))
## [1] 0.4423077
La moda no representa el 44%
Concentración
dataTable=table(opengirl$Country)
Herfindahl(dataTable)
## [1] 0.3380178
La moda se diferencia de los demás