Solución
library(openxlsx)
datafile='mundo.xlsx'
datat=read.xlsx(datafile)
head(datat)
## codigopais pais accFT ruralElec mujFueraEscuela
## 1 AFG Afghanistan 17.71918 70.129379 NA
## 2 ALB Albania 65.75154 100.000000 5.64351
## 3 DZA Algeria 99.99000 99.248154 NA
## 4 AND Andorra 100.00000 100.000000 NA
## 5 AGO Angola 45.58080 4.386689 NA
## 6 ATG Antigua and Barbuda 99.99000 99.999336 NA
## corruCont mujerFuerzaL polStabNoTerror
## 1 -1.6357235 17.63448 -2.5791519
## 2 -0.5250779 39.86474 -0.1914829
## 3 -0.5248498 19.20092 -1.2593676
## 4 1.3080720 NA 1.2782722
## 5 -1.3257300 45.23737 -0.2261823
## 6 1.3080720 NA 0.9049994
str(datat)
## 'data.frame': 201 obs. of 8 variables:
## $ codigopais : chr "AFG" "ALB" "DZA" "AND" ...
## $ pais : chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ accFT : num 17.7 65.8 100 100 45.6 ...
## $ ruralElec : num 70.13 100 99.25 100 4.39 ...
## $ mujFueraEscuela: num NA 5.64 NA NA NA ...
## $ corruCont : num -1.636 -0.525 -0.525 1.308 -1.326 ...
## $ mujerFuerzaL : num 17.6 39.9 19.2 NA 45.2 ...
## $ polStabNoTerror: num -2.579 -0.191 -1.259 1.278 -0.226 ...
Previos: Si tuviera que prescindir de alguna de estas variables, Cuál serÃa la candidata más evidente pues complicarÃa en análisis estadÃstico?
summary(datat)
## codigopais pais accFT ruralElec
## Length:201 Length:201 Min. : 2.00 Min. : 0.70
## Class :character Class :character 1st Qu.: 23.57 1st Qu.: 44.54
## Mode :character Mode :character Median : 80.55 Median : 97.57
## Mean : 63.61 Mean : 73.62
## 3rd Qu.: 99.99 3rd Qu.:100.00
## Max. :100.00 Max. :100.00
## NA's :13 NA's :4
## mujFueraEscuela corruCont mujerFuerzaL polStabNoTerror
## Min. : 0.2612 Min. :-1.7392 Min. :12.48 Min. :-3.13097
## 1st Qu.: 3.0057 1st Qu.:-0.7600 1st Qu.:38.28 1st Qu.:-0.72194
## Median : 8.2741 Median :-0.2947 Median :44.16 Median : 0.02709
## Mean :12.9939 Mean :-0.0357 Mean :40.99 Mean :-0.05160
## 3rd Qu.:15.5726 3rd Qu.: 0.6338 3rd Qu.:47.42 3rd Qu.: 0.79192
## Max. :65.3875 Max. : 2.3590 Max. :54.54 Max. : 1.64520
## NA's :125 NA's :2 NA's :21 NA's :1
En cuántos grupos de paÃses se podrÃa organizar esta data usando la data original entregada?
test=scale(datat[,-c(1,2)])
library(NbClust)
nb <- NbClust(test, method = "complete")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 1 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 6 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
En cuántos grupos de paÃses se podrÃa organizar esta data si cada valor perdido fuera reemplazado por la media de esa columna?
datasc=as.data.frame(lapply(datat[,-c(1,2)],function(x){replace(x,is.na(x),mean(x,na.rm = T))}))
test2=scale(datasc)
library(NbClust)
nb <- NbClust(test2, method = "complete")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 2 proposed 12 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
library(openxlsx)
datafile='mundo.xlsx'
MUNDO=read.xlsx(datafile)
row.names(MUNDO)=MUNDO$pais
MUNDO=MUNDO[,-c(1,2)]
MUNDO_s=scale(MUNDO)
MUNDO_d=dist(MUNDO_s)
MUNDO_r=cmdscale(MUNDO_d,eig=TRUE, k=2)
x=MUNDO_r$points[,1]
y=MUNDO_r$points[,2]
columnForLabels=row.names(MUNDO)
colorForLabels=as.factor(ifelse(MUNDO$polStabNoTerror>0,'si','no'))
paleta=c('red','green')
plot(x, y, main="Mi Grafico",type='n')
text(x, y,labels = columnForLabels, cex = 0.5,col = paleta[colorForLabels])
legend("bottomright", legend = levels(colorForLabels),fill = paleta,title = "terror positivo?",cex=0.5)
REALICEMOS ANALISIS DE COMPONENTES PRINCIPALES
library(openxlsx)
datafile='mundo.xlsx'
MUNDO=read.xlsx(datafile)
subTable=MUNDO[,-c(1,2)]
for(i in 1:ncol(subTable)){ # para cada columna:
MEDIA=mean(subTable[,i], na.rm = TRUE) # calcula la mediana de esa columna
subTable[is.na(subTable[,i]), i] <- round(MEDIA,0) # pon la mediana donde haya un NA en esa columna (redondeada)
}
DATA SUFICIENTE?
library(psych)
matrizCor <- cor(subTable)
KMO(matrizCor)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = matrizCor)
## Overall MSA = 0.7
## MSA for each item =
## accFT ruralElec mujFueraEscuela corruCont
## 0.68 0.67 0.91 0.73
## mujerFuerzaL polStabNoTerror
## 0.59 0.69
TENEMOS MATRIZ IDENTIDAD?
cortest.bartlett(matrizCor, n=nrow(subTable))
## $chisq
## [1] 534.425
##
## $p.value
## [1] 2.911896e-104
##
## $df
## [1] 15
resultadoPr=principal(matrizCor,2,rotate="varimax", scores=T)
print(resultadoPr,digits=3,cut = 0.4)
## Principal Components Analysis
## Call: principal(r = matrizCor, nfactors = 2, rotate = "varimax", scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## accFT 0.918 0.843 0.157 1.00
## ruralElec 0.911 0.841 0.159 1.03
## mujFueraEscuela -0.507 0.270 0.730 1.11
## corruCont 0.664 0.567 0.762 0.238 1.95
## mujerFuerzaL 0.831 0.763 0.237 1.21
## polStabNoTerror 0.524 0.703 0.768 0.232 1.85
##
## RC1 RC2
## SS loadings 2.717 1.531
## Proportion Var 0.453 0.255
## Cumulative Var 0.453 0.708
## Proportion Explained 0.640 0.360
## Cumulative Proportion 0.640 1.000
##
## Mean item complexity = 1.4
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.098
##
## Fit based upon off diagonal values = 0.944
library(openxlsx)
datafile='mundo.xlsx'
datat=read.xlsx(datafile)
etiquetas1=c('bajo','medio','alto')
datat$mujerFuerzaLO=cut(datat$mujerFuerzaL,
breaks=3,
labels=etiquetas1,
ordered_result = T)
etiquetas2=c('malo','regular','bueno')
datat$polStabNoTerrorO =cut(datat$polStabNoTerror,
breaks=3,
labels=etiquetas2,
ordered_result = T)
summary(datat)
## codigopais pais accFT ruralElec
## Length:201 Length:201 Min. : 2.00 Min. : 0.70
## Class :character Class :character 1st Qu.: 23.57 1st Qu.: 44.54
## Mode :character Mode :character Median : 80.55 Median : 97.57
## Mean : 63.61 Mean : 73.62
## 3rd Qu.: 99.99 3rd Qu.:100.00
## Max. :100.00 Max. :100.00
## NA's :13 NA's :4
## mujFueraEscuela corruCont mujerFuerzaL polStabNoTerror
## Min. : 0.2612 Min. :-1.7392 Min. :12.48 Min. :-3.13097
## 1st Qu.: 3.0057 1st Qu.:-0.7600 1st Qu.:38.28 1st Qu.:-0.72194
## Median : 8.2741 Median :-0.2947 Median :44.16 Median : 0.02709
## Mean :12.9939 Mean :-0.0357 Mean :40.99 Mean :-0.05160
## 3rd Qu.:15.5726 3rd Qu.: 0.6338 3rd Qu.:47.42 3rd Qu.: 0.79192
## Max. :65.3875 Max. : 2.3590 Max. :54.54 Max. : 1.64520
## NA's :125 NA's :2 NA's :21 NA's :1
## mujerFuerzaLO polStabNoTerrorO
## bajo : 20 malo :18
## medio: 44 regular:84
## alto :116 bueno :98
## NA's : 21 NA's : 1
##
##
##
tablaTE=table(datat$mujerFuerzaLO,datat$polStabNoTerrorO)
chisq.test(tablaTE,simulate.p.value = T)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: tablaTE
## X-squared = 29.765, df = NA, p-value = 0.0004998
library(ca)
tablaCA_te=ca(tablaTE)
plot.ca(tablaCA_te, col=c("red","blue"))