Cargo la data:
link="https://github.com/PoliticayGobiernoPUCP/EstadisticaPoliticaGobiernoII/blob/master/sesiones/data/governor.csv?raw=true"
governor=read.csv(link,stringsAsFactors = F)
Saco estructura:
str(governor)
## 'data.frame': 329866 obs. of 5 variables:
## $ party : chr "REPUBLICAN" "REPUBLICAN" "REPUBLICAN" "REPUBLICAN" ...
## $ election_year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ amount : num 25 100 100 25 50 100 50 25 25 100 ...
## $ contributor_occupation: chr "" "" "RETIRED" "" ...
## $ code : chr "Individual" "Individual" "Individual" "Individual" ...
Saco tabla de frecuencias:
table(governor$party)
##
## DEMOCRAT INDEPENDENT NONE REPUBLICAN
## 167667 20 4 162175
table(governor$election_year)
##
## 2008 2012 2016
## 155124 130461 44281
table(governor$party,governor$election_year)
##
## 2008 2012 2016
## DEMOCRAT 70400 62520 34747
## INDEPENDENT 11 8 1
## NONE 4 0 0
## REPUBLICAN 84709 67933 9533
byCategories=c("DEMOCRAT", "REPUBLICAN")
governorDR=governor[governor$party %in% byCategories,]
table(governorDR$party,governorDR$election_year)
##
## 2008 2012 2016
## DEMOCRAT 70400 62520 34747
## REPUBLICAN 84709 67933 9533
library(gmodels)
CrossTable(governorDR$party,governorDR$election_year,prop.t=F, prop.r=F, prop.c=F,prop.chisq=F,chisq=T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 329842
##
##
## | governorDR$election_year
## governorDR$party | 2008 | 2012 | 2016 | Row Total |
## -----------------|-----------|-----------|-----------|-----------|
## DEMOCRAT | 70400 | 62520 | 34747 | 167667 |
## -----------------|-----------|-----------|-----------|-----------|
## REPUBLICAN | 84709 | 67933 | 9533 | 162175 |
## -----------------|-----------|-----------|-----------|-----------|
## Column Total | 155109 | 130453 | 44280 | 329842 |
## -----------------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 15814.97 d.f. = 2 p = 0
##
##
##
legendPlot=levels(as.factor(unique(governorDR$party)))
bartable = table(governorDR$party,governorDR$election_year) ##tabla de contingencia
barplot(bartable, beside = T,legend=legendPlot) ## grafico
Hago un subconjunto con las contribuciones hechas por empresa:
business = governorDR[governorDR$code=="Business",]
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
p <- ggplot(business, aes(party, amount))
p + geom_boxplot()
Intervalos de confianza:
# Creo una función para el error standard de la media (seMean):
seMean = function(x) sd(x)/sqrt(length(x))
# Media por grupo
means=aggregate(list(mean=business$amount),
list(party=business$party),mean)
# error standard por grupo: (usando la función que hemos creado!)
sems=aggregate(list(seMean=business$amount),
list(party=business$party),seMean)
means
## party mean
## 1 DEMOCRAT 755.9014
## 2 REPUBLICAN 748.6163
sems
## party seMean
## 1 DEMOCRAT 11.25733
## 2 REPUBLICAN 12.49764
data=merge(means,sems)
data
## party mean seMean
## 1 DEMOCRAT 755.9014 11.25733
## 2 REPUBLICAN 748.6163 12.49764
# añadir intervalos de confianza:
data$lower=data$mean-2*data$seMean
data$upper=data$mean+2*data$seMean
## version final:
data
## party mean seMean lower upper
## 1 DEMOCRAT 755.9014 11.25733 733.3867 778.4160
## 2 REPUBLICAN 748.6163 12.49764 723.6210 773.6115
# creamos los puntos (means):
meanPlot = ggplot(data, aes(y=mean, x=party)) + geom_point()
# añadimos las barras de error:
errorPlot = meanPlot + geom_errorbar(aes(ymin = lower, ymax = upper))
# resultado:
errorPlot
Ahora sí saco t-test:
t.test(business$amount~business$party,var.equal = T)
##
## Two Sample t-test
##
## data: business$amount by business$party
## t = 0.43123, df = 5253, p-value = 0.6663
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.83383 40.40406
## sample estimates:
## mean in group DEMOCRAT mean in group REPUBLICAN
## 755.9014 748.6163
with(governorDR, tapply(amount, list(code=code,party=party), mean))
## party
## code DEMOCRAT REPUBLICAN
## Business 755.9014 748.6163
## Caucus 25000.0000 1875.0000
## Individual 147.6418 157.4498
## Other 666.3455 124.7814
## Party 32767.7546 66312.0519
## Political Action Committee 986.4145 1358.1671
## Union 937.4966 NA
Hago ANOVA:
model = aov(amount ~ code, data = governorDR)
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## code 6 5.207e+11 8.678e+10 7575 <2e-16 ***
## Residuals 329835 3.779e+12 1.146e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Hay diferencias. Pero necesito dónde están. Hago pruebas post-hoc. En este caso, Tukey:
TukeyHSD(model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = amount ~ code, data = governorDR)
##
## $code
## diff lwr
## Caucus-Business 12685.35833 5627.68706
## Individual-Business -601.41172 -740.60096
## Other-Business -611.28193 -752.99952
## Party-Business 40027.78584 39448.04857
## Political Action Committee-Business 272.25626 -84.38534
## Union-Business 185.35492 -402.39816
## Individual-Caucus -13286.77005 -20343.12866
## Other-Caucus -13296.64026 -20353.04920
## Party-Caucus 27342.42752 20263.66219
## Political Action Committee-Caucus -12413.10207 -19477.09644
## Union-Caucus -12500.00341 -19579.42973
## Other-Individual -9.87021 -49.32937
## Party-Individual 40629.19756 40065.66544
## Political Action Committee-Individual 873.66798 544.02214
## Union-Individual 786.76664 214.99148
## Party-Other 40639.06777 40074.90584
## Political Action Committee-Other 883.53819 552.81684
## Union-Other 796.63685 224.24095
## Political Action Committee-Party -39755.52958 -40407.74737
## Union-Party -39842.43093 -40644.70799
## Union-Political Action Committee -86.90134 -746.25438
## upr p adj
## Caucus-Business 19743.02960 0.0000024
## Individual-Business -462.22248 0.0000000
## Other-Business -469.56434 0.0000000
## Party-Business 40607.52311 0.0000000
## Political Action Committee-Business 628.89786 0.2684587
## Union-Business 773.10799 0.9678611
## Individual-Caucus -6230.41143 0.0000006
## Other-Caucus -6240.23131 0.0000006
## Party-Caucus 34421.19285 0.0000000
## Political Action Committee-Caucus -5349.10769 0.0000046
## Union-Caucus -5420.57709 0.0000040
## Other-Individual 29.58895 0.9902604
## Party-Individual 41192.72968 0.0000000
## Political Action Committee-Individual 1203.31383 0.0000000
## Union-Individual 1358.54179 0.0009778
## Party-Other 41203.22970 0.0000000
## Political Action Committee-Other 1214.25955 0.0000000
## Union-Other 1369.03274 0.0008043
## Political Action Committee-Party -39103.31180 0.0000000
## Union-Party -39040.15386 0.0000000
## Union-Political Action Committee 572.45169 0.9997333
result=as.data.frame(TukeyHSD(model)$code)
result[4]<0.05
## p adj
## Caucus-Business TRUE
## Individual-Business TRUE
## Other-Business TRUE
## Party-Business TRUE
## Political Action Committee-Business FALSE
## Union-Business FALSE
## Individual-Caucus TRUE
## Other-Caucus TRUE
## Party-Caucus TRUE
## Political Action Committee-Caucus TRUE
## Union-Caucus TRUE
## Other-Individual FALSE
## Party-Individual TRUE
## Political Action Committee-Individual TRUE
## Union-Individual TRUE
## Party-Other TRUE
## Political Action Committee-Other TRUE
## Union-Other TRUE
## Political Action Committee-Party TRUE
## Union-Party TRUE
## Union-Political Action Committee FALSE
# los datos se recolectarán via API, pedimos data solo del 2014:
library(jsonlite)
apiResponse="https://data.wa.gov/resource/2cup-2fnu.json?year=2014"
medicare = fromJSON(apiResponse)
head(medicare)
## average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 1 71.0 0.90 1098715
## 2 73.0 0.86 1557
## 3 71.0 0.93 5426
## 4 71.0 0.92 28303
## 5 72.0 0.86 11040
## 6 73.0 0.81 22429
## county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 1 STATE TOTAL 739717 358998 32.7
## 2 ADAMS 1333 224 14.4
## 3 ASOTIN 4515 911 16.8
## 4 BENTON 24054 4249 15.0
## 5 CHELAN 8884 2156 19.5
## 6 CLALLAM 20336 2093 9.3
## percent_african_american percent_eligible_for_medicaid percent_female
## 1 2.6 19.1 53.2
## 2 <NA> 15.6 51.2
## 3 <NA> 18.5 51.8
## 4 0.8 15.5 53.8
## 5 0.2 20.2 51.3
## 6 0.3 12.5 52.6
## percent_hispanic percent_male percent_non_hispanic_white
## 1 3.4 46.9 86.3
## 2 <NA> 48.8 <NA>
## 3 <NA> 48.2 <NA>
## 4 5.1 46.2 90.0
## 5 5.4 48.7 91.7
## 6 1.3 47.4 93.3
## percent_other_unknown state_and_county_fips_code
## 1 7.7 .
## 2 <NA> 53001
## 3 <NA> 53003
## 4 4.1 53005
## 5 2.7 53007
## 6 5.1 53009
## to_sort_by_county_and_year to_sort_by_year_and_county year
## 1 0.2014 2014 2014
## 2 530012014 201453001 2014
## 3 530032014 201453003 2014
## 4 530052014 201453005 2014
## 5 530072014 201453007 2014
## 6 530092014 201453009 2014
Eliminar fila:
medicare=medicare[-c(1),]
head(medicare)
## average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 2 73.0 0.86 1557
## 3 71.0 0.93 5426
## 4 71.0 0.92 28303
## 5 72.0 0.86 11040
## 6 73.0 0.81 22429
## 7 69.0 0.95 70906
## county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 2 ADAMS 1333 224 14.4
## 3 ASOTIN 4515 911 16.8
## 4 BENTON 24054 4249 15.0
## 5 CHELAN 8884 2156 19.5
## 6 CLALLAM 20336 2093 9.3
## 7 CLARK 31144 39762 56.1
## percent_african_american percent_eligible_for_medicaid percent_female
## 2 <NA> 15.6 51.2
## 3 <NA> 18.5 51.8
## 4 0.8 15.5 53.8
## 5 0.2 20.2 51.3
## 6 0.3 12.5 52.6
## 7 1.9 26.0 52.8
## percent_hispanic percent_male percent_non_hispanic_white
## 2 <NA> 48.8 <NA>
## 3 <NA> 48.2 <NA>
## 4 5.1 46.2 90.0
## 5 5.4 48.7 91.7
## 6 1.3 47.4 93.3
## 7 2.8 47.2 87.8
## percent_other_unknown state_and_county_fips_code
## 2 <NA> 53001
## 3 <NA> 53003
## 4 4.1 53005
## 5 2.7 53007
## 6 5.1 53009
## 7 7.5 53011
## to_sort_by_county_and_year to_sort_by_year_and_county year
## 2 530012014 201453001 2014
## 3 530032014 201453003 2014
## 4 530052014 201453005 2014
## 5 530072014 201453007 2014
## 6 530092014 201453009 2014
## 7 530112014 201453011 2014
Reseteo el índice:
row.names(medicare)=NULL
head(medicare)
## average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 1 73.0 0.86 1557
## 2 71.0 0.93 5426
## 3 71.0 0.92 28303
## 4 72.0 0.86 11040
## 5 73.0 0.81 22429
## 6 69.0 0.95 70906
## county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 1 ADAMS 1333 224 14.4
## 2 ASOTIN 4515 911 16.8
## 3 BENTON 24054 4249 15.0
## 4 CHELAN 8884 2156 19.5
## 5 CLALLAM 20336 2093 9.3
## 6 CLARK 31144 39762 56.1
## percent_african_american percent_eligible_for_medicaid percent_female
## 1 <NA> 15.6 51.2
## 2 <NA> 18.5 51.8
## 3 0.8 15.5 53.8
## 4 0.2 20.2 51.3
## 5 0.3 12.5 52.6
## 6 1.9 26.0 52.8
## percent_hispanic percent_male percent_non_hispanic_white
## 1 <NA> 48.8 <NA>
## 2 <NA> 48.2 <NA>
## 3 5.1 46.2 90.0
## 4 5.4 48.7 91.7
## 5 1.3 47.4 93.3
## 6 2.8 47.2 87.8
## percent_other_unknown state_and_county_fips_code
## 1 <NA> 53001
## 2 <NA> 53003
## 3 4.1 53005
## 4 2.7 53007
## 5 5.1 53009
## 6 7.5 53011
## to_sort_by_county_and_year to_sort_by_year_and_county year
## 1 530012014 201453001 2014
## 2 530032014 201453003 2014
## 3 530052014 201453005 2014
## 4 530072014 201453007 2014
## 5 530092014 201453009 2014
## 6 530112014 201453011 2014
str(medicare)
## 'data.frame': 39 obs. of 18 variables:
## $ average_age : chr "73.0" "71.0" "71.0" "72.0" ...
## $ average_hcc_score : chr "0.86" "0.93" "0.92" "0.86" ...
## $ beneficiaries_with_part_a_and_part_b: chr "1557" "5426" "28303" "11040" ...
## $ county : chr "ADAMS" "ASOTIN" "BENTON" "CHELAN" ...
## $ ffs_beneficiaries : chr "1333" "4515" "24054" "8884" ...
## $ ma_beneficiaries : chr "224" "911" "4249" "2156" ...
## $ ma_participation_rate : chr "14.4" "16.8" "15.0" "19.5" ...
## $ percent_african_american : chr NA NA "0.8" "0.2" ...
## $ percent_eligible_for_medicaid : chr "15.6" "18.5" "15.5" "20.2" ...
## $ percent_female : chr "51.2" "51.8" "53.8" "51.3" ...
## $ percent_hispanic : chr NA NA "5.1" "5.4" ...
## $ percent_male : chr "48.8" "48.2" "46.2" "48.7" ...
## $ percent_non_hispanic_white : chr NA NA "90.0" "91.7" ...
## $ percent_other_unknown : chr NA NA "4.1" "2.7" ...
## $ state_and_county_fips_code : chr "53001" "53003" "53005" "53007" ...
## $ to_sort_by_county_and_year : chr "530012014" "530032014" "530052014" "530072014" ...
## $ to_sort_by_year_and_county : chr "201453001" "201453003" "201453005" "201453007" ...
## $ year : chr "2014" "2014" "2014" "2014" ...
medicare[,-c(4,16,17)]=lapply(medicare[,-c(4,16,17)],as.numeric)
str(medicare)
## 'data.frame': 39 obs. of 18 variables:
## $ average_age : num 73 71 71 72 73 69 72 69 73 70 ...
## $ average_hcc_score : num 0.86 0.93 0.92 0.86 0.81 0.95 0.83 0.99 0.88 0.76 ...
## $ beneficiaries_with_part_a_and_part_b: num 1557 5426 28303 11040 22429 ...
## $ county : chr "ADAMS" "ASOTIN" "BENTON" "CHELAN" ...
## $ ffs_beneficiaries : num 1333 4515 24054 8884 20336 ...
## $ ma_beneficiaries : num 224 911 4249 2156 2093 ...
## $ ma_participation_rate : num 14.4 16.8 15 19.5 9.3 56.1 3.5 47.2 21.2 4.9 ...
## $ percent_african_american : num NA NA 0.8 0.2 0.3 1.9 NA 0.7 0.2 NA ...
## $ percent_eligible_for_medicaid : num 15.6 18.5 15.5 20.2 12.5 26 19.1 25.1 15.7 20.1 ...
## $ percent_female : num 51.2 51.8 53.8 51.3 52.6 52.8 51.7 50.8 54.1 47.4 ...
## $ percent_hispanic : num NA NA 5.1 5.4 1.3 2.8 NA 1.6 6.5 NA ...
## $ percent_male : num 48.8 48.2 46.2 48.7 47.4 47.2 48.3 49.2 45.9 52.6 ...
## $ percent_non_hispanic_white : num NA NA 90 91.7 93.3 87.8 NA 94.1 90.8 NA ...
## $ percent_other_unknown : num NA NA 4.1 2.7 5.1 7.5 NA 3.6 2.5 NA ...
## $ state_and_county_fips_code : num 53001 53003 53005 53007 53009 ...
## $ to_sort_by_county_and_year : chr "530012014" "530032014" "530052014" "530072014" ...
## $ to_sort_by_year_and_county : chr "201453001" "201453003" "201453005" "201453007" ...
## $ year : num 2014 2014 2014 2014 2014 ...
res=cor(medicare[,c(8:14)],use= "complete.obs") #pearson por default
res <- cor(medicare[,c(8:14)],use= "complete.obs") #pearson por default
round(res, 2)
## percent_african_american
## percent_african_american 1.00
## percent_eligible_for_medicaid 0.28
## percent_female 0.43
## percent_hispanic 0.12
## percent_male -0.43
## percent_non_hispanic_white -0.68
## percent_other_unknown 0.73
## percent_eligible_for_medicaid percent_female
## percent_african_american 0.28 0.43
## percent_eligible_for_medicaid 1.00 0.08
## percent_female 0.08 1.00
## percent_hispanic 0.41 0.22
## percent_male -0.08 -1.00
## percent_non_hispanic_white -0.52 -0.38
## percent_other_unknown 0.29 0.25
## percent_hispanic percent_male
## percent_african_american 0.12 -0.43
## percent_eligible_for_medicaid 0.41 -0.08
## percent_female 0.22 -1.00
## percent_hispanic 1.00 -0.22
## percent_male -0.22 1.00
## percent_non_hispanic_white -0.75 0.38
## percent_other_unknown -0.11 -0.25
## percent_non_hispanic_white
## percent_african_american -0.68
## percent_eligible_for_medicaid -0.52
## percent_female -0.38
## percent_hispanic -0.75
## percent_male 0.38
## percent_non_hispanic_white 1.00
## percent_other_unknown -0.55
## percent_other_unknown
## percent_african_american 0.73
## percent_eligible_for_medicaid 0.29
## percent_female 0.25
## percent_hispanic -0.11
## percent_male -0.25
## percent_non_hispanic_white -0.55
## percent_other_unknown 1.00
library(car) #instalen!
scatterplot(medicare[,9] ~ medicare[,13], main="Scatterplot")
¿A mayor porcentaje de gente blanca en un condado, menor porcentaje de personas elegibles para medicaid?
Cargo data del IDE (Índice de Densidad del Estado). La data viene de Excel. La abro con openxlsx:
library(openxlsx)
folder='data'
fileName='IDE_limpio.xlsx'
fileToRead=file.path(folder,fileName)
provinciasNew=read.xlsx(fileToRead)
Siempre guarda tu Markdown en tu carpeta Estadística 2. Eso lo haces dándole clic por primera vez a Preview.
str(provinciasNew)
## 'data.frame': 195 obs. of 12 variables:
## $ regionUbigeo : chr "010000" "010000" "010000" "010000" ...
## $ provinciaUbigeo : chr "010200" "010300" "010400" "010500" ...
## $ regionNombre : chr "AMAZONAS" "AMAZONAS" "AMAZONAS" "AMAZONAS" ...
## $ provinciaNombre : chr "Bagua" "Bongará" "Condorcanqui" "Luya" ...
## $ pob2012 : num 77438 32317 51802 52185 30236 ...
## $ ide2012 : num 0.662 0.632 0.46 0.605 0.631 ...
## $ identificacion2012 : num 94.6 97.5 86.2 96.2 97.3 ...
## $ medicos2012 : num 14.61 9.01 8.56 12.42 14.88 ...
## $ escolaridad2012 : num 79.8 76.4 52.2 74.7 79.4 ...
## $ AguaDesague2012 : num 64.5 54.8 37.7 43.3 46.5 ...
## $ electrificacion2012: num 67.9 72.2 39.5 67.4 67.5 ...
## $ mapProv : chr "0102" "0103" "0104" "0105" ...
names(provinciasNew)
## [1] "regionUbigeo" "provinciaUbigeo" "regionNombre"
## [4] "provinciaNombre" "pob2012" "ide2012"
## [7] "identificacion2012" "medicos2012" "escolaridad2012"
## [10] "AguaDesague2012" "electrificacion2012" "mapProv"
prov_sub=provinciasNew[c(7:11)]
head(prov_sub)
## identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 94.60787 14.609121 79.79018 64.47904
## 2 97.46807 9.010207 76.42404 54.83408
## 3 86.23196 8.556959 52.21494 37.71451
## 4 96.19272 12.418003 74.72597 43.34842
## 5 97.34310 14.878682 79.42439 46.50182
## 6 95.17449 10.110167 77.16833 52.51951
## electrificacion2012
## 1 67.91462
## 2 72.16926
## 3 39.48908
## 4 67.39611
## 5 67.54610
## 6 63.11765
row.names(prov_sub)=provinciasNew$provinciaNombre
head(prov_sub)
## identificacion2012 medicos2012 escolaridad2012
## Bagua 94.60787 14.609121 79.79018
## Bongará 97.46807 9.010207 76.42404
## Condorcanqui 86.23196 8.556959 52.21494
## Luya 96.19272 12.418003 74.72597
## Rodríguez de Mendoza 97.34310 14.878682 79.42439
## Utcubamba 95.17449 10.110167 77.16833
## AguaDesague2012 electrificacion2012
## Bagua 64.47904 67.91462
## Bongará 54.83408 72.16926
## Condorcanqui 37.71451 39.48908
## Luya 43.34842 67.39611
## Rodríguez de Mendoza 46.50182 67.54610
## Utcubamba 52.51951 63.11765
prov_sub.scaled=scale(prov_sub)
head(prov_sub.scaled)
## identificacion2012 medicos2012 escolaridad2012
## Bagua -1.09436397 0.36776299 -0.05610104
## Bongará 0.06856468 -0.39755431 -0.35513210
## Condorcanqui -4.49992185 -0.45950881 -2.50574699
## Luya -0.44997860 0.06825845 -0.50598013
## Rodríguez de Mendoza 0.01775379 0.40460944 -0.08859598
## Utcubamba -0.86397982 -0.24720043 -0.28901303
## AguaDesague2012 electrificacion2012
## Bagua 0.39995304 -0.267919475
## Bongará -0.06962008 0.003631957
## Condorcanqui -0.90310156 -2.082170112
## Luya -0.62880972 -0.301012951
## Rodríguez de Mendoza -0.47528375 -0.291439587
## Utcubamba -0.18230687 -0.574083951
Creo distancias. [El paquete NbClust lo hará por nosotros.]
Identifico número óptimo de clusters:
library(NbClust)
nb <- NbClust(prov_sub.scaled, method = "complete") # utilizamos el método complete
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
El algoritmo me dice que debo sacar 3 clusters.
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
algoritmo="hclust" #método jerarquico
cuantosClusters=3
solucionJerarquica1 <- eclust(prov_sub.scaled,
FUNcluster =algoritmo,
k = cuantosClusters,
method = "complete", # linkage
graph = FALSE)
fviz_dend(solucionJerarquica1, rect = TRUE, show_labels = T) #veamos el dendograma
idehc = as.data.frame(solucionJerarquica1$cluster)
colnames(idehc) = c("hc") #le cambio el nombre para reconcoer la técnica
Hago merge:
provinciasClust=merge(provinciasNew,idehc,
by.x = 'provinciaNombre',
by.y=0) # 'by.y=0' está usando los row.names
head(provinciasClust)
## provinciaNombre regionUbigeo provinciaUbigeo regionNombre pob2012
## 1 Abancay 030000 030100 APURÍMAC 105694
## 2 Acobamba 090000 090200 HUANCAVELICA 73243
## 3 Acomayo 080000 080200 CUSCO 28318
## 4 Aija 020000 020200 ANCASH 7974
## 5 Alto Amazonas 160000 160200 LORETO 117163
## 6 Ambo 100000 100200 HUÁNUCO 57957
## ide2012 identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 0.7466421 97.90419 16.931097 93.69086 63.00113
## 2 0.5771310 98.75156 2.628194 77.54075 31.17210
## 3 0.6331165 97.19903 10.453706 83.12670 48.23934
## 4 0.6579253 98.23439 8.593985 89.49891 47.78407
## 5 0.5685318 94.03978 10.813959 61.71118 44.49094
## 6 0.5594230 97.92044 11.182912 74.88339 25.58357
## electrificacion2012 mapProv hc
## 1 90.50638 0301 3
## 2 76.72076 0902 1
## 3 70.57030 0802 1
## 4 79.12196 0202 1
## 5 66.00075 1602 2
## 6 62.68590 1002 2
¿Cuántas provincias hay en cada cluster?
table(provinciasClust$hc)
##
## 1 2 3
## 91 56 48
¿Qué provincias están en el cluster 3?
provinciasClust[provinciasClust$hc=="3",]$provinciaNombre
## [1] "Abancay" "Andahuaylas" "Arequipa" "Ascope"
## [5] "Barranca" "Cajamarca" "Callao" "Camaná"
## [9] "Canchis" "Cañete" "Casma" "Castilla"
## [13] "Chachapoyas" "Chepén" "Chiclayo" "Chincha"
## [17] "Cusco" "Huamanga" "Huancayo" "Huaral"
## [21] "Huaraz" "Huarmey" "Huarochirí" "Huaura"
## [25] "Ica" "Ilo" "Islay" "Jorge Basadre"
## [29] "Lima" "Mariscal Nieto" "Nazca" "Pacasmayo"
## [33] "Palpa" "Pisco" "Piura" "Puno"
## [37] "Recuay" "San Martín" "San Román" "Santa"
## [41] "Sullana" "Tacna" "Talara" "Tambopata"
## [45] "Tarata" "Trujillo" "Tumbes" "Yauli"
Saco la media por cluster de cada variable componente del IDE:
mediasPorCluster=aggregate(cbind(identificacion2012,medicos2012,escolaridad2012,AguaDesague2012,
electrificacion2012) ~ hc, data=provinciasClust,FUN=mean)
mediasPorCluster
## hc identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 1 97.66534 9.397934 83.42101 50.44582
## 2 2 95.35047 8.664781 66.79834 43.46882
## 3 3 98.87954 20.493638 90.62941 82.22228
## electrificacion2012
## 1 73.76446
## 2 54.59970
## 3 89.41167
fviz_silhouette(solucionJerarquica1)
## cluster size ave.sil.width
## 1 1 91 0.27
## 2 2 56 0.16
## 3 3 48 0.41
Creo un objeto con la información de las siluetas:
siluetas <-solucionJerarquica1$silinfo$widths
¿Quiénes están mal agrupados?
siluetas[siluetas$sil_width<0,]
## cluster neighbor sil_width
## Paucar del Sara Sara 1 3 -0.029485712
## Huánuco 1 3 -0.162745955
## Tahuamanú 1 3 -0.230168491
## Paruro 2 1 -0.007983778
## Lamas 2 1 -0.012407030
## San Miguel 2 1 -0.090568597
## Ambo 2 1 -0.098618357
## Huaylas 2 1 -0.105335588
## Luya 2 1 -0.135692443
## Sihuas 2 1 -0.140644894
## Chota 2 1 -0.181545362
## Recuay 3 1 -0.080046984
algoritmo="kmeans"
cuantosClusters=3
solucionKmeans1 <- eclust(prov_sub.scaled,
FUNcluster =algoritmo,
k = cuantosClusters, # como lo hicimos previamente
graph = F)
En k medias no veo dendograma, sino mapa de cercanía/lejanía:
fviz_cluster(solucionKmeans1, geom = "point", ellipse = F)
ideK=as.data.frame(solucionKmeans1$cluster)
colnames(ideK) = c("km")
provinciasClust=merge(provinciasClust,ideK,
by.x = 'provinciaNombre',
by.y=0) # 'by.y=0' está usando los row.names
head(provinciasClust)
## provinciaNombre regionUbigeo provinciaUbigeo regionNombre pob2012
## 1 Abancay 030000 030100 APURÍMAC 105694
## 2 Acobamba 090000 090200 HUANCAVELICA 73243
## 3 Acomayo 080000 080200 CUSCO 28318
## 4 Aija 020000 020200 ANCASH 7974
## 5 Alto Amazonas 160000 160200 LORETO 117163
## 6 Ambo 100000 100200 HUÁNUCO 57957
## ide2012 identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 0.7466421 97.90419 16.931097 93.69086 63.00113
## 2 0.5771310 98.75156 2.628194 77.54075 31.17210
## 3 0.6331165 97.19903 10.453706 83.12670 48.23934
## 4 0.6579253 98.23439 8.593985 89.49891 47.78407
## 5 0.5685318 94.03978 10.813959 61.71118 44.49094
## 6 0.5594230 97.92044 11.182912 74.88339 25.58357
## electrificacion2012 mapProv hc km
## 1 90.50638 0301 3 1
## 2 76.72076 0902 1 3
## 3 70.57030 0802 1 3
## 4 79.12196 0202 1 3
## 5 66.00075 1602 2 2
## 6 62.68590 1002 2 3
Veo cuántas provincias hay por cluster en k-medias:
table(provinciasClust$km)
##
## 1 2 3
## 46 53 96
¿Qué provincias están en el cluster 1?
provinciasClust[provinciasClust$km=="1",]$provinciaNombre
## [1] "Abancay" "Arequipa" "Ascope" "Barranca"
## [5] "Cajamarca" "Callao" "Camaná" "Cañete"
## [9] "Casma" "Castilla" "Chachapoyas" "Chepén"
## [13] "Chiclayo" "Chincha" "Cusco" "Huamanga"
## [17] "Huancayo" "Huánuco" "Huaral" "Huaraz"
## [21] "Huarmey" "Huarochirí" "Huaura" "Ica"
## [25] "Ilo" "Islay" "Jorge Basadre" "Lima"
## [29] "Mariscal Nieto" "Nazca" "Pacasmayo" "Palpa"
## [33] "Piura" "Puno" "San Martín" "San Román"
## [37] "Santa" "Sullana" "Tacna" "Tahuamanú"
## [41] "Talara" "Tambopata" "Tarata" "Trujillo"
## [45] "Tumbes" "Yauli"
Saco las medias por cluster para cada variable componente del IDH. Guardo como data frame:
mediasPorCluster2=aggregate(cbind(identificacion2012,medicos2012,escolaridad2012,AguaDesague2012,
electrificacion2012) ~ km, data=provinciasClust,FUN=mean)
mediasPorCluster2
## km identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 1 98.91312 21.42943 90.55211 82.22638
## 2 2 95.13081 8.52126 66.56416 43.64946
## 3 3 97.72347 9.23702 83.21805 50.78811
## electrificacion2012
## 1 89.55139
## 2 53.96373
## 3 73.77570
Veo las siluetas:
fviz_silhouette(solucionKmeans1)
## cluster size ave.sil.width
## 1 1 46 0.42
## 2 2 53 0.17
## 3 3 96 0.29
Veo a los mal agrupados:
siluetasPorProvincia <-solucionKmeans1$silinfo$widths
# quedarse con las negativas:
siluetasPorProvincia[siluetasPorProvincia$sil_width<0,]
## cluster neighbor sil_width
## Cangallo 2 3 -0.0003014385
## Lamas 2 3 -0.0278815757
## Paruro 2 3 -0.0405800275
## Manú 2 3 -0.0547122644
## Utcubamba 2 3 -0.0845866649
## Bellavista 2 3 -0.1003152596
## Andahuaylas 3 1 -0.0083392252
## Pisco 3 1 -0.0106442083
## Canchis 3 1 -0.0642255093
Cargo la data:
folder="data"
fileName="idhPeru.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
datos=read.xlsx(fileToRead,
sheet=2,
startRow = 5,
skipEmptyRows = T,
skipEmptyCols = T)
head(datos,10)
## X1 X2 Distrito habitantes ranking IDH ranking
## 1 000000 PERÚ a/ <NA> 29797694 - 0.4906275 -
## 2 010000 AMAZONAS <NA> 415466 19 0.3689683 19
## 3 010100 <NA> Chachapoyas 54593 115 0.4201136 60
## 4 010101 1 Chachapoyas 27356 195 0.5298996 134
## 5 010102 2 Asuncion 299 1825 0.1642529 1764
## 6 010103 3 Balsas 1575 1468 0.2582213 1220
## 7 010104 4 Cheto 614 1763 0.3206963 816
## 8 010105 5 Chiliquin 818 1703 0.2078854 1578
## 9 010106 6 Chuquibamba 2169 1320 0.2296774 1437
## 10 010107 7 Granada 425 1805 0.3193414 831
## años ranking % ranking años ranking N.S..mes ranking
## 1 73.98881 - 66.2762426 - 8.794300 - 659.0511 -
## 2 73.90743 10 50.6301745 19 6.486100 20 406.6160 18
## 3 73.84635 82 44.6108289 100 7.773447 65 553.2603 44
## 4 73.26092 915 62.7192982 476 9.967432 151 794.1962 95
## 5 73.42032 873 0.7086522 1823 6.086056 1024 327.6701 826
## 6 72.10407 1113 14.4676104 1709 5.259462 1334 322.9567 846
## 7 77.52977 254 44.5600664 899 4.954838 1439 330.1395 814
## 8 76.07404 435 8.0030467 1792 4.379055 1635 250.7874 1111
## 9 71.01417 1278 10.3299153 1770 6.133492 1011 254.3352 1084
## 10 71.45103 1209 40.0996214 1014 5.251665 1341 367.1224 691
¿Qué variables tengo?
names(datos)
## [1] "X1" "X2" "Distrito" "habitantes" "ranking"
## [6] "IDH" "ranking" "años" "ranking" "%"
## [11] "ranking" "años" "ranking" "N.S..mes" "ranking"
Voy a la cola:
tail(datos,10)
## X1
## 2053 250302
## 2054 250303
## 2055 250400
## 2056 250401
## 2057 a/ Incluye las cifras estimadas del distrito de Carmen Alto en la provincia de Huamanga, departamento de Ayacucho, donde. Autoridades locales no permitieron la ejecución del Censo de Población y Vivienda 2007.
## 2058 1/ Cifras estimadas. Autoridades locales no permitieron la ejecución del Censo de Población y Vivienda 2007.
## 2059 2/ Incluye a la población ubicada en área temporal por límites de fronteras de los distritos de Pangoa y Mazamari.
## 2060 3/ Provincias de Lima y Callao.
## 2061 Fuente: INEI. Censo de Población y Vivienda 2007. ENAHO 2007.
## 2062 Elaboración: PNUD-Perú.
## X2 Distrito habitantes ranking IDH ranking años ranking
## 2053 2 Irazola 22100 245 0.3385872 738 80.03469 44
## 2054 3 Curimana 7316 658 0.3150442 852 79.82107 55
## 2055 <NA> Purús 4174 195 0.2955708 130 69.31508 157
## 2056 1 Purus 4174 972 0.2955708 967 69.31508 1476
## 2057 <NA> <NA> NA <NA> NA <NA> NA <NA>
## 2058 <NA> <NA> NA <NA> NA <NA> NA <NA>
## 2059 <NA> <NA> NA <NA> NA <NA> NA <NA>
## 2060 <NA> <NA> NA <NA> NA <NA> NA <NA>
## 2061 <NA> <NA> NA <NA> NA <NA> NA <NA>
## 2062 <NA> <NA> NA <NA> NA <NA> NA <NA>
## % ranking años ranking N.S..mes ranking
## 2053 31.03903 1267 6.583683 866 357.5909 722
## 2054 28.94143 1320 6.728139 811 301.1814 926
## 2055 17.43332 191 7.198020 82 369.7634 92
## 2056 17.43332 1634 7.198020 658 369.7634 679
## 2057 NA <NA> NA <NA> NA <NA>
## 2058 NA <NA> NA <NA> NA <NA>
## 2059 NA <NA> NA <NA> NA <NA>
## 2060 NA <NA> NA <NA> NA <NA>
## 2061 NA <NA> NA <NA> NA <NA>
## 2062 NA <NA> NA <NA> NA <NA>
Elimino filas innecesarias:
datos=datos[-c(2057:2062),]
tail(datos,10)
## X1 X2 Distrito habitantes ranking IDH ranking
## 2047 250201 1 Raymondi 31815 168 0.2752737 1101
## 2048 250202 2 Sepahua 7790 634 0.3041428 915
## 2049 250203 3 Tahuania 7798 633 0.2069411 1581
## 2050 250204 4 Yurua 2080 1346 0.1880110 1674
## 2051 250300 <NA> Padre Abad 55866 112 0.3793717 74
## 2052 250301 1 Padre Abad 26450 205 0.4216082 422
## 2053 250302 2 Irazola 22100 245 0.3385872 738
## 2054 250303 3 Curimana 7316 658 0.3150442 852
## 2055 250400 <NA> Purús 4174 195 0.2955708 130
## 2056 250401 1 Purus 4174 972 0.2955708 967
## años ranking % ranking años ranking N.S..mes ranking
## 2047 62.97039 1781 20.89495 1539 6.323261 938 349.9306 749
## 2048 62.71963 1785 26.77310 1383 6.983010 732 387.8883 623
## 2049 66.33663 1662 11.40517 1762 5.497847 1249 218.9890 1267
## 2050 63.83347 1754 13.18217 1728 4.413204 1625 197.5084 1384
## 2051 78.35136 17 38.74236 120 7.119100 86 432.3284 78
## 2052 77.49887 257 44.32234 913 7.603725 559 531.0493 374
## 2053 80.03469 44 31.03903 1267 6.583683 866 357.5909 722
## 2054 79.82107 55 28.94143 1320 6.728139 811 301.1814 926
## 2055 69.31508 157 17.43332 191 7.198020 82 369.7634 92
## 2056 69.31508 1476 17.43332 1634 7.198020 658 369.7634 679
Vuelvo a la cabeza:
head(datos,10)
## X1 X2 Distrito habitantes ranking IDH ranking
## 1 000000 PERÚ a/ <NA> 29797694 - 0.4906275 -
## 2 010000 AMAZONAS <NA> 415466 19 0.3689683 19
## 3 010100 <NA> Chachapoyas 54593 115 0.4201136 60
## 4 010101 1 Chachapoyas 27356 195 0.5298996 134
## 5 010102 2 Asuncion 299 1825 0.1642529 1764
## 6 010103 3 Balsas 1575 1468 0.2582213 1220
## 7 010104 4 Cheto 614 1763 0.3206963 816
## 8 010105 5 Chiliquin 818 1703 0.2078854 1578
## 9 010106 6 Chuquibamba 2169 1320 0.2296774 1437
## 10 010107 7 Granada 425 1805 0.3193414 831
## años ranking % ranking años ranking N.S..mes ranking
## 1 73.98881 - 66.2762426 - 8.794300 - 659.0511 -
## 2 73.90743 10 50.6301745 19 6.486100 20 406.6160 18
## 3 73.84635 82 44.6108289 100 7.773447 65 553.2603 44
## 4 73.26092 915 62.7192982 476 9.967432 151 794.1962 95
## 5 73.42032 873 0.7086522 1823 6.086056 1024 327.6701 826
## 6 72.10407 1113 14.4676104 1709 5.259462 1334 322.9567 846
## 7 77.52977 254 44.5600664 899 4.954838 1439 330.1395 814
## 8 76.07404 435 8.0030467 1792 4.379055 1635 250.7874 1111
## 9 71.01417 1278 10.3299153 1770 6.133492 1011 254.3352 1084
## 10 71.45103 1209 40.0996214 1014 5.251665 1341 367.1224 691
Elimino la primera fila:
datos=datos[-c(1),]
head(datos)
## X1 X2 Distrito habitantes ranking IDH ranking
## 2 010000 AMAZONAS <NA> 415466 19 0.3689683 19
## 3 010100 <NA> Chachapoyas 54593 115 0.4201136 60
## 4 010101 1 Chachapoyas 27356 195 0.5298996 134
## 5 010102 2 Asuncion 299 1825 0.1642529 1764
## 6 010103 3 Balsas 1575 1468 0.2582213 1220
## 7 010104 4 Cheto 614 1763 0.3206963 816
## años ranking % ranking años ranking N.S..mes ranking
## 2 73.90743 10 50.6301745 19 6.486100 20 406.6160 18
## 3 73.84635 82 44.6108289 100 7.773447 65 553.2603 44
## 4 73.26092 915 62.7192982 476 9.967432 151 794.1962 95
## 5 73.42032 873 0.7086522 1823 6.086056 1024 327.6701 826
## 6 72.10407 1113 14.4676104 1709 5.259462 1334 322.9567 846
## 7 77.52977 254 44.5600664 899 4.954838 1439 330.1395 814
names(datos)
## [1] "X1" "X2" "Distrito" "habitantes" "ranking"
## [6] "IDH" "ranking" "años" "ranking" "%"
## [11] "ranking" "años" "ranking" "N.S..mes" "ranking"
Elimino columnas innecesarias:
columnas=c(seq(5,15,2))
datos <- datos[,-columnas]
head(datos)
## X1 X2 Distrito habitantes IDH años %
## 2 010000 AMAZONAS <NA> 415466 0.3689683 73.90743 50.6301745
## 3 010100 <NA> Chachapoyas 54593 0.4201136 73.84635 44.6108289
## 4 010101 1 Chachapoyas 27356 0.5298996 73.26092 62.7192982
## 5 010102 2 Asuncion 299 0.1642529 73.42032 0.7086522
## 6 010103 3 Balsas 1575 0.2582213 72.10407 14.4676104
## 7 010104 4 Cheto 614 0.3206963 77.52977 44.5600664
## años.1 N.S..mes
## 2 6.486100 406.6160
## 3 7.773447 553.2603
## 4 9.967432 794.1962
## 5 6.086056 327.6701
## 6 5.259462 322.9567
## 7 4.954838 330.1395
Aislo REGIONES:
regiones=datos[is.na(datos$Distrito),]
head(regiones)
## X1 X2 Distrito habitantes IDH años %
## 2 010000 AMAZONAS <NA> 415466 0.3689683 73.90743 50.63017
## 94 020000 ANCASH <NA> 1122792 0.4295926 73.92538 55.30838
## 281 030000 APURÍMAC <NA> 449365 0.3184201 73.19803 53.79581
## 369 040000 AREQUIPA <NA> 1231553 0.5528506 75.51374 85.21130
## 487 050000 AYACUCHO <NA> 658400 0.3337594 70.85430 43.59124
## 610 060000 CAJAMARCA <NA> 1507486 0.3633385 73.66707 52.22049
## años.1 N.S..mes
## 2 6.4861 406.6160
## 94 7.6707 536.1965
## 281 5.5644 297.3386
## 369 9.6657 755.1269
## 487 6.3373 356.3175
## 610 6.3766 390.3197
Reseteo el índice:
row.names(regiones)=NULL
head(regiones)
## X1 X2 Distrito habitantes IDH años % años.1
## 1 010000 AMAZONAS <NA> 415466 0.3689683 73.90743 50.63017 6.4861
## 2 020000 ANCASH <NA> 1122792 0.4295926 73.92538 55.30838 7.6707
## 3 030000 APURÍMAC <NA> 449365 0.3184201 73.19803 53.79581 5.5644
## 4 040000 AREQUIPA <NA> 1231553 0.5528506 75.51374 85.21130 9.6657
## 5 050000 AYACUCHO <NA> 658400 0.3337594 70.85430 43.59124 6.3373
## 6 060000 CAJAMARCA <NA> 1507486 0.3633385 73.66707 52.22049 6.3766
## N.S..mes
## 1 406.6160
## 2 536.1965
## 3 297.3386
## 4 755.1269
## 5 356.3175
## 6 390.3197
Elimino filas 15 y 16; elimino las columnas innecesarias; cambio nombre a la columnas:
regiones=regiones[-c(15,16),]
regiones=regiones[,c(1,2)]
colnames(regiones)=c("ubigeoRegion","nombreRegion")
head(regiones,10)
## ubigeoRegion nombreRegion
## 1 010000 AMAZONAS
## 2 020000 ANCASH
## 3 030000 APURÍMAC
## 4 040000 AREQUIPA
## 5 050000 AYACUCHO
## 6 060000 CAJAMARCA
## 7 080000 CUSCO
## 8 090000 HUANCAVELICA
## 9 100000 HUÁNUCO
## 10 110000 ICA
provincias=datos[is.na(datos$X2),]
head(provincias)
## X1 X2 Distrito habitantes IDH años
## 3 010100 <NA> Chachapoyas 54593 0.4201136 73.84635
## 25 010200 <NA> Bagua 77537 0.3734377 75.99719
## 32 010300 <NA> Bongará 31769 0.3398606 73.37774
## 45 010400 <NA> Condorcanqui 50742 0.1863721 70.87631
## 49 010500 <NA> Luya 52248 0.2996638 73.68010
## 73 010600 <NA> Rodríguez de Mendoza 29895 0.3253056 74.32124
## % años.1 N.S..mes
## 3 44.610829 7.773447 553.2603
## 25 39.679426 6.932584 433.8135
## 32 28.908538 6.442591 425.3696
## 45 7.976181 5.557132 178.6617
## 49 33.566048 5.470024 312.5744
## 73 26.226472 6.059564 403.0446
row.names(provincias)=NULL
head(provincias)
## X1 X2 Distrito habitantes IDH años %
## 1 010100 <NA> Chachapoyas 54593 0.4201136 73.84635 44.610829
## 2 010200 <NA> Bagua 77537 0.3734377 75.99719 39.679426
## 3 010300 <NA> Bongará 31769 0.3398606 73.37774 28.908538
## 4 010400 <NA> Condorcanqui 50742 0.1863721 70.87631 7.976181
## 5 010500 <NA> Luya 52248 0.2996638 73.68010 33.566048
## 6 010600 <NA> Rodríguez de Mendoza 29895 0.3253056 74.32124 26.226472
## años.1 N.S..mes
## 1 7.773447 553.2603
## 2 6.932584 433.8135
## 3 6.442591 425.3696
## 4 5.557132 178.6617
## 5 5.470024 312.5744
## 6 6.059564 403.0446
provincias=provincias[-c(129),]
provincias=provincias[,c(1,3)]
colnames(provincias)=c("UbigeoProvincia","nombreProvincia")
head(provincias,10)
## UbigeoProvincia nombreProvincia
## 1 010100 Chachapoyas
## 2 010200 Bagua
## 3 010300 Bongará
## 4 010400 Condorcanqui
## 5 010500 Luya
## 6 010600 Rodríguez de Mendoza
## 7 010700 Utcubamba
## 8 020100 Huaraz
## 9 020200 Aija
## 10 020300 Antonio Raymondi
Aislo DISTRITOS:
distritos=datos[complete.cases(datos),]
head(distritos,10)
## X1 X2 Distrito habitantes IDH años %
## 4 010101 1 Chachapoyas 27356 0.5298996 73.26092 62.7192982
## 5 010102 2 Asuncion 299 0.1642529 73.42032 0.7086522
## 6 010103 3 Balsas 1575 0.2582213 72.10407 14.4676104
## 7 010104 4 Cheto 614 0.3206963 77.52977 44.5600664
## 8 010105 5 Chiliquin 818 0.2078854 76.07404 8.0030467
## 9 010106 6 Chuquibamba 2169 0.2296774 71.01417 10.3299153
## 10 010107 7 Granada 425 0.3193414 71.45103 40.0996214
## 11 010108 8 Huancas 1063 0.2901601 74.85428 25.6923436
## 12 010109 9 La Jalca 5669 0.2260517 75.27414 18.7753124
## 13 010110 10 Leimebamba 4225 0.2739669 72.25067 11.4461999
## años.1 N.S..mes
## 4 9.967432 794.1962
## 5 6.086056 327.6701
## 6 5.259462 322.9567
## 7 4.954838 330.1395
## 8 4.379055 250.7874
## 9 6.133492 254.3352
## 10 5.251665 367.1224
## 11 5.725090 306.9544
## 12 5.235605 194.4394
## 13 6.411637 368.8416
distritos=distritos[,-c(2)]
head(distritos,10)
## X1 Distrito habitantes IDH años % años.1
## 4 010101 Chachapoyas 27356 0.5298996 73.26092 62.7192982 9.967432
## 5 010102 Asuncion 299 0.1642529 73.42032 0.7086522 6.086056
## 6 010103 Balsas 1575 0.2582213 72.10407 14.4676104 5.259462
## 7 010104 Cheto 614 0.3206963 77.52977 44.5600664 4.954838
## 8 010105 Chiliquin 818 0.2078854 76.07404 8.0030467 4.379055
## 9 010106 Chuquibamba 2169 0.2296774 71.01417 10.3299153 6.133492
## 10 010107 Granada 425 0.3193414 71.45103 40.0996214 5.251665
## 11 010108 Huancas 1063 0.2901601 74.85428 25.6923436 5.725090
## 12 010109 La Jalca 5669 0.2260517 75.27414 18.7753124 5.235605
## 13 010110 Leimebamba 4225 0.2739669 72.25067 11.4461999 6.411637
## N.S..mes
## 4 794.1962
## 5 327.6701
## 6 322.9567
## 7 330.1395
## 8 250.7874
## 9 254.3352
## 10 367.1224
## 11 306.9544
## 12 194.4394
## 13 368.8416
colnames(distritos)=c("ubigeoDistrito","nombreDistrito","habitantes","IDH","esperanza","secundaria","educacion","ingresos")
head(distritos,10)
## ubigeoDistrito nombreDistrito habitantes IDH esperanza secundaria
## 4 010101 Chachapoyas 27356 0.5298996 73.26092 62.7192982
## 5 010102 Asuncion 299 0.1642529 73.42032 0.7086522
## 6 010103 Balsas 1575 0.2582213 72.10407 14.4676104
## 7 010104 Cheto 614 0.3206963 77.52977 44.5600664
## 8 010105 Chiliquin 818 0.2078854 76.07404 8.0030467
## 9 010106 Chuquibamba 2169 0.2296774 71.01417 10.3299153
## 10 010107 Granada 425 0.3193414 71.45103 40.0996214
## 11 010108 Huancas 1063 0.2901601 74.85428 25.6923436
## 12 010109 La Jalca 5669 0.2260517 75.27414 18.7753124
## 13 010110 Leimebamba 4225 0.2739669 72.25067 11.4461999
## educacion ingresos
## 4 9.967432 794.1962
## 5 6.086056 327.6701
## 6 5.259462 322.9567
## 7 4.954838 330.1395
## 8 4.379055 250.7874
## 9 6.133492 254.3352
## 10 5.251665 367.1224
## 11 5.725090 306.9544
## 12 5.235605 194.4394
## 13 6.411637 368.8416