Repaso Estadística 2

Bivariada

Relación Categórica - Categórica

Cargo la data:

link="https://github.com/PoliticayGobiernoPUCP/EstadisticaPoliticaGobiernoII/blob/master/sesiones/data/governor.csv?raw=true"
governor=read.csv(link,stringsAsFactors = F)

Saco estructura:

str(governor)

## 'data.frame':    329866 obs. of  5 variables:
##  $ party                 : chr  "REPUBLICAN" "REPUBLICAN" "REPUBLICAN" "REPUBLICAN" ...
##  $ election_year         : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ amount                : num  25 100 100 25 50 100 50 25 25 100 ...
##  $ contributor_occupation: chr  "" "" "RETIRED" "" ...
##  $ code                  : chr  "Individual" "Individual" "Individual" "Individual" ...

Saco tabla de frecuencias:

table(governor$party)

## 
##    DEMOCRAT INDEPENDENT        NONE  REPUBLICAN 
##      167667          20           4      162175

table(governor$election_year)

## 
##   2008   2012   2016 
## 155124 130461  44281

table(governor$party,governor$election_year)

##              
##                2008  2012  2016
##   DEMOCRAT    70400 62520 34747
##   INDEPENDENT    11     8     1
##   NONE            4     0     0
##   REPUBLICAN  84709 67933  9533

byCategories=c("DEMOCRAT", "REPUBLICAN")
governorDR=governor[governor$party %in% byCategories,]

table(governorDR$party,governorDR$election_year)

##             
##               2008  2012  2016
##   DEMOCRAT   70400 62520 34747
##   REPUBLICAN 84709 67933  9533

library(gmodels)
CrossTable(governorDR$party,governorDR$election_year,prop.t=F, prop.r=F, prop.c=F,prop.chisq=F,chisq=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  329842 
## 
##  
##                  | governorDR$election_year 
## governorDR$party |      2008 |      2012 |      2016 | Row Total | 
## -----------------|-----------|-----------|-----------|-----------|
##         DEMOCRAT |     70400 |     62520 |     34747 |    167667 | 
## -----------------|-----------|-----------|-----------|-----------|
##       REPUBLICAN |     84709 |     67933 |      9533 |    162175 | 
## -----------------|-----------|-----------|-----------|-----------|
##     Column Total |    155109 |    130453 |     44280 |    329842 | 
## -----------------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  15814.97     d.f. =  2     p =  0 
## 
## 
##

legendPlot=levels(as.factor(unique(governorDR$party)))
bartable = table(governorDR$party,governorDR$election_year)  ##tabla de contingencia
barplot(bartable, beside = T,legend=legendPlot)  ## grafico

Relación Categórica - Numérica

Hago un subconjunto con las contribuciones hechas por empresa:

business = governorDR[governorDR$code=="Business",]

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

p <- ggplot(business, aes(party, amount))

p + geom_boxplot()

Intervalos de confianza:

# Creo una función para el error standard de la media (seMean):
seMean = function(x) sd(x)/sqrt(length(x))

# Media por grupo
means=aggregate(list(mean=business$amount),
          list(party=business$party),mean)

# error standard por grupo: (usando la función que hemos creado!)
sems=aggregate(list(seMean=business$amount),
          list(party=business$party),seMean)

means

##        party     mean
## 1   DEMOCRAT 755.9014
## 2 REPUBLICAN 748.6163

sems

##        party   seMean
## 1   DEMOCRAT 11.25733
## 2 REPUBLICAN 12.49764

data=merge(means,sems)
data

##        party     mean   seMean
## 1   DEMOCRAT 755.9014 11.25733
## 2 REPUBLICAN 748.6163 12.49764

# añadir intervalos de confianza:
data$lower=data$mean-2*data$seMean
data$upper=data$mean+2*data$seMean

## version final:
data

##        party     mean   seMean    lower    upper
## 1   DEMOCRAT 755.9014 11.25733 733.3867 778.4160
## 2 REPUBLICAN 748.6163 12.49764 723.6210 773.6115

# creamos los puntos (means):
meanPlot = ggplot(data, aes(y=mean, x=party)) + geom_point()

# añadimos las barras de error:
errorPlot = meanPlot + geom_errorbar(aes(ymin = lower, ymax = upper))

# resultado: 
errorPlot

Ahora sí saco t-test:

t.test(business$amount~business$party,var.equal = T)

## 
##  Two Sample t-test
## 
## data:  business$amount by business$party
## t = 0.43123, df = 5253, p-value = 0.6663
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -25.83383  40.40406
## sample estimates:
##   mean in group DEMOCRAT mean in group REPUBLICAN 
##                 755.9014                 748.6163

ANOVA

with(governorDR, tapply(amount, list(code=code,party=party), mean))

##                             party
## code                           DEMOCRAT REPUBLICAN
##   Business                     755.9014   748.6163
##   Caucus                     25000.0000  1875.0000
##   Individual                   147.6418   157.4498
##   Other                        666.3455   124.7814
##   Party                      32767.7546 66312.0519
##   Political Action Committee   986.4145  1358.1671
##   Union                        937.4966         NA

Hago ANOVA:

model = aov(amount ~ code, data = governorDR)
summary(model)

##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## code             6 5.207e+11 8.678e+10    7575 <2e-16 ***
## Residuals   329835 3.779e+12 1.146e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hay diferencias. Pero necesito dónde están. Hago pruebas post-hoc. En este caso, Tukey:

TukeyHSD(model)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = amount ~ code, data = governorDR)
## 
## $code
##                                               diff          lwr
## Caucus-Business                        12685.35833   5627.68706
## Individual-Business                     -601.41172   -740.60096
## Other-Business                          -611.28193   -752.99952
## Party-Business                         40027.78584  39448.04857
## Political Action Committee-Business      272.25626    -84.38534
## Union-Business                           185.35492   -402.39816
## Individual-Caucus                     -13286.77005 -20343.12866
## Other-Caucus                          -13296.64026 -20353.04920
## Party-Caucus                           27342.42752  20263.66219
## Political Action Committee-Caucus     -12413.10207 -19477.09644
## Union-Caucus                          -12500.00341 -19579.42973
## Other-Individual                          -9.87021    -49.32937
## Party-Individual                       40629.19756  40065.66544
## Political Action Committee-Individual    873.66798    544.02214
## Union-Individual                         786.76664    214.99148
## Party-Other                            40639.06777  40074.90584
## Political Action Committee-Other         883.53819    552.81684
## Union-Other                              796.63685    224.24095
## Political Action Committee-Party      -39755.52958 -40407.74737
## Union-Party                           -39842.43093 -40644.70799
## Union-Political Action Committee         -86.90134   -746.25438
##                                                upr     p adj
## Caucus-Business                        19743.02960 0.0000024
## Individual-Business                     -462.22248 0.0000000
## Other-Business                          -469.56434 0.0000000
## Party-Business                         40607.52311 0.0000000
## Political Action Committee-Business      628.89786 0.2684587
## Union-Business                           773.10799 0.9678611
## Individual-Caucus                      -6230.41143 0.0000006
## Other-Caucus                           -6240.23131 0.0000006
## Party-Caucus                           34421.19285 0.0000000
## Political Action Committee-Caucus      -5349.10769 0.0000046
## Union-Caucus                           -5420.57709 0.0000040
## Other-Individual                          29.58895 0.9902604
## Party-Individual                       41192.72968 0.0000000
## Political Action Committee-Individual   1203.31383 0.0000000
## Union-Individual                        1358.54179 0.0009778
## Party-Other                            41203.22970 0.0000000
## Political Action Committee-Other        1214.25955 0.0000000
## Union-Other                             1369.03274 0.0008043
## Political Action Committee-Party      -39103.31180 0.0000000
## Union-Party                           -39040.15386 0.0000000
## Union-Political Action Committee         572.45169 0.9997333

result=as.data.frame(TukeyHSD(model)$code)
result[4]<0.05

##                                       p adj
## Caucus-Business                        TRUE
## Individual-Business                    TRUE
## Other-Business                         TRUE
## Party-Business                         TRUE
## Political Action Committee-Business   FALSE
## Union-Business                        FALSE
## Individual-Caucus                      TRUE
## Other-Caucus                           TRUE
## Party-Caucus                           TRUE
## Political Action Committee-Caucus      TRUE
## Union-Caucus                           TRUE
## Other-Individual                      FALSE
## Party-Individual                       TRUE
## Political Action Committee-Individual  TRUE
## Union-Individual                       TRUE
## Party-Other                            TRUE
## Political Action Committee-Other       TRUE
## Union-Other                            TRUE
## Political Action Committee-Party       TRUE
## Union-Party                            TRUE
## Union-Political Action Committee      FALSE

Relación Numérica-Numérica

# los datos se recolectarán via API, pedimos data solo del 2014:
library(jsonlite) 
apiResponse="https://data.wa.gov/resource/2cup-2fnu.json?year=2014"
medicare = fromJSON(apiResponse)
head(medicare)

##   average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 1        71.0              0.90                              1098715
## 2        73.0              0.86                                 1557
## 3        71.0              0.93                                 5426
## 4        71.0              0.92                                28303
## 5        72.0              0.86                                11040
## 6        73.0              0.81                                22429
##        county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 1 STATE TOTAL            739717           358998                  32.7
## 2       ADAMS              1333              224                  14.4
## 3      ASOTIN              4515              911                  16.8
## 4      BENTON             24054             4249                  15.0
## 5      CHELAN              8884             2156                  19.5
## 6     CLALLAM             20336             2093                   9.3
##   percent_african_american percent_eligible_for_medicaid percent_female
## 1                      2.6                          19.1           53.2
## 2                     <NA>                          15.6           51.2
## 3                     <NA>                          18.5           51.8
## 4                      0.8                          15.5           53.8
## 5                      0.2                          20.2           51.3
## 6                      0.3                          12.5           52.6
##   percent_hispanic percent_male percent_non_hispanic_white
## 1              3.4         46.9                       86.3
## 2             <NA>         48.8                       <NA>
## 3             <NA>         48.2                       <NA>
## 4              5.1         46.2                       90.0
## 5              5.4         48.7                       91.7
## 6              1.3         47.4                       93.3
##   percent_other_unknown state_and_county_fips_code
## 1                   7.7                          .
## 2                  <NA>                      53001
## 3                  <NA>                      53003
## 4                   4.1                      53005
## 5                   2.7                      53007
## 6                   5.1                      53009
##   to_sort_by_county_and_year to_sort_by_year_and_county year
## 1                     0.2014                       2014 2014
## 2                  530012014                  201453001 2014
## 3                  530032014                  201453003 2014
## 4                  530052014                  201453005 2014
## 5                  530072014                  201453007 2014
## 6                  530092014                  201453009 2014

Eliminar fila:

medicare=medicare[-c(1),]
head(medicare)

##   average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 2        73.0              0.86                                 1557
## 3        71.0              0.93                                 5426
## 4        71.0              0.92                                28303
## 5        72.0              0.86                                11040
## 6        73.0              0.81                                22429
## 7        69.0              0.95                                70906
##    county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 2   ADAMS              1333              224                  14.4
## 3  ASOTIN              4515              911                  16.8
## 4  BENTON             24054             4249                  15.0
## 5  CHELAN              8884             2156                  19.5
## 6 CLALLAM             20336             2093                   9.3
## 7   CLARK             31144            39762                  56.1
##   percent_african_american percent_eligible_for_medicaid percent_female
## 2                     <NA>                          15.6           51.2
## 3                     <NA>                          18.5           51.8
## 4                      0.8                          15.5           53.8
## 5                      0.2                          20.2           51.3
## 6                      0.3                          12.5           52.6
## 7                      1.9                          26.0           52.8
##   percent_hispanic percent_male percent_non_hispanic_white
## 2             <NA>         48.8                       <NA>
## 3             <NA>         48.2                       <NA>
## 4              5.1         46.2                       90.0
## 5              5.4         48.7                       91.7
## 6              1.3         47.4                       93.3
## 7              2.8         47.2                       87.8
##   percent_other_unknown state_and_county_fips_code
## 2                  <NA>                      53001
## 3                  <NA>                      53003
## 4                   4.1                      53005
## 5                   2.7                      53007
## 6                   5.1                      53009
## 7                   7.5                      53011
##   to_sort_by_county_and_year to_sort_by_year_and_county year
## 2                  530012014                  201453001 2014
## 3                  530032014                  201453003 2014
## 4                  530052014                  201453005 2014
## 5                  530072014                  201453007 2014
## 6                  530092014                  201453009 2014
## 7                  530112014                  201453011 2014

Reseteo el índice:

row.names(medicare)=NULL
head(medicare)

##   average_age average_hcc_score beneficiaries_with_part_a_and_part_b
## 1        73.0              0.86                                 1557
## 2        71.0              0.93                                 5426
## 3        71.0              0.92                                28303
## 4        72.0              0.86                                11040
## 5        73.0              0.81                                22429
## 6        69.0              0.95                                70906
##    county ffs_beneficiaries ma_beneficiaries ma_participation_rate
## 1   ADAMS              1333              224                  14.4
## 2  ASOTIN              4515              911                  16.8
## 3  BENTON             24054             4249                  15.0
## 4  CHELAN              8884             2156                  19.5
## 5 CLALLAM             20336             2093                   9.3
## 6   CLARK             31144            39762                  56.1
##   percent_african_american percent_eligible_for_medicaid percent_female
## 1                     <NA>                          15.6           51.2
## 2                     <NA>                          18.5           51.8
## 3                      0.8                          15.5           53.8
## 4                      0.2                          20.2           51.3
## 5                      0.3                          12.5           52.6
## 6                      1.9                          26.0           52.8
##   percent_hispanic percent_male percent_non_hispanic_white
## 1             <NA>         48.8                       <NA>
## 2             <NA>         48.2                       <NA>
## 3              5.1         46.2                       90.0
## 4              5.4         48.7                       91.7
## 5              1.3         47.4                       93.3
## 6              2.8         47.2                       87.8
##   percent_other_unknown state_and_county_fips_code
## 1                  <NA>                      53001
## 2                  <NA>                      53003
## 3                   4.1                      53005
## 4                   2.7                      53007
## 5                   5.1                      53009
## 6                   7.5                      53011
##   to_sort_by_county_and_year to_sort_by_year_and_county year
## 1                  530012014                  201453001 2014
## 2                  530032014                  201453003 2014
## 3                  530052014                  201453005 2014
## 4                  530072014                  201453007 2014
## 5                  530092014                  201453009 2014
## 6                  530112014                  201453011 2014

str(medicare)

## 'data.frame':    39 obs. of  18 variables:
##  $ average_age                         : chr  "73.0" "71.0" "71.0" "72.0" ...
##  $ average_hcc_score                   : chr  "0.86" "0.93" "0.92" "0.86" ...
##  $ beneficiaries_with_part_a_and_part_b: chr  "1557" "5426" "28303" "11040" ...
##  $ county                              : chr  "ADAMS" "ASOTIN" "BENTON" "CHELAN" ...
##  $ ffs_beneficiaries                   : chr  "1333" "4515" "24054" "8884" ...
##  $ ma_beneficiaries                    : chr  "224" "911" "4249" "2156" ...
##  $ ma_participation_rate               : chr  "14.4" "16.8" "15.0" "19.5" ...
##  $ percent_african_american            : chr  NA NA "0.8" "0.2" ...
##  $ percent_eligible_for_medicaid       : chr  "15.6" "18.5" "15.5" "20.2" ...
##  $ percent_female                      : chr  "51.2" "51.8" "53.8" "51.3" ...
##  $ percent_hispanic                    : chr  NA NA "5.1" "5.4" ...
##  $ percent_male                        : chr  "48.8" "48.2" "46.2" "48.7" ...
##  $ percent_non_hispanic_white          : chr  NA NA "90.0" "91.7" ...
##  $ percent_other_unknown               : chr  NA NA "4.1" "2.7" ...
##  $ state_and_county_fips_code          : chr  "53001" "53003" "53005" "53007" ...
##  $ to_sort_by_county_and_year          : chr  "530012014" "530032014" "530052014" "530072014" ...
##  $ to_sort_by_year_and_county          : chr  "201453001" "201453003" "201453005" "201453007" ...
##  $ year                                : chr  "2014" "2014" "2014" "2014" ...

medicare[,-c(4,16,17)]=lapply(medicare[,-c(4,16,17)],as.numeric)

str(medicare)

## 'data.frame':    39 obs. of  18 variables:
##  $ average_age                         : num  73 71 71 72 73 69 72 69 73 70 ...
##  $ average_hcc_score                   : num  0.86 0.93 0.92 0.86 0.81 0.95 0.83 0.99 0.88 0.76 ...
##  $ beneficiaries_with_part_a_and_part_b: num  1557 5426 28303 11040 22429 ...
##  $ county                              : chr  "ADAMS" "ASOTIN" "BENTON" "CHELAN" ...
##  $ ffs_beneficiaries                   : num  1333 4515 24054 8884 20336 ...
##  $ ma_beneficiaries                    : num  224 911 4249 2156 2093 ...
##  $ ma_participation_rate               : num  14.4 16.8 15 19.5 9.3 56.1 3.5 47.2 21.2 4.9 ...
##  $ percent_african_american            : num  NA NA 0.8 0.2 0.3 1.9 NA 0.7 0.2 NA ...
##  $ percent_eligible_for_medicaid       : num  15.6 18.5 15.5 20.2 12.5 26 19.1 25.1 15.7 20.1 ...
##  $ percent_female                      : num  51.2 51.8 53.8 51.3 52.6 52.8 51.7 50.8 54.1 47.4 ...
##  $ percent_hispanic                    : num  NA NA 5.1 5.4 1.3 2.8 NA 1.6 6.5 NA ...
##  $ percent_male                        : num  48.8 48.2 46.2 48.7 47.4 47.2 48.3 49.2 45.9 52.6 ...
##  $ percent_non_hispanic_white          : num  NA NA 90 91.7 93.3 87.8 NA 94.1 90.8 NA ...
##  $ percent_other_unknown               : num  NA NA 4.1 2.7 5.1 7.5 NA 3.6 2.5 NA ...
##  $ state_and_county_fips_code          : num  53001 53003 53005 53007 53009 ...
##  $ to_sort_by_county_and_year          : chr  "530012014" "530032014" "530052014" "530072014" ...
##  $ to_sort_by_year_and_county          : chr  "201453001" "201453003" "201453005" "201453007" ...
##  $ year                                : num  2014 2014 2014 2014 2014 ...

res=cor(medicare[,c(8:14)],use= "complete.obs")  #pearson por default

res <- cor(medicare[,c(8:14)],use= "complete.obs")  #pearson por default 
round(res, 2)

##                               percent_african_american
## percent_african_american                          1.00
## percent_eligible_for_medicaid                     0.28
## percent_female                                    0.43
## percent_hispanic                                  0.12
## percent_male                                     -0.43
## percent_non_hispanic_white                       -0.68
## percent_other_unknown                             0.73
##                               percent_eligible_for_medicaid percent_female
## percent_african_american                               0.28           0.43
## percent_eligible_for_medicaid                          1.00           0.08
## percent_female                                         0.08           1.00
## percent_hispanic                                       0.41           0.22
## percent_male                                          -0.08          -1.00
## percent_non_hispanic_white                            -0.52          -0.38
## percent_other_unknown                                  0.29           0.25
##                               percent_hispanic percent_male
## percent_african_american                  0.12        -0.43
## percent_eligible_for_medicaid             0.41        -0.08
## percent_female                            0.22        -1.00
## percent_hispanic                          1.00        -0.22
## percent_male                             -0.22         1.00
## percent_non_hispanic_white               -0.75         0.38
## percent_other_unknown                    -0.11        -0.25
##                               percent_non_hispanic_white
## percent_african_american                           -0.68
## percent_eligible_for_medicaid                      -0.52
## percent_female                                     -0.38
## percent_hispanic                                   -0.75
## percent_male                                        0.38
## percent_non_hispanic_white                          1.00
## percent_other_unknown                              -0.55
##                               percent_other_unknown
## percent_african_american                       0.73
## percent_eligible_for_medicaid                  0.29
## percent_female                                 0.25
## percent_hispanic                              -0.11
## percent_male                                  -0.25
## percent_non_hispanic_white                    -0.55
## percent_other_unknown                          1.00

library(car) #instalen! 
scatterplot(medicare[,9] ~ medicare[,13], main="Scatterplot")

¿A mayor porcentaje de gente blanca en un condado, menor porcentaje de personas elegibles para medicaid?

Clusters

Cargo data del IDE (Índice de Densidad del Estado). La data viene de Excel. La abro con openxlsx:

library(openxlsx)
folder='data'
fileName='IDE_limpio.xlsx'
fileToRead=file.path(folder,fileName)
provinciasNew=read.xlsx(fileToRead)

Siempre guarda tu Markdown en tu carpeta Estadística 2. Eso lo haces dándole clic por primera vez a Preview.

str(provinciasNew)

## 'data.frame':    195 obs. of  12 variables:
##  $ regionUbigeo       : chr  "010000" "010000" "010000" "010000" ...
##  $ provinciaUbigeo    : chr  "010200" "010300" "010400" "010500" ...
##  $ regionNombre       : chr  "AMAZONAS" "AMAZONAS" "AMAZONAS" "AMAZONAS" ...
##  $ provinciaNombre    : chr  "Bagua" "Bongará" "Condorcanqui" "Luya" ...
##  $ pob2012            : num  77438 32317 51802 52185 30236 ...
##  $ ide2012            : num  0.662 0.632 0.46 0.605 0.631 ...
##  $ identificacion2012 : num  94.6 97.5 86.2 96.2 97.3 ...
##  $ medicos2012        : num  14.61 9.01 8.56 12.42 14.88 ...
##  $ escolaridad2012    : num  79.8 76.4 52.2 74.7 79.4 ...
##  $ AguaDesague2012    : num  64.5 54.8 37.7 43.3 46.5 ...
##  $ electrificacion2012: num  67.9 72.2 39.5 67.4 67.5 ...
##  $ mapProv            : chr  "0102" "0103" "0104" "0105" ...

names(provinciasNew)

##  [1] "regionUbigeo"        "provinciaUbigeo"     "regionNombre"       
##  [4] "provinciaNombre"     "pob2012"             "ide2012"            
##  [7] "identificacion2012"  "medicos2012"         "escolaridad2012"    
## [10] "AguaDesague2012"     "electrificacion2012" "mapProv"

Hago un subconjunto con mis variables de interés

prov_sub=provinciasNew[c(7:11)]
head(prov_sub)

##   identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1           94.60787   14.609121        79.79018        64.47904
## 2           97.46807    9.010207        76.42404        54.83408
## 3           86.23196    8.556959        52.21494        37.71451
## 4           96.19272   12.418003        74.72597        43.34842
## 5           97.34310   14.878682        79.42439        46.50182
## 6           95.17449   10.110167        77.16833        52.51951
##   electrificacion2012
## 1            67.91462
## 2            72.16926
## 3            39.48908
## 4            67.39611
## 5            67.54610
## 6            63.11765

Pongo el nombre de las provincias como nombre de las filas

row.names(prov_sub)=provinciasNew$provinciaNombre
head(prov_sub)

##                      identificacion2012 medicos2012 escolaridad2012
## Bagua                          94.60787   14.609121        79.79018
## Bongará                        97.46807    9.010207        76.42404
## Condorcanqui                   86.23196    8.556959        52.21494
## Luya                           96.19272   12.418003        74.72597
## Rodríguez de Mendoza           97.34310   14.878682        79.42439
## Utcubamba                      95.17449   10.110167        77.16833
##                      AguaDesague2012 electrificacion2012
## Bagua                       64.47904            67.91462
## Bongará                     54.83408            72.16926
## Condorcanqui                37.71451            39.48908
## Luya                        43.34842            67.39611
## Rodríguez de Mendoza        46.50182            67.54610
## Utcubamba                   52.51951            63.11765

Estandarizo con scale:

prov_sub.scaled=scale(prov_sub)
head(prov_sub.scaled)

##                      identificacion2012 medicos2012 escolaridad2012
## Bagua                       -1.09436397  0.36776299     -0.05610104
## Bongará                      0.06856468 -0.39755431     -0.35513210
## Condorcanqui                -4.49992185 -0.45950881     -2.50574699
## Luya                        -0.44997860  0.06825845     -0.50598013
## Rodríguez de Mendoza         0.01775379  0.40460944     -0.08859598
## Utcubamba                   -0.86397982 -0.24720043     -0.28901303
##                      AguaDesague2012 electrificacion2012
## Bagua                     0.39995304        -0.267919475
## Bongará                  -0.06962008         0.003631957
## Condorcanqui             -0.90310156        -2.082170112
## Luya                     -0.62880972        -0.301012951
## Rodríguez de Mendoza     -0.47528375        -0.291439587
## Utcubamba                -0.18230687        -0.574083951

Creo distancias. [El paquete NbClust lo hará por nosotros.]
Identifico número óptimo de clusters:

library(NbClust)
nb <- NbClust(prov_sub.scaled, method = "complete") # utilizamos el método complete

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 5 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

El algoritmo me dice que debo sacar 3 clusters.

Método jerárquico

Método jerárquico

library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

algoritmo="hclust" #método jerarquico
cuantosClusters=3 
solucionJerarquica1 <- eclust(prov_sub.scaled, 
                              FUNcluster =algoritmo,
                              k = cuantosClusters,
                              method = "complete", # linkage 
                              graph = FALSE) 

fviz_dend(solucionJerarquica1, rect = TRUE, show_labels = T) #veamos el dendograma

Creo una columna donde guardo los resultados de método jerárquico en mi data original

idehc = as.data.frame(solucionJerarquica1$cluster)
colnames(idehc) = c("hc") #le cambio el nombre para reconcoer la técnica

Hago merge:

provinciasClust=merge(provinciasNew,idehc,
                 by.x = 'provinciaNombre',
                 by.y=0) # 'by.y=0' está usando los row.names
head(provinciasClust)

##   provinciaNombre regionUbigeo provinciaUbigeo regionNombre pob2012
## 1         Abancay       030000          030100     APURÍMAC  105694
## 2        Acobamba       090000          090200 HUANCAVELICA   73243
## 3         Acomayo       080000          080200        CUSCO   28318
## 4            Aija       020000          020200       ANCASH    7974
## 5   Alto Amazonas       160000          160200       LORETO  117163
## 6            Ambo       100000          100200      HUÁNUCO   57957
##     ide2012 identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 0.7466421           97.90419   16.931097        93.69086        63.00113
## 2 0.5771310           98.75156    2.628194        77.54075        31.17210
## 3 0.6331165           97.19903   10.453706        83.12670        48.23934
## 4 0.6579253           98.23439    8.593985        89.49891        47.78407
## 5 0.5685318           94.03978   10.813959        61.71118        44.49094
## 6 0.5594230           97.92044   11.182912        74.88339        25.58357
##   electrificacion2012 mapProv hc
## 1            90.50638    0301  3
## 2            76.72076    0902  1
## 3            70.57030    0802  1
## 4            79.12196    0202  1
## 5            66.00075    1602  2
## 6            62.68590    1002  2

Exploro

¿Cuántas provincias hay en cada cluster?

table(provinciasClust$hc)

## 
##  1  2  3 
## 91 56 48

¿Qué provincias están en el cluster 3?

provinciasClust[provinciasClust$hc=="3",]$provinciaNombre

##  [1] "Abancay"        "Andahuaylas"    "Arequipa"       "Ascope"        
##  [5] "Barranca"       "Cajamarca"      "Callao"         "Camaná"        
##  [9] "Canchis"        "Cañete"         "Casma"          "Castilla"      
## [13] "Chachapoyas"    "Chepén"         "Chiclayo"       "Chincha"       
## [17] "Cusco"          "Huamanga"       "Huancayo"       "Huaral"        
## [21] "Huaraz"         "Huarmey"        "Huarochirí"     "Huaura"        
## [25] "Ica"            "Ilo"            "Islay"          "Jorge Basadre" 
## [29] "Lima"           "Mariscal Nieto" "Nazca"          "Pacasmayo"     
## [33] "Palpa"          "Pisco"          "Piura"          "Puno"          
## [37] "Recuay"         "San Martín"     "San Román"      "Santa"         
## [41] "Sullana"        "Tacna"          "Talara"         "Tambopata"     
## [45] "Tarata"         "Trujillo"       "Tumbes"         "Yauli"

Saco la media por cluster de cada variable componente del IDE:

mediasPorCluster=aggregate(cbind(identificacion2012,medicos2012,escolaridad2012,AguaDesague2012,
                electrificacion2012) ~ hc, data=provinciasClust,FUN=mean)
mediasPorCluster

##   hc identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1  1           97.66534    9.397934        83.42101        50.44582
## 2  2           95.35047    8.664781        66.79834        43.46882
## 3  3           98.87954   20.493638        90.62941        82.22228
##   electrificacion2012
## 1            73.76446
## 2            54.59970
## 3            89.41167

Qué tan buena ha sido la agrupación. Veo las siluetas:

fviz_silhouette(solucionJerarquica1)

##   cluster size ave.sil.width
## 1       1   91          0.27
## 2       2   56          0.16
## 3       3   48          0.41

Creo un objeto con la información de las siluetas:

siluetas <-solucionJerarquica1$silinfo$widths

¿Quiénes están mal agrupados?

siluetas[siluetas$sil_width<0,]

##                      cluster neighbor    sil_width
## Paucar del Sara Sara       1        3 -0.029485712
## Huánuco                    1        3 -0.162745955
## Tahuamanú                  1        3 -0.230168491
## Paruro                     2        1 -0.007983778
## Lamas                      2        1 -0.012407030
## San Miguel                 2        1 -0.090568597
## Ambo                       2        1 -0.098618357
## Huaylas                    2        1 -0.105335588
## Luya                       2        1 -0.135692443
## Sihuas                     2        1 -0.140644894
## Chota                      2        1 -0.181545362
## Recuay                     3        1 -0.080046984

Método k-medias

Aplico el algoritmo con eclust:

algoritmo="kmeans"
cuantosClusters=3
solucionKmeans1 <- eclust(prov_sub.scaled,
                          FUNcluster =algoritmo,
                          k = cuantosClusters, # como lo hicimos previamente
                          graph = F)

En k medias no veo dendograma, sino mapa de cercanía/lejanía:

fviz_cluster(solucionKmeans1, geom = "point", ellipse = F)

Guardo los resultados de la agrupación:

ideK=as.data.frame(solucionKmeans1$cluster)
colnames(ideK) = c("km")

Hago merge:

provinciasClust=merge(provinciasClust,ideK,
                 by.x = 'provinciaNombre',
                 by.y=0) # 'by.y=0' está usando los row.names
head(provinciasClust)

##   provinciaNombre regionUbigeo provinciaUbigeo regionNombre pob2012
## 1         Abancay       030000          030100     APURÍMAC  105694
## 2        Acobamba       090000          090200 HUANCAVELICA   73243
## 3         Acomayo       080000          080200        CUSCO   28318
## 4            Aija       020000          020200       ANCASH    7974
## 5   Alto Amazonas       160000          160200       LORETO  117163
## 6            Ambo       100000          100200      HUÁNUCO   57957
##     ide2012 identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1 0.7466421           97.90419   16.931097        93.69086        63.00113
## 2 0.5771310           98.75156    2.628194        77.54075        31.17210
## 3 0.6331165           97.19903   10.453706        83.12670        48.23934
## 4 0.6579253           98.23439    8.593985        89.49891        47.78407
## 5 0.5685318           94.03978   10.813959        61.71118        44.49094
## 6 0.5594230           97.92044   11.182912        74.88339        25.58357
##   electrificacion2012 mapProv hc km
## 1            90.50638    0301  3  1
## 2            76.72076    0902  1  3
## 3            70.57030    0802  1  3
## 4            79.12196    0202  1  3
## 5            66.00075    1602  2  2
## 6            62.68590    1002  2  3

Veo cuántas provincias hay por cluster en k-medias:

table(provinciasClust$km)

## 
##  1  2  3 
## 46 53 96

¿Qué provincias están en el cluster 1?

provinciasClust[provinciasClust$km=="1",]$provinciaNombre

##  [1] "Abancay"        "Arequipa"       "Ascope"         "Barranca"      
##  [5] "Cajamarca"      "Callao"         "Camaná"         "Cañete"        
##  [9] "Casma"          "Castilla"       "Chachapoyas"    "Chepén"        
## [13] "Chiclayo"       "Chincha"        "Cusco"          "Huamanga"      
## [17] "Huancayo"       "Huánuco"        "Huaral"         "Huaraz"        
## [21] "Huarmey"        "Huarochirí"     "Huaura"         "Ica"           
## [25] "Ilo"            "Islay"          "Jorge Basadre"  "Lima"          
## [29] "Mariscal Nieto" "Nazca"          "Pacasmayo"      "Palpa"         
## [33] "Piura"          "Puno"           "San Martín"     "San Román"     
## [37] "Santa"          "Sullana"        "Tacna"          "Tahuamanú"     
## [41] "Talara"         "Tambopata"      "Tarata"         "Trujillo"      
## [45] "Tumbes"         "Yauli"

Saco las medias por cluster para cada variable componente del IDH. Guardo como data frame:

mediasPorCluster2=aggregate(cbind(identificacion2012,medicos2012,escolaridad2012,AguaDesague2012,
                electrificacion2012) ~ km, data=provinciasClust,FUN=mean)
mediasPorCluster2

##   km identificacion2012 medicos2012 escolaridad2012 AguaDesague2012
## 1  1           98.91312    21.42943        90.55211        82.22638
## 2  2           95.13081     8.52126        66.56416        43.64946
## 3  3           97.72347     9.23702        83.21805        50.78811
##   electrificacion2012
## 1            89.55139
## 2            53.96373
## 3            73.77570

Veo las siluetas:

fviz_silhouette(solucionKmeans1)

##   cluster size ave.sil.width
## 1       1   46          0.42
## 2       2   53          0.17
## 3       3   96          0.29

Veo a los mal agrupados:

siluetasPorProvincia <-solucionKmeans1$silinfo$widths

# quedarse con las negativas:
siluetasPorProvincia[siluetasPorProvincia$sil_width<0,]

##             cluster neighbor     sil_width
## Cangallo          2        3 -0.0003014385
## Lamas             2        3 -0.0278815757
## Paruro            2        3 -0.0405800275
## Manú              2        3 -0.0547122644
## Utcubamba         2        3 -0.0845866649
## Bellavista        2        3 -0.1003152596
## Andahuaylas       3        1 -0.0083392252
## Pisco             3        1 -0.0106442083
## Canchis           3        1 -0.0642255093

Limpieza de datos

Cargo la data:

folder="data"
fileName="idhPeru.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
datos=read.xlsx(fileToRead,
                sheet=2,
                startRow = 5,
                skipEmptyRows = T,
                skipEmptyCols = T)

head(datos,10)

##        X1       X2      Distrito habitantes ranking       IDH ranking
## 1  000000 PERÚ  a/          <NA>   29797694       - 0.4906275       -
## 2  010000 AMAZONAS          <NA>     415466      19 0.3689683      19
## 3  010100     <NA>   Chachapoyas      54593     115 0.4201136      60
## 4  010101        1  Chachapoyas       27356     195 0.5298996     134
## 5  010102        2  Asuncion            299    1825 0.1642529    1764
## 6  010103        3     Balsas          1575    1468 0.2582213    1220
## 7  010104        4       Cheto          614    1763 0.3206963     816
## 8  010105        5    Chiliquin         818    1703 0.2078854    1578
## 9  010106        6 Chuquibamba         2169    1320 0.2296774    1437
## 10 010107        7   Granada            425    1805 0.3193414     831
##        años ranking          % ranking     años ranking N.S..mes ranking
## 1  73.98881       - 66.2762426       - 8.794300       - 659.0511       -
## 2  73.90743      10 50.6301745      19 6.486100      20 406.6160      18
## 3  73.84635      82 44.6108289     100 7.773447      65 553.2603      44
## 4  73.26092     915 62.7192982     476 9.967432     151 794.1962      95
## 5  73.42032     873  0.7086522    1823 6.086056    1024 327.6701     826
## 6  72.10407    1113 14.4676104    1709 5.259462    1334 322.9567     846
## 7  77.52977     254 44.5600664     899 4.954838    1439 330.1395     814
## 8  76.07404     435  8.0030467    1792 4.379055    1635 250.7874    1111
## 9  71.01417    1278 10.3299153    1770 6.133492    1011 254.3352    1084
## 10 71.45103    1209 40.0996214    1014 5.251665    1341 367.1224     691

¿Qué variables tengo?

names(datos)

##  [1] "X1"         "X2"         "Distrito"   "habitantes" "ranking"   
##  [6] "IDH"        "ranking"    "años"       "ranking"    "%"         
## [11] "ranking"    "años"       "ranking"    "N.S..mes"   "ranking"

Voy a la cola:

tail(datos,10)

##                                                                                                                                                                                                                      X1
## 2053                                                                                                                                                                                                             250302
## 2054                                                                                                                                                                                                             250303
## 2055                                                                                                                                                                                                             250400
## 2056                                                                                                                                                                                                             250401
## 2057 a/ Incluye las cifras estimadas del distrito de Carmen Alto en la provincia de Huamanga, departamento de Ayacucho, donde. Autoridades locales no permitieron la ejecución del Censo de Población y Vivienda 2007. 
## 2058                                                                                                       1/ Cifras estimadas. Autoridades locales no permitieron la ejecución del Censo de Población y Vivienda 2007.
## 2059                                                                                                 2/ Incluye a la población ubicada en área temporal por límites de fronteras de los distritos de Pangoa y Mazamari.
## 2060                                                                                                                                                                                    3/ Provincias de Lima y Callao.
## 2061                                                                                                                                                      Fuente: INEI. Censo de Población y Vivienda 2007. ENAHO 2007.
## 2062                                                                                                                                                                                            Elaboración: PNUD-Perú.
##        X2   Distrito habitantes ranking       IDH ranking     años ranking
## 2053    2 Irazola         22100     245 0.3385872     738 80.03469      44
## 2054    3 Curimana         7316     658 0.3150442     852 79.82107      55
## 2055 <NA>      Purús       4174     195 0.2955708     130 69.31508     157
## 2056    1     Purus        4174     972 0.2955708     967 69.31508    1476
## 2057 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
## 2058 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
## 2059 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
## 2060 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
## 2061 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
## 2062 <NA>       <NA>         NA    <NA>        NA    <NA>       NA    <NA>
##             % ranking     años ranking N.S..mes ranking
## 2053 31.03903    1267 6.583683     866 357.5909     722
## 2054 28.94143    1320 6.728139     811 301.1814     926
## 2055 17.43332     191 7.198020      82 369.7634      92
## 2056 17.43332    1634 7.198020     658 369.7634     679
## 2057       NA    <NA>       NA    <NA>       NA    <NA>
## 2058       NA    <NA>       NA    <NA>       NA    <NA>
## 2059       NA    <NA>       NA    <NA>       NA    <NA>
## 2060       NA    <NA>       NA    <NA>       NA    <NA>
## 2061       NA    <NA>       NA    <NA>       NA    <NA>
## 2062       NA    <NA>       NA    <NA>       NA    <NA>

Elimino filas innecesarias:

datos=datos[-c(2057:2062),]
tail(datos,10)

##          X1   X2      Distrito habitantes ranking       IDH ranking
## 2047 250201    1    Raymondi        31815     168 0.2752737    1101
## 2048 250202    2     Sepahua         7790     634 0.3041428     915
## 2049 250203    3    Tahuania         7798     633 0.2069411    1581
## 2050 250204    4      Yurua          2080    1346 0.1880110    1674
## 2051 250300 <NA>    Padre Abad      55866     112 0.3793717      74
## 2052 250301    1 Padre Abad         26450     205 0.4216082     422
## 2053 250302    2    Irazola         22100     245 0.3385872     738
## 2054 250303    3    Curimana         7316     658 0.3150442     852
## 2055 250400 <NA>         Purús       4174     195 0.2955708     130
## 2056 250401    1        Purus        4174     972 0.2955708     967
##          años ranking        % ranking     años ranking N.S..mes ranking
## 2047 62.97039    1781 20.89495    1539 6.323261     938 349.9306     749
## 2048 62.71963    1785 26.77310    1383 6.983010     732 387.8883     623
## 2049 66.33663    1662 11.40517    1762 5.497847    1249 218.9890    1267
## 2050 63.83347    1754 13.18217    1728 4.413204    1625 197.5084    1384
## 2051 78.35136      17 38.74236     120 7.119100      86 432.3284      78
## 2052 77.49887     257 44.32234     913 7.603725     559 531.0493     374
## 2053 80.03469      44 31.03903    1267 6.583683     866 357.5909     722
## 2054 79.82107      55 28.94143    1320 6.728139     811 301.1814     926
## 2055 69.31508     157 17.43332     191 7.198020      82 369.7634      92
## 2056 69.31508    1476 17.43332    1634 7.198020     658 369.7634     679

Vuelvo a la cabeza:

head(datos,10)

##        X1       X2      Distrito habitantes ranking       IDH ranking
## 1  000000 PERÚ  a/          <NA>   29797694       - 0.4906275       -
## 2  010000 AMAZONAS          <NA>     415466      19 0.3689683      19
## 3  010100     <NA>   Chachapoyas      54593     115 0.4201136      60
## 4  010101        1  Chachapoyas       27356     195 0.5298996     134
## 5  010102        2  Asuncion            299    1825 0.1642529    1764
## 6  010103        3     Balsas          1575    1468 0.2582213    1220
## 7  010104        4       Cheto          614    1763 0.3206963     816
## 8  010105        5    Chiliquin         818    1703 0.2078854    1578
## 9  010106        6 Chuquibamba         2169    1320 0.2296774    1437
## 10 010107        7   Granada            425    1805 0.3193414     831
##        años ranking          % ranking     años ranking N.S..mes ranking
## 1  73.98881       - 66.2762426       - 8.794300       - 659.0511       -
## 2  73.90743      10 50.6301745      19 6.486100      20 406.6160      18
## 3  73.84635      82 44.6108289     100 7.773447      65 553.2603      44
## 4  73.26092     915 62.7192982     476 9.967432     151 794.1962      95
## 5  73.42032     873  0.7086522    1823 6.086056    1024 327.6701     826
## 6  72.10407    1113 14.4676104    1709 5.259462    1334 322.9567     846
## 7  77.52977     254 44.5600664     899 4.954838    1439 330.1395     814
## 8  76.07404     435  8.0030467    1792 4.379055    1635 250.7874    1111
## 9  71.01417    1278 10.3299153    1770 6.133492    1011 254.3352    1084
## 10 71.45103    1209 40.0996214    1014 5.251665    1341 367.1224     691

Elimino la primera fila:

datos=datos[-c(1),]
head(datos)

##       X1       X2     Distrito habitantes ranking       IDH ranking
## 2 010000 AMAZONAS         <NA>     415466      19 0.3689683      19
## 3 010100     <NA>  Chachapoyas      54593     115 0.4201136      60
## 4 010101        1 Chachapoyas       27356     195 0.5298996     134
## 5 010102        2 Asuncion            299    1825 0.1642529    1764
## 6 010103        3    Balsas          1575    1468 0.2582213    1220
## 7 010104        4      Cheto          614    1763 0.3206963     816
##       años ranking          % ranking     años ranking N.S..mes ranking
## 2 73.90743      10 50.6301745      19 6.486100      20 406.6160      18
## 3 73.84635      82 44.6108289     100 7.773447      65 553.2603      44
## 4 73.26092     915 62.7192982     476 9.967432     151 794.1962      95
## 5 73.42032     873  0.7086522    1823 6.086056    1024 327.6701     826
## 6 72.10407    1113 14.4676104    1709 5.259462    1334 322.9567     846
## 7 77.52977     254 44.5600664     899 4.954838    1439 330.1395     814

names(datos)

##  [1] "X1"         "X2"         "Distrito"   "habitantes" "ranking"   
##  [6] "IDH"        "ranking"    "años"       "ranking"    "%"         
## [11] "ranking"    "años"       "ranking"    "N.S..mes"   "ranking"

Elimino columnas innecesarias:

columnas=c(seq(5,15,2))
datos <- datos[,-columnas]
head(datos)

##       X1       X2     Distrito habitantes       IDH     años          %
## 2 010000 AMAZONAS         <NA>     415466 0.3689683 73.90743 50.6301745
## 3 010100     <NA>  Chachapoyas      54593 0.4201136 73.84635 44.6108289
## 4 010101        1 Chachapoyas       27356 0.5298996 73.26092 62.7192982
## 5 010102        2 Asuncion            299 0.1642529 73.42032  0.7086522
## 6 010103        3    Balsas          1575 0.2582213 72.10407 14.4676104
## 7 010104        4      Cheto          614 0.3206963 77.52977 44.5600664
##     años.1 N.S..mes
## 2 6.486100 406.6160
## 3 7.773447 553.2603
## 4 9.967432 794.1962
## 5 6.086056 327.6701
## 6 5.259462 322.9567
## 7 4.954838 330.1395

Aislo REGIONES:

regiones=datos[is.na(datos$Distrito),]
head(regiones)

##         X1        X2 Distrito habitantes       IDH     años        %
## 2   010000  AMAZONAS     <NA>     415466 0.3689683 73.90743 50.63017
## 94  020000    ANCASH     <NA>    1122792 0.4295926 73.92538 55.30838
## 281 030000  APURÍMAC     <NA>     449365 0.3184201 73.19803 53.79581
## 369 040000  AREQUIPA     <NA>    1231553 0.5528506 75.51374 85.21130
## 487 050000  AYACUCHO     <NA>     658400 0.3337594 70.85430 43.59124
## 610 060000 CAJAMARCA     <NA>    1507486 0.3633385 73.66707 52.22049
##     años.1 N.S..mes
## 2   6.4861 406.6160
## 94  7.6707 536.1965
## 281 5.5644 297.3386
## 369 9.6657 755.1269
## 487 6.3373 356.3175
## 610 6.3766 390.3197

Reseteo el índice:

row.names(regiones)=NULL
head(regiones)

##       X1        X2 Distrito habitantes       IDH     años        % años.1
## 1 010000  AMAZONAS     <NA>     415466 0.3689683 73.90743 50.63017 6.4861
## 2 020000    ANCASH     <NA>    1122792 0.4295926 73.92538 55.30838 7.6707
## 3 030000  APURÍMAC     <NA>     449365 0.3184201 73.19803 53.79581 5.5644
## 4 040000  AREQUIPA     <NA>    1231553 0.5528506 75.51374 85.21130 9.6657
## 5 050000  AYACUCHO     <NA>     658400 0.3337594 70.85430 43.59124 6.3373
## 6 060000 CAJAMARCA     <NA>    1507486 0.3633385 73.66707 52.22049 6.3766
##   N.S..mes
## 1 406.6160
## 2 536.1965
## 3 297.3386
## 4 755.1269
## 5 356.3175
## 6 390.3197

Elimino filas 15 y 16; elimino las columnas innecesarias; cambio nombre a la columnas:

regiones=regiones[-c(15,16),]
regiones=regiones[,c(1,2)]
colnames(regiones)=c("ubigeoRegion","nombreRegion")
head(regiones,10)

##    ubigeoRegion nombreRegion
## 1        010000     AMAZONAS
## 2        020000       ANCASH
## 3        030000     APURÍMAC
## 4        040000     AREQUIPA
## 5        050000     AYACUCHO
## 6        060000    CAJAMARCA
## 7        080000        CUSCO
## 8        090000 HUANCAVELICA
## 9        100000      HUÁNUCO
## 10       110000          ICA

provincias=datos[is.na(datos$X2),]
head(provincias)

##        X1   X2             Distrito habitantes       IDH     años
## 3  010100 <NA>          Chachapoyas      54593 0.4201136 73.84635
## 25 010200 <NA>                Bagua      77537 0.3734377 75.99719
## 32 010300 <NA>              Bongará      31769 0.3398606 73.37774
## 45 010400 <NA>         Condorcanqui      50742 0.1863721 70.87631
## 49 010500 <NA>                 Luya      52248 0.2996638 73.68010
## 73 010600 <NA> Rodríguez de Mendoza      29895 0.3253056 74.32124
##            %   años.1 N.S..mes
## 3  44.610829 7.773447 553.2603
## 25 39.679426 6.932584 433.8135
## 32 28.908538 6.442591 425.3696
## 45  7.976181 5.557132 178.6617
## 49 33.566048 5.470024 312.5744
## 73 26.226472 6.059564 403.0446

row.names(provincias)=NULL
head(provincias)

##       X1   X2             Distrito habitantes       IDH     años         %
## 1 010100 <NA>          Chachapoyas      54593 0.4201136 73.84635 44.610829
## 2 010200 <NA>                Bagua      77537 0.3734377 75.99719 39.679426
## 3 010300 <NA>              Bongará      31769 0.3398606 73.37774 28.908538
## 4 010400 <NA>         Condorcanqui      50742 0.1863721 70.87631  7.976181
## 5 010500 <NA>                 Luya      52248 0.2996638 73.68010 33.566048
## 6 010600 <NA> Rodríguez de Mendoza      29895 0.3253056 74.32124 26.226472
##     años.1 N.S..mes
## 1 7.773447 553.2603
## 2 6.932584 433.8135
## 3 6.442591 425.3696
## 4 5.557132 178.6617
## 5 5.470024 312.5744
## 6 6.059564 403.0446

provincias=provincias[-c(129),]
provincias=provincias[,c(1,3)]
colnames(provincias)=c("UbigeoProvincia","nombreProvincia")
head(provincias,10)

##    UbigeoProvincia      nombreProvincia
## 1           010100          Chachapoyas
## 2           010200                Bagua
## 3           010300              Bongará
## 4           010400         Condorcanqui
## 5           010500                 Luya
## 6           010600 Rodríguez de Mendoza
## 7           010700            Utcubamba
## 8           020100               Huaraz
## 9           020200                 Aija
## 10          020300     Antonio Raymondi

Aislo DISTRITOS:

distritos=datos[complete.cases(datos),]
head(distritos,10)

##        X1 X2      Distrito habitantes       IDH     años          %
## 4  010101  1  Chachapoyas       27356 0.5298996 73.26092 62.7192982
## 5  010102  2  Asuncion            299 0.1642529 73.42032  0.7086522
## 6  010103  3     Balsas          1575 0.2582213 72.10407 14.4676104
## 7  010104  4       Cheto          614 0.3206963 77.52977 44.5600664
## 8  010105  5    Chiliquin         818 0.2078854 76.07404  8.0030467
## 9  010106  6 Chuquibamba         2169 0.2296774 71.01417 10.3299153
## 10 010107  7   Granada            425 0.3193414 71.45103 40.0996214
## 11 010108  8     Huancas         1063 0.2901601 74.85428 25.6923436
## 12 010109  9     La Jalca        5669 0.2260517 75.27414 18.7753124
## 13 010110 10   Leimebamba        4225 0.2739669 72.25067 11.4461999
##      años.1 N.S..mes
## 4  9.967432 794.1962
## 5  6.086056 327.6701
## 6  5.259462 322.9567
## 7  4.954838 330.1395
## 8  4.379055 250.7874
## 9  6.133492 254.3352
## 10 5.251665 367.1224
## 11 5.725090 306.9544
## 12 5.235605 194.4394
## 13 6.411637 368.8416

distritos=distritos[,-c(2)]
head(distritos,10)

##        X1      Distrito habitantes       IDH     años          %   años.1
## 4  010101  Chachapoyas       27356 0.5298996 73.26092 62.7192982 9.967432
## 5  010102  Asuncion            299 0.1642529 73.42032  0.7086522 6.086056
## 6  010103     Balsas          1575 0.2582213 72.10407 14.4676104 5.259462
## 7  010104       Cheto          614 0.3206963 77.52977 44.5600664 4.954838
## 8  010105    Chiliquin         818 0.2078854 76.07404  8.0030467 4.379055
## 9  010106 Chuquibamba         2169 0.2296774 71.01417 10.3299153 6.133492
## 10 010107   Granada            425 0.3193414 71.45103 40.0996214 5.251665
## 11 010108     Huancas         1063 0.2901601 74.85428 25.6923436 5.725090
## 12 010109     La Jalca        5669 0.2260517 75.27414 18.7753124 5.235605
## 13 010110   Leimebamba        4225 0.2739669 72.25067 11.4461999 6.411637
##    N.S..mes
## 4  794.1962
## 5  327.6701
## 6  322.9567
## 7  330.1395
## 8  250.7874
## 9  254.3352
## 10 367.1224
## 11 306.9544
## 12 194.4394
## 13 368.8416

colnames(distritos)=c("ubigeoDistrito","nombreDistrito","habitantes","IDH","esperanza","secundaria","educacion","ingresos")
head(distritos,10)

##    ubigeoDistrito nombreDistrito habitantes       IDH esperanza secundaria
## 4          010101   Chachapoyas       27356 0.5298996  73.26092 62.7192982
## 5          010102   Asuncion            299 0.1642529  73.42032  0.7086522
## 6          010103      Balsas          1575 0.2582213  72.10407 14.4676104
## 7          010104        Cheto          614 0.3206963  77.52977 44.5600664
## 8          010105     Chiliquin         818 0.2078854  76.07404  8.0030467
## 9          010106  Chuquibamba         2169 0.2296774  71.01417 10.3299153
## 10         010107    Granada            425 0.3193414  71.45103 40.0996214
## 11         010108      Huancas         1063 0.2901601  74.85428 25.6923436
## 12         010109      La Jalca        5669 0.2260517  75.27414 18.7753124
## 13         010110    Leimebamba        4225 0.2739669  72.25067 11.4461999
##    educacion ingresos
## 4   9.967432 794.1962
## 5   6.086056 327.6701
## 6   5.259462 322.9567
## 7   4.954838 330.1395
## 8   4.379055 250.7874
## 9   6.133492 254.3352
## 10  5.251665 367.1224
## 11  5.725090 306.9544
## 12  5.235605 194.4394
## 13  6.411637 368.8416