Leyendo la data(URL)

link1 <- "https://raw.githubusercontent.com/Hurtado20161224/Proyectoaplicativo/master/dataVictimizacion2.csv"
bd = read.csv(link1,header=TRUE,na.strings = "-",quote = "",stringsAsFactors = FALSE,row.names =NULL,encoding = "utf-08")

bd$DEPARTAMENTO <- as.factor(bd$DEPARTAMENTO)

levels(bd$DEPARTAMENTO) <- c("ANCASH","AMAZONAS","APURIMAC","AREQUIPA","AYACUCHO","CAJAMARCA","CUSCO","HUANUCO","HUANCAVELICA","ICA","JUNIN","LA LIBERTAD","LAMBAYEQUE","LIMA","LORETO","MADRE DE DIOS","MOQUEGUA","PASCO","PIURA","CALLAO","PUNO","SAN MARTIN","TACNA","TUMBES","UCAYALI")

head(bd)
##   Ubigeo DEPARTAMENTO    Distrito numero.de.victimizacion poblacion2016
## 1  10101     AMAZONAS CHACHAPOYAS                    4313         28937
## 2  10109     AMAZONAS    LA JALCA                     213          5503
## 3  10201     AMAZONAS       BAGUA                    2114         26005
## 4  10202     AMAZONAS    ARAMANGO                     246         10969
## 5  10205     AMAZONAS       IMAZA                     123         24201
## 6  10206     AMAZONAS     LA PECA                     111          8019
##   victimas.en.su.distrito victimas.en.su.barrio vigilancia.zona
## 1                     867                   401            4228
## 2                      25                     9             213
## 3                     432                   175            2078
## 4                      34                    18             245
## 5                      26                    13             123
## 6                      12                    12             108
##   vigilancia.policia.nacional presencia.de.policia.nacional
## 1                        1596                          1306
## 2                          60                            69
## 3                         583                           613
## 4                         114                            56
## 5                          32                            46
## 6                          34                            77
##   confianza.segurirdad.municipal confianza.en.poder.judicial
## 1                           4229                        4115
## 2                            209                         191
## 3                           2067                        1968
## 4                            243                         227
## 5                            122                         112
## 6                            109                          95
##   confianza.en.ministerio.publico
## 1                            4121
## 2                             195
## 3                            1973
## 4                             227
## 5                             112
## 6                             100

Eliminacion de variables que presentan problemas de multicolinealidad

tail(bd)
##     Ubigeo DEPARTAMENTO   Distrito numero.de.victimizacion poblacion2016
## 656 250107      UCAYALI   MANANTAY                    3146         81150
## 657 250201      UCAYALI   RAYMONDI                    1059         34213
## 658 250202      UCAYALI    SEPAHUA                     305          8939
## 659 250301      UCAYALI PADRE ABAD                    1028         26132
## 660 250302      UCAYALI    IRAZOLA                     716         10527
## 661 250303      UCAYALI   CURIMANA                      37          8697
##     victimas.en.su.distrito victimas.en.su.barrio vigilancia.zona
## 656                     906                   402            3110
## 657                     276                    86            1043
## 658                      69                    23             302
## 659                     315                    98            1019
## 660                     189                    50             716
## 661                      11                    11              37
##     vigilancia.policia.nacional presencia.de.policia.nacional
## 656                         707                          1239
## 657                         222                           370
## 658                          90                            93
## 659                         165                           318
## 660                         210                           203
## 661                          10                            27
##     confianza.segurirdad.municipal confianza.en.poder.judicial
## 656                           3087                        2995
## 657                           1032                         994
## 658                            290                         273
## 659                           1008                         964
## 660                            692                         671
## 661                             34                          35
##     confianza.en.ministerio.publico
## 656                            2983
## 657                             991
## 658                             267
## 659                             959
## 660                             662
## 661                              34
colnames(bd)
##  [1] "Ubigeo"                          "DEPARTAMENTO"                   
##  [3] "Distrito"                        "numero.de.victimizacion"        
##  [5] "poblacion2016"                   "victimas.en.su.distrito"        
##  [7] "victimas.en.su.barrio"           "vigilancia.zona"                
##  [9] "vigilancia.policia.nacional"     "presencia.de.policia.nacional"  
## [11] "confianza.segurirdad.municipal"  "confianza.en.poder.judicial"    
## [13] "confianza.en.ministerio.publico"
bdtemp <- bd
bd <- bd[,-c(1,2,3,
             #4,
             5,8,13)] # Elimina ubigeo

Imputacion por la mediana

library(mlr)
## Loading required package: ParamHelpers
data_parametrica <- impute(bd, classes = list(factor = imputeMode(), 
                                                             integer = imputeMedian(),
                                                             numeric = imputeMedian()),
                           dummy.classes = c("integer","factor"), dummy.type = "numeric")
bd1=data_parametrica$data[,1:min(dim(bd))]
summary(bd1)
##  numero.de.victimizacion victimas.en.su.distrito victimas.en.su.barrio
##  Min.   :   15.0         Min.   :   2.0          Min.   :   1.00      
##  1st Qu.:   99.0         1st Qu.:  22.0          1st Qu.:  12.00      
##  Median :  236.0         Median :  54.0          Median :  26.00      
##  Mean   :  691.8         Mean   : 220.2          Mean   :  89.25      
##  3rd Qu.:  591.0         3rd Qu.: 170.0          3rd Qu.:  64.00      
##  Max.   :14334.0         Max.   :5143.0          Max.   :2115.00      
##  vigilancia.policia.nacional presencia.de.policia.nacional
##  Min.   :   1.0              Min.   :   8.0               
##  1st Qu.:  17.0              1st Qu.:  51.0               
##  Median :  52.5              Median :  93.0               
##  Mean   : 162.1              Mean   : 244.7               
##  3rd Qu.: 135.0              3rd Qu.: 196.0               
##  Max.   :3596.0              Max.   :5244.0               
##  confianza.segurirdad.municipal confianza.en.poder.judicial
##  Min.   :   15.0                Min.   :   13.0            
##  1st Qu.:   95.0                1st Qu.:   89.0            
##  Median :  228.0                Median :  214.0            
##  Mean   :  677.4                Mean   :  642.7            
##  3rd Qu.:  582.0                3rd Qu.:  549.0            
##  Max.   :14208.0                Max.   :13902.0
dim(bd1)
## [1] 661   7
tem.num.vict <- bd1$numero.de.victimizacion
bd1 <- bd1[,-1]

Reduccion de variables para formar latentes

library(psych)
#
# Matriz de correlación
corrMatrix<-lowerCor(bd1)
##                                vctms.n.s.d vctms.n.s.b vgl.. pr... cnf..
## victimas.en.su.distrito        1.00                                     
## victimas.en.su.barrio          0.99        1.00                         
## vigilancia.policia.nacional    0.85        0.84        1.00             
## presencia.de.policia.nacional  0.98        0.98        0.83  1.00       
## confianza.segurirdad.municipal 0.98        0.97        0.90  0.98  1.00 
## confianza.en.poder.judicial    0.98        0.97        0.90  0.98  1.00 
##                                cn...
## victimas.en.su.distrito             
## victimas.en.su.barrio               
## vigilancia.policia.nacional         
## presencia.de.policia.nacional       
## confianza.segurirdad.municipal      
## confianza.en.poder.judicial    1.00
#
cat("Nº de variables que no correlacionan bien: ",
    sum( sapply(as.data.frame(corrMatrix), function(x) all(abs(x) < 0.3)) ) )
## Nº de variables que no correlacionan bien:  0
#
cat("Nº de variables que podrian causar multicolinealidad: ",
    sum( sapply(as.data.frame(corrMatrix), function(x) any(abs(x) >= 0.9 & abs(x) != 1)) ) )
## Nº de variables que podrian causar multicolinealidad:  5

Realizacion del analisis factorial que nos sugiere formar dos latentes

library(usdm)
## Loading required package: sp
## Loading required package: raster
## 
## Attaching package: 'raster'
## The following object is masked from 'package:mlr':
## 
##     resample
## The following object is masked from 'package:ParamHelpers':
## 
##     getValues
vifstep(bd1,th=10)
## 4 variables from the 6 input variables have collinearity problem: 
##  
## confianza.segurirdad.municipal confianza.en.poder.judicial victimas.en.su.barrio victimas.en.su.distrito 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( presencia.de.policia.nacional ~ vigilancia.policia.nacional ):  0.8287628 
## max correlation ( presencia.de.policia.nacional ~ vigilancia.policia.nacional ):  0.8287628 
## 
## ---------- VIFs of the remained variables -------- 
##                       Variables      VIF
## 1   vigilancia.policia.nacional 3.193335
## 2 presencia.de.policia.nacional 3.193335
# Prueba de KMO
KMO(bd1)[1:2]
## $MSA
## [1] 0.7569126
## 
## $MSAi
##        victimas.en.su.distrito          victimas.en.su.barrio 
##                      0.7325947                      0.7247999 
##    vigilancia.policia.nacional  presencia.de.policia.nacional 
##                      0.7261371                      0.7326029 
## confianza.segurirdad.municipal    confianza.en.poder.judicial 
##                      0.7807750                      0.8512290
# Prueba de esfericidad de Bartlett  
psych::mardia(bd1,na.rm = TRUE,plot=TRUE)

## Call: psych::mardia(x = bd1, na.rm = TRUE, plot = TRUE)
## 
## Mardia tests of multivariate skew and kurtosis
## Use describe(x) the to get univariate tests
## n.obs = 661   num.vars =  6 
## b1p =  188.75   skew =  20794.43  with probability =  0
##  small sample skew =  20915.89  with probability =  0
## b2p =  426.32   kurtosis =  496.36  with probability =  0
cortest.bartlett(bd1)$p.value<0.05
## R was not square, finding R from data
## [1] TRUE
sedimentacion= princomp(bd1,score= TRUE, cor= TRUE)
plot(sedimentacion,type="lines")

Por medio de la rotacion varimax,evidenciamos la formacion de dos variables latente.En este caso,se usa este medio de rotacion para un mejor reparto de la proporcion de variacion explicada

# factorial
mod1 = fa(bd1,2,rotate="varimax", scores=T)
## The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
print(mod1,digits=2,cut = 0.2)
## Factor Analysis using method =  minres
## Call: fa(r = bd1, nfactors = 2, rotate = "varimax", scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                                 MR1  MR2   h2     u2 com
## victimas.en.su.distrito        0.83 0.54 0.98 0.0169 1.7
## victimas.en.su.barrio          0.85 0.51 0.98 0.0208 1.7
## vigilancia.policia.nacional    0.50 0.81 0.89 0.1056 1.7
## presencia.de.policia.nacional  0.85 0.51 0.99 0.0143 1.6
## confianza.segurirdad.municipal 0.76 0.65 1.00 0.0024 2.0
## confianza.en.poder.judicial    0.76 0.65 1.00 0.0037 2.0
## 
##                        MR1  MR2
## SS loadings           3.53 2.31
## Proportion Var        0.59 0.38
## Cumulative Var        0.59 0.97
## Proportion Explained  0.60 0.40
## Cumulative Proportion 0.60 1.00
## 
## Mean item complexity =  1.8
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  20.7 with Chi Square of  13606.5
## The degrees of freedom for the model are 4  and the objective function was  2.38 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  0.01 
## 
## The harmonic number of observations is  661 with the empirical chi square  0.39  with prob <  0.98 
## The total number of observations was  661  with Likelihood Chi Square =  1559.43  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.57
## RMSEA index =  0.77  and the 90 % confidence intervals are  0.736 0.8
## BIC =  1533.46
## Fit based upon off diagonal values = 1
puntaje <-mod1$scores

colnames(puntaje) <- c("victconf","cappol")

ormacion d elos clusters por medio Kmeans,basado en los scores

library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
mod2  = eclust(puntaje,
                         FUNcluster ="kmeans",
                         k = 2, #número de clusters -ESTO CAMBIA-
                         graph = F)

DEPARTAMENTO.scaled <- data.frame(puntaje, kmeans=mod2$cluster)

table(DEPARTAMENTO.scaled$kmeans)
## 
##   1   2 
## 624  37
aggregate(cbind(victconf,cappol) ~ kmeans, data=DEPARTAMENTO.scaled,FUN=mean)
##   kmeans    victconf     cappol
## 1      1 -0.09051543 -0.1889171
## 2      2  1.52653041  3.1860613
siluetas = mod2$silinfo$widths
siluetas[siluetas$sil_width<0,]
##     cluster neighbor   sil_width
## 21        2        1 -0.05908159
## 3         2        1 -0.06743154
## 46        2        1 -0.07247878
## 462       2        1 -0.11838391
## 236       2        1 -0.14698598
## 464       2        1 -0.16536916
## 43        2        1 -0.23447023
## 382       2        1 -0.23855867
## 216       2        1 -0.25531841
## 463       2        1 -0.32130389

Aplicacion regresion lineal

subdata <- cbind(puntaje,tem.num.vict)

head(x = subdata, n = 30)
##          victconf       cappol tem.num.vict
##  [1,] -1.15035516  5.604313092         4313
##  [2,] -0.47018052 -0.008816447          213
##  [3,] -1.01392325  2.925062745         2114
##  [4,] -0.70044123  0.365144853          246
##  [5,] -0.40775879 -0.158570772          123
##  [6,] -0.13517816 -0.567812016          111
##  [7,] -0.47767106  0.020191398          179
##  [8,] -0.42464575  0.388547427          619
##  [9,] -0.48361093 -0.049547523          165
## [10,] -0.20338068 -0.370714001          183
## [11,]  0.09393278 -0.847816157          187
## [12,] -0.38724388 -0.076190089          262
## [13,] -0.31855450 -0.112014623          304
## [14,] -0.60401968  0.067797969          108
## [15,] -0.50669779  0.208570019          334
## [16,] -1.55935232  4.218679292         2623
## [17,] -0.66909385  0.811032617          698
## [18,] -0.42706478  0.119688246          346
## [19,] -0.24135224 -0.149123085          361
## [20,]  1.03319633  2.556108218         3292
## [21,]  1.04615399  2.179289636         3119
## [22,] -0.47063002 -0.056273214          134
## [23,] -0.44851881 -0.063498943          135
## [24,] -0.52495990  0.050336435          160
## [25,] -0.16751657 -0.627511301           33
## [26,] -0.46609834  0.191566774          388
## [27,] -0.52387977  0.045761694          173
## [28,] -0.32594542 -0.272256876          117
## [29,] -0.34603033 -0.226487090          145
## [30,]  0.03704827 -0.796320543           77
colnames(subdata)[3]="numero.de.victimizacion"

modelo1 <- glm(formula = numero.de.victimizacion ~ ., family = "gaussian", 
               data = as.data.frame(subdata))
modelo1
## 
## Call:  glm(formula = numero.de.victimizacion ~ ., family = "gaussian", 
##     data = as.data.frame(subdata))
## 
## Coefficients:
## (Intercept)     victconf       cappol  
##       691.8       1018.4        821.9  
## 
## Degrees of Freedom: 660 Total (i.e. Null);  658 Residual
## Null Deviance:       1.137e+09 
## Residual Deviance: 5447000   AIC: 7844

Residios cercanos acero: Cumple este requisito

mean(modelo1$residuals)
## [1] -1.218486e-12

Puntos influyentes y heteroceasticidad:Se observa la formacion de un embudo en las graficas de la parte izquierda.Ademàs,en la derecha se observan distribuciones parece no haber un comportamiento normal

par(mfrow=c(2,2))
plot(modelo1)

Baja correlacion de las variables independientes con los residuos

corr <- rep(0,2)
bd <- subdata
for(i in 2){
  x1 <- modelo1$residuals
  y1 <- bd[,i]

  t<-cor.test(x1,y1)
  corr[i]<-t$p.value
}
corr
## [1] 0 1

Existe una correlcion entre los residuos con la primera latente

No multicolinealidad

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:usdm':
## 
##     vif
## The following object is masked from 'package:psych':
## 
##     logit
car::vif(modelo1)
## victconf   cappol 
##  1.00271  1.00271

No se presenta la multicolinealidad

Normalidad de los residuos(Shapiro-Wilk)

shapiro.test(modelo1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo1$residuals
## W = 0.74156, p-value < 2.2e-16
x <- modelo1$residuals 
h<-hist(x) 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="blue", lwd=2)

Se verifica graficamente no hay normalidad de los residuos

Creacio de una subdata y recodificacion de la variable dependiente para aplicar una regresion logistica ordinal

bd4=subdata
colnames(bd4)[3]<-"seguridad"
head(bd4)
##        victconf       cappol seguridad
## [1,] -1.1503552  5.604313092      4313
## [2,] -0.4701805 -0.008816447       213
## [3,] -1.0139233  2.925062745      2114
## [4,] -0.7004412  0.365144853       246
## [5,] -0.4077588 -0.158570772       123
## [6,] -0.1351782 -0.567812016       111
bd4 <- as.data.frame(bd4)

bp=c(-Inf,99,236,591,Inf)
names <-c(" Muy inseguro", "inseguro", "seguro","Muy seguro")
bd4$seguridadcod <- cut(bd4$seguridad ,breaks =bp,labels = names)
head(bd4) 
##     victconf       cappol seguridad seguridadcod
## 1 -1.1503552  5.604313092      4313   Muy seguro
## 2 -0.4701805 -0.008816447       213     inseguro
## 3 -1.0139233  2.925062745      2114   Muy seguro
## 4 -0.7004412  0.365144853       246       seguro
## 5 -0.4077588 -0.158570772       123     inseguro
## 6 -0.1351782 -0.567812016       111     inseguro
library("oglmx")
## Loading required package: maxLik
## Loading required package: miscTools
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
model2=ologit.reg(seguridadcod ~ .,
                  data=bd4)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
summary(model2)
## Ordered Logit Regression 
## Log-Likelihood: -0.006012488 
## No. Iterations: 300 
## McFadden's R2: 0.9999934 
## AIC: 12.01202 
##           Estimate Std. error t value Pr(>|t|)
## victconf  -112.835   2340.435 -0.0482   0.9615
## cappol     -86.456   1813.638 -0.0477   0.9620
## seguridad   13.398     25.860  0.5181   0.6044
## ----- Threshold Parameters -----
##                                     Estimate Std. error t value Pr(>|t|)
## Threshold ( Muy inseguro->inseguro)   1398.8     2846.9  0.4913   0.6232
## Threshold (inseguro->seguro)          3226.8     6188.3  0.5214   0.6021
## Threshold (seguro->Muy seguro)        7938.6    15284.6  0.5194   0.6035

Latentes epxlican en un 99% la variabilidad de la variable dependiente,pero no presenta una buen ajuste.Esto lo podemos verificar con los odds-ratio.

odds=exp(coef(model2))
ci=exp(confint(model2))
cbind(odds,ci)
##                                             odds        2.5 %       97.5 %
## victconf                            9.919101e-50 0.000000e+00          Inf
## cappol                              2.836430e-38 0.000000e+00          Inf
## seguridad                           6.589956e+05 6.409823e-17 6.775152e+27
## Threshold ( Muy inseguro->inseguro)          Inf 0.000000e+00          Inf
## Threshold (inseguro->seguro)                 Inf 0.000000e+00          Inf
## Threshold (seguro->Muy seguro)               Inf 0.000000e+00          Inf