library(ggplot2)
library(readxl)  # read_excel
library(tidyverse)  # %>%
library(dplyr)
library(flextable)
library(writexl)  # write_xlsx


# Gráficos

library(GGally)          # ggpairs
library(ggcorrplot)      # cor_pmat, ggMarginal


# Mapas

library(sf)           # read_sf st_crs
library(classInt)     # classIntervals
library(ggspatial)    # annotation_north_arrow
library(wesanderson)  # wes_palette
library(DT)

Análisis descriptivo

ITER_GRO <- read_excel("C:/Documetos MGM_OS_Dell/Documentos Maria_Dell/Maestria_MCDySI/Docencia_MCDySI/Modelos_matematicos/Prácticas_Rstudio/ITER_GRO.xlsx")

View(ITER_GRO )

#colnames(ITER_GRO)  # Nombre de las columnas del data.frame
dat<-ITER_GRO%>%
     dplyr::select(mun, nom_mun,
                   loc, nom_loc,
                   p15ym_an,    # pob. 15 años y mas analfabetas
                   vph_aguafv,  # v sin agua en casa
                   vph_s_elec   # v sin luz
                   )%>%
    filter(mun!=0,loc!=0)%>%
    mutate(   p15ym_an=as.numeric(p15ym_an),
            vph_aguafv=as.numeric(vph_aguafv),
            vph_s_elec=as.numeric(vph_s_elec))%>%
    group_by(mun, nom_mun)%>%
    dplyr::summarise( t.m_p15ym_an=sum(p15ym_an,  na.rm = T), # na.rm = T, quitar NA
                      t.m_vph_aguafv=sum(vph_aguafv, na.rm = T),
                      t.m_vph_s_elec=sum(vph_s_elec, na.rm = T)
                     )%>%
     rename(CVE_MUN=mun)

  
View(dat)


datatable(dat)

#flextable(dat)%>%
 # autofit()%>%
  #set_caption("Totales por municipios, de tres variables")%>%
#  set_header_labels(CVE_MUN="Cve mun",
 #                   nom_mun="Nom. mun",
#                    t.m_p15ym_an="Tot. pbo analfabeta",
 #                   t.m_vph_aguafv="Tot. casas sin agua",
#                    t.m_vph_s_elec="Tot. casas sin luz" )

#head(dat)

hist(dat$t.m_p15ym_an)

hist(dat$t.m_vph_aguafv)

hist(dat$t.m_vph_s_elec)

ggpairs(dat[,3:5])+
theme_bw()

\(H_0\): \(\rho=0\) vs \(H_1\): \(\rho\neq 0\)

Se rechaza \(H_0\) si el p-valor es menor que un nivel de significancia dado por el investigador:

Niveles de significancia, es decir, valores que toma \(\alpha\):

signo *** [0, 0.001]
signo ** (0.001, 0.01]
signo * (0.01, 0.05]
signo . (0.05, 0.1]
signo (0.1, 1]

#install.packages("ggcorrplot")

cor_dat<-round(cor(dat[,3:5]),3)


p_mat <- cor_pmat(dat[,3:5]) # Cálculo de los p-valores


ggcorrplot(cor_dat, 
           hc.order = TRUE,
           digits = 3,
           type = "lower",
           lab = T,
           insig = "pch",
           pch.cex = 5 ,
           p.mat = p_mat,
           sig.level = 0.001, # Los valores no significativos a este nivel coloca *
           legend.title = "Correlación",
           outline.col = "white", 
           lab_size = 4.5)

Matrices de datos

Dadas \(X_{1},...,X_{p}\), \(p\) variables aleatorias, con \(n\) datos cada variable, se tiene la siguiente matriz de datos:

\[ \mathbf{X}=\left( \begin{array}{cccc} x_{11} & x_{12} & \cdots & x_{1p} \\ x_{21} & x_{22} & \cdots & x_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ x_{n1} & x_{n2} & \cdots & x_{np} \end{array} \right) \]

De esta matriz \(\mathbf{X}\), se calculas el vector de promedios, la matriz de covarianzas y la matriz de correlaciones.

Vector de promedios

Vector de promedios \[ \bar{\mathbf{x}}=(\bar{x}_{1},\bar{x}_{2},...,\bar{x}_{p}) \]

dat_mean<-apply(dat[,3:5],2,mean)
dat_mean

##   t.m_p15ym_an t.m_vph_aguafv t.m_vph_s_elec 
##      4621.3210      3752.0370       415.5062

}

Matriz de covarianzas:

Dado \(X_1\) y \(X_2\) v.a definidas en el mismo espacio de probabilidad con media \(\mu _{X_1}\) y \(\mu_{X_2}\) respectivamente. Entonces la covarianza de \(X_1\) y \(X_2\) esta dada por \[ Cov(X_1,X_2)=E[(X_1-\mu _{X_1})(X_2-\mu _{X_2})]. \] Observaciones:

La covarianza mide la relación lineal que existe entre\(X_1\) y \(X_2\)
La función de covarianza depende de la escala de \(X_1\) y \(X_2\)
\(Cov(X_1,X_2)=Cov(X_2,X_1)\)
Como la función de covarianza depende de la escala de las v.a y toma valores positivos y negativos, entonces \[ -\infty <Cov(X_1,X_2)<\infty \]
Si \(Cov(X_1,X_2)>0\), entonces \(X_1-\mu _{X_1}>0\) y \(X_2-\mu _{X_2}>0\).
Si \(Cov(X_1,X_2)<0\), entonces \(X_1-\mu _{X_1}<0\) '{o} \(X_2-\mu _{X_2}<0\).
Si \(X_2=X_1\) entonces

\[ Cov(X_1,X_1)=V(X_1) \]

Un estimador muestral de la covarianza entre dos variables aleatorias, está dado por

\[ s_{12}=\frac{1}{n}\sum_{j=1}^n(x_{j1}-\bar{x}_{1})(x_{j2}-\bar{x}_{2}) \] Dadas \(p\) variables aleatorias, se tiene que \[ s_{ik}=\frac{1}{n}\sum_{j=1}^n(x_{ji}-\bar{x}_{i})(x_{jk}-\bar{x}_{k}) \] donde \(i=1,2,...,p\), y \(k=1,2,3,...,p\).

\[ S=\left( \begin{array}{cccc} s_{11} & s_{12} & \cdots & s_{1p} \\ s_{21} & s_{22} & \cdots & s_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ s_{p1} & s_{p1} & \cdots & s_{pp} \end{array} \right) \]

Matriz de correlaciones

El coeficiente de correlación entre dos variables aleatorias \(X\) y \(Y\), está dado por \[ \rho _{X,Y}=\frac{Cov(X,Y)}{\sigma _{X}\sigma _{Y}} \] donde \(\sigma _{X}=\sqrt{\sigma _{X}^{2}}\) y \(\sigma _{Y}=\sqrt{\sigma_{Y}^{2}}\), donde \(V(X)=\sigma _{X}^{2}\) y \(V(Y)=\sigma _{Y}^{2}\).

Observaciones:

El coeficiente de correlación mide la relación lineal que existe entre \(X_1\) y \(X_2\).
Si \(X_1\) y \(X_2\) son independientes, entonces \[ \rho _{12}=0 \]
Si \(X_1\) y \(X_2\) son v.a con distribución normal y \(\rho _{12}=0\), entonces \(X_1\) y \(X_2\) son independientes.
El coeficiente de correlación está entre: \(-1\leq\rho_{12}\leq 1\).
El coeficiente de correlación no depende de la escala de \(X_1\) y \(X_2\).
El \(\rho _{12}\) es invariante bajo transformaciones de escala y localidad.
A mayor asociación lineal entre las v.a \(X_1\) y \(X_2\), más cercano estará \(\rho _{12}\) de \(-1\) o \(1\).
Este coeficiente de correlación se utiliza para v.a cuantitativas, es decir, las v.a discretas y continuas.

Un estimador muestral del coeficiente de correlación, \(\rho_{12}\) está dado por

\[ r_{12}=\frac{s_{12}}{\sqrt{s_{11}}\sqrt{s_{22}}}=\frac{\sum_{j=1}^n(x_{j1}-\bar{x}_{1})(x_{j2}-\bar{x}_{2})}{\sqrt{\sum_{j=1}^n(x_{j1}-\bar{x}_{1})^2}\sqrt{\sum_{j=1}^n(x_{j2}-\bar{x}_{2})^2}} \] \(j=1,...,n\) es el número de observaciones.

De manera general, se tiene

\[ r_{ik}=\frac{s_{ik}}{\sqrt{s_{ii}}\sqrt{s_{kk}}}=\frac{\sum_{j=1}^n(x_{ji}-\bar{x}_{i})(x_{jk}-\bar{x}_{k})}{\sqrt{\sum_{j=1}^n(x_{ji}-\bar{x}_{i})^2}\sqrt{\sum_{j=1}^n(x_{jk}-\bar{x}_{k})^2}} \]

donde \(i,k=1,2,...,p\).

Estos elementos pueden ser acomodados en una matriz de correlaciones.

\[ R=\left( \begin{array}{cccc} r_{11} & r_{12} & \cdots & r_{1p} \\ r_{21} & r_{22} & \cdots & r_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ r_{p1} & r_{p1} & \cdots & r_{pp} \end{array} \right) \] # Representación de la información

shp_guerrero<- sf::st_read("C:/Documetos MGM_OS_Dell/Documentos Maria_Dell/Maestria_MCDySI/Docencia_MCDySI/Modelos_matematicos/Prácticas_Rstudio/12_guerrero/conjunto_de_datos/12mun.shp",
                         options = "ENCODING=WINDOWS-1252"
                          # encoding = "UTF-8"
                           )

## options:        ENCODING=WINDOWS-1252 
## Reading layer `12mun' from data source 
##   `C:\Documetos MGM_OS_Dell\Documentos Maria_Dell\Maestria_MCDySI\Docencia_MCDySI\Modelos_matematicos\Prácticas_Rstudio\12_guerrero\conjunto_de_datos\12mun.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 81 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 2480497 ymin: 485730.8 xmax: 2925387 ymax: 766619.5
## Projected CRS: MEXICO_ITRF_2008_LCC

shp_guerrero$CVE_MUN<-as.numeric(shp_guerrero$CVE_MUN)

dat_pob<-ITER_GRO%>%
         dplyr::select(mun, nom_mun,
                       loc, nom_loc,
                       p15ym_an,    # pob. 15 años y más analfabetas
                       p_15ymas     # pob de 15 años y más
                      )%>%
         filter(mun!=0,loc!=0)%>%
         mutate(p15ym_an=as.numeric(p15ym_an),
                p_15ymas=as.numeric(p_15ymas))%>%
         group_by(mun, nom_mun)%>%
  dplyr::summarise(t.m_p15ym_an=sum(p15ym_an,na.rm = T), # na.rm = T, quitar NA
                        t.m_p15ym=sum(p_15ymas,na.rm = T)  # Pob de 15 años y más
                      )%>%
        mutate(Frec_p15ym.an=round((t.m_p15ym_an/t.m_p15ym)*100,2))%>%
        rename(CVE_MUN=mun)


View(dat_pob)

datatable(dat_pob)

#flextable(dat_pob)%>%
#  autofit()

dat_shp<-merge(shp_guerrero, dat_pob, by="CVE_MUN")

ggplot() +
      geom_sf(data =dat_shp,  color="black", aes(fill=Frec_p15ym.an)) +
      coord_sf(crs = 4326, # maya cuadrada
               label_graticule="NW",
               label_axes = list(right="N", bottom="E"))+ # coloca ejer derecho y superior
      theme_bw()+    # Cambia el tema
      scale_fill_gradient(low = "blue", high = "red") +
      annotation_north_arrow(location='tr',  # br  Coloca el norte
                             height = unit(1, "cm"),
                             width = unit(1, "cm"))+
      annotation_scale(location = "bl" )+  # Barra
      labs(x= " ",y=" ", fill="% de p15ym_an")

Referencias:

https://rpubs.com/RH_Rogelio/847557

Matrices y vectores aleatorios

MGM

Análisis descriptivo

Matrices de datos

Vector de promedios

Matriz de covarianzas:

Matriz de correlaciones