setwd("C:/Estadistica 2/Sesión 2")
rm(list = ls()) # limpiar el working environment

linkADrive='https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=0&single=true&output=csv'

estadoLocales=read.csv(linkADrive)

head(estadoLocales)
##   DEPARTAMENTO UBIGEO buenEstado
## 1     AMAZONAS  10000       18.6
## 2       ÁNCASH  20000       13.9
## 3     APURÍMAC  30000        8.7
## 4     AREQUIPA  40000       27.4
## 5     AYACUCHO  50000       17.0
## 6    CAJAMARCA  60000       18.0
str(estadoLocales)
## 'data.frame':    25 obs. of  3 variables:
##  $ DEPARTAMENTO: chr  "AMAZONAS" "ÁNCASH" "APURÍMAC" "AREQUIPA" ...
##  $ UBIGEO      : int  10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 ...
##  $ buenEstado  : num  18.6 13.9 8.7 27.4 17 18 33.8 11.9 10.1 15.6 ...
summary(estadoLocales$buenEstado)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.70   13.90   17.30   18.61   21.80   40.50
library(DescTools)

allStats=c(summary(estadoLocales$buenEstado),
  sd=sd(estadoLocales$buenEstado),
  skew=Skew(estadoLocales$buenEstado),
  kurt=Kurt(estadoLocales$buenEstado),
  cv=CoefVar(estadoLocales$buenEstado))
allStats
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max.         sd 
##  7.7000000 13.9000000 17.3000000 18.6120000 21.8000000 40.5000000  8.2596065 
##       skew       kurt         cv 
##  0.9008641  0.2026850  0.4437786
library(ggplot2)

base=ggplot(data=estadoLocales,
            aes(x=buenEstado))
histogram= base + geom_histogram(aes(y = after_stat(density)),
                 colour = 1, fill = "white",bins=10) +  
    stat_function(fun = dnorm,
                  args = list(mean = allStats['Mean'],
                              sd = allStats['sd']),col='red')
    
histogram

base=ggplot(data=estadoLocales,
            aes(y=buenEstado))
boxplot=base + geom_boxplot()

boxplot

data_boxLocales=ggplot_build(boxplot)$data[[1]]
data_boxLocales
##   ymin lower middle upper ymax   outliers notchupper notchlower x flipped_aes
## 1  7.7  13.9   17.3  21.8 31.4 33.8, 40.5    19.7964    14.8036 0       FALSE
##   PANEL group ymin_final ymax_final   xmin  xmax xid newx new_width weight
## 1     1    -1        7.7       40.5 -0.375 0.375   1    0      0.75      1
##   colour  fill alpha shape linetype linewidth
## 1 grey20 white    NA    19    solid       0.5
data_boxLocales$outliers
## [[1]]
## [1] 33.8 40.5
data_boxLocales[c('ymin','ymax')]
##   ymin ymax
## 1  7.7 31.4
linkPea="https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=1924082402&single=true&output=csv"
peaOcu=read.csv(linkPea)
head(peaOcu)
##   UBIGEO peaOcupada
## 1  10000    130,019
## 2  20000    387,976
## 3  30000    140,341
## 4  40000    645,001
## 5  50000    235,857
## 6  60000    461,312
gsub(pattern = ",",replacement = "",peaOcu$peaOcupada)
##  [1] "130019"  "387976"  "140341"  "645001"  "235857"  "461312"  "445072" 
##  [8] "496399"  "111996"  "253200"  "369753"  "512532"  "691563"  "459254" 
## [15] "4536507" "300663"  "64206"   "82399"   "97392"   "665465"  "454941" 
## [22] "321613"  "161903"  "90571"   "189783"
peaOcu$peaOcupada=gsub(pattern = ",",replacement = "",peaOcu$peaOcupada)
peaOcu$peaOcupada=as.numeric(peaOcu$peaOcupada)

# veamos
str(peaOcu)
## 'data.frame':    25 obs. of  2 variables:
##  $ UBIGEO    : int  10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 ...
##  $ peaOcupada: num  130019 387976 140341 645001 235857 ...
EstPea=merge(estadoLocales,peaOcu, by = "UBIGEO")
EstPea
##    UBIGEO  DEPARTAMENTO buenEstado peaOcupada
## 1   10000      AMAZONAS       18.6     130019
## 2   20000        ÁNCASH       13.9     387976
## 3   30000      APURÍMAC        8.7     140341
## 4   40000      AREQUIPA       27.4     645001
## 5   50000      AYACUCHO       17.0     235857
## 6   60000     CAJAMARCA       18.0     461312
## 7   70000        CALLAO       33.8     445072
## 8   80000         CUSCO       11.9     496399
## 9   90000  HUANCAVELICA       10.1     111996
## 10 100000       HUÁNUCO       15.6     253200
## 11 110000           ICA       28.3     369753
## 12 120000         JUNÍN       11.6     512532
## 13 130000   LA LIBERTAD       21.8     691563
## 14 140000    LAMBAYEQUE       19.4     459254
## 15 150000          LIMA       31.4    4536507
## 16 160000        LORETO       15.1     300663
## 17 170000 MADRE DE DIOS       17.3      64206
## 18 180000      MOQUEGUA       19.0      82399
## 19 190000         PASCO        7.8      97392
## 20 200000         PIURA       22.0     665465
## 21 210000          PUNO        7.7     454941
## 22 220000    SAN MARTÍN       16.2     321613
## 23 230000         TACNA       40.5     161903
## 24 240000        TUMBES       17.3      90571
## 25 250000       UCAYALI       14.9     189783
library(ggrepel)
base=ggplot(data=EstPea, aes(x=peaOcupada, y=buenEstado))
scatter = base + geom_point()
scatterText = scatter + geom_text_repel(aes(label=DEPARTAMENTO),size=2)
scatterText

f1=formula(~peaOcupada + buenEstado)
pearsonf1=cor.test(f1,data=EstPea)[c('estimate','p.value')]
pearsonf1
## $estimate
##       cor 
## 0.3567442 
## 
## $p.value
## [1] 0.08002776
spearmanf1=cor.test(f1,data=EstPea,method='spearman',exact=F)[c('estimate','p.value')]
spearmanf1
## $estimate
##       rho 
## 0.2785151 
## 
## $p.value
## [1] 0.1776146
linkPobla="https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=1758328391&single=true&output=csv"
poblas=read.csv(linkPobla)
head(poblas)
##   UBIGEO   TOTAL  URBANO  RURAL
## 1  10000  417365  205976 211389
## 2  20000 1139115  806065 333050
## 3  30000  424259  243354 180905
## 4  40000 1460433 1383694  76739
## 5  50000  650940  444473 206467
## 6  60000 1427527  567141 860386
EstPeaPob=merge(EstPea, poblas,by="UBIGEO")
head(EstPeaPob)
##   UBIGEO DEPARTAMENTO buenEstado peaOcupada   TOTAL  URBANO  RURAL
## 1  10000     AMAZONAS       18.6     130019  417365  205976 211389
## 2  20000       ÁNCASH       13.9     387976 1139115  806065 333050
## 3  30000     APURÍMAC        8.7     140341  424259  243354 180905
## 4  40000     AREQUIPA       27.4     645001 1460433 1383694  76739
## 5  50000     AYACUCHO       17.0     235857  650940  444473 206467
## 6  60000    CAJAMARCA       18.0     461312 1427527  567141 860386
library(magrittr)
EstPeaPob$populoso=ifelse(EstPeaPob$TOTAL>1000000,'Si','No')%>%as.factor()

# veamos conteo de cada categoría
table(EstPeaPob$populoso)
## 
## No Si 
## 14 11
base=ggplot(data=EstPeaPob, aes(x=populoso, y=buenEstado))
base + geom_boxplot(notch = T) +  geom_jitter(color="black", size=0.4, alpha=0.9)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

f2=formula(buenEstado~populoso)


tablag= aggregate(f2, EstPeaPob,
          FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})




shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
shapiroTest
##           W        Prob
## 1 0.8126339 0.007182561
## 2 0.9583503 0.750951875
library(knitr)
library(magrittr)
library(kableExtra)
kable(cbind(tablag[1],shapiroTest))%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
populoso W Prob
No 0.8126339 0.0071826
Si 0.9583503 0.7509519
(student_T=t.test(f2,data=EstPeaPob)[c('estimate','p.value')])
## $estimate
## mean in group No mean in group Si 
##             17.6             19.9 
## 
## $p.value
## [1] 0.5025783
(Mann_Whitnery=wilcox.test(f2,data=EstPeaPob,exact=F)['p.value'])
## $p.value
## [1] 0.3960432
modelo1=formula(buenEstado~peaOcupada + populoso)
# la pea como porcentaje
EstPeaPob$peaOcu_pct=EstPeaPob$peaOcupada/EstPeaPob$TOTAL


modelo2=formula(buenEstado~peaOcu_pct + populoso)
regre2=lm(modelo2,data = EstPeaPob)
summary(regre2)
## 
## Call:
## lm(formula = modelo2, data = EstPeaPob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9294  -4.0314  -0.0228   4.8027  11.3151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -26.3941    10.4069  -2.536 0.018814 *  
## peaOcu_pct  119.8259    27.9708   4.284 0.000302 ***
## populosoSi    0.5927     2.5719   0.230 0.819861    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.306 on 22 degrees of freedom
## Multiple R-squared:  0.4657, Adjusted R-squared:  0.4171 
## F-statistic: 9.586 on 2 and 22 DF,  p-value: 0.001014