setwd("C:/Estadistica 2/Sesión 2")
rm(list = ls()) # limpiar el working environment
linkADrive='https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=0&single=true&output=csv'
estadoLocales=read.csv(linkADrive)
head(estadoLocales)
## DEPARTAMENTO UBIGEO buenEstado
## 1 AMAZONAS 10000 18.6
## 2 ÁNCASH 20000 13.9
## 3 APURÍMAC 30000 8.7
## 4 AREQUIPA 40000 27.4
## 5 AYACUCHO 50000 17.0
## 6 CAJAMARCA 60000 18.0
str(estadoLocales)
## 'data.frame': 25 obs. of 3 variables:
## $ DEPARTAMENTO: chr "AMAZONAS" "ÁNCASH" "APURÍMAC" "AREQUIPA" ...
## $ UBIGEO : int 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 ...
## $ buenEstado : num 18.6 13.9 8.7 27.4 17 18 33.8 11.9 10.1 15.6 ...
summary(estadoLocales$buenEstado)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.70 13.90 17.30 18.61 21.80 40.50
library(DescTools)
allStats=c(summary(estadoLocales$buenEstado),
sd=sd(estadoLocales$buenEstado),
skew=Skew(estadoLocales$buenEstado),
kurt=Kurt(estadoLocales$buenEstado),
cv=CoefVar(estadoLocales$buenEstado))
allStats
## Min. 1st Qu. Median Mean 3rd Qu. Max. sd
## 7.7000000 13.9000000 17.3000000 18.6120000 21.8000000 40.5000000 8.2596065
## skew kurt cv
## 0.9008641 0.2026850 0.4437786
library(ggplot2)
base=ggplot(data=estadoLocales,
aes(x=buenEstado))
histogram= base + geom_histogram(aes(y = after_stat(density)),
colour = 1, fill = "white",bins=10) +
stat_function(fun = dnorm,
args = list(mean = allStats['Mean'],
sd = allStats['sd']),col='red')
histogram

base=ggplot(data=estadoLocales,
aes(y=buenEstado))
boxplot=base + geom_boxplot()
boxplot

data_boxLocales=ggplot_build(boxplot)$data[[1]]
data_boxLocales
## ymin lower middle upper ymax outliers notchupper notchlower x flipped_aes
## 1 7.7 13.9 17.3 21.8 31.4 33.8, 40.5 19.7964 14.8036 0 FALSE
## PANEL group ymin_final ymax_final xmin xmax xid newx new_width weight
## 1 1 -1 7.7 40.5 -0.375 0.375 1 0 0.75 1
## colour fill alpha shape linetype linewidth
## 1 grey20 white NA 19 solid 0.5
data_boxLocales$outliers
## [[1]]
## [1] 33.8 40.5
data_boxLocales[c('ymin','ymax')]
## ymin ymax
## 1 7.7 31.4
linkPea="https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=1924082402&single=true&output=csv"
peaOcu=read.csv(linkPea)
head(peaOcu)
## UBIGEO peaOcupada
## 1 10000 130,019
## 2 20000 387,976
## 3 30000 140,341
## 4 40000 645,001
## 5 50000 235,857
## 6 60000 461,312
gsub(pattern = ",",replacement = "",peaOcu$peaOcupada)
## [1] "130019" "387976" "140341" "645001" "235857" "461312" "445072"
## [8] "496399" "111996" "253200" "369753" "512532" "691563" "459254"
## [15] "4536507" "300663" "64206" "82399" "97392" "665465" "454941"
## [22] "321613" "161903" "90571" "189783"
peaOcu$peaOcupada=gsub(pattern = ",",replacement = "",peaOcu$peaOcupada)
peaOcu$peaOcupada=as.numeric(peaOcu$peaOcupada)
# veamos
str(peaOcu)
## 'data.frame': 25 obs. of 2 variables:
## $ UBIGEO : int 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 ...
## $ peaOcupada: num 130019 387976 140341 645001 235857 ...
EstPea=merge(estadoLocales,peaOcu, by = "UBIGEO")
EstPea
## UBIGEO DEPARTAMENTO buenEstado peaOcupada
## 1 10000 AMAZONAS 18.6 130019
## 2 20000 ÁNCASH 13.9 387976
## 3 30000 APURÍMAC 8.7 140341
## 4 40000 AREQUIPA 27.4 645001
## 5 50000 AYACUCHO 17.0 235857
## 6 60000 CAJAMARCA 18.0 461312
## 7 70000 CALLAO 33.8 445072
## 8 80000 CUSCO 11.9 496399
## 9 90000 HUANCAVELICA 10.1 111996
## 10 100000 HUÁNUCO 15.6 253200
## 11 110000 ICA 28.3 369753
## 12 120000 JUNÍN 11.6 512532
## 13 130000 LA LIBERTAD 21.8 691563
## 14 140000 LAMBAYEQUE 19.4 459254
## 15 150000 LIMA 31.4 4536507
## 16 160000 LORETO 15.1 300663
## 17 170000 MADRE DE DIOS 17.3 64206
## 18 180000 MOQUEGUA 19.0 82399
## 19 190000 PASCO 7.8 97392
## 20 200000 PIURA 22.0 665465
## 21 210000 PUNO 7.7 454941
## 22 220000 SAN MARTÍN 16.2 321613
## 23 230000 TACNA 40.5 161903
## 24 240000 TUMBES 17.3 90571
## 25 250000 UCAYALI 14.9 189783
library(ggrepel)
base=ggplot(data=EstPea, aes(x=peaOcupada, y=buenEstado))
scatter = base + geom_point()
scatterText = scatter + geom_text_repel(aes(label=DEPARTAMENTO),size=2)
scatterText

f1=formula(~peaOcupada + buenEstado)
pearsonf1=cor.test(f1,data=EstPea)[c('estimate','p.value')]
pearsonf1
## $estimate
## cor
## 0.3567442
##
## $p.value
## [1] 0.08002776
spearmanf1=cor.test(f1,data=EstPea,method='spearman',exact=F)[c('estimate','p.value')]
spearmanf1
## $estimate
## rho
## 0.2785151
##
## $p.value
## [1] 0.1776146
linkPobla="https://docs.google.com/spreadsheets/d/e/2PACX-1vS2ZSNM8BIZtoufVTO4Mw3ZmTWW1rAAtsGzFg0shHTJXX-3GmtLsgU-Nqkw5RzDgrNX31GTC9L7LnEz/pub?gid=1758328391&single=true&output=csv"
poblas=read.csv(linkPobla)
head(poblas)
## UBIGEO TOTAL URBANO RURAL
## 1 10000 417365 205976 211389
## 2 20000 1139115 806065 333050
## 3 30000 424259 243354 180905
## 4 40000 1460433 1383694 76739
## 5 50000 650940 444473 206467
## 6 60000 1427527 567141 860386
EstPeaPob=merge(EstPea, poblas,by="UBIGEO")
head(EstPeaPob)
## UBIGEO DEPARTAMENTO buenEstado peaOcupada TOTAL URBANO RURAL
## 1 10000 AMAZONAS 18.6 130019 417365 205976 211389
## 2 20000 ÁNCASH 13.9 387976 1139115 806065 333050
## 3 30000 APURÍMAC 8.7 140341 424259 243354 180905
## 4 40000 AREQUIPA 27.4 645001 1460433 1383694 76739
## 5 50000 AYACUCHO 17.0 235857 650940 444473 206467
## 6 60000 CAJAMARCA 18.0 461312 1427527 567141 860386
library(magrittr)
EstPeaPob$populoso=ifelse(EstPeaPob$TOTAL>1000000,'Si','No')%>%as.factor()
# veamos conteo de cada categoría
table(EstPeaPob$populoso)
##
## No Si
## 14 11
base=ggplot(data=EstPeaPob, aes(x=populoso, y=buenEstado))
base + geom_boxplot(notch = T) + geom_jitter(color="black", size=0.4, alpha=0.9)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

f2=formula(buenEstado~populoso)
tablag= aggregate(f2, EstPeaPob,
FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})
shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
shapiroTest
## W Prob
## 1 0.8126339 0.007182561
## 2 0.9583503 0.750951875
library(knitr)
library(magrittr)
library(kableExtra)
kable(cbind(tablag[1],shapiroTest))%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
|
populoso
|
W
|
Prob
|
|
No
|
0.8126339
|
0.0071826
|
|
Si
|
0.9583503
|
0.7509519
|
(student_T=t.test(f2,data=EstPeaPob)[c('estimate','p.value')])
## $estimate
## mean in group No mean in group Si
## 17.6 19.9
##
## $p.value
## [1] 0.5025783
(Mann_Whitnery=wilcox.test(f2,data=EstPeaPob,exact=F)['p.value'])
## $p.value
## [1] 0.3960432
modelo1=formula(buenEstado~peaOcupada + populoso)
# la pea como porcentaje
EstPeaPob$peaOcu_pct=EstPeaPob$peaOcupada/EstPeaPob$TOTAL
modelo2=formula(buenEstado~peaOcu_pct + populoso)
regre2=lm(modelo2,data = EstPeaPob)
summary(regre2)
##
## Call:
## lm(formula = modelo2, data = EstPeaPob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9294 -4.0314 -0.0228 4.8027 11.3151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.3941 10.4069 -2.536 0.018814 *
## peaOcu_pct 119.8259 27.9708 4.284 0.000302 ***
## populosoSi 0.5927 2.5719 0.230 0.819861
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.306 on 22 degrees of freedom
## Multiple R-squared: 0.4657, Adjusted R-squared: 0.4171
## F-statistic: 9.586 on 2 and 22 DF, p-value: 0.001014