library(readxl)
data <- read_excel("dataPeru.xlsx")
##PREGUNTA 1 Al querer probar la hipotesis que el buen estado de los locales escolares depende del porcentaje de la poblacion que contribuye a la SUNAT; y del porcentaje de la PEA que está laborando; se llega a comprobar que (con una significancia del 0.05):
colnames(data)
## [1] "DEPARTAMENTO" "UBIGEO" "buenEstado"
## [4] "contribuyentesSunat" "peaOcupada" "pobUrbana"
## [7] "PobRural" "pobTotal"
str(data)
## tibble [25 × 8] (S3: tbl_df/tbl/data.frame)
## $ DEPARTAMENTO : chr [1:25] "AMAZONAS" "ÁNCASH" "APURÍMAC" "AREQUIPA" ...
## $ UBIGEO : chr [1:25] "010000" "020000" "030000" "040000" ...
## $ buenEstado : num [1:25] 18.6 13.9 8.7 27.4 17 18 33.8 11.9 10.1 15.6 ...
## $ contribuyentesSunat: num [1:25] 75035 302906 103981 585628 151191 ...
## $ peaOcupada : num [1:25] 130019 387976 140341 645001 235857 ...
## $ pobUrbana : num [1:25] 205976 806065 243354 1383694 444473 ...
## $ PobRural : num [1:25] 211389 333050 180905 76739 206467 ...
## $ pobTotal : num [1:25] 417365 1139115 424259 1460433 650940 ...
data$buenEstado_logit <- log(data$buenEstado / (100 - data$buenEstado))
reg1 <- lm(buenEstado_logit ~ contribuyentesSunat/pobTotal + peaOcupada/pobTotal, data = data)
summary(reg1)
##
## Call:
## lm(formula = buenEstado_logit ~ contribuyentesSunat/pobTotal +
## peaOcupada/pobTotal, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7962 -0.2235 0.0577 0.2200 1.0849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.658e+00 2.652e-01 -6.251 4.19e-06 ***
## contribuyentesSunat 7.476e-06 4.025e-06 1.858 0.078 .
## peaOcupada -6.395e-06 4.447e-06 -1.438 0.166
## contribuyentesSunat:pobTotal -2.579e-12 2.452e-12 -1.052 0.305
## pobTotal:peaOcupada 2.635e-12 2.680e-12 0.983 0.337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4939 on 20 degrees of freedom
## Multiple R-squared: 0.3022, Adjusted R-squared: 0.1627
## F-statistic: 2.166 on 4 and 20 DF, p-value: 0.1101
Otra posible opción
reg1.2 = lm(buenEstado ~ contribuyentesSunat + peaOcupada, data = data)
summary(reg1.2)
##
## Call:
## lm(formula = buenEstado ~ contribuyentesSunat + peaOcupada, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.589 -3.966 -1.347 1.907 21.518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.865e+01 2.694e+00 6.922 5.98e-07 ***
## contribuyentesSunat 1.786e-05 2.060e-05 0.867 0.395
## peaOcupada -1.596e-05 2.241e-05 -0.712 0.484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.925 on 22 degrees of freedom
## Multiple R-squared: 0.1561, Adjusted R-squared: 0.07939
## F-statistic: 2.035 on 2 and 22 DF, p-value: 0.1546
RPTA: Ninguna de las variables influye en el porcentaje de escuelas en Buen Estado
#PREGUNTA 2: Ahora: Probar la hipotesis que la cantidad de PEA ocupada dependen de la cantidad de contribuyentes a la SUNAT ; y del porcentaje de locales escolares en buen estado
regpoisson <- glm(peaOcupada ~ contribuyentesSunat + buenEstado, data = data, family = poisson)
summary(regpoisson)
##
## Call:
## glm(formula = peaOcupada ~ contribuyentesSunat + buenEstado,
## family = poisson, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -495.36 -329.75 -4.22 233.32 467.86
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.238e+01 9.007e-04 13744.7 <2e-16 ***
## contribuyentesSunat 5.575e-07 1.786e-10 3121.7 <2e-16 ***
## buenEstado 7.924e-03 4.530e-05 174.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16555865 on 24 degrees of freedom
## Residual deviance: 2148535 on 22 degrees of freedom
## AIC: 2148901
##
## Number of Fisher Scoring iterations: 4
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(regpoisson)
##
## Overdispersion test
##
## data: regpoisson
## z = 5.4114, p-value = 3.128e-08
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 80162.34
library(MASS)
reg2 <- glm.nb(peaOcupada ~ contribuyentesSunat + buenEstado, data = data)
summary(reg2)
##
## Call:
## glm.nb(formula = peaOcupada ~ contribuyentesSunat + buenEstado,
## data = data, init.theta = 3.198717665, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.04373 -1.15744 0.08482 0.78672 1.29108
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.230e+01 2.841e-01 43.293 < 2e-16 ***
## contribuyentesSunat 6.633e-07 1.306e-07 5.079 3.8e-07 ***
## buenEstado 8.246e-03 1.487e-02 0.554 0.579
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.1987) family taken to be 1)
##
## Null deviance: 84.727 on 24 degrees of freedom
## Residual deviance: 26.290 on 22 degrees of freedom
## AIC: 681.31
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.199
## Std. Err.: 0.862
##
## 2 x log-likelihood: -673.314
RPTA: solo contribuye la variable contribuyentesSunat
##SEGUNDA PARTE (2Puntos)
library(readr)
dataLima <- read_csv("Lima.xlsx - data.csv")
## New names:
## Rows: 60 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): TASA DE DENUNCIAS POR COMISION DE DELITOS,SEGUN DISTRITO, ...2, ...3
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
colnames(dataLima) <- as.character(dataLima[1, ])
dataLima <- dataLima[-1, ]
dataLima <- dataLima[-c(1, 2), ]
dataLima <- dataLima[, -2]
colnames(dataLima) <- as.character(dataLima[1, ])
dataLima <- dataLima[-1, ]
dataLima <- dataLima[1:(nrow(dataLima) - 13), ]
dataresiduos <- read_csv("residuosPeru.xlsx - Sheet 1.csv")
## New names:
## Rows: 14979 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (15): Residuos municipales generados anualmente, ...2, ...3, ...4, ...5,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
colnames(dataresiduos) <- as.character(dataresiduos[1, ])
dataresiduos <- dataresiduos[-1, ]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataresiduos <- dataresiduos %>%
rename(Distrito = DISTRITO)
colnames(dataLima)[2] <- "tasadenuncias"
#merge
datamergee <- merge(dataresiduos, dataLima, by = "Distrito")