library(readxl)
data <- read_excel("dataPeru.xlsx")

##PREGUNTA 1 Al querer probar la hipotesis que el buen estado de los locales escolares depende del porcentaje de la poblacion que contribuye a la SUNAT; y del porcentaje de la PEA que está laborando; se llega a comprobar que (con una significancia del 0.05):

colnames(data)
## [1] "DEPARTAMENTO"        "UBIGEO"              "buenEstado"         
## [4] "contribuyentesSunat" "peaOcupada"          "pobUrbana"          
## [7] "PobRural"            "pobTotal"
str(data)
## tibble [25 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DEPARTAMENTO       : chr [1:25] "AMAZONAS" "ÁNCASH" "APURÍMAC" "AREQUIPA" ...
##  $ UBIGEO             : chr [1:25] "010000" "020000" "030000" "040000" ...
##  $ buenEstado         : num [1:25] 18.6 13.9 8.7 27.4 17 18 33.8 11.9 10.1 15.6 ...
##  $ contribuyentesSunat: num [1:25] 75035 302906 103981 585628 151191 ...
##  $ peaOcupada         : num [1:25] 130019 387976 140341 645001 235857 ...
##  $ pobUrbana          : num [1:25] 205976 806065 243354 1383694 444473 ...
##  $ PobRural           : num [1:25] 211389 333050 180905 76739 206467 ...
##  $ pobTotal           : num [1:25] 417365 1139115 424259 1460433 650940 ...
data$buenEstado_logit <- log(data$buenEstado / (100 - data$buenEstado))

reg1 <- lm(buenEstado_logit ~ contribuyentesSunat/pobTotal + peaOcupada/pobTotal, data = data)
summary(reg1)
## 
## Call:
## lm(formula = buenEstado_logit ~ contribuyentesSunat/pobTotal + 
##     peaOcupada/pobTotal, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7962 -0.2235  0.0577  0.2200  1.0849 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.658e+00  2.652e-01  -6.251 4.19e-06 ***
## contribuyentesSunat           7.476e-06  4.025e-06   1.858    0.078 .  
## peaOcupada                   -6.395e-06  4.447e-06  -1.438    0.166    
## contribuyentesSunat:pobTotal -2.579e-12  2.452e-12  -1.052    0.305    
## pobTotal:peaOcupada           2.635e-12  2.680e-12   0.983    0.337    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4939 on 20 degrees of freedom
## Multiple R-squared:  0.3022, Adjusted R-squared:  0.1627 
## F-statistic: 2.166 on 4 and 20 DF,  p-value: 0.1101

Otra posible opción

reg1.2 = lm(buenEstado ~ contribuyentesSunat + peaOcupada, data = data)
summary(reg1.2)
## 
## Call:
## lm(formula = buenEstado ~ contribuyentesSunat + peaOcupada, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.589  -3.966  -1.347   1.907  21.518 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.865e+01  2.694e+00   6.922 5.98e-07 ***
## contribuyentesSunat  1.786e-05  2.060e-05   0.867    0.395    
## peaOcupada          -1.596e-05  2.241e-05  -0.712    0.484    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.925 on 22 degrees of freedom
## Multiple R-squared:  0.1561, Adjusted R-squared:  0.07939 
## F-statistic: 2.035 on 2 and 22 DF,  p-value: 0.1546

RPTA: Ninguna de las variables influye en el porcentaje de escuelas en Buen Estado

#PREGUNTA 2: Ahora: Probar la hipotesis que la cantidad de PEA ocupada dependen de la cantidad de contribuyentes a la SUNAT ; y del porcentaje de locales escolares en buen estado

regpoisson <- glm(peaOcupada ~ contribuyentesSunat + buenEstado, data = data, family = poisson)
summary(regpoisson)
## 
## Call:
## glm(formula = peaOcupada ~ contribuyentesSunat + buenEstado, 
##     family = poisson, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -495.36  -329.75    -4.22   233.32   467.86  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.238e+01  9.007e-04 13744.7   <2e-16 ***
## contribuyentesSunat 5.575e-07  1.786e-10  3121.7   <2e-16 ***
## buenEstado          7.924e-03  4.530e-05   174.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16555865  on 24  degrees of freedom
## Residual deviance:  2148535  on 22  degrees of freedom
## AIC: 2148901
## 
## Number of Fisher Scoring iterations: 4
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(regpoisson)
## 
##  Overdispersion test
## 
## data:  regpoisson
## z = 5.4114, p-value = 3.128e-08
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   80162.34
library(MASS)
reg2 <- glm.nb(peaOcupada ~ contribuyentesSunat + buenEstado, data = data)
summary(reg2)
## 
## Call:
## glm.nb(formula = peaOcupada ~ contribuyentesSunat + buenEstado, 
##     data = data, init.theta = 3.198717665, link = log)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.04373  -1.15744   0.08482   0.78672   1.29108  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.230e+01  2.841e-01  43.293  < 2e-16 ***
## contribuyentesSunat 6.633e-07  1.306e-07   5.079  3.8e-07 ***
## buenEstado          8.246e-03  1.487e-02   0.554    0.579    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.1987) family taken to be 1)
## 
##     Null deviance: 84.727  on 24  degrees of freedom
## Residual deviance: 26.290  on 22  degrees of freedom
## AIC: 681.31
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.199 
##           Std. Err.:  0.862 
## 
##  2 x log-likelihood:  -673.314

RPTA: solo contribuye la variable contribuyentesSunat

##SEGUNDA PARTE (2Puntos)

library(readr)
dataLima <- read_csv("Lima.xlsx - data.csv")
## New names:
## Rows: 60 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): TASA DE DENUNCIAS POR COMISION DE DELITOS,SEGUN DISTRITO, ...2, ...3
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
colnames(dataLima) <- as.character(dataLima[1, ])
dataLima <- dataLima[-1, ]
dataLima <- dataLima[-c(1, 2), ]
dataLima <- dataLima[, -2]
colnames(dataLima) <- as.character(dataLima[1, ])
dataLima <- dataLima[-1, ]
dataLima <- dataLima[1:(nrow(dataLima) - 13), ]
dataresiduos <- read_csv("residuosPeru.xlsx - Sheet 1.csv")
## New names:
## Rows: 14979 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (15): Residuos municipales generados anualmente, ...2, ...3, ...4, ...5,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
colnames(dataresiduos) <- as.character(dataresiduos[1, ])
dataresiduos <- dataresiduos[-1, ]
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataresiduos <- dataresiduos %>%
  rename(Distrito = DISTRITO)
colnames(dataLima)[2] <- "tasadenuncias"

#merge

datamergee <- merge(dataresiduos, dataLima, by = "Distrito")