Pregunta 1

Cargamos la base datos para realizar el analisis:

setwd("C:/Users/atita/Desktop/preg1")
data<- read.csv("Estudiantes_mate.csv",sep = ";")

Despues vemos cuantos valores ausentes hay en cada columna de datos:

library(kableExtra)
library(scales)
library(tidyverse)
data_na<- as.data.frame(colSums(is.na(data)))
names(data_na)<- "Valores_NA"
data_na %>% kbl() %>% kable_material_dark() %>% 
  column_spec(column = 2, width = "3cm", background = alpha("blue",0.3))
Valores_NA
school 0
sex 0
age 0
address 0
famsize 0
Pstatus 0
Medu 0
Fedu 0
Mjob 0
Fjob 0
reason 0
guardian 0
traveltime 0
studytime 0
failures 0
schoolsup 0
famsup 0
paid 0
activities 0
nursery 0
higher 0
internet 0
romantic 0
famrel 0
freetime 0
goout 0
Dalc 0
Walc 0
health 0
absences 0

Nota: En nuestra base datos se puede observar que no posee ningun valor ausente NA.

Ahora despues de verificar que nuestra base datos no posee ningun valor faltante se procede a realizar el analisis exploratorio de 6 variables.

Primer grafico

data %>% select(sex) %>% group_by(sex) %>% count() %>% 
  as.data.frame() %>% ggplot(aes(sex,n)) + geom_col(mapping = aes(c("Mujeres","Varones")),
                                                    fill=c(alpha("pink",0.4),alpha("blue",0.4)),
                                                    size=1.5)+
  labs(title="Sexo de los estudiantes",x="",y="cantidad de Varones y de Mujeres")+
  theme(
        axis.title = element_text(colour="steelblue4",family = "Helvetica",
                                  size = rel(1.2)), 
        axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                                 size = rel(1.0)), 
        axis.line = element_line(size = 1,colour = "black"), 
        axis.ticks = element_line(colour="grey",size = rel(1.2)),
        panel.grid.minor = element_blank(), 
        panel.background = element_rect(fill = "whitesmoke"), 
        plot.title = element_text(colour = "steelblue4", face = "bold",
                                  size = rel(1.7),family = "Helvetica"))

Segunda variable

data %>% select(Pstatus) %>% group_by(Pstatus) %>% count() %>% 
  as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("Convivientes","No convivientes")),
                                                    fill=c(alpha("pink",0.4),alpha("blue",0.4)),
                                                    size=1.5)+
  labs(title="Estado de convivencia de los padres",x="",y="cantidad de convivientes y no convivientes")+
  theme(
    axis.title = element_text(colour="steelblue4",family = "Helvetica",
                              size = rel(1.2)), 
    axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                             size = rel(1.0)), 
    axis.line = element_line(size = 1,colour = "black"), 
    axis.ticks = element_line(colour="grey",size = rel(1.2)),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "whitesmoke"), 
    plot.title = element_text(colour = "steelblue4", face = "bold",
                              size = rel(1.7),family = "Helvetica"))

Tercera variable

data %>% select(schoolsup) %>% group_by(schoolsup) %>% count() %>% 
  as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("no","si")),
                                                                             fill=c(alpha("pink",0.4),alpha("blue",0.4)),
                                                                             size=1.5)+
  labs(title="Apoyo educativo escolar",x="",y="Cantidad de personas que \n resiven apoyo educativo")+
  theme(
    axis.title = element_text(colour="steelblue4",family = "Helvetica",
                              size = rel(1.2)), 
    axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                             size = rel(1.0)), 
    axis.line = element_line(size = 1,colour = "black"), 
    axis.ticks = element_line(colour="grey",size = rel(1.2)),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "whitesmoke"), 
    plot.title = element_text(colour = "steelblue4", face = "bold",
                              size = rel(1.7),family = "Helvetica"))

Cuarta variable

data %>% select(schoolsup) %>% group_by(schoolsup) %>% count() %>% 
  as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("no","si")),
                                                                             fill=c(alpha("pink",0.4),alpha("blue",0.4)),
                                                                             size=1.5)+
  labs(title="Apoyo educativo escolar",x="",y="Cantidad de personas que \n resiven apoyo educativo")+
  theme(
    axis.title = element_text(colour="steelblue4",family = "Helvetica",
                              size = rel(1.2)), 
    axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                             size = rel(1.0)), 
    axis.line = element_line(size = 1,colour = "black"), 
    axis.ticks = element_line(colour="grey",size = rel(1.2)),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "whitesmoke"), 
    plot.title = element_text(colour = "steelblue4", face = "bold",
                              size = rel(1.7),family = "Helvetica"))

Quinta variable

data %>% select(absences) %>% ggplot()+geom_density(aes(absences),fill=alpha("steelblue",0.4))+
  xlab("Número de asusencias escolares")+labs(title="Asusencias escolares")+
  theme(
    axis.title = element_text(colour="steelblue4",family = "Helvetica",
                              size = rel(1.2)), 
    axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                             size = rel(1.0)), 
    axis.line = element_line(size = 1,colour = "black"), 
    axis.ticks = element_line(colour="grey",size = rel(1.2)),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "whitesmoke"), 
    plot.title = element_text(colour = "steelblue4", face = "bold",
                              size = rel(1.7),family = "Helvetica"))

Sexta variable

data %>% select(absences,sex) %>% mutate(sex1=ifelse(sex=="F","Femenino","Masculino"),position = 'stack') %>% 
  select(-sex) %>% ggplot()+geom_density(aes(absences,fill=sex1))+
  xlab("Número de asusencias escolares")+labs(title="Asusencias escolares")+
  facet_grid(sex1~., scales = 'free')+
  theme(
    axis.title = element_text(colour="steelblue4",family = "Helvetica",
                              size = rel(1.2)), 
    axis.text = element_text(family = "Helvetica",colour = "steelblue1",
                             size = rel(1.0)), 
    axis.line = element_line(size = 1,colour = "black"), 
    axis.ticks = element_line(colour="grey",size = rel(1.2)),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "whitesmoke"), 
    plot.title = element_text(colour = "steelblue4", face = "bold",
                              size = rel(1.7),family = "Helvetica"),
    legend.position="none")

Pregunta 2

library(hrbrthemes)
library(ggpubr)
library(funModeling) 
library(naniar)
library(cowplot)

Comenzamos a cargar los datos:

setwd("C:/Users/atita/Desktop/Preg2")
data<- read.csv("PNAD2015.csv")
colnames(data)[5]<- "Anios_estudio"

Realizemos la siguiente transfomación en algunas variables categorcias:

data %>% mutate(Sexo=ifelse(Sexo==0,"Masculino","Femenino"),
                ColorPiel=ifelse(ColorPiel==0,"Indigena",ifelse(ColorPiel==2,"Blanca",                                                             ifelse(ColorPiel==4,"Negra",ifelse(ColorPiel==6,"Amarillo",                                                                                                  ifelse(ColorPiel==8,"Marror","No_precisa"))))))-> data1

Vemos los valores asuntes de la base de datos:

vis_miss(data1)

>> Nota: Se puede ver que la base de datos no tiene ningun valor ausente.

Realizamos una descriccion de la base de datos:

describe(data1)
## data1 
## 
##  6  Variables      76840  Observations
## --------------------------------------------------------------------------------
## Region 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    76840        0       27    0.995    31.91    12.62       13       15 
##      .25      .50      .75      .90      .95 
##       25       31       41       50       52 
## 
## lowest : 11 12 13 14 15, highest: 43 50 51 52 53
## --------------------------------------------------------------------------------
## Sexo 
##        n  missing distinct 
##    76840        0        2 
##                               
## Value       Femenino Masculino
## Frequency      23590     53250
## Proportion     0.307     0.693
## --------------------------------------------------------------------------------
## Edad 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    76840        0       84    0.999    44.07    14.18       25       28 
##      .25      .50      .75      .90      .95 
##       34       43       53       61       65 
## 
## lowest : 13 14 15 16 17, highest: 92 94 95 97 99
## --------------------------------------------------------------------------------
## ColorPiel 
##        n  missing distinct 
##    76840        0        5 
## 
## lowest : Amarillo Blanca   Indigena Marror   Negra   
## highest: Amarillo Blanca   Indigena Marror   Negra   
##                                                        
## Value      Amarillo   Blanca Indigena   Marror    Negra
## Frequency       352    31815      357    35925     8391
## Proportion    0.005    0.414    0.005    0.468    0.109
## --------------------------------------------------------------------------------
## Anios_estudio 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    76840        0       17    0.975     9.47    5.133        1        3 
##      .25      .50      .75      .90      .95 
##        6       11       12       16       16 
## 
## lowest :  1  2  3  4  5, highest: 13 14 15 16 17
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency   5849  1388  2101  2891  6729  4499  2445  2689  7980  1840  2118
## Proportion 0.076 0.018 0.027 0.038 0.088 0.059 0.032 0.035 0.104 0.024 0.028
##                                               
## Value         12    13    14    15    16    17
## Frequency  20848  1836  1253  1388 10795   191
## Proportion 0.271 0.024 0.016 0.018 0.140 0.002
## --------------------------------------------------------------------------------
## Ingresos 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    76840        0     1606    0.998     2000     2097      100      350 
##      .25      .50      .75      .90      .95 
##      788     1200     2000     4000     6000 
## 
## lowest :      0      5      6      8     10, highest:  80000  90000 100000 120000 200000
## --------------------------------------------------------------------------------

Libreria funModeling

Con la libreria funModeling se puede hacer el analisis exploratorio de una mejor manera:

Datos categoricos:

freq(data1)

##        Sexo frequency percentage cumulative_perc
## 1 Masculino     53250       69.3            69.3
## 2  Femenino     23590       30.7           100.0

##   ColorPiel frequency percentage cumulative_perc
## 1    Marror     35925      46.75           46.75
## 2    Blanca     31815      41.40           88.15
## 3     Negra      8391      10.92           99.07
## 4  Indigena       357       0.46           99.53
## 5  Amarillo       352       0.46          100.00
## [1] "Variables processed: Sexo, ColorPiel"

Datos numericos:

plot_num(data1)

Analizando los 5 paises con una mayor cantidad de encuestados:

if (!require("pacman")) install.packages("pacman")
pacman::p_load(jpeg, png, ggplot2, grid, neuropsychology)
setwd("C:/Users/atita/Desktop/Preg2")
img <- png::readPNG("Flag_of_Brazil.svg.png")
data %>% select(Region) %>% group_by(Region) %>% count() %>% 
  as.data.frame()%>% ggplot(aes(reorder(as.factor(Region),n),n,fill=n))+
  annotation_custom(rasterGrob(img, 
                               width = unit(1,"npc"), 
                               height = unit(1,"npc")), 
                    -Inf, Inf, -Inf, Inf) +
  geom_col(alpha=0.8)+
  scale_fill_gradient(low = "blue", high = "red")+
  coord_flip()+labs(title="Ciudades encuestadas",x="",y="")+
  theme(plot.subtitle = element_text(family = "NewCenturySchoolbook", 
                                     size = 10, face = "bold", colour = "gray25", 
                                     hjust = 0.1), axis.line = element_line(colour = NA), 
        axis.ticks = element_line(colour = NA, 
                                  linetype = "dashed"), axis.title = element_text(family = "AvantGarde", 
                                                                                  face = "bold"), axis.text = element_text(family = "mono", 
                                                                                                                           face = "bold"), axis.text.x = element_text(family = "serif", 
                                                                                                                                                                      vjust = 0.25), plot.title = element_text(family = "AvantGarde", 
                                                                                                                                                                                                               face = "bold", hjust = 0.5), legend.text = element_text(family = "mono"),
        legend.position = "none")->grafico
  

data %>% select(Region,Edad,Anios_estudio,
                Ingresos) %>% group_by(Region)%>% 
      summarise(Promdio_Edad=round(mean(Edad),3),
                promedio_ingresos=round(mean(Ingresos),3),
                promedio_estudio=round(mean(Anios_estudio),3))->data2

data %>% select(Region) %>% group_by(Region)%>% count()-> data_3

data2<- cbind(data2,data_3$n)
names(data2)[5]<- "N_encuestados"
data2<-data2[order(data2$N_encuestados,decreasing = T),]

data2<- data2[1:5,]

tabla<- ggtexttable(data2, rows = NULL, theme = ttheme("mBlue")) 
 

ggarrange(grafico, tabla, 
          ncol = 1, nrow = 2,
          heights = c(1, 0.5))

data1 %>% filter(Region==c(35,31,43,29,33))-> new_data

p= ggscatterhist(
  new_data, x = "Edad", y = "Ingresos",
  color = "Sexo", size = 6, alpha = 0.6,
  palette = c("#798E87", "#046C9A", "#C7B19C"),
  margin.params = list(fill = "Sexo", 
                       color = "black", size = 1)
) 

q= ggscatterhist(
  new_data, x = "Edad", y = "Ingresos",
  color = "ColorPiel", size = 6, alpha = 0.6,
  palette = c("#798E87", "#046C9A", "#C7B19C","#AF7AC5","#85929E"),
  margin.params = list(fill = "ColorPiel", 
                       color = "black", size = 1)
) 

Pregunta 3

Primero se analiza cual es el procentaje de N.A en la base datos, viendo el porcentaje se elimina las variables con más del 7% con N.A:

setwd("C:/Users/atita/Desktop/Preg3")
data<- read.csv("CalCOFI.csv")
ata<-apply(data,2,function(x) sum(is.na(x))/length(x))
ata<- as.data.frame(round(ata,2))
names(ata)<- "N.A"
valores<- which(ata$N.A>0.07)
data_new<- data[,-valores]

Función para automatizar el proceso

eli_na<- function(data){
  na<- apply(data,2,function(x) sum(is.na(x))/length(x))
  na<- as.data.frame(na)
  names(ata)<- "N.A"
  v<- which(ata$N.A>0.07)
  na1<- na[,-valores]
  return("new_data"=na,"Valores_na"=na)
}

Eliminación de valores atipicos

zx<- (data_new$R_Depth-mean(data_new$R_Depth))/sd(data_new$R_Depth) 
p<-which(abs(zx)>3)
new_data<- data_new[-p,]

Primer modelo

library(stargazer)
modelo1<- lm(R_SALINITY~R_Depth,data = new_data)
stargazer(modelo1,type="html")
Dependent variable:
R_SALINITY
R_Depth 0.001***
(0.00000)
Constant 33.558***
(0.001)
Observations 806,781
R2 0.423
Adjusted R2 0.423
Residual Std. Error 0.348 (df = 806779)
F Statistic 591,367.000*** (df = 1; 806779)
Note: p<0.1; p<0.05; p<0.01
modelo2<- lm(R_SALINITY~R_TEMP,data = new_data)
stargazer(modelo2,type = "html")
Dependent variable:
R_SALINITY
R_TEMP -0.053***
(0.0001)
Constant 34.416***
(0.001)
Observations 803,534
R2 0.233
Adjusted R2 0.233
Residual Std. Error 0.401 (df = 803532)
F Statistic 243,732.200*** (df = 1; 803532)
Note: p<0.1; p<0.05; p<0.01
modelo3<- lm(R_SALINITY~R_TEMP+R_Depth,data = new_data)
stargazer(modelo3,type = "html")
Dependent variable:
R_SALINITY
R_TEMP 0.005***
(0.0001)
R_Depth 0.001***
(0.00000)
Constant 33.484***
(0.002)
Observations 803,534
R2 0.424
Adjusted R2 0.424
Residual Std. Error 0.347 (df = 803531)
F Statistic 296,143.300*** (df = 2; 803531)
Note: p<0.1; p<0.05; p<0.01

Perfomace del modelo

ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
packages <- c("see","jtools","olsrr","parameters","stats","ggplot2", "palmerpenguins", "plot3D" , "plot3Drgl","apaTables","gvlma","broomExtra","performance")
ipak(packages)
##            see         jtools          olsrr     parameters          stats 
##           TRUE           TRUE           TRUE           TRUE           TRUE 
##        ggplot2 palmerpenguins         plot3D      plot3Drgl      apaTables 
##           TRUE           TRUE           TRUE           TRUE           TRUE 
##          gvlma     broomExtra    performance 
##           TRUE           TRUE           TRUE
plot(compare_performance(modelo1,modelo2,modelo3,rank = T))

Nota: El mejor modelo es Modelo 3 por su mejor ajuste.

plot_summs(modelo1,modelo2,modelo3,scale=T,robust=T)

Pregunta 4

setwd("C:/Users/atita/Desktop/Preg4")

library(readxl)
data <- read_excel("Reactiva_Peru_Lista_de_empresas_al_30102020.xlsx", 
                                                          skip = 2)
data %>% select(`SECTOR ECONÓMICO`,`NOMBRE DE ENTIDAD OTORGANTE DEL CRÉDITO`,
                `MONTO PRÉSTAMO (S/)`,`MONTO COBERTURADO (S/)`,
                DEPARTAMENTO) -> new_data

names(new_data)<- c("S.E","Entidad","Monto_prestamo","Monto_cobertura",
                    "Departamento")
data_na<- as.data.frame(colSums(is.na(new_data)))
names(data_na)<- "Valores_NA"
data_na %>% kbl() %>% kable_material_dark() %>% 
  column_spec(column = 2, width = "3cm", background = alpha("blue",0.3))
Valores_NA
S.E 3
Entidad 3
Monto_prestamo 3
Monto_cobertura 3
Departamento 3

Nota: Al somo tener 3 Valores N.A en cada variable procedere a eliminarlas

Analisis de variables categoricas

new_data<- na.omit(new_data)
freq(new_data)

##                                         S.E frequency percentage
## 1                                  COMERCIO    237995      47.48
## 2     TRANSPORTE, ALMACENAMIENTO Y COMUNIC.     59661      11.90
## 3                   INDUSTRIA MANUFACTURERA     48576       9.69
## 4  ACTIV. INMOBILIARIAS, EMPRESARIALES ALQ.     40003       7.98
## 5                           OTROS SERVICIOS     27441       5.47
## 6                              CONSTRUCCION     27117       5.41
## 7                    HOTELES Y RESTAURANTES     24567       4.90
## 8   AGRICULTUR, GANADERIA, CAZA Y SILVICULT     21797       4.35
## 9             SERVICIOS SOCIALES Y DE SALUD      5342       1.07
## 10                                ENSEÑANZA      3094       0.62
## 11                                    PESCA      2942       0.59
## 12                                  MINERIA      1350       0.27
## 13                 ELECTRICIDAD, GAS Y AGUA       717       0.14
## 14                INTERMEDIACION FINANCIERA       696       0.14
##    cumulative_perc
## 1            47.48
## 2            59.38
## 3            69.07
## 4            77.05
## 5            82.52
## 6            87.93
## 7            92.83
## 8            97.18
## 9            98.25
## 10           98.87
## 11           99.46
## 12           99.73
## 13           99.87
## 14          100.00

##                              Entidad frequency percentage cumulative_perc
## 1                     0049 - MIBANCO    255671      51.00           51.00
## 2                     0002 - CRÉDITO     64832      12.93           63.93
## 3             0011 - BANCO BBVA PERU     25101       5.01           68.94
## 4               0803 - CMAC AREQUIPA     22095       4.41           73.35
## 5                   0003 - INTERBANK     19677       3.93           77.28
## 6                  0806 - CMAC CUSCO     19331       3.86           81.14
## 7            0808 - CMAC DE HUANCAYO     19101       3.81           84.95
## 8                   0836 - CRAC RAIZ     15847       3.16           88.11
## 9                  0009 - SCOTIABANK     12296       2.45           90.56
## 10 0098 - FINANCIERA PROEMPRESA S.A.      6849       1.37           91.93
## 11              0802 - CMAC TRUJILLO      6669       1.33           93.26
## 12        0073 - FINANCIERA CREDINKA      4883       0.97           94.23
## 13               0805 - CMAC SULLANA      4424       0.88           95.11
## 14                      0093 - QAPAQ      4293       0.86           95.97
## 15                 0813 - CMAC TACNA      3697       0.74           96.71
## 16                0811 - CMAC MAYNAS      3527       0.70           97.41
## 17     0091 - FINANCIERA COMPARTAMOS      3073       0.61           98.02
## 18     0043 - FINANCIERA CREDISCOTIA      2631       0.52           98.54
## 19                0809 - CMAC DE ICA      2038       0.41           98.95
## 20                  0800 - CMCP LIMA      1999       0.40           99.35
## 21                 0801 - CMAC PIURA      1514       0.30           99.65
## 22            0035 - BANCO PICHINCHA       679       0.14           99.79
## 23             0038 - INTERAMERICANO       466       0.09           99.88
## 24       0099 - FINANCIERA CONFIANZA       269       0.05           99.93
## 25               0850 - CRAC PRYMERA       171       0.03           99.96
## 26                   0023 - COMERCIO       127       0.03           99.99
## 27        0096 - FINANCIERA EFECTIVA        29       0.01          100.00
## 28        0056 - SANTANDER PERÚ S.A.         9       0.00          100.00

##     Departamento frequency percentage cumulative_perc
## 1           LIMA    153251      30.57           30.57
## 2           PUNO     37377       7.46           38.03
## 3          PIURA     33406       6.66           44.69
## 4       AREQUIPA     31450       6.27           50.96
## 5          CUSCO     29427       5.87           56.83
## 6      CAJAMARCA     27663       5.52           62.35
## 7    LA LIBERTAD     25833       5.15           67.50
## 8     LAMBAYEQUE     24117       4.81           72.31
## 9          JUNIN     23252       4.64           76.95
## 10        ANCASH     22045       4.40           81.35
## 11         TACNA     11645       2.32           83.67
## 12      AYACUCHO     10081       2.01           85.68
## 13    SAN MARTIN     10050       2.00           87.68
## 14           ICA      9549       1.90           89.58
## 15        CALLAO      8619       1.72           91.30
## 16       HUANUCO      8131       1.62           92.92
## 17      AMAZONAS      5276       1.05           93.97
## 18      APURIMAC      5153       1.03           95.00
## 19        TUMBES      4472       0.89           95.89
## 20      MOQUEGUA      4432       0.88           96.77
## 21       UCAYALI      4314       0.86           97.63
## 22        LORETO      4134       0.82           98.45
## 23 MADRE DE DIOS      3424       0.68           99.13
## 24         PASCO      2469       0.49           99.62
## 25  HUANCAVELICA      1728       0.34          100.00
## [1] "Variables processed: S.E, Entidad, Departamento"
plot_num(new_data)

Corelacion entre las dos varibles numericas

titulo<- "Relalcion entre Monto de prestamo vs Monto de cobertura"
library(ggpmisc)
scientific_10 <- function(x) {
  gsub("e", " x 10^", scientific_format()(x))
}
ggplot(new_data, aes(x =  Monto_prestamo, y =Monto_cobertura)) +
  geom_point()+
  stat_smooth(method = lm, formula = y ~ x ) + 
  stat_poly_eq(formula =  y ~ x,
               aes(label = paste(..eq.label.. , ..rr.label.., sep = "~~")),
               parse = TRUE )+scale_y_continuous(label=scientific_10)+
  scale_x_continuous(labels = scientific_10)+labs(title = titulo,
                    x="Monto de presamo",y="Monto de cobertura")+theme_minimal()