Cargamos la base datos para realizar el analisis:
setwd("C:/Users/atita/Desktop/preg1")
data<- read.csv("Estudiantes_mate.csv",sep = ";")Despues vemos cuantos valores ausentes hay en cada columna de datos:
library(kableExtra)
library(scales)
library(tidyverse)
data_na<- as.data.frame(colSums(is.na(data)))
names(data_na)<- "Valores_NA"
data_na %>% kbl() %>% kable_material_dark() %>%
column_spec(column = 2, width = "3cm", background = alpha("blue",0.3))| Valores_NA | |
|---|---|
| school | 0 |
| sex | 0 |
| age | 0 |
| address | 0 |
| famsize | 0 |
| Pstatus | 0 |
| Medu | 0 |
| Fedu | 0 |
| Mjob | 0 |
| Fjob | 0 |
| reason | 0 |
| guardian | 0 |
| traveltime | 0 |
| studytime | 0 |
| failures | 0 |
| schoolsup | 0 |
| famsup | 0 |
| paid | 0 |
| activities | 0 |
| nursery | 0 |
| higher | 0 |
| internet | 0 |
| romantic | 0 |
| famrel | 0 |
| freetime | 0 |
| goout | 0 |
| Dalc | 0 |
| Walc | 0 |
| health | 0 |
| absences | 0 |
Nota: En nuestra base datos se puede observar que no posee ningun valor ausente NA.
Ahora despues de verificar que nuestra base datos no posee ningun valor faltante se procede a realizar el analisis exploratorio de 6 variables.
data %>% select(sex) %>% group_by(sex) %>% count() %>%
as.data.frame() %>% ggplot(aes(sex,n)) + geom_col(mapping = aes(c("Mujeres","Varones")),
fill=c(alpha("pink",0.4),alpha("blue",0.4)),
size=1.5)+
labs(title="Sexo de los estudiantes",x="",y="cantidad de Varones y de Mujeres")+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"))data %>% select(Pstatus) %>% group_by(Pstatus) %>% count() %>%
as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("Convivientes","No convivientes")),
fill=c(alpha("pink",0.4),alpha("blue",0.4)),
size=1.5)+
labs(title="Estado de convivencia de los padres",x="",y="cantidad de convivientes y no convivientes")+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"))data %>% select(schoolsup) %>% group_by(schoolsup) %>% count() %>%
as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("no","si")),
fill=c(alpha("pink",0.4),alpha("blue",0.4)),
size=1.5)+
labs(title="Apoyo educativo escolar",x="",y="Cantidad de personas que \n resiven apoyo educativo")+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"))data %>% select(schoolsup) %>% group_by(schoolsup) %>% count() %>%
as.data.frame() %>% arrange(desc(n)) %>% ggplot(aes(Pstatus,n)) + geom_col(mapping = aes(c("no","si")),
fill=c(alpha("pink",0.4),alpha("blue",0.4)),
size=1.5)+
labs(title="Apoyo educativo escolar",x="",y="Cantidad de personas que \n resiven apoyo educativo")+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"))data %>% select(absences) %>% ggplot()+geom_density(aes(absences),fill=alpha("steelblue",0.4))+
xlab("Número de asusencias escolares")+labs(title="Asusencias escolares")+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"))data %>% select(absences,sex) %>% mutate(sex1=ifelse(sex=="F","Femenino","Masculino"),position = 'stack') %>%
select(-sex) %>% ggplot()+geom_density(aes(absences,fill=sex1))+
xlab("Número de asusencias escolares")+labs(title="Asusencias escolares")+
facet_grid(sex1~., scales = 'free')+
theme(
axis.title = element_text(colour="steelblue4",family = "Helvetica",
size = rel(1.2)),
axis.text = element_text(family = "Helvetica",colour = "steelblue1",
size = rel(1.0)),
axis.line = element_line(size = 1,colour = "black"),
axis.ticks = element_line(colour="grey",size = rel(1.2)),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "whitesmoke"),
plot.title = element_text(colour = "steelblue4", face = "bold",
size = rel(1.7),family = "Helvetica"),
legend.position="none")library(hrbrthemes)
library(ggpubr)
library(funModeling)
library(naniar)
library(cowplot)Comenzamos a cargar los datos:
setwd("C:/Users/atita/Desktop/Preg2")
data<- read.csv("PNAD2015.csv")
colnames(data)[5]<- "Anios_estudio"Realizemos la siguiente transfomación en algunas variables categorcias:
data %>% mutate(Sexo=ifelse(Sexo==0,"Masculino","Femenino"),
ColorPiel=ifelse(ColorPiel==0,"Indigena",ifelse(ColorPiel==2,"Blanca", ifelse(ColorPiel==4,"Negra",ifelse(ColorPiel==6,"Amarillo", ifelse(ColorPiel==8,"Marror","No_precisa"))))))-> data1Vemos los valores asuntes de la base de datos:
vis_miss(data1) >> Nota: Se puede ver que la base de datos no tiene ningun valor ausente.
Realizamos una descriccion de la base de datos:
describe(data1)## data1
##
## 6 Variables 76840 Observations
## --------------------------------------------------------------------------------
## Region
## n missing distinct Info Mean Gmd .05 .10
## 76840 0 27 0.995 31.91 12.62 13 15
## .25 .50 .75 .90 .95
## 25 31 41 50 52
##
## lowest : 11 12 13 14 15, highest: 43 50 51 52 53
## --------------------------------------------------------------------------------
## Sexo
## n missing distinct
## 76840 0 2
##
## Value Femenino Masculino
## Frequency 23590 53250
## Proportion 0.307 0.693
## --------------------------------------------------------------------------------
## Edad
## n missing distinct Info Mean Gmd .05 .10
## 76840 0 84 0.999 44.07 14.18 25 28
## .25 .50 .75 .90 .95
## 34 43 53 61 65
##
## lowest : 13 14 15 16 17, highest: 92 94 95 97 99
## --------------------------------------------------------------------------------
## ColorPiel
## n missing distinct
## 76840 0 5
##
## lowest : Amarillo Blanca Indigena Marror Negra
## highest: Amarillo Blanca Indigena Marror Negra
##
## Value Amarillo Blanca Indigena Marror Negra
## Frequency 352 31815 357 35925 8391
## Proportion 0.005 0.414 0.005 0.468 0.109
## --------------------------------------------------------------------------------
## Anios_estudio
## n missing distinct Info Mean Gmd .05 .10
## 76840 0 17 0.975 9.47 5.133 1 3
## .25 .50 .75 .90 .95
## 6 11 12 16 16
##
## lowest : 1 2 3 4 5, highest: 13 14 15 16 17
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 5849 1388 2101 2891 6729 4499 2445 2689 7980 1840 2118
## Proportion 0.076 0.018 0.027 0.038 0.088 0.059 0.032 0.035 0.104 0.024 0.028
##
## Value 12 13 14 15 16 17
## Frequency 20848 1836 1253 1388 10795 191
## Proportion 0.271 0.024 0.016 0.018 0.140 0.002
## --------------------------------------------------------------------------------
## Ingresos
## n missing distinct Info Mean Gmd .05 .10
## 76840 0 1606 0.998 2000 2097 100 350
## .25 .50 .75 .90 .95
## 788 1200 2000 4000 6000
##
## lowest : 0 5 6 8 10, highest: 80000 90000 100000 120000 200000
## --------------------------------------------------------------------------------
Con la libreria funModeling se puede hacer el analisis exploratorio de una mejor manera:
freq(data1)## Sexo frequency percentage cumulative_perc
## 1 Masculino 53250 69.3 69.3
## 2 Femenino 23590 30.7 100.0
## ColorPiel frequency percentage cumulative_perc
## 1 Marror 35925 46.75 46.75
## 2 Blanca 31815 41.40 88.15
## 3 Negra 8391 10.92 99.07
## 4 Indigena 357 0.46 99.53
## 5 Amarillo 352 0.46 100.00
## [1] "Variables processed: Sexo, ColorPiel"
plot_num(data1)if (!require("pacman")) install.packages("pacman")
pacman::p_load(jpeg, png, ggplot2, grid, neuropsychology)
setwd("C:/Users/atita/Desktop/Preg2")
img <- png::readPNG("Flag_of_Brazil.svg.png")data %>% select(Region) %>% group_by(Region) %>% count() %>%
as.data.frame()%>% ggplot(aes(reorder(as.factor(Region),n),n,fill=n))+
annotation_custom(rasterGrob(img,
width = unit(1,"npc"),
height = unit(1,"npc")),
-Inf, Inf, -Inf, Inf) +
geom_col(alpha=0.8)+
scale_fill_gradient(low = "blue", high = "red")+
coord_flip()+labs(title="Ciudades encuestadas",x="",y="")+
theme(plot.subtitle = element_text(family = "NewCenturySchoolbook",
size = 10, face = "bold", colour = "gray25",
hjust = 0.1), axis.line = element_line(colour = NA),
axis.ticks = element_line(colour = NA,
linetype = "dashed"), axis.title = element_text(family = "AvantGarde",
face = "bold"), axis.text = element_text(family = "mono",
face = "bold"), axis.text.x = element_text(family = "serif",
vjust = 0.25), plot.title = element_text(family = "AvantGarde",
face = "bold", hjust = 0.5), legend.text = element_text(family = "mono"),
legend.position = "none")->grafico
data %>% select(Region,Edad,Anios_estudio,
Ingresos) %>% group_by(Region)%>%
summarise(Promdio_Edad=round(mean(Edad),3),
promedio_ingresos=round(mean(Ingresos),3),
promedio_estudio=round(mean(Anios_estudio),3))->data2
data %>% select(Region) %>% group_by(Region)%>% count()-> data_3
data2<- cbind(data2,data_3$n)
names(data2)[5]<- "N_encuestados"
data2<-data2[order(data2$N_encuestados,decreasing = T),]
data2<- data2[1:5,]
tabla<- ggtexttable(data2, rows = NULL, theme = ttheme("mBlue"))
ggarrange(grafico, tabla,
ncol = 1, nrow = 2,
heights = c(1, 0.5))data1 %>% filter(Region==c(35,31,43,29,33))-> new_data
p= ggscatterhist(
new_data, x = "Edad", y = "Ingresos",
color = "Sexo", size = 6, alpha = 0.6,
palette = c("#798E87", "#046C9A", "#C7B19C"),
margin.params = list(fill = "Sexo",
color = "black", size = 1)
) q= ggscatterhist(
new_data, x = "Edad", y = "Ingresos",
color = "ColorPiel", size = 6, alpha = 0.6,
palette = c("#798E87", "#046C9A", "#C7B19C","#AF7AC5","#85929E"),
margin.params = list(fill = "ColorPiel",
color = "black", size = 1)
) Primero se analiza cual es el procentaje de N.A en la base datos, viendo el porcentaje se elimina las variables con más del 7% con N.A:
setwd("C:/Users/atita/Desktop/Preg3")
data<- read.csv("CalCOFI.csv")
ata<-apply(data,2,function(x) sum(is.na(x))/length(x))
ata<- as.data.frame(round(ata,2))
names(ata)<- "N.A"
valores<- which(ata$N.A>0.07)
data_new<- data[,-valores]eli_na<- function(data){
na<- apply(data,2,function(x) sum(is.na(x))/length(x))
na<- as.data.frame(na)
names(ata)<- "N.A"
v<- which(ata$N.A>0.07)
na1<- na[,-valores]
return("new_data"=na,"Valores_na"=na)
}zx<- (data_new$R_Depth-mean(data_new$R_Depth))/sd(data_new$R_Depth)
p<-which(abs(zx)>3)
new_data<- data_new[-p,]library(stargazer)
modelo1<- lm(R_SALINITY~R_Depth,data = new_data)
stargazer(modelo1,type="html")| Dependent variable: | |
| R_SALINITY | |
| R_Depth | 0.001*** |
| (0.00000) | |
| Constant | 33.558*** |
| (0.001) | |
| Observations | 806,781 |
| R2 | 0.423 |
| Adjusted R2 | 0.423 |
| Residual Std. Error | 0.348 (df = 806779) |
| F Statistic | 591,367.000*** (df = 1; 806779) |
| Note: | p<0.1; p<0.05; p<0.01 |
modelo2<- lm(R_SALINITY~R_TEMP,data = new_data)
stargazer(modelo2,type = "html")| Dependent variable: | |
| R_SALINITY | |
| R_TEMP | -0.053*** |
| (0.0001) | |
| Constant | 34.416*** |
| (0.001) | |
| Observations | 803,534 |
| R2 | 0.233 |
| Adjusted R2 | 0.233 |
| Residual Std. Error | 0.401 (df = 803532) |
| F Statistic | 243,732.200*** (df = 1; 803532) |
| Note: | p<0.1; p<0.05; p<0.01 |
modelo3<- lm(R_SALINITY~R_TEMP+R_Depth,data = new_data)
stargazer(modelo3,type = "html")| Dependent variable: | |
| R_SALINITY | |
| R_TEMP | 0.005*** |
| (0.0001) | |
| R_Depth | 0.001*** |
| (0.00000) | |
| Constant | 33.484*** |
| (0.002) | |
| Observations | 803,534 |
| R2 | 0.424 |
| Adjusted R2 | 0.424 |
| Residual Std. Error | 0.347 (df = 803531) |
| F Statistic | 296,143.300*** (df = 2; 803531) |
| Note: | p<0.1; p<0.05; p<0.01 |
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("see","jtools","olsrr","parameters","stats","ggplot2", "palmerpenguins", "plot3D" , "plot3Drgl","apaTables","gvlma","broomExtra","performance")
ipak(packages)## see jtools olsrr parameters stats
## TRUE TRUE TRUE TRUE TRUE
## ggplot2 palmerpenguins plot3D plot3Drgl apaTables
## TRUE TRUE TRUE TRUE TRUE
## gvlma broomExtra performance
## TRUE TRUE TRUE
plot(compare_performance(modelo1,modelo2,modelo3,rank = T))Nota: El mejor modelo es Modelo 3 por su mejor ajuste.
plot_summs(modelo1,modelo2,modelo3,scale=T,robust=T)setwd("C:/Users/atita/Desktop/Preg4")
library(readxl)
data <- read_excel("Reactiva_Peru_Lista_de_empresas_al_30102020.xlsx",
skip = 2)
data %>% select(`SECTOR ECONÓMICO`,`NOMBRE DE ENTIDAD OTORGANTE DEL CRÉDITO`,
`MONTO PRÉSTAMO (S/)`,`MONTO COBERTURADO (S/)`,
DEPARTAMENTO) -> new_data
names(new_data)<- c("S.E","Entidad","Monto_prestamo","Monto_cobertura",
"Departamento")
data_na<- as.data.frame(colSums(is.na(new_data)))
names(data_na)<- "Valores_NA"
data_na %>% kbl() %>% kable_material_dark() %>%
column_spec(column = 2, width = "3cm", background = alpha("blue",0.3))| Valores_NA | |
|---|---|
| S.E | 3 |
| Entidad | 3 |
| Monto_prestamo | 3 |
| Monto_cobertura | 3 |
| Departamento | 3 |
Nota: Al somo tener 3 Valores N.A en cada variable procedere a eliminarlas
new_data<- na.omit(new_data)
freq(new_data)## S.E frequency percentage
## 1 COMERCIO 237995 47.48
## 2 TRANSPORTE, ALMACENAMIENTO Y COMUNIC. 59661 11.90
## 3 INDUSTRIA MANUFACTURERA 48576 9.69
## 4 ACTIV. INMOBILIARIAS, EMPRESARIALES ALQ. 40003 7.98
## 5 OTROS SERVICIOS 27441 5.47
## 6 CONSTRUCCION 27117 5.41
## 7 HOTELES Y RESTAURANTES 24567 4.90
## 8 AGRICULTUR, GANADERIA, CAZA Y SILVICULT 21797 4.35
## 9 SERVICIOS SOCIALES Y DE SALUD 5342 1.07
## 10 ENSEÑANZA 3094 0.62
## 11 PESCA 2942 0.59
## 12 MINERIA 1350 0.27
## 13 ELECTRICIDAD, GAS Y AGUA 717 0.14
## 14 INTERMEDIACION FINANCIERA 696 0.14
## cumulative_perc
## 1 47.48
## 2 59.38
## 3 69.07
## 4 77.05
## 5 82.52
## 6 87.93
## 7 92.83
## 8 97.18
## 9 98.25
## 10 98.87
## 11 99.46
## 12 99.73
## 13 99.87
## 14 100.00
## Entidad frequency percentage cumulative_perc
## 1 0049 - MIBANCO 255671 51.00 51.00
## 2 0002 - CRÉDITO 64832 12.93 63.93
## 3 0011 - BANCO BBVA PERU 25101 5.01 68.94
## 4 0803 - CMAC AREQUIPA 22095 4.41 73.35
## 5 0003 - INTERBANK 19677 3.93 77.28
## 6 0806 - CMAC CUSCO 19331 3.86 81.14
## 7 0808 - CMAC DE HUANCAYO 19101 3.81 84.95
## 8 0836 - CRAC RAIZ 15847 3.16 88.11
## 9 0009 - SCOTIABANK 12296 2.45 90.56
## 10 0098 - FINANCIERA PROEMPRESA S.A. 6849 1.37 91.93
## 11 0802 - CMAC TRUJILLO 6669 1.33 93.26
## 12 0073 - FINANCIERA CREDINKA 4883 0.97 94.23
## 13 0805 - CMAC SULLANA 4424 0.88 95.11
## 14 0093 - QAPAQ 4293 0.86 95.97
## 15 0813 - CMAC TACNA 3697 0.74 96.71
## 16 0811 - CMAC MAYNAS 3527 0.70 97.41
## 17 0091 - FINANCIERA COMPARTAMOS 3073 0.61 98.02
## 18 0043 - FINANCIERA CREDISCOTIA 2631 0.52 98.54
## 19 0809 - CMAC DE ICA 2038 0.41 98.95
## 20 0800 - CMCP LIMA 1999 0.40 99.35
## 21 0801 - CMAC PIURA 1514 0.30 99.65
## 22 0035 - BANCO PICHINCHA 679 0.14 99.79
## 23 0038 - INTERAMERICANO 466 0.09 99.88
## 24 0099 - FINANCIERA CONFIANZA 269 0.05 99.93
## 25 0850 - CRAC PRYMERA 171 0.03 99.96
## 26 0023 - COMERCIO 127 0.03 99.99
## 27 0096 - FINANCIERA EFECTIVA 29 0.01 100.00
## 28 0056 - SANTANDER PERÚ S.A. 9 0.00 100.00
## Departamento frequency percentage cumulative_perc
## 1 LIMA 153251 30.57 30.57
## 2 PUNO 37377 7.46 38.03
## 3 PIURA 33406 6.66 44.69
## 4 AREQUIPA 31450 6.27 50.96
## 5 CUSCO 29427 5.87 56.83
## 6 CAJAMARCA 27663 5.52 62.35
## 7 LA LIBERTAD 25833 5.15 67.50
## 8 LAMBAYEQUE 24117 4.81 72.31
## 9 JUNIN 23252 4.64 76.95
## 10 ANCASH 22045 4.40 81.35
## 11 TACNA 11645 2.32 83.67
## 12 AYACUCHO 10081 2.01 85.68
## 13 SAN MARTIN 10050 2.00 87.68
## 14 ICA 9549 1.90 89.58
## 15 CALLAO 8619 1.72 91.30
## 16 HUANUCO 8131 1.62 92.92
## 17 AMAZONAS 5276 1.05 93.97
## 18 APURIMAC 5153 1.03 95.00
## 19 TUMBES 4472 0.89 95.89
## 20 MOQUEGUA 4432 0.88 96.77
## 21 UCAYALI 4314 0.86 97.63
## 22 LORETO 4134 0.82 98.45
## 23 MADRE DE DIOS 3424 0.68 99.13
## 24 PASCO 2469 0.49 99.62
## 25 HUANCAVELICA 1728 0.34 100.00
## [1] "Variables processed: S.E, Entidad, Departamento"
plot_num(new_data)titulo<- "Relalcion entre Monto de prestamo vs Monto de cobertura"
library(ggpmisc)
scientific_10 <- function(x) {
gsub("e", " x 10^", scientific_format()(x))
}
ggplot(new_data, aes(x = Monto_prestamo, y =Monto_cobertura)) +
geom_point()+
stat_smooth(method = lm, formula = y ~ x ) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.. , ..rr.label.., sep = "~~")),
parse = TRUE )+scale_y_continuous(label=scientific_10)+
scale_x_continuous(labels = scientific_10)+labs(title = titulo,
x="Monto de presamo",y="Monto de cobertura")+theme_minimal()