library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(Metrics)
library(lubridate)
library(plotly)
rm(list = ls())
Carga de bases
cargo_13<- data.table(read.csv('2013/20130904_Docentes_2013_20130709_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))
cargo_14<- data.table(read.csv('2014/20140819_Docentes_2014_20140704_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))
Combinación de bases
combined_data <- rbind(cargo_13, cargo_14, fill=T)
Generación de nueva variable
combined_data$change <- ifelse(combined_data$AGNO == 2013 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2014], 1, 0)
# numero de cambios entre 2013-2014
sum(combined_data$change)
## [1] 18308
percentage <- (18308 / 234168) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))
## [1] "7.81831847220799 % de los profesores se han cambiado de trabajo entre 2013 y 2014"
Confirmación de datos
subset_data <- combined_data[combined_data$change == 1, ]
Confirmación de valores unicos
# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)
# Print the result
if (all_unique) {
print("All values in MRUN are unique.")
} else {
print("There are duplicate values in MRUN.")
}
## [1] "There are duplicate values in MRUN."
Valores repetidos
# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]
# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))
# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))
## [1] "Número de datos repetidos en MRUN: 853"
Grafico de proporción de cambios
graph1 <- ggplot() +
geom_bar(combined_data, mapping = aes(change)) +
theme_minimal() +
labs(x='Colegios',y='Cambio de colegio',color=NULL) +
theme(legend.position = 'bottom')
graph1

Graficos de cambios por edad
#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))
#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))
ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
geom_point() +
theme_minimal() +
labs(x = 'Age', y = 'Cambio') +
theme(legend.position = 'bottom',
axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))
ggplot() +
geom_bar(subset_data, mapping = aes(birth)) +
theme_minimal() +
labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
theme_minimal() +
labs(x='Rural',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
theme_minimal() +
labs(x='Genero',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
theme_minimal() +
labs(x='Región',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

Regresión lineal tipo
f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)
##
## Call:
## lm(formula = f01, data = combined_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10587 -0.07775 -0.01278 0.00317 0.93905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.616e+02 1.133e+00 142.642 < 2e-16 ***
## AGNO -8.038e-02 5.631e-04 -142.744 < 2e-16 ***
## RBD 2.348e-07 3.283e-08 7.153 8.53e-13 ***
## RURAL_RBD -2.245e-03 8.775e-04 -2.559 0.0105 *
## HORAS_AULA -6.098e-04 2.127e-05 -28.671 < 2e-16 ***
## birth 1.660e-04 2.350e-05 7.064 1.61e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1909 on 461040 degrees of freedom
## Multiple R-squared: 0.04462, Adjusted R-squared: 0.04461
## F-statistic: 4307 on 5 and 461040 DF, p-value: < 2.2e-16
2019-2020
rm(list = ls())
Carga de bases
cargo_19<- data.table(read.csv('2019/20191009_Docentes_2019_20190630_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))
cargo_20<- data.table(read.csv('2020/20200727_Docentes_2020_20200630_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))
Combinación de bases
combined_data <- rbind(cargo_19, cargo_20, fill=T)
Generación de nueva variable
combined_data$change <- ifelse(combined_data$AGNO == 2019 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2020], 1, 0)
# numero de cambios entre 2013-2014
sum(combined_data$change)
## [1] 20608
percentage <- (20608 / 263281) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))
## [1] "7.82737835240674 % de los profesores se han cambiado de trabajo entre 2013 y 2014"
Confirmación de datos
subset_data <- combined_data[combined_data$change == 1, ]
Confirmación de valores unicos
# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)
# Print the result
if (all_unique) {
print("All values in MRUN are unique.")
} else {
print("There are duplicate values in MRUN.")
}
## [1] "There are duplicate values in MRUN."
Valores repetidos
# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]
# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))
# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))
## [1] "Número de datos repetidos en MRUN: 803"
Grafico de proporción de cambios
graph1 <- ggplot() +
geom_bar(combined_data, mapping = aes(change)) +
theme_minimal() +
labs(x='Colegios',y='Cambio de colegio',color=NULL) +
theme(legend.position = 'bottom')
graph1

Graficos de cambios por edad
#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))
#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))
ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
geom_point() +
theme_minimal() +
labs(x = 'Age', y = 'Cambio') +
theme(legend.position = 'bottom',
axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))
ggplot() +
geom_bar(subset_data, mapping = aes(birth)) +
theme_minimal() +
labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
theme_minimal() +
labs(x='Rural',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
theme_minimal() +
labs(x='Genero',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

ggplot() +
geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
theme_minimal() +
labs(x='Región',y='Numero de Personas',color=NULL) +
theme(legend.position = 'bottom')

Regresión lineal tipo
f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)
##
## Call:
## lm(formula = f01, data = combined_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10188 -0.07586 -0.01022 0.00168 0.94560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.555e+02 1.049e+00 148.177 < 2e-16 ***
## AGNO -7.713e-02 5.196e-04 -148.446 < 2e-16 ***
## RBD 3.887e-07 2.961e-08 13.128 < 2e-16 ***
## RURAL_RBD -3.049e-03 7.973e-04 -3.824 0.000131 ***
## HORAS_AULA -3.994e-04 2.000e-05 -19.973 < 2e-16 ***
## birth 1.725e-04 2.171e-05 7.944 1.96e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1891 on 530183 degrees of freedom
## (471 observations deleted due to missingness)
## Multiple R-squared: 0.04095, Adjusted R-squared: 0.04094
## F-statistic: 4528 on 5 and 530183 DF, p-value: < 2.2e-16