library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(Metrics)
library(lubridate)
library(plotly)
rm(list = ls())

Carga de bases

cargo_13<- data.table(read.csv('2013/20130904_Docentes_2013_20130709_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))

cargo_14<- data.table(read.csv('2014/20140819_Docentes_2014_20140704_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))

Combinación de bases

combined_data <- rbind(cargo_13, cargo_14, fill=T)

Generación de nueva variable

combined_data$change <- ifelse(combined_data$AGNO == 2013 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2014], 1, 0)
# numero de cambios entre 2013-2014
sum(combined_data$change)
## [1] 18308
percentage <- (18308 / 234168) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))
## [1] "7.81831847220799 % de los profesores se han cambiado de trabajo entre 2013 y 2014"

Confirmación de datos

subset_data <- combined_data[combined_data$change == 1, ]

Confirmación de valores unicos

# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)

# Print the result
if (all_unique) {
  print("All values in MRUN are unique.")
} else {
  print("There are duplicate values in MRUN.")
}
## [1] "There are duplicate values in MRUN."

Valores repetidos

# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]

# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))

# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))
## [1] "Número de datos repetidos en MRUN: 853"

Grafico de proporción de cambios

graph1 <- ggplot() +
  geom_bar(combined_data, mapping = aes(change)) +

  
  theme_minimal() +
  labs(x='Colegios',y='Cambio de colegio',color=NULL) +
  theme(legend.position = 'bottom') 
graph1

Graficos de cambios por edad

#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))

#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))
ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
  geom_point() +
  theme_minimal() +
  labs(x = 'Age', y = 'Cambio') +
  theme(legend.position = 'bottom',
        axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))
ggplot() +
  geom_bar(subset_data, mapping = aes(birth)) +
  theme_minimal() +
  labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
  theme_minimal() +
  labs(x='Rural',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
  theme_minimal() +
  labs(x='Genero',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
  theme_minimal() +
  labs(x='Región',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

Regresión lineal tipo

f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)
## 
## Call:
## lm(formula = f01, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10587 -0.07775 -0.01278  0.00317  0.93905 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  1.616e+02  1.133e+00  142.642  < 2e-16 ***
## AGNO        -8.038e-02  5.631e-04 -142.744  < 2e-16 ***
## RBD          2.348e-07  3.283e-08    7.153 8.53e-13 ***
## RURAL_RBD   -2.245e-03  8.775e-04   -2.559   0.0105 *  
## HORAS_AULA  -6.098e-04  2.127e-05  -28.671  < 2e-16 ***
## birth        1.660e-04  2.350e-05    7.064 1.61e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1909 on 461040 degrees of freedom
## Multiple R-squared:  0.04462,    Adjusted R-squared:  0.04461 
## F-statistic:  4307 on 5 and 461040 DF,  p-value: < 2.2e-16

2019-2020

rm(list = ls())

Carga de bases

cargo_19<- data.table(read.csv('2019/20191009_Docentes_2019_20190630_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))

cargo_20<- data.table(read.csv('2020/20200727_Docentes_2020_20200630_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))

Combinación de bases

combined_data <- rbind(cargo_19, cargo_20, fill=T)

Generación de nueva variable

combined_data$change <- ifelse(combined_data$AGNO == 2019 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2020], 1, 0)
# numero de cambios entre 2013-2014
sum(combined_data$change)
## [1] 20608
percentage <- (20608 / 263281) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))
## [1] "7.82737835240674 % de los profesores se han cambiado de trabajo entre 2013 y 2014"

Confirmación de datos

subset_data <- combined_data[combined_data$change == 1, ]

Confirmación de valores unicos

# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)

# Print the result
if (all_unique) {
  print("All values in MRUN are unique.")
} else {
  print("There are duplicate values in MRUN.")
}
## [1] "There are duplicate values in MRUN."

Valores repetidos

# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]

# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))

# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))
## [1] "Número de datos repetidos en MRUN: 803"

Grafico de proporción de cambios

graph1 <- ggplot() +
  geom_bar(combined_data, mapping = aes(change)) +

  
  theme_minimal() +
  labs(x='Colegios',y='Cambio de colegio',color=NULL) +
  theme(legend.position = 'bottom') 
graph1

Graficos de cambios por edad

#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))

#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))
ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
  geom_point() +
  theme_minimal() +
  labs(x = 'Age', y = 'Cambio') +
  theme(legend.position = 'bottom',
        axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))
ggplot() +
  geom_bar(subset_data, mapping = aes(birth)) +
  theme_minimal() +
  labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
  theme_minimal() +
  labs(x='Rural',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
  theme_minimal() +
  labs(x='Genero',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom') 

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
  theme_minimal() +
  labs(x='Región',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

Regresión lineal tipo

f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)
## 
## Call:
## lm(formula = f01, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10188 -0.07586 -0.01022  0.00168  0.94560 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  1.555e+02  1.049e+00  148.177  < 2e-16 ***
## AGNO        -7.713e-02  5.196e-04 -148.446  < 2e-16 ***
## RBD          3.887e-07  2.961e-08   13.128  < 2e-16 ***
## RURAL_RBD   -3.049e-03  7.973e-04   -3.824 0.000131 ***
## HORAS_AULA  -3.994e-04  2.000e-05  -19.973  < 2e-16 ***
## birth        1.725e-04  2.171e-05    7.944 1.96e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1891 on 530183 degrees of freedom
##   (471 observations deleted due to missingness)
## Multiple R-squared:  0.04095,    Adjusted R-squared:  0.04094 
## F-statistic:  4528 on 5 and 530183 DF,  p-value: < 2.2e-16