library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
library(Metrics)
library(lubridate)
library(plotly)

rm(list = ls())

Carga de bases

cargo_13<- data.table(read.csv('2013/20130904_Docentes_2013_20130709_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))

cargo_14<- data.table(read.csv('2014/20140819_Docentes_2014_20140704_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))

Combinación de bases

combined_data <- rbind(cargo_13, cargo_14, fill=T)

Generación de nueva variable

combined_data$change <- ifelse(combined_data$AGNO == 2013 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2014], 1, 0)

# numero de cambios entre 2013-2014
sum(combined_data$change)

## [1] 18308

percentage <- (18308 / 234168) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))

## [1] "7.81831847220799 % de los profesores se han cambiado de trabajo entre 2013 y 2014"

Confirmación de datos

subset_data <- combined_data[combined_data$change == 1, ]

Confirmación de valores unicos

# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)

# Print the result
if (all_unique) {
  print("All values in MRUN are unique.")
} else {
  print("There are duplicate values in MRUN.")
}

## [1] "There are duplicate values in MRUN."

Valores repetidos

# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]

# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))

# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))

## [1] "Número de datos repetidos en MRUN: 853"

Grafico de proporción de cambios

graph1 <- ggplot() +
  geom_bar(combined_data, mapping = aes(change)) +

  
  theme_minimal() +
  labs(x='Colegios',y='Cambio de colegio',color=NULL) +
  theme(legend.position = 'bottom') 
graph1

Graficos de cambios por edad

#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))

#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))

ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
  geom_point() +
  theme_minimal() +
  labs(x = 'Age', y = 'Cambio') +
  theme(legend.position = 'bottom',
        axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))

ggplot() +
  geom_bar(subset_data, mapping = aes(birth)) +
  theme_minimal() +
  labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
  theme_minimal() +
  labs(x='Rural',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
  theme_minimal() +
  labs(x='Genero',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
  theme_minimal() +
  labs(x='Región',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

Regresión lineal tipo

f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)

## 
## Call:
## lm(formula = f01, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10587 -0.07775 -0.01278  0.00317  0.93905 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  1.616e+02  1.133e+00  142.642  < 2e-16 ***
## AGNO        -8.038e-02  5.631e-04 -142.744  < 2e-16 ***
## RBD          2.348e-07  3.283e-08    7.153 8.53e-13 ***
## RURAL_RBD   -2.245e-03  8.775e-04   -2.559   0.0105 *  
## HORAS_AULA  -6.098e-04  2.127e-05  -28.671  < 2e-16 ***
## birth        1.660e-04  2.350e-05    7.064 1.61e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1909 on 461040 degrees of freedom
## Multiple R-squared:  0.04462,    Adjusted R-squared:  0.04461 
## F-statistic:  4307 on 5 and 461040 DF,  p-value: < 2.2e-16

2019-2020

rm(list = ls())

Carga de bases

cargo_19<- data.table(read.csv('2019/20191009_Docentes_2019_20190630_PUBL.csv', header = T, sep = ";" , encoding = 'Latin-1'))

cargo_20<- data.table(read.csv('2020/20200727_Docentes_2020_20200630_PUBL.csv ', header = T, sep = ";" , encoding = 'Latin-1'))

Combinación de bases

combined_data <- rbind(cargo_19, cargo_20, fill=T)

Generación de nueva variable

combined_data$change <- ifelse(combined_data$AGNO == 2019 & !combined_data$MRUN %in% combined_data$MRUN[combined_data$AGNO == 2020], 1, 0)

# numero de cambios entre 2013-2014
sum(combined_data$change)

## [1] 20608

percentage <- (20608 / 263281) * 100
print(paste0(percentage, " % de los profesores se han cambiado de trabajo entre 2013 y 2014"))

## [1] "7.82737835240674 % de los profesores se han cambiado de trabajo entre 2013 y 2014"

Confirmación de datos

subset_data <- combined_data[combined_data$change == 1, ]

Confirmación de valores unicos

# Check if all values in MRUN are unique
all_unique <- length(unique(subset_data$MRUN)) == nrow(subset_data)

# Print the result
if (all_unique) {
  print("All values in MRUN are unique.")
} else {
  print("There are duplicate values in MRUN.")
}

## [1] "There are duplicate values in MRUN."

Valores repetidos

# Find duplicated values in MRUN
duplicated_values <- subset_data$MRUN[duplicated(subset_data$MRUN)]

# Contar el número de datos repetidos en MRUN
num_repeated <- sum(duplicated(subset_data$MRUN))

# Imprimir el resultado
print(paste("Número de datos repetidos en MRUN:", num_repeated))

## [1] "Número de datos repetidos en MRUN: 803"

Grafico de proporción de cambios

graph1 <- ggplot() +
  geom_bar(combined_data, mapping = aes(change)) +

  
  theme_minimal() +
  labs(x='Colegios',y='Cambio de colegio',color=NULL) +
  theme(legend.position = 'bottom') 
graph1

Graficos de cambios por edad

#Año de nacimiento
combined_data$birth <- as.numeric(substr(combined_data$DOC_FEC_NAC, 1, 4))

#Formato fecha de nacimeinto
combined_data$date_birth <- as.Date(strptime(combined_data$DOC_FEC_NAC, "%Y %m %d"))

ggplot(combined_data , aes(x = combined_data$birth, y = change, color = DOC_GENERO)) +
  geom_point() +
  theme_minimal() +
  labs(x = 'Age', y = 'Cambio') +
  theme(legend.position = 'bottom',
        axis.text.x = element_text(angle = 45, hjust = 1))

subset_data$birth <- as.numeric(substr(subset_data$DOC_FEC_NAC, 1, 4))

ggplot() +
  geom_bar(subset_data, mapping = aes(birth)) +
  theme_minimal() +
  labs(x='Nacimiento',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$RURAL_RBD)) +
  theme_minimal() +
  labs(x='Rural',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$DOC_GENERO)) +
  theme_minimal() +
  labs(x='Genero',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

ggplot() +
  geom_bar(subset_data, mapping = aes(subset_data$COD_REG_RBD)) +
  theme_minimal() +
  labs(x='Región',y='Numero de Personas',color=NULL) +
  theme(legend.position = 'bottom')

Regresión lineal tipo

f01 <- formula(change~ AGNO + RBD + RURAL_RBD + HORAS_AULA + birth)
reg1<- lm(f01,data=combined_data)
summary(reg1)

## 
## Call:
## lm(formula = f01, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10188 -0.07586 -0.01022  0.00168  0.94560 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  1.555e+02  1.049e+00  148.177  < 2e-16 ***
## AGNO        -7.713e-02  5.196e-04 -148.446  < 2e-16 ***
## RBD          3.887e-07  2.961e-08   13.128  < 2e-16 ***
## RURAL_RBD   -3.049e-03  7.973e-04   -3.824 0.000131 ***
## HORAS_AULA  -3.994e-04  2.000e-05  -19.973  < 2e-16 ***
## birth        1.725e-04  2.171e-05    7.944 1.96e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1891 on 530183 degrees of freedom
##   (471 observations deleted due to missingness)
## Multiple R-squared:  0.04095,    Adjusted R-squared:  0.04094 
## F-statistic:  4528 on 5 and 530183 DF,  p-value: < 2.2e-16

Untitled

Sebastián Lillo

2023-07-14

Carga de bases

Combinación de bases

Generación de nueva variable

Confirmación de datos

Confirmación de valores unicos

Valores repetidos

Grafico de proporción de cambios

Graficos de cambios por edad

Regresión lineal tipo

2019-2020

Carga de bases

Combinación de bases

Generación de nueva variable

Confirmación de datos

Confirmación de valores unicos

Valores repetidos

Grafico de proporción de cambios

Graficos de cambios por edad

Regresión lineal tipo