library(readxl)
variables <- read_excel("Base de datos RETO.xlsx", 
    sheet = "VARIABLES")

na_count <- colSums(is.na(variables))
print(na_count)

##   edad_meses         sexo  escolaridad      estrato    violencia    deprimido 
##            0            0            8            0            0          440 
##  crecimiento      castigo      soledad soledadmenor 
##           10          154            0            0

variables <- na.omit(variables)
print(variables)

## # A tibble: 393 × 10
##    edad_meses sexo   escolaridad estrato violencia deprimido crecimiento castigo
##         <dbl> <chr>        <dbl> <chr>       <dbl>     <dbl>       <dbl> <chr>  
##  1         33 Hombre           0 Rural …         0         0           4 No     
##  2         25 Hombre          12 Urbano…        21         0           0 No     
##  3         35 Hombre          15 Urbano…        21         0           1 No     
##  4         26 Hombre          15 Urbano…         4         0           5 No     
##  5         30 Hombre          15 Urbano…         0         0          12 Sí     
##  6         35 Hombre          15 Urbano…         4         0          10 No     
##  7         31 Hombre          15 Metrop…        10         0           0 No     
##  8         31 Hombre          12 Metrop…        19         0           4 No     
##  9         25 Hombre          12 Metrop…         0        52           4 No     
## 10         32 Hombre           0 Metrop…         8         0           0 No     
## # ℹ 383 more rows
## # ℹ 2 more variables: soledad <dbl>, soledadmenor <dbl>

#Matriz de Correlación

cor_matrix <- cor(variables[, c("edad_meses", "escolaridad", "violencia", "deprimido", "crecimiento")])
print(cor_matrix)

##              edad_meses escolaridad   violencia   deprimido crecimiento
## edad_meses   1.00000000 -0.01809124 -0.02381185  0.03037982  0.09759053
## escolaridad -0.01809124  1.00000000  0.02459147 -0.07958463  0.16591401
## violencia   -0.02381185  0.02459147  1.00000000  0.04739506  0.02985501
## deprimido    0.03037982 -0.07958463  0.04739506  1.00000000 -0.08988854
## crecimiento  0.09759053  0.16591401  0.02985501 -0.08988854  1.00000000

interpret_correlation <- function(cor_value) {
  if (cor_value > 0.8) {
    return("Muy Fuerte")
  } else if (cor_value > 0.6) {
    return("Fuerte")
  } else if (cor_value > 0.4) {
    return("Moderado")
  } else if (cor_value > 0.2) {
    return("Debil")
  } else {
    return("Muy debil")
  }
}
interpreted_matrix <- apply(cor_matrix, c(1, 2), interpret_correlation)
print(interpreted_matrix)

##             edad_meses   escolaridad  violencia    deprimido    crecimiento 
## edad_meses  "Muy Fuerte" "Muy debil"  "Muy debil"  "Muy debil"  "Muy debil" 
## escolaridad "Muy debil"  "Muy Fuerte" "Muy debil"  "Muy debil"  "Muy debil" 
## violencia   "Muy debil"  "Muy debil"  "Muy Fuerte" "Muy debil"  "Muy debil" 
## deprimido   "Muy debil"  "Muy debil"  "Muy debil"  "Muy Fuerte" "Muy debil" 
## crecimiento "Muy debil"  "Muy debil"  "Muy debil"  "Muy debil"  "Muy Fuerte"

heatmap(cor_matrix, 
        annot = interpreted_matrix, # show the interpretations in each cell
        cmap = "coolwarm", # color map
        main = "Correlation Matrix with Interpretations")

## Warning in plot.window(...): "annot" is not a graphical parameter

## Warning in plot.window(...): "cmap" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "annot" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "cmap" is not a graphical parameter

## Warning in title(...): "annot" is not a graphical parameter

## Warning in title(...): "cmap" is not a graphical parameter

Visualización de Datos

#Distribución de edades por genero
ggplot(variables, aes(x = sexo, y = edad_meses, fill = sexo)) +
  geom_violin() +
  labs(title = "Distribución de Edades Por Género")

#Estrato
bar_social_status <- ggplot(variables, aes(x = factor(estrato))) +
  geom_bar() +
  labs(title = "Bar Chart: Distribución de Estrato", x = "Estrato", y = "Conteo")
print(bar_social_status)

#Violencia por edad
bar_age_violence <- ggplot(variables, aes(x = edad_meses, y = violencia)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Grouped Bar Chart: Violencia por Edad", x = "Edad en meses", y = "Violencia")
print(bar_age_violence)

#Horas de depresión 
hist_depression <- ggplot(variables, aes(x = deprimido)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "white", alpha = 0.7) +
  labs(title = "Histogram: Depression Hours", x = "Depression Hours", y = "Count")
print(hist_depression)

#Horas de depresión por genero
box_gender_depression <- ggplot(variables, aes(x = sexo, y = deprimido, fill = sexo)) +
  geom_boxplot() +
  labs(title = "Box Plot: Depression Hours by Gender", x = "Gender", y = "Depression Hours")
print(box_gender_depression)

#Depression por castigo (si/no)
bar_punishment_depression <- ggplot(variables, aes(x = castigo, fill = factor(deprimido))) +
  geom_bar(position = "fill") +
  labs(title = "Stacked Bar Chart: Depression por castigo (si/no)", x = "Punishment", y = "Proportion")
print(bar_punishment_depression)

Linear Regression - DEPRIMIDO

lm_model <- lm(deprimido ~ edad_meses + escolaridad + estrato + violencia + crecimiento + soledad + soledadmenor, data = variables)
summary(lm_model)

## 
## Call:
## lm(formula = deprimido ~ edad_meses + escolaridad + estrato + 
##     violencia + crecimiento + soledad + soledadmenor, data = variables)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44.60 -20.46 -14.78  -5.98 357.26 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)          25.0060    19.4750   1.284    0.200
## edad_meses            0.2585     0.3362   0.769    0.443
## escolaridad          -1.3529     1.0021  -1.350    0.178
## estratoRural  ( <25 -20.3519    27.6223  -0.737    0.462
## estratoUrbano ( 250   4.5690     7.6478   0.597    0.551
## violencia             0.3931     0.4359   0.902    0.368
## crecimiento          -1.1766     0.7598  -1.549    0.122
## soledad              -0.2351     1.9669  -0.120    0.905
## soledadmenor         -0.0347     1.6315  -0.021    0.983
## 
## Residual standard error: 66.03 on 384 degrees of freedom
## Multiple R-squared:  0.01904,    Adjusted R-squared:  -0.001396 
## F-statistic: 0.9317 on 8 and 384 DF,  p-value: 0.49

summary_result <- summary(lm_model)
p_values <- summary_result$coefficients[, "Pr(>|t|)"]
alpha <- 0.05
for (i in seq_along(p_values)) {
  if (p_values[i] < alpha) {
    cat(names(p_values)[i], "is statistically significant (p-value <", alpha, ")\n")
  } else {
    cat(names(p_values)[i], "is not statistically significant (p-value >=", alpha, ")\n")
  }
}

## (Intercept) is not statistically significant (p-value >= 0.05 )
## edad_meses is not statistically significant (p-value >= 0.05 )
## escolaridad is not statistically significant (p-value >= 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is not statistically significant (p-value >= 0.05 )
## violencia is not statistically significant (p-value >= 0.05 )
## crecimiento is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is not statistically significant (p-value >= 0.05 )

Linear Regression - VIOLENCIA

lm_model2 <- lm(violencia ~ edad_meses + escolaridad + estrato + deprimido + crecimiento + soledad + soledadmenor, data = variables)
summary(lm_model2)

## 
## Call:
## lm(formula = violencia ~ edad_meses + escolaridad + estrato + 
##     deprimido + crecimiento + soledad + soledadmenor, data = variables)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.401  -5.907  -3.057   3.208  17.371 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          6.373743   2.259065   2.821  0.00503 **
## edad_meses          -0.021207   0.039336  -0.539  0.59011   
## escolaridad          0.098444   0.117364   0.839  0.40211   
## estratoRural  ( <25 -1.082388   3.232057  -0.335  0.73789   
## estratoUrbano ( 250  2.241118   0.887433   2.525  0.01196 * 
## deprimido            0.005377   0.005961   0.902  0.36768   
## crecimiento          0.078810   0.089036   0.885  0.37663   
## soledad             -0.081812   0.229989  -0.356  0.72224   
## soledadmenor        -0.570494   0.188559  -3.026  0.00265 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.722 on 384 degrees of freedom
## Multiple R-squared:  0.0477, Adjusted R-squared:  0.02787 
## F-statistic: 2.405 on 8 and 384 DF,  p-value: 0.01532

summary_result2 <- summary(lm_model2)
p_values2 <- summary_result2$coefficients[, "Pr(>|t|)"]
alpha <- 0.05
for (i in seq_along(p_values2)) {
  if (p_values2[i] < alpha) {
    cat(names(p_values2)[i], "is statistically significant (p-value <", alpha, ")\n")
  } else {
    cat(names(p_values2)[i], "is not statistically significant (p-value >=", alpha, ")\n")
  }
}

## (Intercept) is statistically significant (p-value < 0.05 )
## edad_meses is not statistically significant (p-value >= 0.05 )
## escolaridad is not statistically significant (p-value >= 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is statistically significant (p-value < 0.05 )
## deprimido is not statistically significant (p-value >= 0.05 )
## crecimiento is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is statistically significant (p-value < 0.05 )

Linear Regression - CRECIMIENTO

lm_model3 <- lm(crecimiento ~ edad_meses + escolaridad + estrato + deprimido + violencia + soledad + soledadmenor, data = variables)
summary(lm_model3)

## 
## Call:
## lm(formula = crecimiento ~ edad_meses + escolaridad + estrato + 
##     deprimido + violencia + soledad + soledadmenor, data = variables)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.216 -3.655 -1.153  3.684 19.579 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          0.670655   1.306359   0.513  0.60798   
## edad_meses           0.045145   0.022413   2.014  0.04468 * 
## escolaridad          0.207143   0.066425   3.118  0.00196 **
## estratoRural  ( <25  0.511338   1.850660   0.276  0.78247   
## estratoUrbano ( 250 -0.993561   0.509803  -1.949  0.05203 . 
## deprimido           -0.005275   0.003406  -1.549  0.12228   
## violencia            0.025837   0.029189   0.885  0.37663   
## soledad             -0.181846   0.131379  -1.384  0.16712   
## soledadmenor         0.060336   0.109199   0.553  0.58090   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.421 on 384 degrees of freedom
## Multiple R-squared:  0.05919,    Adjusted R-squared:  0.03959 
## F-statistic:  3.02 on 8 and 384 DF,  p-value: 0.002662

summary_result3 <- summary(lm_model3)
p_values3 <- summary_result3$coefficients[, "Pr(>|t|)"]
alpha <- 0.05
for (i in seq_along(p_values3)) {
  if (p_values3[i] < alpha) {
    cat(names(p_values3)[i], "is statistically significant (p-value <", alpha, ")\n")
  } else {
    cat(names(p_values3)[i], "is not statistically significant (p-value >=", alpha, ")\n")
  }
}

## (Intercept) is not statistically significant (p-value >= 0.05 )
## edad_meses is statistically significant (p-value < 0.05 )
## escolaridad is statistically significant (p-value < 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is not statistically significant (p-value >= 0.05 )
## deprimido is not statistically significant (p-value >= 0.05 )
## violencia is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is not statistically significant (p-value >= 0.05 )

Multiple Regression - DEPRIMIDO

multi_model <- lm(deprimido ~ edad_meses + escolaridad + estrato + violencia + crecimiento + soledad + soledadmenor, data = variables)
summary(multi_model)

## 
## Call:
## lm(formula = deprimido ~ edad_meses + escolaridad + estrato + 
##     violencia + crecimiento + soledad + soledadmenor, data = variables)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44.60 -20.46 -14.78  -5.98 357.26 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)          25.0060    19.4750   1.284    0.200
## edad_meses            0.2585     0.3362   0.769    0.443
## escolaridad          -1.3529     1.0021  -1.350    0.178
## estratoRural  ( <25 -20.3519    27.6223  -0.737    0.462
## estratoUrbano ( 250   4.5690     7.6478   0.597    0.551
## violencia             0.3931     0.4359   0.902    0.368
## crecimiento          -1.1766     0.7598  -1.549    0.122
## soledad              -0.2351     1.9669  -0.120    0.905
## soledadmenor         -0.0347     1.6315  -0.021    0.983
## 
## Residual standard error: 66.03 on 384 degrees of freedom
## Multiple R-squared:  0.01904,    Adjusted R-squared:  -0.001396 
## F-statistic: 0.9317 on 8 and 384 DF,  p-value: 0.49

summary_result_multi <- summary(multi_model)
p_values_multi <- summary_result_multi$coefficients[, "Pr(>|t|)"]
alpha_multi <- 0.05
for (i in seq_along(p_values_multi)) {
  if (p_values_multi[i] < alpha_multi) {
    cat(names(p_values_multi)[i], "is statistically significant (p-value <", alpha_multi, ")\n")
  } else {
    cat(names(p_values_multi)[i], "is not statistically significant (p-value >=", alpha_multi, ")\n")
  }
}

## (Intercept) is not statistically significant (p-value >= 0.05 )
## edad_meses is not statistically significant (p-value >= 0.05 )
## escolaridad is not statistically significant (p-value >= 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is not statistically significant (p-value >= 0.05 )
## violencia is not statistically significant (p-value >= 0.05 )
## crecimiento is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is not statistically significant (p-value >= 0.05 )

Multiple Regression - VIOLENCIA

multi_model2 <- lm(violencia ~ edad_meses + escolaridad + estrato + deprimido + crecimiento + soledad + soledadmenor, data = variables)
summary(multi_model2)

## 
## Call:
## lm(formula = violencia ~ edad_meses + escolaridad + estrato + 
##     deprimido + crecimiento + soledad + soledadmenor, data = variables)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.401  -5.907  -3.057   3.208  17.371 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          6.373743   2.259065   2.821  0.00503 **
## edad_meses          -0.021207   0.039336  -0.539  0.59011   
## escolaridad          0.098444   0.117364   0.839  0.40211   
## estratoRural  ( <25 -1.082388   3.232057  -0.335  0.73789   
## estratoUrbano ( 250  2.241118   0.887433   2.525  0.01196 * 
## deprimido            0.005377   0.005961   0.902  0.36768   
## crecimiento          0.078810   0.089036   0.885  0.37663   
## soledad             -0.081812   0.229989  -0.356  0.72224   
## soledadmenor        -0.570494   0.188559  -3.026  0.00265 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.722 on 384 degrees of freedom
## Multiple R-squared:  0.0477, Adjusted R-squared:  0.02787 
## F-statistic: 2.405 on 8 and 384 DF,  p-value: 0.01532

summary_result_multi2 <- summary(multi_model2)
p_values_multi2 <- summary_result_multi2$coefficients[, "Pr(>|t|)"]
alpha_multi <- 0.05
for (i in seq_along(p_values_multi2)) {
  if (p_values_multi2[i] < alpha_multi) {
    cat(names(p_values_multi2)[i], "is statistically significant (p-value <", alpha_multi, ")\n")
  } else {
    cat(names(p_values_multi2)[i], "is not statistically significant (p-value >=", alpha_multi, ")\n")
  }
}

## (Intercept) is statistically significant (p-value < 0.05 )
## edad_meses is not statistically significant (p-value >= 0.05 )
## escolaridad is not statistically significant (p-value >= 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is statistically significant (p-value < 0.05 )
## deprimido is not statistically significant (p-value >= 0.05 )
## crecimiento is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is statistically significant (p-value < 0.05 )

Multiple Regression - CRECIMIENTO

multi_model3 <- lm(crecimiento ~ edad_meses + escolaridad + estrato + deprimido + violencia + soledad + soledadmenor, data = variables)
summary(multi_model3)

## 
## Call:
## lm(formula = crecimiento ~ edad_meses + escolaridad + estrato + 
##     deprimido + violencia + soledad + soledadmenor, data = variables)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.216 -3.655 -1.153  3.684 19.579 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          0.670655   1.306359   0.513  0.60798   
## edad_meses           0.045145   0.022413   2.014  0.04468 * 
## escolaridad          0.207143   0.066425   3.118  0.00196 **
## estratoRural  ( <25  0.511338   1.850660   0.276  0.78247   
## estratoUrbano ( 250 -0.993561   0.509803  -1.949  0.05203 . 
## deprimido           -0.005275   0.003406  -1.549  0.12228   
## violencia            0.025837   0.029189   0.885  0.37663   
## soledad             -0.181846   0.131379  -1.384  0.16712   
## soledadmenor         0.060336   0.109199   0.553  0.58090   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.421 on 384 degrees of freedom
## Multiple R-squared:  0.05919,    Adjusted R-squared:  0.03959 
## F-statistic:  3.02 on 8 and 384 DF,  p-value: 0.002662

summary_result_multi3 <- summary(multi_model3)
p_values_multi3 <- summary_result_multi3$coefficients[, "Pr(>|t|)"]
alpha_multi <- 0.05
for (i in seq_along(p_values_multi3)) {
  if (p_values_multi3[i] < alpha_multi) {
    cat(names(p_values_multi3)[i], "is statistically significant (p-value <", alpha_multi, ")\n")
  } else {
    cat(names(p_values_multi3)[i], "is not statistically significant (p-value >=", alpha_multi, ")\n")
  }
}

## (Intercept) is not statistically significant (p-value >= 0.05 )
## edad_meses is statistically significant (p-value < 0.05 )
## escolaridad is statistically significant (p-value < 0.05 )
## estratoRural  ( <25 is not statistically significant (p-value >= 0.05 )
## estratoUrbano ( 250 is not statistically significant (p-value >= 0.05 )
## deprimido is not statistically significant (p-value >= 0.05 )
## violencia is not statistically significant (p-value >= 0.05 )
## soledad is not statistically significant (p-value >= 0.05 )
## soledadmenor is not statistically significant (p-value >= 0.05 )

ANOVA - DEPRIMIDO

anova_model <- aov(deprimido ~ edad_meses + escolaridad + estrato + violencia + crecimiento + soledad + soledadmenor, data = variables)
summary(anova_model)

##               Df  Sum Sq Mean Sq F value Pr(>F)
## edad_meses     1    1575    1575   0.361  0.548
## escolaridad    1   10664   10664   2.446  0.119
## estrato        2    6543    3272   0.750  0.473
## violencia      1    3234    3234   0.742  0.390
## crecimiento    1   10404   10404   2.386  0.123
## soledad        1      72      72   0.017  0.897
## soledadmenor   1       2       2   0.000  0.983
## Residuals    384 1674185    4360

p_value_overall <- summary(anova_model)[[1]]$`Pr(>F)`[1]
cat("Overall p-value:", p_value_overall, "\n")

## Overall p-value: 0.5481478

ANOVA - CRECIMIENTO

anova_model2 <- aov(crecimiento ~ edad_meses + escolaridad + estrato + violencia + deprimido + soledad + soledadmenor, data = variables)
summary(anova_model2)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## edad_meses     1     76   75.98   3.887 0.049371 *  
## escolaridad    1    224  224.39  11.480 0.000776 ***
## estrato        2     73   36.66   1.875 0.154688    
## violencia      1     13   13.30   0.680 0.409979    
## deprimido      1     47   46.88   2.398 0.122276    
## soledad        1     32   32.39   1.657 0.198776    
## soledadmenor   1      6    5.97   0.305 0.580903    
## Residuals    384   7506   19.55                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p_value_overall2 <- summary(anova_model2)[[1]]$`Pr(>F)`[1]
cat("Overall p-value:", p_value_overall2, "\n")

## Overall p-value: 0.04937086

ANOVA - VIOLENCIA

anova_model3 <- aov(violencia ~ edad_meses + escolaridad + estrato + crecimiento + deprimido + soledad + soledadmenor, data = variables)
summary(anova_model3)

##               Df Sum Sq Mean Sq F value  Pr(>F)   
## edad_meses     1     14    13.6   0.229 0.63281   
## escolaridad    1     14    14.0   0.235 0.62778   
## estrato        2    399   199.3   3.342 0.03638 * 
## crecimiento    1     41    41.3   0.693 0.40578   
## deprimido      1     52    52.4   0.879 0.34899   
## soledad        1     81    81.2   1.362 0.24400   
## soledadmenor   1    546   545.8   9.154 0.00265 **
## Residuals    384  22896    59.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p_value_overall3 <- summary(anova_model3)[[1]]$`Pr(>F)`[1]
cat("Overall p-value:", p_value_overall3, "\n")

## Overall p-value: 0.6328086

ANOVA - VIOLENCIA

anova_model <- aov(violencia ~ edad_meses + escolaridad + estrato + soledad + soledadmenor, data = variables)
summary(anova_model)

##               Df Sum Sq Mean Sq F value  Pr(>F)   
## edad_meses     1     14    13.6   0.229 0.63258   
## escolaridad    1     14    14.0   0.236 0.62755   
## estrato        2    399   199.3   3.347 0.03622 * 
## soledad        1     90    90.1   1.513 0.21946   
## soledadmenor   1    542   542.2   9.106 0.00272 **
## Residuals    386  22984    59.5                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Histograma

hist(variables$deprimido, breaks = 20, col = "skyblue", main = "Histograma de deprimido")
abline(v = median(variables$deprimido), col = "red", lty = 2)

Logistic Regression

table(variables$deprimido)

## 
##   0  12  52 365 
## 339  13  28  13

variables$binary_deprimido <- ifelse(variables$deprimido > 365, 1, 0)
table(variables$binary_deprimido)

## 
##   0 
## 393

logistic_model_deprimido <- glm(binary_deprimido ~ edad_meses + escolaridad + estrato + violencia + crecimiento + soledad + soledadmenor, 
                                 data = variables, 
                                 family = binomial)

## Warning: glm.fit: algorithm did not converge

summary(logistic_model_deprimido)

## 
## Call:
## glm(formula = binary_deprimido ~ edad_meses + escolaridad + estrato + 
##     violencia + crecimiento + soledad + soledadmenor, family = binomial, 
##     data = variables)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.409e-06  -2.409e-06  -2.409e-06  -2.409e-06  -2.409e-06  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -2.657e+01  1.050e+05       0        1
## edad_meses          -1.547e-15  1.813e+03       0        1
## escolaridad         -1.180e-14  5.405e+03       0        1
## estratoRural  ( <25  1.019e-12  1.490e+05       0        1
## estratoUrbano ( 250  4.711e-16  4.125e+04       0        1
## violencia           -1.174e-15  2.351e+03       0        1
## crecimiento          1.461e-15  4.098e+03       0        1
## soledad              1.221e-15  1.061e+04       0        1
## soledadmenor        -2.130e-15  8.799e+03       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 0.00e+00  on 392  degrees of freedom
## Residual deviance: 2.28e-09  on 384  degrees of freedom
## AIC: 18
## 
## Number of Fisher Scoring iterations: 25

#Interpretación de Logistic Regression

summary_result <- summary(logistic_model_deprimido)

# Interpretation of Null Deviance
cat("Null Deviance:\n")

## Null Deviance:

cat("The null deviance measures how well the model explains the response variable without any predictors.\n")

## The null deviance measures how well the model explains the response variable without any predictors.

cat("A smaller null deviance indicates that the model is better at explaining the outcome than a null model with no predictors.\n")

## A smaller null deviance indicates that the model is better at explaining the outcome than a null model with no predictors.

cat("Null Deviance:", summary_result$null.deviance, "on", summary_result$df.null, "degrees of freedom\n\n")

## Null Deviance: 0 on 392 degrees of freedom

# Interpretation of Residual Deviance
cat("Residual Deviance:\n")

## Residual Deviance:

cat("The residual deviance measures how well the model explains the response variable after including predictors.\n")

## The residual deviance measures how well the model explains the response variable after including predictors.

cat("A smaller residual deviance indicates a better fit of the model.\n")

## A smaller residual deviance indicates a better fit of the model.

cat("Residual Deviance:", summary_result$deviance, "on", summary_result$df.residual, "degrees of freedom\n\n")

## Residual Deviance: 2.280022e-09 on 384 degrees of freedom

# Interpretation of AIC
cat("AIC (Akaike Information Criterion):\n")

## AIC (Akaike Information Criterion):

cat("AIC is a measure of the model's goodness of fit, balancing the fit of the model with its complexity (number of parameters).\n")

## AIC is a measure of the model's goodness of fit, balancing the fit of the model with its complexity (number of parameters).

cat("A lower AIC indicates a better-fitting model.\n")

## A lower AIC indicates a better-fitting model.

cat("AIC:", summary_result$aic, "\n\n")

## AIC: 18

# Number of Fisher Scoring Iterations
cat("Number of Fisher Scoring Iterations:\n")

## Number of Fisher Scoring Iterations:

cat("The number of Fisher scoring iterations indicates how many iterations were performed by the optimization algorithm to find the maximum likelihood estimates of the model parameters.\n")

## The number of Fisher scoring iterations indicates how many iterations were performed by the optimization algorithm to find the maximum likelihood estimates of the model parameters.

cat("Number of Fisher Scoring Iterations:", summary_result$iter, "\n")

## Number of Fisher Scoring Iterations: 25

#Modelo Predictivo - DEPRIMIDO

print(paste("Modelo Predictivo - DEPRIMIDO"))

## [1] "Modelo Predictivo - DEPRIMIDO"

df_cleaned <- variables[, c("deprimido", "edad_meses", "escolaridad", "estrato", "soledad", "soledadmenor", "violencia", "crecimiento")]
df_cleaned <- df_cleaned[complete.cases(df_cleaned), ]
set.seed(123)
split_index <- sample(1:nrow(df_cleaned), 0.8 * nrow(df_cleaned))
train_data <- df_cleaned[split_index, ]
test_data <- df_cleaned[-split_index, ]
lm_model <- lm(deprimido ~ edad_meses + escolaridad + estrato + violencia + crecimiento, data = train_data)
predictions <- predict(lm_model, newdata = test_data)
mse <- mean((test_data$deprimido - predictions)^2)
r_squared <- summary(lm_model)$r.squared
adjusted_r_squared <- summary(lm_model)$adj.r.squared
mae <- mean(abs(test_data$deprimido - predictions))
rmse <- sqrt(mse)

print(paste("Model Evaluation Metrics:"))

## [1] "Model Evaluation Metrics:"

print(paste("Mean Squared Error (MSE): ", mse))

## [1] "Mean Squared Error (MSE):  3515.18060247854"

print(paste("Mean Absolute Error (MAE): ", mae))

## [1] "Mean Absolute Error (MAE):  26.7148500999942"

print(paste("Root Mean Squared Error (RMSE): ", rmse))

## [1] "Root Mean Squared Error (RMSE):  59.288958520778"

print(paste("R-squared: ", r_squared))

## [1] "R-squared:  0.0230286612368626"

print(paste("Adjusted R-squared: ", adjusted_r_squared))

## [1] "Adjusted R-squared:  0.00393475885061223"

if (mse == 0) {
  print("The model perfectly predicts the 'deprimido' variable.")
} else {
  print(paste("En promedio, las predicciones del modelo están equivocadas por aproximadamente", round(sqrt(mse), 2), "unidades en la escala de 'deprimido.'"))
}

## [1] "En promedio, las predicciones del modelo están equivocadas por aproximadamente 59.29 unidades en la escala de 'deprimido.'"

if (rmse < 10) {
  cat("El RMSE es bajo, sugiriendo que el modelo tiene un buen ajuste.\n")
} else {
  cat("El RMSE es alto, indicando que el modelo puede tener limitaciones.\n")
}

## El RMSE es alto, indicando que el modelo puede tener limitaciones.

if (r_squared > 0.5) {
  cat("El R-squared es alto, indicando una buena proporción de varianza explicada por el modelo.\n")
} else {
  cat("El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.\n")
}

## El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.

if (adjusted_r_squared > 0.5) {
  cat("El Adjusted R-squared es alto, lo cual es favorable al considerar el número de predictores.\n")
} else {
  cat("El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.\n")
  }

## El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.

#Modelo Predictivo - VIOLENCIA

print(paste("Modelo Predictivo - VIOLENCIA"))

## [1] "Modelo Predictivo - VIOLENCIA"

df_cleaned <- variables[, c("deprimido", "edad_meses", "escolaridad", "estrato", "soledad", "soledadmenor", "violencia", "crecimiento")]
df_cleaned <- df_cleaned[complete.cases(df_cleaned), ]
set.seed(123)
split_index <- sample(1:nrow(df_cleaned), 0.8 * nrow(df_cleaned))
train_data <- df_cleaned[split_index, ]
test_data <- df_cleaned[-split_index, ]
lm_model2 <- lm(violencia ~ edad_meses + escolaridad + estrato + deprimido + crecimiento + soledad + soledadmenor, data = train_data)
predictions2 <- predict(lm_model2, newdata = test_data)
mse <- mean((test_data$violencia - predictions2)^2)
r_squared2 <- summary(lm_model2)$r.squared
adjusted_r_squared2 <- summary(lm_model2)$adj.r.squared
mae2 <- mean(abs(test_data$violencia - predictions2))
rmse2 <- sqrt(mae2)


print(paste("Model Evaluation Metrics:"))

## [1] "Model Evaluation Metrics:"

print(paste("Mean Squared Error (MSE): ", mse))

## [1] "Mean Squared Error (MSE):  55.7045582737328"

print(paste("Mean Absolute Error (MAE): ", mae))

## [1] "Mean Absolute Error (MAE):  26.7148500999942"

print(paste("Root Mean Squared Error (RMSE): ", rmse))

## [1] "Root Mean Squared Error (RMSE):  59.288958520778"

print(paste("R-squared: ", r_squared2))

## [1] "R-squared:  0.0472922481856356"

print(paste("Adjusted R-squared: ", adjusted_r_squared2))

## [1] "Adjusted R-squared:  0.0223031924003408"

if (mse == 0) {
  print("The model perfectly predicts the 'violencia' variable.")
} else {
  print(paste("En promedio, las predicciones del modelo están equivocadas por aproximadamente", round(sqrt(mse), 2), "unidades en la escala de 'violencia'"))
}

## [1] "En promedio, las predicciones del modelo están equivocadas por aproximadamente 7.46 unidades en la escala de 'violencia'"

if (rmse < 10) {
  cat("El RMSE es bajo, sugiriendo que el modelo tiene un buen ajuste.\n")
} else {
  cat("El RMSE es alto, indicando que el modelo puede tener limitaciones.\n")
}

## El RMSE es alto, indicando que el modelo puede tener limitaciones.

if (r_squared > 0.5) {
  cat("El R-squared es alto, indicando una buena proporción de varianza explicada por el modelo.\n")
} else {
  cat("El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.\n")
}

## El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.

if (adjusted_r_squared > 0.5) {
  cat("El Adjusted R-squared es alto, lo cual es favorable al considerar el número de predictores.\n")
} else {
  cat("El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.\n")
  }

## El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.

#Modelo Predictivo - CRECIMIENTO

print(paste("Modelo Predictivo - CRECIMIENTO"))

## [1] "Modelo Predictivo - CRECIMIENTO"

df_cleaned <- variables[, c("deprimido", "edad_meses", "escolaridad", "estrato", "soledad", "soledadmenor", "violencia", "crecimiento")]
df_cleaned <- df_cleaned[complete.cases(df_cleaned), ]
set.seed(123)
split_index <- sample(1:nrow(df_cleaned), 0.8 * nrow(df_cleaned))
train_data <- df_cleaned[split_index, ]
test_data <- df_cleaned[-split_index, ]
lm_model3 <- lm(crecimiento ~ edad_meses + escolaridad + estrato + deprimido + violencia + soledad + soledadmenor, data = train_data)
predictions3 <- predict(lm_model3, newdata = test_data)
mse <- mean((test_data$crecimiento - predictions3)^2)
r_squared3 <- summary(lm_model3)$r.squared
adjusted_r_squared3 <- summary(lm_model3)$adj.r.squared
mae3 <- mean(abs(test_data$crecimiento - predictions3))
rmse3 <- sqrt(mae3)

print(paste("Model Evaluation Metrics:"))

## [1] "Model Evaluation Metrics:"

print(paste("Mean Squared Error (MSE): ", mse))

## [1] "Mean Squared Error (MSE):  16.5057489199769"

print(paste("Mean Absolute Error (MAE): ", mae))

## [1] "Mean Absolute Error (MAE):  26.7148500999942"

print(paste("Root Mean Squared Error (RMSE): ", rmse))

## [1] "Root Mean Squared Error (RMSE):  59.288958520778"

print(paste("R-squared: ", r_squared3))

## [1] "R-squared:  0.0620773525638316"

print(paste("Adjusted R-squared: ", adjusted_r_squared3))

## [1] "Adjusted R-squared:  0.0374761027950142"

if (mse == 0) {
  print("The model perfectly predicts the 'violencia' variable.")
} else {
  print(paste("En promedio, las predicciones del modelo están equivocadas por aproximadamente", round(sqrt(mse), 2), "unidades en la escala de 'violencia'"))
}

## [1] "En promedio, las predicciones del modelo están equivocadas por aproximadamente 4.06 unidades en la escala de 'violencia'"

if (rmse < 10) {
  cat("El RMSE es bajo, sugiriendo que el modelo tiene un buen ajuste.\n")
} else {
  cat("El RMSE es alto, indicando que el modelo puede tener limitaciones.\n")
}

## El RMSE es alto, indicando que el modelo puede tener limitaciones.

if (r_squared > 0.5) {
  cat("El R-squared es alto, indicando una buena proporción de varianza explicada por el modelo.\n")
} else {
  cat("El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.\n")
}

## El R-squared is bajo, lo que sugiere que el modelo puede no capturar gran parte de la varianza en los datos.

if (adjusted_r_squared > 0.5) {
  cat("El Adjusted R-squared es alto, lo cual es favorable al considerar el número de predictores.\n")
} else {
  cat("El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.\n")
  }

## El Adjusted R-squared es bajo, sugiriendo que predictores adicionales pueden mejorar el modelo.

RETO FINAL

Lorena Villarreal

2023-11-09

Visualización de Datos

Linear Regression - DEPRIMIDO

Linear Regression - VIOLENCIA

Linear Regression - CRECIMIENTO

Multiple Regression - DEPRIMIDO

Multiple Regression - VIOLENCIA

Multiple Regression - CRECIMIENTO

ANOVA - DEPRIMIDO

ANOVA - CRECIMIENTO

ANOVA - VIOLENCIA

ANOVA - VIOLENCIA

Logistic Regression