#Librería para excel
library(readxl)
library(ggplot2)
1.Cargamos los datos:
mis_datos <- read_xlsx("ch8_cps.xlsx")
set.seed(3)
muestra <- sample(1:nrow(mis_datos), 1000)
mis_datos_muestra <- mis_datos[muestra, ]
head(mis_datos_muestra)
# Primero, asegúrate de correr este bloque para activar las herramientas
library(ggplot2)
Warning: package ‘ggplot2’ was built under R version 4.4.3
# Ahora, el código corregido para la gráfica
ggplot(mis_datos_muestra, aes(x = age, y = ahe)) +
geom_point(alpha = 0.2, color = "darkblue") +
labs(x = "Edad",
y = "Salario por hora",
title = "Relación entre Edad y Salario") +
theme_minimal()
# Estimación del modelo lineal
m_lineal <- lm(ahe ~ age, data = mis_datos_muestra)
# Mostrar los coeficientes y estadísticos
summary(m_lineal)
Call:
lm(formula = ahe ~ age, data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-23.561 -10.744 -4.415 5.931 120.591
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 17.70465 1.83653 9.640 < 2e-16
age 0.15619 0.04118 3.793 0.000158
(Intercept) ***
age ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.51 on 998 degrees of freedom
Multiple R-squared: 0.01421, Adjusted R-squared: 0.01322
F-statistic: 14.39 on 1 and 998 DF, p-value: 0.0001579
Observamos el modelo lineal:
ggplot(mis_datos_muestra, aes(x = age, y = ahe)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(x = "Edad", y = "Salario por hora", title = "Ajuste Lineal")
# Estimamos el modelo agregando el término de edad al cuadrado
m_cuadratico <- lm(ahe ~ age + I(age^2), data = mis_datos_muestra)
# Mostramos los resultados para ver la significancia de los coeficientes
summary(m_cuadratico)
Call:
lm(formula = ahe ~ age + I(age^2), data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-24.906 -10.494 -4.339 5.832 118.897
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.564761 5.289163 -1.430 0.153
age 1.394651 0.246874 5.649 2.10e-08
I(age^2) -0.013919 0.002737 -5.086 4.37e-07
(Intercept)
age ***
I(age^2) ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.31 on 997 degrees of freedom
Multiple R-squared: 0.03914, Adjusted R-squared: 0.03721
F-statistic: 20.31 on 2 and 997 DF, p-value: 2.271e-09
graficamos el modelo cuadrático:
# Graficamos los puntos y añadimos la curva cuadrática
ggplot(mis_datos_muestra, aes(x = age, y = ahe)) +
geom_point(alpha = 0.2, color = "gray") +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE, color = "blue") +
labs(x = "Edad",
y = "Salario por hora",
title = "Ajuste del Modelo Cuadrático") +
theme_minimal()
Comencemos con grado 4:
# Estimamos el polinomio de grado 4
m_poli4 <- lm(ahe ~ poly(age, 4, raw = TRUE), data = mis_datos_muestra)
# Revisamos los coeficientes
summary(m_poli4)
Call:
lm(formula = ahe ~ poly(age, 4, raw = TRUE), data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-24.598 -10.709 -3.870 6.064 117.777
Coefficients:
Estimate Std. Error
(Intercept) -3.831e+01 3.507e+01
poly(age, 4, raw = TRUE)1 3.538e+00 3.371e+00
poly(age, 4, raw = TRUE)2 -5.655e-02 1.154e-01
poly(age, 4, raw = TRUE)3 1.620e-04 1.672e-03
poly(age, 4, raw = TRUE)4 1.482e-06 8.699e-06
t value Pr(>|t|)
(Intercept) -1.093 0.275
poly(age, 4, raw = TRUE)1 1.050 0.294
poly(age, 4, raw = TRUE)2 -0.490 0.624
poly(age, 4, raw = TRUE)3 0.097 0.923
poly(age, 4, raw = TRUE)4 0.170 0.865
Residual standard error: 16.26 on 995 degrees of freedom
Multiple R-squared: 0.04699, Adjusted R-squared: 0.04316
F-statistic: 12.27 on 4 and 995 DF, p-value: 9.711e-10
Lo graficamos:
# Graficamos el ajuste polinomial de grado 4
ggplot(mis_datos_muestra, aes(x = age, y = ahe)) +
geom_point(alpha = 0.2, color = "gray") +
geom_smooth(method = "lm", formula = y ~ poly(x, 4, raw = TRUE), se = FALSE, color = "darkgreen") +
labs(x = "Edad",
y = "Salario por hora",
title = "Ajuste Polinomial de Grado 4") +
theme_minimal()
Modelo lin-log:
# Estimamos el modelo: Salario (ahe) contra el logaritmo de la Edad (age)
m_linlog <- lm(ahe ~ log(age), data = mis_datos_muestra)
# Vemos los resultados
summary(m_linlog)
Call:
lm(formula = ahe ~ log(age), data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-23.571 -10.537 -4.299 5.979 120.414
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.953 6.238 -0.794 0.427
log(age) 7.909 1.676 4.719 2.71e-06
(Intercept)
log(age) ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.45 on 998 degrees of freedom
Multiple R-squared: 0.02183, Adjusted R-squared: 0.02085
F-statistic: 22.27 on 1 and 998 DF, p-value: 2.708e-06
# Si quieres ver cómo se ve la curva logarítmica:
ggplot(mis_datos_muestra, aes(x = age, y = ahe)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", formula = y ~ log(x), se = FALSE, color = "purple") +
labs(x = "Edad (escala original)", y = "Salario por hora", title = "Modelo Lin-Log")
# Estimamos el modelo con una variable continua (age) y una binaria (female)
m_binario <- lm(ahe ~ age + female, data = mis_datos_muestra)
# Mostramos el resumen de los resultados
summary(m_binario)
Call:
lm(formula = ahe ~ age + female, data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-26.610 -10.764 -3.801 5.727 117.850
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.0796 1.8466 10.874 < 2e-16
age 0.1658 0.0405 4.095 4.57e-05
female -6.2612 1.0332 -6.060 1.93e-09
(Intercept) ***
age ***
female ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.22 on 997 degrees of freedom
Multiple R-squared: 0.04923, Adjusted R-squared: 0.04732
F-statistic: 25.81 on 2 and 997 DF, p-value: 1.177e-11
Veamos si afecta que sea mujer de noreste:
# El asterisco crea: female + northeast + (female x northeast)
m_interaccion <- lm(ahe ~ age + female * northeast, data = mis_datos_muestra)
# Resultados finales
summary(m_interaccion)
Call:
lm(formula = ahe ~ age + female * northeast, data = mis_datos_muestra)
Residuals:
Min 1Q Median 3Q Max
-29.270 -10.758 -4.043 5.890 114.260
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 19.82797 1.84107 10.770 < 2e-16 ***
age 0.15781 0.04015 3.931 9.06e-05 ***
female -6.92271 1.10293 -6.277 5.16e-10 ***
northeast 4.14581 1.95355 2.122 0.0341 *
female:northeast 5.04327 2.95203 1.708 0.0879 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.07 on 995 degrees of freedom
Multiple R-squared: 0.06951, Adjusted R-squared: 0.06577
F-statistic: 18.58 on 4 and 995 DF, p-value: 9.653e-15