Pregunta 1

Modelo de Regresión Lineal

base <- data.table(fread('kidiq.csv', encoding = 'Latin-1'))

a)

f01 <- formula(base$kid_score~base$mom_iq)
reg1<- lm(f01,data=base)
summary(reg1)
## 
## Call:
## lm(formula = f01, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.753 -12.074   2.217  11.710  47.691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.79978    5.91741    4.36 1.63e-05 ***
## base$mom_iq  0.60997    0.05852   10.42  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.27 on 432 degrees of freedom
## Multiple R-squared:  0.201,  Adjusted R-squared:  0.1991 
## F-statistic: 108.6 on 1 and 432 DF,  p-value: < 2.2e-16

Generación de Variable con valores predichos

base[,kid_score1 := predict(reg1)]

Grafico Modelo 1 (kid_score ~ mom_iq)

graph1 <- ggplot() +
  geom_segment(base, mapping = aes(x=base$mom_iq, xend=base$mom_iq, y=base$kid_score, yend=base$kid_score1)) +
  geom_point(base, mapping = aes(x=base$mom_iq, y=base$kid_score, color='Valores reales')) +
  geom_smooth(base, mapping = aes(x=base$mom_iq, y=base$kid_score1, color='Valores predichos'), method='lm',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Puntaje de los niños',y='IQ de la madre',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))+ xlim(0,150)+ ylim(0,150)
graph1

b)

f01 <- formula(base$kid_score~base$mom_iq + base$mom_hs)
reg1<- lm(f01,data=base)
summary(reg1)
## 
## Call:
## lm(formula = f01, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.873 -12.663   2.404  11.356  49.545 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.73154    5.87521   4.380 1.49e-05 ***
## base$mom_iq  0.56391    0.06057   9.309  < 2e-16 ***
## base$mom_hs  5.95012    2.21181   2.690  0.00742 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.14 on 431 degrees of freedom
## Multiple R-squared:  0.2141, Adjusted R-squared:  0.2105 
## F-statistic: 58.72 on 2 and 431 DF,  p-value: < 2.2e-16

Grafico Modelo 2 (mom_hs)

base[,kid_score1 := predict(reg1)]

graph1 <- ggplot() +
geom_segment(base, mapping = aes(x=base$mom_iq, xend=base$mom_iq, y=base$kid_score, yend=base$kid_score1)) +
  geom_point(base, mapping = aes(x=base$mom_iq, y=base$kid_score, color='Valores Reales')) +
  geom_smooth(base %>% filter(mom_hs == 0), mapping = aes(x=mom_iq, y=kid_score1), color='green', method='lm',se=F, fullrange = T) +
  geom_smooth(base %>% filter(mom_hs == 1), mapping = aes(x=mom_iq, y=kid_score1), color='cyan', method='lm',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Puntaje de los niños',y='IQ de la madre',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))+ xlim(0,150)+ ylim(0,150) +            geom_text(aes(x=10,y=55,label="Si mom_hs=1"),color="cyan") + geom_text(aes(x=10,y=17,label="Si mom_hs=0"),color="green")
graph1

c)

Si madre NO termino la escuela

base<-base[base$mom_hs==0]
f02 <- formula(base$kid_score~base$mom_iq)
reg2<- lm(f02,data=base)
base[,kid_score2 := predict(reg2)]
summary(reg2)
## 
## Call:
## lm(formula = f02, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.452  -9.868   0.661  13.289  43.149 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.4820    14.6011  -0.786    0.434    
## base$mom_iq   0.9689     0.1574   6.154    2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.07 on 91 degrees of freedom
## Multiple R-squared:  0.2939, Adjusted R-squared:  0.2861 
## F-statistic: 37.87 on 1 and 91 DF,  p-value: 1.997e-08

Grafico Modelo 3 (mom_hs == 0)

graph1 <- ggplot() +
  geom_segment(base, mapping = aes(x=base$mom_iq, xend=base$mom_iq, y=base$kid_score, yend=base$kid_score2)) +
  geom_point(base, mapping = aes(x=base$mom_iq, y=base$kid_score, color='Valores reales')) +
  geom_smooth(base, mapping = aes(x=base$mom_iq, y=base$kid_score2, color='Valores predichos'), method='lm',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Puntaje de los niños',y='IQ de la madre',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))+ xlim(0,150)+ ylim(-12,150)
graph1

Si la madre termina la escuela

base<-base[base$mom_hs==1]
f02 <- formula(base$kid_score~base$mom_iq)
reg2<- lm(f02,data=base)
base[,kid_score2 := predict(reg2)]
summary(reg2)
## 
## Call:
## lm(formula = f02, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.092 -11.904   2.331  10.457  43.880 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.78620    6.66341   5.971 5.96e-09 ***
## base$mom_iq  0.48461    0.06452   7.511 5.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.66 on 339 degrees of freedom
## Multiple R-squared:  0.1427, Adjusted R-squared:  0.1402 
## F-statistic: 56.42 on 1 and 339 DF,  p-value: 5.237e-13

Grafico modelo 3 (mom_hs==1)

graph1 <- ggplot() +
  geom_segment(base, mapping = aes(x=base$mom_iq, xend=base$mom_iq, y=base$kid_score, yend=base$kid_score2)) +
  geom_point(base, mapping = aes(x=base$mom_iq, y=base$kid_score, color='Valores reales')) +
  geom_smooth(base, mapping = aes(x=base$mom_iq, y=base$kid_score2, color='Valores predichos'), method='lm',se=F, fullrange = T) +
  theme_minimal() +
  labs(x='Puntaje de los niños',y='IQ de la madre',color=NULL) +
  theme(legend.position = 'bottom') +
  scale_y_continuous(labels = number_format(scale = 1))+ xlim(0,150)+ ylim(0,150)
graph1