rm (list = ls())
library(data.table)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(knitr)
library(moderndive)
library(rpart.plot)
## Loading required package: rpart
library(rpart)
library(readr)
datos <- read_delim("datos_tarea_5.csv",
";", escape_double = FALSE, na = "NA",
trim_ws = TRUE)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## a6_otro = col_character(),
## b16_otro = col_character(),
## e3_7 = col_logical(),
## e3_8 = col_logical(),
## e3_11 = col_logical(),
## e3_12 = col_logical(),
## e19_otro = col_character(),
## d1_monto = col_number(),
## ss_t = col_number(),
## d2_1_monto = col_number(),
## d2_1_porcentaje = col_logical(),
## d2_2_monto = col_number(),
## d2_2_porcentaje = col_logical(),
## d2_3_monto = col_number(),
## d2_3_porcentaje = col_logical(),
## d2_3_opcionb = col_logical(),
## d2_4_monto = col_number(),
## d2_4_porcentaje = col_logical(),
## d2_5_monto = col_number(),
## d2_5_porcentaje = col_logical()
## # ... with 98 more columns
## )
## i Use `spec()` for the full column specifications.
class(datos$ss_t)
## [1] "numeric"
datos=as.data.table(datos)
ggplot(datos[0<ss_t&ss_t<2000000], aes(x=ss_t,fill=region))+
geom_histogram(bins=30)+facet_wrap(facets = "region")
### P3 ### Arregle los gráficos para que cada uno tenga un eje x legible, además arregle el eje y. Agregue color a cada gráfico:(4 puntos) Hint: utilice la función scales = ‘free_y’ dentro de facet_wrap para dejar libre el eje y. También recuerde chequear la clase de la variable region para que se asigne correctamente un color a cada región.
datos$region=as.character(datos$region)
ggplot(datos[0<ss_t&ss_t<2000000], aes(x=ss_t,fill=region))+
geom_histogram(bins=30)+facet_wrap(facets = "region",scales = 'free_y')+
theme(axis.text.x = element_text(angle=90,vjust=1))
ggplot(datos,aes(x=ss_t, y=edad)) + geom_point()+scale_x_continuous(labels=scales::comma)+
labs(title = "Diagrama de dispersión",
subtitle = "ScatterPlot",
caption = "Fuente: INE",
x = "Edad",
y = "Ingresos")
datos= datos[!is.na(datos$ss_t),]
datos= datos[!is.na(datos$sexo),]
datos= datos[!is.na(datos$cine),]
reg_1 <- lm(ss_t~sexo + cine , data = datos)
get_regression_table(reg_1) %>% kable()
| term | estimate | std_error | statistic | p_value | lower_ci | upper_ci |
|---|---|---|---|---|---|---|
| intercept | 1.004020e+14 | 1.768835e+12 | 56.762 | 0.000 | 9.693508e+13 | 1.038689e+14 |
| sexo | -2.326727e+13 | 1.103550e+12 | -21.084 | 0.000 | -2.543021e+13 | -2.110432e+13 |
| cine | 4.564607e+10 | 1.359551e+10 | 3.357 | 0.001 | 1.899902e+10 | 7.229312e+10 |
summary(reg_1)
##
## Call:
## lm(formula = ss_t ~ sexo + cine, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.227e+14 -7.727e+13 -5.410e+13 -5.400e+13 9.452e+14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.004e+14 1.769e+12 56.762 < 2e-16 ***
## sexo -2.327e+13 1.104e+12 -21.084 < 2e-16 ***
## cine 4.565e+10 1.360e+10 3.357 0.000787 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.71e+14 on 96237 degrees of freedom
## Multiple R-squared: 0.004709, Adjusted R-squared: 0.004688
## F-statistic: 227.6 on 2 and 96237 DF, p-value: < 2.2e-16
pred1<-predict(reg_1) ## calcular los y predichos para cada observación
datos[,pred1:=predict(reg_1)]
predicciones1<-data.table(RMSE=RMSE(pred1,datos$ss_t),
MAE=MAE(pred1,datos$ss_t))
predicciones1
## RMSE MAE
## 1: 1.71014e+14 1.076885e+14
datos[cine==7,.N,by=.(sexo,pred1)]
## sexo pred1 N
## 1: 1 7.745423e+13 6224
## 2: 2 5.418696e+13 6922
datos[ss_t==0 & cine==7,.N,by=sexo]
## sexo N
## 1: 1 3079
## 2: 2 3508
datos[, factsexo := as.factor(sexo)]
datos[, facttipo := as.factor(tipo)]
reg_2 <- lm(ss_t~factsexo + cine + facttipo +edad, data = datos,na.action=na.exclude)
summary(reg_2)
##
## Call:
## lm(formula = ss_t ~ factsexo + cine + facttipo + edad, data = datos,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.423e+14 -7.349e+13 -6.005e+13 -3.956e+13 9.652e+14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.116e+13 1.254e+12 48.772 < 2e-16 ***
## factsexo2 -2.497e+13 1.102e+12 -22.668 < 2e-16 ***
## cine 3.162e+10 1.356e+10 2.332 0.01971 *
## facttipo2 -4.733e+12 1.623e+12 -2.917 0.00353 **
## facttipo3 -1.826e+13 1.393e+12 -13.105 < 2e-16 ***
## edad 5.504e+11 2.381e+10 23.115 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.704e+14 on 96234 degrees of freedom
## Multiple R-squared: 0.01175, Adjusted R-squared: 0.0117
## F-statistic: 228.9 on 5 and 96234 DF, p-value: < 2.2e-16
pred2<-predict(reg_2) ## calcular los y predichos para cada observación
datos[,pred1:=predict(reg_2)]
predicciones2<-data.table(RMSE=RMSE(pred2,datos$ss_t),
MAE=MAE(pred2,datos$ss_t))
predicciones2
## RMSE MAE
## 1: 1.704077e+14 1.067312e+14
predicciones1<-data.table(RMSE=RMSE(pred1,datos$ss_t),
MAE=MAE(pred1,datos$ss_t))
predicciones1
## RMSE MAE
## 1: 1.71014e+14 1.076885e+14
predicciones2<-data.table(RMSE=RMSE(pred2,datos$ss_t),
MAE=MAE(pred2,datos$ss_t))
predicciones2
## RMSE MAE
## 1: 1.704077e+14 1.067312e+14
library(jtools)
set.seed(12345) ## setear una semilla
setupKCV <- trainControl(method = "cv" , number = 5)
predkfolds1<-train(ss_t~sexo + cine ,data=datos,method="lm",trControl= setupKCV)
predkfolds2<-train(ss_t~factsexo + cine + facttipo +edad,data=datos,method="lm",trControl= setupKCV)
print(predkfolds1)
## Linear Regression
##
## 96240 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 76992, 76992, 76992, 76992, 76992
## Resampling results:
##
## RMSE Rsquared MAE
## 1.710092e+14 0.004790278 1.07691e+14
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
print(predkfolds2)
## Linear Regression
##
## 96240 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 76992, 76992, 76992, 76992, 76992
## Resampling results:
##
## RMSE Rsquared MAE
## 1.703979e+14 0.01171366 1.067353e+14
##
## Tuning parameter 'intercept' was held constant at a value of TRUE