Se nos proporcionó dos datasets correspondientes al Censo de Vivienda en el estado de California de 1990. Cada una de las observaciones representa un conjunto de hogares.
El objetivo del mismo es llegar a predecir el valor medio del conjunto de hogares (median_house_value) utilizando métodos de regresión lineal.
El dataset de entrenamiento consta de 14,447 observaciones y 11 variables.
df = read.csv("train.csv")
df
df_test = read.csv("test.csv")
#Breve análisis
summary(df)
id longitude latitude housing_median_age total_rooms total_bedrooms population households
Min. : 2 Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 6 Min. : 2.0 Min. : 3 Min. : 2.0
1st Qu.: 5204 1st Qu.:-121.8 1st Qu.:33.94 1st Qu.:18.00 1st Qu.: 1450 1st Qu.: 297.0 1st Qu.: 791 1st Qu.: 281.0
Median :10344 Median :-118.5 Median :34.26 Median :29.00 Median : 2131 Median : 436.0 Median : 1170 Median : 410.0
Mean :10335 Mean :-119.6 Mean :35.63 Mean :28.73 Mean : 2629 Mean : 536.6 Mean : 1422 Mean : 498.6
3rd Qu.:15502 3rd Qu.:-118.0 3rd Qu.:37.71 3rd Qu.:37.00 3rd Qu.: 3149 3rd Qu.: 646.0 3rd Qu.: 1722 3rd Qu.: 604.5
Max. :20640 Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320 Max. :6445.0 Max. :28566 Max. :6082.0
NA's :144
median_income median_house_value ocean_proximity
Min. : 0.4999 Min. : 14999 Length:14447
1st Qu.: 2.5675 1st Qu.:119400 Class :character
Median : 3.5313 Median :178700 Mode :character
Mean : 3.8725 Mean :207144
3rd Qu.: 4.7457 3rd Qu.:265600
Max. :15.0001 Max. :500001
A simple vista, Lat. y Lon. son variables que no agregan valor; son puramente descriptivas.
La distribución de la variable housing_median_age es casi normal.
Hasta el momento, total_bedrooms es la única variable que posee NAs.
Las variables population, households y median_house_value poseen un sesgo positivo. Se podría decir lo mismo de median_income pero la escala es distinta. Se validará mediante una gráfica.
la variable ocean_proximity es la única categórica.
#installed.packages("leaps")
library(ggplot2)
library(dplyr)
library(corrplot)
library(randomForest)
library(mlbench)
library(PerformanceAnalytics)
library(MASS)
library(xgboost)
library(randomForest)
#Creación del data set de pruebas
df_cambios = df
df_cambios
#Densidad de latitude
df %>%
ggplot(aes(x=latitude, y=..density..))+
geom_density(fill="blue")+
theme_minimal()
#Densidad de longitude
df %>%
ggplot(aes(x=longitude, y=..density..))+
geom_density(fill="blue")+
theme_minimal()
#Densidad de housing_median_age
df %>%
ggplot(aes(x=housing_median_age))+
geom_bar()+
theme_minimal()
#Intento de normalizar la data / tentativo a borrarse
#df_normal = data.frame(row.names = 1:14447)
#df_normal
#library(dlookr)
#df_normal$housing_median_age = transform(df$housing_median_age, method = "log+1")
#plot(df_normal$housing_median_age)
#df_normal
#Distribución de total_rooms
df %>%
ggplot(aes(x=total_rooms))+
geom_bar()+
theme_minimal()
#Cálculo del IQR
iqr = IQR(df$total_rooms)
#Cálculo del límite inferior y superior
LI_total_rooms = round(quantile(df$total_rooms, prob=0.25) - (iqr * 1.75))
LS_total_rooms = round(quantile(df$total_rooms, prob=0.75) + (iqr * 1.75))
LI_total_rooms
25%
-1522
LS_total_rooms
75%
6121
df_cambios$total_rooms_CP = ifelse(df$total_rooms > LS_total_rooms, LS_total_rooms,
ifelse(df$total_rooms < LI_total_rooms, LI_total_rooms, df$total_rooms))
df_cambios %>%
ggplot(aes(x=total_rooms_CP))+
geom_histogram()+
theme_minimal()
df %>%
ggplot(aes(x=total_rooms, y=median_house_value))+
geom_point(shape="circle", col="green3")+
theme_minimal()
df_cambios %>%
ggplot(aes(x=total_rooms_CP, y=median_house_value))+
geom_point(shape="circle", col="blue3")+
theme_minimal()
La dispersión es absurda. No hay forma que esta variable pueda explicar el median_house_value.
#Distribución de total_bedrooms
df %>%
ggplot(aes(x=total_bedrooms))+
geom_bar()+
theme_minimal()
*Existencia de NAs. Se van a tratar.
df_cambios$total_bedrooms_sinNA = ifelse(is.na(df_cambios$total_bedrooms), median(df_cambios$total_bedrooms, na.rm=T), df_cambios$total_bedrooms)
summary(df_cambios$total_bedrooms_sinNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.0 298.0 436.0 535.6 642.0 6445.0
#Nuevo feature sin NAs
df_cambios %>%
ggplot(aes(x=total_bedrooms_sinNA))+
geom_histogram(binwidth=50)+
theme_minimal()
#Cálculo del IQR
iqr = IQR(df_cambios$total_bedrooms_sinNA)
#Cálculo del límite inferior y superior
LI_total_bedrooms_sinNA = round(quantile(df_cambios$total_bedrooms_sinNA, prob=0.25) - (iqr * 1.75))
LS_total_bedrooms_sinNA = round(quantile(df_cambios$total_bedrooms_sinNA, prob=0.75) + (iqr * 1.75))
LI_total_bedrooms_sinNA
25%
-304
LS_total_bedrooms_sinNA
75%
1244
df_cambios$total_bedrooms_sinNA_CP = ifelse(df_cambios$total_bedrooms_sinNA > LS_total_bedrooms_sinNA, LS_total_bedrooms_sinNA, ifelse(df_cambios$total_bedrooms_sinNA < LI_total_bedrooms_sinNA, LI_total_bedrooms_sinNA, df_cambios$total_bedrooms_sinNA))
df_cambios %>%
ggplot(aes(x=total_bedrooms_sinNA_CP))+
geom_histogram()+
theme_minimal()
#Scatter plot sin NAs
df_cambios %>%
ggplot(aes(x=total_bedrooms_sinNA, y=median_house_value))+
geom_point(shape="circle", col="orange3")+
theme_minimal()
#Scatter plot sin NAs y sin outliers
df_cambios %>%
ggplot(aes(x=total_bedrooms_sinNA_CP, y=median_house_value))+
geom_point(shape="circle", col="red3")+
theme_minimal()
Existe una gran dispersión aún en los datos, ya sea con y sin outliers.
#Densidad de population
df %>%
ggplot(aes(x=population))+
geom_bar()+
theme_minimal()
*Existencia de outliers. Se van a tratar.
#Cálculo del IQR
iqr = IQR(df$population)
#Cálculo del límite inferior y superior
LI_population = round(quantile(df$population, prob=0.25) - (iqr * 1.75))
LS_population = round(quantile(df$population, prob=0.75) + (iqr * 1.75))
LI_population
25%
-838
LS_population
75%
3351
df_cambios$population_CP = ifelse(df$population > LS_population, LS_population,
ifelse(df$population < LI_population, LI_population, df$population))
df_cambios %>%
ggplot(aes(x=total_rooms_CP))+
geom_histogram()+
theme_minimal()
df %>%
ggplot(aes(x=population, y=median_house_value))+
geom_point(shape="circle", col="green3")+
theme_minimal()
df_cambios %>%
ggplot(aes(x=population_CP, y=median_house_value))+
geom_point(shape="circle", col="blue3")+
theme_minimal()
Pese a que se trataron los outliers, la data de por sí es bastante dispersa.
#Distribución de households
df %>%
ggplot(aes(x=households))+
geom_bar()+
theme_minimal()
Existen outliers. Se van a tratar.
#Cálculo del IQR
iqr = IQR(df$households)
#Cálculo del límite inferior y superior
LI_households = round(quantile(df$households, prob=0.25) - (iqr * 1.75))
LS_households = round(quantile(df$households, prob=0.75) + (iqr * 1.75))
LI_households
25%
-285
LS_households
75%
1171
df_cambios$households_CP = ifelse(df$households > LS_households, LS_households,
ifelse(df$households < LI_households, LI_households, df$households))
df_cambios %>%
ggplot(aes(x=households_CP))+
geom_histogram()+
theme_minimal()
df %>%
ggplot(aes(x=households, y=median_house_value))+
geom_point(shape="circle", col="green3")+
theme_minimal()
df_cambios %>%
ggplot(aes(x=households_CP, y=median_house_value))+
geom_point(shape="circle", col="blue3")+
theme_minimal()
#Densidad de median_income
df %>%
ggplot(aes(x=median_income, y=..density..))+
geom_density(fill="red")+
theme_minimal()
#Cálculo del IQR
iqr = IQR(df$median_income)
#Cálculo del límite inferior y superior
LI_median_income = quantile(df$median_income, prob=0.25) - (iqr * 1.75)
LS_median_income = quantile(df$median_income, prob=0.75) + (iqr * 1.75)
LI_median_income
25%
-1.244262
LS_median_income
75%
8.557412
df_cambios$median_income_CP = ifelse(df$median_income > LS_median_income, LS_median_income, ifelse(df$median_income < LI_median_income, LI_median_income, df$median_income))
df_cambios %>%
ggplot(aes(x=median_income_CP, y=..density..))+
geom_density(fill="red")+
theme_minimal()
df %>%
ggplot(aes(x=median_income, y=median_house_value))+
geom_point(shape="circle", col="green3")+
theme_minimal()
df_cambios %>%
ggplot(aes(x=median_income_CP, y=median_house_value))+
geom_point(shape="circle", col="blue3")+
theme_minimal()
#Densidad de median_house_value
df %>%
ggplot(aes(x=median_house_value, y=..density..))+
geom_density(fill="red")+
theme_minimal()
summary(df$median_house_value)
Min. 1st Qu. Median Mean 3rd Qu. Max.
14999 119400 178700 207144 265600 500001
iqrt = IQR(df$median_house_value)
iqrt
[1] 146200
quantile(df$median_house_value, prob=0.75) + iqrt*1.75
75%
521450
quantile(df$median_house_value, prob=0.25) - iqrt*1.75
25%
-136450
*No existen outliers en la variable a predecir.
df %>%
ggplot(aes(x = ocean_proximity))+
geom_bar()+
theme_minimal()
library(caret)
dmy <- dummyVars(" ~ .", data = df_cambios, fullRank = T)
dmy
Dummy Variable Object
Formula: ~.
<environment: 0x00000163390f5dd0>
17 variables, 0 factors
Variables and levels will be separated by '.'
A full rank encoding is used
df_dummies <- data.frame(predict(dmy, newdata = df_cambios))
glimpse(df_dummies)
Rows: 14,447
Columns: 20
$ id <dbl> 71, 12792, 12520, 10660, 5215, 10357, 327, 5408, 9212, 12165, 4797, 6403, 1433, 18251, 16859, 7508, 8674, 19784, 1~
$ longitude <dbl> -122.29, -121.46, -121.46, -117.81, -118.25, -117.67, -122.19, -118.45, -120.31, -117.22, -118.35, -118.02, -122.0~
$ latitude <dbl> 37.81, 38.65, 38.55, 33.67, 33.94, 33.60, 37.73, 34.03, 37.11, 33.74, 34.03, 34.14, 37.99, 37.39, 37.63, 33.91, 33~
$ housing_median_age <dbl> 26, 8, 52, 24, 44, 25, 45, 41, 38, 7, 43, 34, 37, 39, 39, 42, 35, 20, 25, 18, 9, 47, 20, 32, 13, 10, 8, 19, 15, 21~
$ total_rooms <dbl> 768, 3746, 2094, 3930, 1463, 3164, 1528, 1240, 1696, 1810, 2122, 1077, 2247, 2210, 4220, 1786, 2152, 3758, 2054, 1~
$ total_bedrooms <dbl> 152, 767, 463, 661, 312, 449, 291, 320, 301, 386, 524, 257, 416, 483, 1055, 358, 454, 798, 495, 332, 2640, 335, 36~
$ population <dbl> 392, 2161, 1364, 1831, 940, 1517, 801, 711, 985, 931, 1510, 478, 1237, 1023, 2720, 1318, 902, 1685, 835, 733, 6837~
$ households <dbl> 127, 744, 407, 616, 312, 453, 287, 304, 278, 355, 436, 199, 397, 450, 1046, 373, 414, 757, 475, 318, 2358, 329, 30~
$ median_income <dbl> 1.7719, 3.2039, 1.2235, 6.3767, 2.3333, 6.7921, 1.2625, 3.3482, 2.4054, 2.5221, 2.2273, 2.6316, 4.4500, 4.5833, 2.~
$ median_house_value <dbl> 82500, 103400, 68500, 269000, 99800, 266000, 84700, 318100, 112500, 109200, 123300, 252800, 161900, 342400, 242500~
$ ocean_proximityINLAND <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ~
$ ocean_proximityISLAND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ ocean_proximityNEAR.BAY <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ~
$ ocean_proximityNEAR.OCEAN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ~
$ total_rooms_CP <dbl> 768, 3746, 2094, 3930, 1463, 3164, 1528, 1240, 1696, 1810, 2122, 1077, 2247, 2210, 4220, 1786, 2152, 3758, 2054, 1~
$ total_bedrooms_sinNA <dbl> 152, 767, 463, 661, 312, 449, 291, 320, 301, 386, 524, 257, 416, 483, 1055, 358, 454, 798, 495, 332, 2640, 335, 36~
$ total_bedrooms_sinNA_CP <dbl> 152, 767, 463, 661, 312, 449, 291, 320, 301, 386, 524, 257, 416, 483, 1055, 358, 454, 798, 495, 332, 1244, 335, 36~
$ population_CP <dbl> 392, 2161, 1364, 1831, 940, 1517, 801, 711, 985, 931, 1510, 478, 1237, 1023, 2720, 1318, 902, 1685, 835, 733, 3351~
$ households_CP <dbl> 127, 744, 407, 616, 312, 453, 287, 304, 278, 355, 436, 199, 397, 450, 1046, 373, 414, 757, 475, 318, 1171, 329, 30~
$ median_income_CP <dbl> 1.771900, 3.203900, 1.223500, 6.376700, 2.333300, 6.792100, 1.262500, 3.348200, 2.405400, 2.522100, 2.227300, 2.63~
#Gráfico de correlación - df_dummies
df_dummies_corr = df_dummies %>%
dplyr::select(longitude, latitude, ocean_proximityINLAND:total_rooms_CP, total_bedrooms_sinNA_CP:median_income_CP, median_house_value)
corrplot(cor(df_dummies_corr), method = "color", type = "upper", tl.col="black")
#Ocean Proximity INLAND
df_dummies %>%
ggplot(aes(x=ocean_proximityINLAND, y=median_house_value))+
geom_point(shape="circle", col="blue3")+
theme_minimal()
#Median Income CP
df_cambios %>%
ggplot(aes(x=median_income_CP, y=median_house_value))+
geom_point(shape="circle", col="green3")+
theme_minimal()
#Latitude
df_cambios %>%
ggplot(aes(x=total_rooms_CP, y=median_house_value))+
geom_point(shape="circle", col="orange3")+
theme_minimal()
#Índice
trainIndex = createDataPartition(df_dummies_corr$median_house_value, p=0.7, list=FALSE )
#Set de entrenamiento
df_trainSet = df_dummies_corr[trainIndex, ]
#Set de prueba
df_testSet = df_dummies_corr[-trainIndex, ]
df_trainSet
df_testSet
#Explicado por median_income + ocean_proximityINLAND
lm2 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND)
summary(lm2)
Call:
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND,
data = .)
Residuals:
Min 1Q Median 3Q Max
-322576 -48342 -12721 32114 487298
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 73763.6 2046.0 36.05 <2e-16 ***
median_income_CP 41756.3 447.7 93.27 <2e-16 ***
ocean_proximityINLAND -81934.9 1644.8 -49.81 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74640 on 10112 degrees of freedom
Multiple R-squared: 0.5861, Adjusted R-squared: 0.5861
F-statistic: 7161 on 2 and 10112 DF, p-value: < 2.2e-16
#Explicado por median_income + ocean_proximityINLAND
lm3 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND + total_rooms_CP)
summary(lm3)
Call:
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND +
total_rooms_CP, data = .)
Residuals:
Min 1Q Median 3Q Max
-317082 -48117 -12969 31891 490078
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.116e+04 2.196e+03 32.400 < 2e-16 ***
median_income_CP 4.137e+04 4.631e+02 89.336 < 2e-16 ***
ocean_proximityINLAND -8.233e+04 1.648e+03 -49.942 < 2e-16 ***
total_rooms_CP 1.705e+00 5.253e-01 3.246 0.00117 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74610 on 10111 degrees of freedom
Multiple R-squared: 0.5866, Adjusted R-squared: 0.5865
F-statistic: 4782 on 3 and 10111 DF, p-value: < 2.2e-16
#Explicado por median_income + ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY
lm4 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY)
summary(lm4)
Call:
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND +
total_rooms_CP + ocean_proximityNEAR.BAY, data = .)
Residuals:
Min 1Q Median 3Q Max
-313842 -47823 -12579 31605 490111
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.806e+04 2.226e+03 30.568 < 2e-16 ***
median_income_CP 4.135e+04 4.617e+02 89.556 < 2e-16 ***
ocean_proximityINLAND -7.925e+04 1.691e+03 -46.870 < 2e-16 ***
total_rooms_CP 1.741e+00 5.238e-01 3.325 0.000888 ***
ocean_proximityNEAR.BAY 1.863e+04 2.407e+03 7.740 1.09e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74390 on 10110 degrees of freedom
Multiple R-squared: 0.589, Adjusted R-squared: 0.5889
F-statistic: 3622 on 4 and 10110 DF, p-value: < 2.2e-16
#Explicado por median_income + ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY + ocean_proximityNEAR.OCEAN
lm4 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY + ocean_proximityNEAR.OCEAN)
summary(lm4)
Call:
lm(formula = median_house_value ~ median_income_CP + ocean_proximityINLAND +
total_rooms_CP + ocean_proximityNEAR.BAY + ocean_proximityNEAR.OCEAN,
data = .)
Residuals:
Min 1Q Median 3Q Max
-310912 -47832 -12008 31906 490458
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.407e+04 2.305e+03 27.798 < 2e-16 ***
median_income_CP 4.149e+04 4.613e+02 89.944 < 2e-16 ***
ocean_proximityINLAND -7.568e+04 1.775e+03 -42.646 < 2e-16 ***
total_rooms_CP 1.728e+00 5.227e-01 3.306 0.000951 ***
ocean_proximityNEAR.BAY 2.208e+04 2.460e+03 8.974 < 2e-16 ***
ocean_proximityNEAR.OCEAN 1.521e+04 2.339e+03 6.504 8.22e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74240 on 10109 degrees of freedom
Multiple R-squared: 0.5907, Adjusted R-squared: 0.5905
F-statistic: 2918 on 5 and 10109 DF, p-value: < 2.2e-16
#Agregamos efecto de interacción
int_lm1 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP*ocean_proximityINLAND)
summary(int_lm1)
Call:
lm(formula = median_house_value ~ median_income_CP * ocean_proximityINLAND,
data = .)
Residuals:
Min 1Q Median 3Q Max
-329727 -46533 -13917 31319 468466
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 65300.4 2269.4 28.774 <2e-16 ***
median_income_CP 43817.5 507.9 86.274 <2e-16 ***
ocean_proximityINLAND -51158.5 3977.8 -12.861 <2e-16 ***
median_income_CP:ocean_proximityINLAND -9023.5 1062.7 -8.491 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74380 on 10111 degrees of freedom
Multiple R-squared: 0.5891, Adjusted R-squared: 0.589
F-statistic: 4832 on 3 and 10111 DF, p-value: < 2.2e-16
int_lm2 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP*ocean_proximityINLAND + total_rooms_CP)
summary(int_lm2)
Call:
lm(formula = median_house_value ~ median_income_CP * ocean_proximityINLAND +
total_rooms_CP, data = .)
Residuals:
Min 1Q Median 3Q Max
-323959 -46532 -13962 31521 471237
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.246e+04 2.413e+03 25.887 < 2e-16 ***
median_income_CP 4.343e+04 5.200e+02 83.505 < 2e-16 ***
ocean_proximityINLAND -5.128e+04 3.976e+03 -12.897 < 2e-16 ***
total_rooms_CP 1.812e+00 5.235e-01 3.460 0.000542 ***
median_income_CP:ocean_proximityINLAND -9.111e+03 1.062e+03 -8.576 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74340 on 10110 degrees of freedom
Multiple R-squared: 0.5896, Adjusted R-squared: 0.5894
F-statistic: 3631 on 4 and 10110 DF, p-value: < 2.2e-16
int_lm3 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP*ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY)
summary(int_lm3)
Call:
lm(formula = median_house_value ~ median_income_CP * ocean_proximityINLAND +
total_rooms_CP + ocean_proximityNEAR.BAY, data = .)
Residuals:
Min 1Q Median 3Q Max
-320715 -46445 -13381 31170 471292
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59365.620 2438.321 24.347 < 2e-16 ***
median_income_CP 43406.010 518.535 83.709 < 2e-16 ***
ocean_proximityINLAND -48246.206 3983.445 -12.112 < 2e-16 ***
total_rooms_CP 1.847 0.522 3.539 0.000403 ***
ocean_proximityNEAR.BAY 18604.920 2398.583 7.757 9.56e-15 ***
median_income_CP:ocean_proximityINLAND -9099.716 1059.300 -8.590 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 74130 on 10109 degrees of freedom
Multiple R-squared: 0.592, Adjusted R-squared: 0.5918
F-statistic: 2933 on 5 and 10109 DF, p-value: < 2.2e-16
int_lm4 = df_trainSet %>%
lm(formula = median_house_value ~ median_income_CP*ocean_proximityINLAND + total_rooms_CP + ocean_proximityNEAR.BAY + ocean_proximityNEAR.OCEAN)
summary(int_lm4)
Call:
lm(formula = median_house_value ~ median_income_CP * ocean_proximityINLAND +
total_rooms_CP + ocean_proximityNEAR.BAY + ocean_proximityNEAR.OCEAN,
data = .)
Residuals:
Min 1Q Median 3Q Max
-317822 -46184 -12859 31279 471274
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.507e+04 2.515e+03 21.897 < 2e-16 ***
median_income_CP 4.359e+04 5.181e+02 84.133 < 2e-16 ***
ocean_proximityINLAND -4.393e+04 4.026e+03 -10.912 < 2e-16 ***
total_rooms_CP 1.836e+00 5.209e-01 3.524 0.000427 ***
ocean_proximityNEAR.BAY 2.217e+04 2.451e+03 9.045 < 2e-16 ***
ocean_proximityNEAR.OCEAN 1.574e+04 2.331e+03 6.750 1.56e-11 ***
median_income_CP:ocean_proximityINLAND -9.282e+03 1.057e+03 -8.779 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73960 on 10108 degrees of freedom
Multiple R-squared: 0.5938, Adjusted R-squared: 0.5936
F-statistic: 2463 on 6 and 10108 DF, p-value: < 2.2e-16
# lm4
train_control = trainControl(method = "cv", number = 5)
step.model_lm4 = train(median_house_value ~ ., data = df_trainSet,
method = "leapForward",
tuneGrid = data.frame(nvmax = 1:9),
trControl = train_control)
summary(step.model_lm4$finalModel)
Subset selection object
11 Variables (and intercept)
Forced in Forced out
longitude FALSE FALSE
latitude FALSE FALSE
ocean_proximityINLAND FALSE FALSE
ocean_proximityISLAND FALSE FALSE
ocean_proximityNEAR.BAY FALSE FALSE
ocean_proximityNEAR.OCEAN FALSE FALSE
total_rooms_CP FALSE FALSE
total_bedrooms_sinNA_CP FALSE FALSE
population_CP FALSE FALSE
households_CP FALSE FALSE
median_income_CP FALSE FALSE
1 subsets of each size up to 9
Selection Algorithm: forward
longitude latitude ocean_proximityINLAND ocean_proximityISLAND ocean_proximityNEAR.BAY ocean_proximityNEAR.OCEAN total_rooms_CP
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " "*" " " " " " " " "
3 ( 1 ) " " " " "*" " " " " " " " "
4 ( 1 ) " " " " "*" " " " " " " " "
5 ( 1 ) " " " " "*" " " " " " " " "
6 ( 1 ) " " " " "*" " " " " " " "*"
7 ( 1 ) "*" " " "*" " " " " " " "*"
8 ( 1 ) "*" "*" "*" " " " " " " "*"
9 ( 1 ) "*" "*" "*" "*" " " " " "*"
total_bedrooms_sinNA_CP population_CP households_CP median_income_CP
1 ( 1 ) " " " " " " "*"
2 ( 1 ) " " " " " " "*"
3 ( 1 ) "*" " " " " "*"
4 ( 1 ) "*" "*" " " "*"
5 ( 1 ) "*" "*" "*" "*"
6 ( 1 ) "*" "*" "*" "*"
7 ( 1 ) "*" "*" "*" "*"
8 ( 1 ) "*" "*" "*" "*"
9 ( 1 ) "*" "*" "*" "*"
#Modelo mediante Forward Selection (FS)
train_control = trainControl(method = "cv", number = 5)
model_v1 = train(median_house_value ~ longitude + latitude + ocean_proximityINLAND + ocean_proximityISLAND + total_rooms_CP + total_bedrooms_sinNA_CP + population_CP + households_CP + median_income_CP, data = df_trainSet,
method = "lm",
trControl = train_control)
model_v1
Linear Regression
10115 samples
9 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 8092, 8091, 8092, 8093, 8092
Resampling results:
RMSE Rsquared MAE
67952.82 0.657127 50252.03
Tuning parameter 'intercept' was held constant at a value of TRUE
*Este modelo posee menor RMSE. Por ende se procederá a crear uno nuevo utilizando solo estas variables.
#Interacción entre Median_income_CP y Ocean Proximity INLAND
train_control = trainControl(method = "cv", number = 5)
model_v3 = train(median_house_value ~ longitude + latitude + median_income_CP*ocean_proximityINLAND + total_rooms_CP + total_bedrooms_sinNA_CP + population_CP + households_CP, data = df_trainSet,
method = "lm",
trControl = train_control)
model_v3
Linear Regression
10115 samples
8 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 8091, 8092, 8093, 8092, 8092
Resampling results:
RMSE Rsquared MAE
67545.47 0.6614535 49524.18
Tuning parameter 'intercept' was held constant at a value of TRUE
¡Mejoró!
#Entre Median_income_CP y Ocean Proximity INLAND y Latitude*Total_Rooms_CP
train_control = trainControl(method = "cv", number = 5)
model_v4 = train(median_house_value ~ longitude + latitude*total_rooms_CP + median_income_CP*ocean_proximityINLAND + total_bedrooms_sinNA_CP + population_CP + households_CP, data = df_trainSet,
method = "lm",
trControl = train_control)
model_v4
Linear Regression
10115 samples
8 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 8091, 8091, 8092, 8094, 8092
Resampling results:
RMSE Rsquared MAE
67550.86 0.6610678 49526.39
Tuning parameter 'intercept' was held constant at a value of TRUE
Las dos interacciones hicieron que el modelo empeorara, pero de igual manera se va a hacer un submission.
#Cross validation
customControl <- trainControl(method = "repeatedcv",
number=10,
repeats=5,
verboseIter = F) # Verbose False es para no desplegar un log de que iteracion se va ejecutando
Se escogió este parámetro ya que consideramos que existen algunas variables que definitivamente no aportan al modelo. Esto lo logramos percibir en el análisis de correlación.
lasso<-train(median_house_value ~. ,
df_trainSet,
method="glmnet",
tuneGrid = expand.grid(alpha = 1,
lambda=seq(0.0001, 1, length=5)),
trControl=customControl)
plot(lasso)
Conforme crece el parámetro de regularización, el RMSE no se ve afectado.
plot(lasso$finalModel, xvar = "lambda", label=TRUE)
plot(lasso$finalModel, xvar = "dev", label=TRUE)
plot(varImp(lasso, scale=T))
Las variables con mayor impacto son:
#Variables seleccionadas mediante LASSO
train_control = trainControl(method = "cv", number = 5)
model_v2 = train(median_house_value ~ longitude + latitude + ocean_proximityINLAND + ocean_proximityISLAND + median_income_CP, data = df_trainSet,
method = "lm",
trControl = train_control)
model_v2
Linear Regression
10115 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 8093, 8092, 8092, 8091, 8092
Resampling results:
RMSE Rsquared MAE
73076.54 0.6033179 54281.31
Tuning parameter 'intercept' was held constant at a value of TRUE
Pese a que el RMSE fue más alto, es el modelo más “imparcial” al momento de evaluar data nueva. Será interesante evaluar con interacciones.
#Modelo mediante Forward Selection (FS)
train_control = trainControl(method = "cv", number = 5)
model_v5 = train(median_house_value ~ longitude + latitude + ocean_proximityISLAND:median_income_CP + ocean_proximityINLAND, data = df_trainSet,
method = "lm",
trControl = train_control)
model_v5
Linear Regression
10115 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 8092, 8093, 8091, 8093, 8091
Resampling results:
RMSE Rsquared MAE
99236.13 0.2682878 75784.38
Tuning parameter 'intercept' was held constant at a value of TRUE
Lejos de mejorar, el RMSE subió bastante.
df.rf=randomForest(median_house_value ~ ., data = df_trainSet)
df.rf
Call:
randomForest(formula = median_house_value ~ ., data = df_trainSet)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 2659765261
% Var explained: 80.24
#Graficamos el modelo segun la cantidad de arboles creados
plot(df.rf)
#Conocer la importancia de cada una de las variables
df.rf$importance
IncNodePurity
longitude 1.754306e+13
latitude 1.577971e+13
ocean_proximityINLAND 1.848250e+13
ocean_proximityISLAND 2.036068e+10
ocean_proximityNEAR.BAY 1.031703e+12
ocean_proximityNEAR.OCEAN 1.232271e+12
total_rooms_CP 6.996495e+12
total_bedrooms_sinNA_CP 4.853193e+12
population_CP 7.989039e+12
households_CP 4.865709e+12
median_income_CP 4.995148e+13
#Genera la cantidad de arboles optimos para el modelo
which.min(df.rf$mse)
[1] 494
pred<-predict(df.rf,df_testSet)
test_rmse <- sqrt(mean((pred - df_testSet$median_house_value)^2))
paste("Error de test (rmse) del árbol:", round(test_rmse,2))
[1] "Error de test (rmse) del árbol: 52480.84"
Obtuvimos un RMSE bajo a comparación de los modelos anteriores, sin embargo se procederá a hacer otras modificaciones.
library(ranger)
features <- setdiff(names(df_trainSet), "median_house_value")
rg <- ranger(
formula = median_house_value~ .,
data = df_trainSet,
num.trees = 500,
mtry = 11
)
rg
Ranger result
Call:
ranger(formula = median_house_value ~ ., data = df_trainSet, num.trees = 500, mtry = 11)
Type: Regression
Number of trees: 500
Sample size: 10115
Number of independent variables: 11
Mtry: 11
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 2449839212
R squared (OOB): 0.8179937
pred_ranger<-predict(rg,df_testSet)
test_rmse <- sqrt(mean((pred_ranger$predictions - df_testSet$median_house_value)^2))
paste("Error de test (rmse) del árbol:", round(test_rmse,2))
[1] "Error de test (rmse) del árbol: 50323.34"
Se obtuvo un RMSE mucho inferior a los anteriores. De igual forma se intentará bajo otro método.
train<- as.matrix(df_trainSet, rownames.force=NA)
test<- as.matrix(df_testSet, rownames.force=NA)
train <- as(train, "sparseMatrix")
test <- as(test, "sparseMatrix")
# Never forget to exclude objective variable in 'data option'
train_Data <- xgb.DMatrix(data = train[,1:11], label = train[,"median_house_value"])
# Tuning the parameters #
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3)
xgb.grid <- expand.grid(nrounds = 500,
max_depth = 7,
eta = 0.1,
gamma = 1,
colsample_bytree = 1,
min_child_weight=100,
subsample = 1)
xgb_tune <-train(median_house_value ~.,
data=df_trainSet,
objective = "reg:squarederror",
method="xgbTree",
metric = "RMSE",
trControl=cv.ctrl,
tuneGrid = xgb.grid)
The following parameters were provided multiple times:
objective
Only the last value for each of them will be used.
The following parameters were provided multiple times:
objective
Only the last value for each of them will be used.
The following parameters were provided multiple times:
objective
Only the last value for each of them will be used.
The following parameters were provided multiple times:
objective
Only the last value for each of them will be used.
print(xgb.grid)
test_data <- xgb.DMatrix(data = test[,1:11])
prediction <- predict(xgb_tune, df_testSet)
test_rmse <- sqrt(mean((prediction - df_testSet$median_house_value)^2))
paste("Error de test (rmse) del árbol:", round(test_rmse,2))
[1] "Error de test (rmse) del árbol: 49632.8"
glimpse(df_test)
Rows: 6,193
Columns: 10
$ id <int> 14769, 16487, 2879, 19071, 12053, 17077, 10624, 10320, 5406, 10840, 20172, 12112, 9762, 13614, 8503, 3237, 2088, 7275, 38~
$ longitude <dbl> -117.10, -121.05, -118.97, -122.52, -117.57, -122.20, -117.77, -117.79, -118.44, -117.93, -119.30, -117.34, -121.74, -117~
$ latitude <dbl> 32.57, 38.14, 35.38, 38.27, 33.87, 37.48, 33.67, 33.84, 34.03, 33.66, 34.39, 34.02, 36.49, 34.13, 33.87, 36.10, 36.75, 33~
$ housing_median_age <int> 26, 19, 42, 18, 33, 32, 12, 9, 37, 18, 35, 28, 33, 37, 27, 21, 52, 30, 22, 34, 36, 36, 37, 12, 35, 39, 36, 21, 43, 32, 14~
$ total_rooms <int> 2343, 3326, 1185, 2405, 2076, 640, 4329, 10484, 975, 2043, 3079, 2683, 2952, 2498, 3144, 1382, 377, 861, 2516, 2782, 2028~
$ total_bedrooms <int> 371, 561, 358, 390, 517, 166, 1068, 1603, 189, 250, 579, 708, 565, 472, 722, 327, 97, 250, 826, 502, 523, 285, 301, 356, ~
$ population <int> 1221, 1544, 1038, 872, 1374, 991, 1913, 4005, 489, 702, 1807, 2047, 1264, 1291, 1510, 1469, 530, 1062, 3350, 1219, 2751, ~
$ households <int> 372, 511, 299, 367, 480, 160, 978, 1419, 202, 246, 589, 636, 517, 487, 680, 355, 96, 231, 713, 507, 496, 272, 265, 367, 5~
$ median_income <dbl> 4.3601, 2.9875, 0.9951, 5.2155, 2.2197, 1.9844, 4.5094, 8.3931, 4.2434, 9.6062, 4.6900, 2.2750, 4.4209, 3.0000, 3.1597, 1~
$ ocean_proximity <chr> "NEAR OCEAN", "INLAND", "INLAND", "<1H OCEAN", "INLAND", "NEAR BAY", "<1H OCEAN", "<1H OCEAN", "<1H OCEAN", "<1H OCEAN", ~
#Tratamiento de NAs - Total_bedrooms
df_test$total_bedrooms_sinNA = ifelse(is.na(df_test$total_bedrooms), median(df_test$total_bedrooms, na.rm=T), df_test$total_bedrooms)
##Tratamiento de outliers - Total_bedrooms
df_test$total_bedrooms_sinNA_CP = ifelse(df_test$total_bedrooms_sinNA > LS_total_bedrooms_sinNA, LS_total_bedrooms_sinNA, ifelse(df_test$total_bedrooms_sinNA < LI_total_bedrooms_sinNA, LI_total_bedrooms_sinNA, df_test$total_bedrooms_sinNA))
#Tratamiento de outliers - Total Rooms
df_test$total_rooms_CP = ifelse(df_test$total_rooms > LS_total_rooms, LS_total_rooms, ifelse(df_test$total_rooms < LI_total_rooms, LI_total_rooms, df_test$total_rooms))
#Tratamiento de outliers - Population
df_test$population_CP = ifelse(df_test$population > LS_population, LS_population, ifelse(df_test$population < LI_population, LI_population, df_test$population))
#Tratamiento de outliers - Households
df_test$households_CP = ifelse(df_test$households > LS_households, LS_households, ifelse(df_test$households < LI_households, LI_households, df_test$households))
#Tratamiento de outliers - Median_income
df_test$median_income_CP = ifelse(df_test$median_income > LS_median_income, LS_median_income, ifelse(df_test$median_income < LI_median_income, LI_median_income, df_test$median_income))
#Dummy variables
dmy <- dummyVars(" ~ .", data = df_test, fullRank = T)
dfTest <- data.frame(predict(dmy, newdata = df_test))
glimpse(dfTest)
Rows: 6,193
Columns: 19
$ id <dbl> 14769, 16487, 2879, 19071, 12053, 17077, 10624, 10320, 5406, 10840, 20172, 12112, 9762, 13614, 8503, 3237, 2088, 7~
$ longitude <dbl> -117.10, -121.05, -118.97, -122.52, -117.57, -122.20, -117.77, -117.79, -118.44, -117.93, -119.30, -117.34, -121.7~
$ latitude <dbl> 32.57, 38.14, 35.38, 38.27, 33.87, 37.48, 33.67, 33.84, 34.03, 33.66, 34.39, 34.02, 36.49, 34.13, 33.87, 36.10, 36~
$ housing_median_age <dbl> 26, 19, 42, 18, 33, 32, 12, 9, 37, 18, 35, 28, 33, 37, 27, 21, 52, 30, 22, 34, 36, 36, 37, 12, 35, 39, 36, 21, 43,~
$ total_rooms <dbl> 2343, 3326, 1185, 2405, 2076, 640, 4329, 10484, 975, 2043, 3079, 2683, 2952, 2498, 3144, 1382, 377, 861, 2516, 278~
$ total_bedrooms <dbl> 371, 561, 358, 390, 517, 166, 1068, 1603, 189, 250, 579, 708, 565, 472, 722, 327, 97, 250, 826, 502, 523, 285, 301~
$ population <dbl> 1221, 1544, 1038, 872, 1374, 991, 1913, 4005, 489, 702, 1807, 2047, 1264, 1291, 1510, 1469, 530, 1062, 3350, 1219,~
$ households <dbl> 372, 511, 299, 367, 480, 160, 978, 1419, 202, 246, 589, 636, 517, 487, 680, 355, 96, 231, 713, 507, 496, 272, 265,~
$ median_income <dbl> 4.3601, 2.9875, 0.9951, 5.2155, 2.2197, 1.9844, 4.5094, 8.3931, 4.2434, 9.6062, 4.6900, 2.2750, 4.4209, 3.0000, 3.~
$ ocean_proximityINLAND <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, ~
$ ocean_proximityISLAND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ ocean_proximityNEAR.BAY <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ~
$ ocean_proximityNEAR.OCEAN <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ total_bedrooms_sinNA <dbl> 371, 561, 358, 390, 517, 166, 1068, 1603, 189, 250, 579, 708, 565, 472, 722, 327, 97, 250, 826, 502, 523, 285, 301~
$ total_bedrooms_sinNA_CP <dbl> 371, 561, 358, 390, 517, 166, 1068, 1244, 189, 250, 579, 708, 565, 472, 722, 327, 97, 250, 826, 502, 523, 285, 301~
$ total_rooms_CP <dbl> 2343, 3326, 1185, 2405, 2076, 640, 4329, 6121, 975, 2043, 3079, 2683, 2952, 2498, 3144, 1382, 377, 861, 2516, 2782~
$ population_CP <dbl> 1221, 1544, 1038, 872, 1374, 991, 1913, 3351, 489, 702, 1807, 2047, 1264, 1291, 1510, 1469, 530, 1062, 3350, 1219,~
$ households_CP <dbl> 372, 511, 299, 367, 480, 160, 978, 1171, 202, 246, 589, 636, 517, 487, 680, 355, 96, 231, 713, 507, 496, 272, 265,~
$ median_income_CP <dbl> 4.360100, 2.987500, 0.995100, 5.215500, 2.219700, 1.984400, 4.509400, 8.393100, 4.243400, 8.557412, 4.690000, 2.27~
salida = round(predict(object = xgb_tune,
newdata = dfTest))
#Creación del data set final
dfSalida = data.frame(dfTest$id, salida)
colnames(dfSalida) = c("id", "median_house_value")
dfSalida
#Producimos Salida.
write.csv(dfSalida, "Test13-censoCA.csv", row.names = FALSE, quote = FALSE)