#REGRESIÓN LINEAL MULTIPLE EN R

Importamos librerias

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
# Para utilizar la funcion one_hot()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:xts':
## 
##     first, last
library(mltools)
## 
## Attaching package: 'mltools'
## The following object is masked from 'package:PerformanceAnalytics':
## 
##     skewness
#Para transformar a Camel CASE
library(tools)
#Para utilizar kable
library(knitr)
#Para el diagrama de dispersión
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(MASS)
library(ggfortify)
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     rivers
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()    masks data.table::between()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks data.table::first(), xts::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks data.table::last(), xts::last()
## ✖ tidyr::replace_na() masks mltools::replace_na()
## ✖ dplyr::select()     masks MASS::select()
## ✖ dplyr::src()        masks Hmisc::src()
## ✖ dplyr::summarize()  masks Hmisc::summarize()
## ✖ purrr::transpose()  masks data.table::transpose()

##RECONOCIMIENTO DEL DATASET: CALIFORNIA HOUSING PRICES

Tenemos los datos del precio de las casas para diferentes distritos en California.

Cada fila representa un distrito.

Tenemos 10 atributos:

-Longitud

-Latitud

-Edad media de la casa

-Número de habitaciones

-Número de camas

-Población del distrito

-Hogares en el distrito

-Ingresos medios

-Proximidad al Oceano

#LEEMOS EL DATASET
df <- read.csv('housing.csv')
plot_map = ggplot(df, 
                  aes(x = longitude, y = latitude, color = median_house_value 
                    )) +
              geom_point(aes(size = population), alpha = 0.4) +
              xlab("Longitude") +
              ylab("Latitude") +
              ggtitle("Mapa de Longitud y Latitud : Precio Medio de las Casas") +
              
              scale_color_distiller(palette = "Spectral") +
              labs(color = "Median House Value ($USD)", size = "Población")
plot_map

head(df)
##   longitude latitude housing_median_age total_rooms total_bedrooms population
## 1   -122.23    37.88                 41         880            129        322
## 2   -122.22    37.86                 21        7099           1106       2401
## 3   -122.24    37.85                 52        1467            190        496
## 4   -122.25    37.85                 52        1274            235        558
## 5   -122.25    37.85                 52        1627            280        565
## 6   -122.25    37.85                 52         919            213        413
##   households median_income median_house_value ocean_proximity
## 1        126        8.3252             452600        NEAR BAY
## 2       1138        8.3014             358500        NEAR BAY
## 3        177        7.2574             352100        NEAR BAY
## 4        219        5.6431             341300        NEAR BAY
## 5        259        3.8462             342200        NEAR BAY
## 6        193        4.0368             269700        NEAR BAY
str(df)
## 'data.frame':    20640 obs. of  10 variables:
##  $ longitude         : num  -122 -122 -122 -122 -122 ...
##  $ latitude          : num  37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age: num  41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms       : num  880 7099 1467 1274 1627 ...
##  $ total_bedrooms    : num  129 1106 190 235 280 ...
##  $ population        : num  322 2401 496 558 565 ...
##  $ households        : num  126 1138 177 219 259 ...
##  $ median_income     : num  8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value: num  452600 358500 352100 341300 342200 ...
##  $ ocean_proximity   : chr  "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...
summary(df)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##                                                                     
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5348  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.5   Mean   : 3.8707  
##  3rd Qu.: 647.0   3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  NA's   :207                                                        
##  median_house_value ocean_proximity   
##  Min.   : 14999     Length:20640      
##  1st Qu.:119600     Class :character  
##  Median :179700     Mode  :character  
##  Mean   :206856                       
##  3rd Qu.:264725                       
##  Max.   :500001                       
## 

OBSERVEMOS LA VARIABLE DE INTERES: El valor Medio de una Casa de California

hist(df$median_house_value, breaks=100, main="Y : Precio Medio de las Casas", border="darkgoldenrod2", col="darkblue")

OBSERVEMOS UN HISTOGRAMA DE TODOS LOS DATOS

par(mfrow = c(3,3))
hist(df$longitude, breaks = 140, main = "longitude", border="darkorange", col="dodgerblue")
hist(df$latitude, breaks = 140, main = "latitude", border="darkorange", col="dodgerblue")
hist(df$housing_median_age, breaks = 140, main = "housing_median_age", border="darkorange", col="dodgerblue")
hist(df$total_rooms, breaks = 140, main = "total_rooms", border="darkorange", col="dodgerblue")
hist(df$total_bedrooms, breaks = 140, main = "total_bedrooms", border="darkorange", col="dodgerblue")
hist(df$population, breaks = 140, main = "population", border="darkorange", col="dodgerblue")
hist(df$households, breaks = 140, main = "households", border="darkorange", col="dodgerblue")
hist(df$median_income, breaks = 140, main = "median_income", border="darkorange", col="dodgerblue")
hist(df$median_house_value, breaks = 140, main = "median_house_value", border="darkorange", col="dodgerblue")

## OBSERVEMOS UNA DIAGRAMA DE DISPERSIÓN DE TODOS LOS DATOS EXCEPTO LONGITUD Y LATITUD

Z = df[,-1:-2]
ggpairs(Z, columns=1:7, ggplot2::aes(colour=ocean_proximity), progress = FALSE)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values
## Warning: Removed 207 rows containing missing values (geom_point).
## Removed 207 rows containing missing values (geom_point).
## Warning: Removed 207 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 207 rows containing missing values
## Warning: Removed 207 rows containing missing values (geom_point).
## Removed 207 rows containing missing values (geom_point).
## Removed 207 rows containing missing values (geom_point).
## Removed 207 rows containing missing values (geom_point).

Z = df[, -10]
Z <- na.omit(Z)
corrmatrix = cor(Z)
t(corrmatrix)
##                      longitude    latitude housing_median_age total_rooms
## longitude           1.00000000 -0.92461611        -0.10935655  0.04548017
## latitude           -0.92461611  1.00000000         0.01189907 -0.03666681
## housing_median_age -0.10935655  0.01189907         1.00000000 -0.36062830
## total_rooms         0.04548017 -0.03666681        -0.36062830  1.00000000
## total_bedrooms      0.06960802 -0.06698283        -0.32045104  0.93037950
## population          0.10027030 -0.10899734        -0.29578730  0.85728125
## households          0.05651277 -0.07177419        -0.30276797  0.91899153
## median_income      -0.01555015 -0.07962632        -0.11827772  0.19788152
## median_house_value -0.04539822 -0.14463821         0.10643205  0.13329413
##                    total_bedrooms   population  households median_income
## longitude              0.06960802  0.100270301  0.05651277  -0.015550150
## latitude              -0.06698283 -0.108997344 -0.07177419  -0.079626319
## housing_median_age    -0.32045104 -0.295787297 -0.30276797  -0.118277723
## total_rooms            0.93037950  0.857281251  0.91899153   0.197881519
## total_bedrooms         1.00000000  0.877746743  0.97972827  -0.007722850
## population             0.87774674  1.000000000  0.90718590   0.005086624
## households             0.97972827  0.907185900  1.00000000   0.013433892
## median_income         -0.00772285  0.005086624  0.01343389   1.000000000
## median_house_value     0.04968618 -0.025299732  0.06489355   0.688355475
##                    median_house_value
## longitude                 -0.04539822
## latitude                  -0.14463821
## housing_median_age         0.10643205
## total_rooms                0.13329413
## total_bedrooms             0.04968618
## population                -0.02529973
## households                 0.06489355
## median_income              0.68835548
## median_house_value         1.00000000

DIAGRAMAS DE DISPERSION

par(mfrow = c(2,3))
plot( df$housing_median_age,  df$median_house_value)
plot(df$total_rooms,df$median_house_value)
plot(df$total_bedrooms, df$median_house_value)
plot(df$population, df$median_house_value)
plot(df$households, df$median_house_value)
plot(df$median_income, df$median_house_value)

DESBALANCE EN ALGUNAS CARACTERISTICAS

summary(df$ocean_proximity) 
##    Length     Class      Mode 
##     20640 character character
ggplot(df, aes(x = factor(ocean_proximity))) + geom_bar(stat = "count", color = "black", fill = "black")+xlab("categoria") +
              ylab("n") +
              ggtitle("Proximidad al Oceano") 

## PROBLEMAS DE DATOS CONCENTRADOS EN UN VALOR PARA ALGUNAS CARACTERISTICAS

par(mfrow = c(1,3))
plot(df$median_house_value~df$median_income, main="MEDIAN_INCOME")
hist(df$median_income, breaks = 140, main = "MEDIAN_INCOME", border="chartreuse3", col="dodgerblue")
out_median_income=boxplot(df$households, main="MEDIAN_INCOME")

par(mfrow = c(1,3))
plot(df$median_house_value~df$total_rooms, main="TOTAL_ROOMS")
hist(df$median_income, breaks = 140, main = "TOTAL_ROOMS", border="chartreuse4", col="darkred")
out_total_rooms=boxplot(df$total_rooms, main="TOTAL_ROOMS")

par(mfrow = c(1,3))
plot(df$median_house_value~df$total_bedrooms, main="TOTAL_BEDROOMS")
hist(df$median_income, breaks = 140, main = "TOTAL_BEDROOMS", border="chartreuse4", col="darkred")
out_total_bedrooms= boxplot(df$total_bedrooms, main="TOTAL_BEDROOMS")

par(mfrow = c(1,3))
plot(df$median_house_value~df$population, main="POPULATION")
hist(df$median_income, breaks = 140, main = "POPULATION", border="chartreuse4", col="darkred")
out_population= boxplot(df$population, main="POPULATION")

par(mfrow = c(1,3))
plot(df$median_house_value~df$households, main="HOUSEHOLDS")
hist(df$households, breaks = 140, main = "HOUSEHOLDS",border="chartreuse4", col="darkred")
out_households= boxplot(df$households, main="HOUSEHOLDS")

par(mfrow = c(1,2))
hist(df$median_house_value, breaks = 140, main = "MEDIAN_HOUSE_VALUE", border="chartreuse3", col="dodgerblue")
out_median_value=boxplot(df$median_house_value, main="MEDIAN_HOUSE_VALUE")

Pasamos las variables categoricas a ‘ONE-HOT’ y dejamos listo ‘df’ para regresión

str(df)
## 'data.frame':    20640 obs. of  10 variables:
##  $ longitude         : num  -122 -122 -122 -122 -122 ...
##  $ latitude          : num  37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age: num  41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms       : num  880 7099 1467 1274 1627 ...
##  $ total_bedrooms    : num  129 1106 190 235 280 ...
##  $ population        : num  322 2401 496 558 565 ...
##  $ households        : num  126 1138 177 219 259 ...
##  $ median_income     : num  8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value: num  452600 358500 352100 341300 342200 ...
##  $ ocean_proximity   : chr  "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...
df$ocean_proximity = as.factor(df$ocean_proximity)
df<- one_hot(as.data.table(df))
colnames(df)[10]  <- "ocean_proximity_1H_OCEAN"
colnames(df)[13]  <- "ocean_proximity_NEAR_BAY"
colnames(df)[14]  <- "ocean_proximity_NEAR_OCEAN"
df <- na.omit(df)
str(df)
## Classes 'data.table' and 'data.frame':   20433 obs. of  14 variables:
##  $ longitude                 : num  -122 -122 -122 -122 -122 ...
##  $ latitude                  : num  37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age        : num  41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms               : num  880 7099 1467 1274 1627 ...
##  $ total_bedrooms            : num  129 1106 190 235 280 ...
##  $ population                : num  322 2401 496 558 565 ...
##  $ households                : num  126 1138 177 219 259 ...
##  $ median_income             : num  8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value        : num  452600 358500 352100 341300 342200 ...
##  $ ocean_proximity_1H_OCEAN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_INLAND    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_ISLAND    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_NEAR_BAY  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ocean_proximity_NEAR_OCEAN: int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

DATA CLEANING AND TRANSFORMING

cdf=df #clean_data_frame
#cdf_lon_lat=df #clean_data_frame

Quitamos los valores nulos, los outlayers de MEDIAN_HOUSE_VALUE, la categoria inbalanceada de OCEAN_PROXIMITY y los datos espaciales LONGITUD y LATITUD

#QUITAMOS LOS NAN's
cdf <- na.omit(cdf)
#QUITAMOS 'ISLANDS'
cdf<-cdf[,-12] 
#QUITMOS LONGITUD Y LATITUD
#cdf <- cdf[,-1:-2]
#REMOVEMOS ALGUNOS OUTLAYERS
cdf<-cdf[cdf$median_house_value <500000, ] 
cdf<-cdf[cdf$median_income <15, ] 
cdf<-cdf[cdf$housing_median_age <49, ] 

# QUITAMOS LOS OUTLAYERS
#cdf<- cdf[-which(cdf$total_rooms%in% out_total_rooms),]
#cdf<- cdf[-which(cdf$total_bedrooms%in% out_total_bedrooms),]
#cdf<- cdf[-which(cdf$population%in% out_population),]
#cdf<- cdf[-which(cdf$households%in% out_households),]
#cdf<- cdf[-which(cdf$median_income%in% out_median_income),]
#cdf<- cdf[-which(cdf$median_house_value%in% out_median_value),]


str(cdf)
## Classes 'data.table' and 'data.frame':   18071 obs. of  13 variables:
##  $ longitude                 : num  -122 -122 -122 -122 -122 ...
##  $ latitude                  : num  37.9 37.9 37.8 37.9 37.9 ...
##  $ housing_median_age        : num  41 21 42 40 42 41 48 48 43 40 ...
##  $ total_rooms               : num  880 7099 2555 751 1639 ...
##  $ total_bedrooms            : num  129 1106 665 184 367 ...
##  $ population                : num  322 2401 1206 409 929 ...
##  $ households                : num  126 1138 595 166 366 ...
##  $ median_income             : num  8.33 8.3 2.08 1.36 1.71 ...
##  $ median_house_value        : num  452600 358500 226700 147500 159800 ...
##  $ ocean_proximity_1H_OCEAN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_INLAND    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_NEAR_BAY  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ocean_proximity_NEAR_OCEAN: int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

TRANSFORMAMOS ALGUNAS VARIABLES PARA NORMALIZAR SUS HSITOGRAMAS

cdf$rooms_per_household <- cdf$total_rooms/ cdf$households
cdf$bedrooms_per_room= cdf$total_bedrooms/ cdf$total_rooms
cdf$population_per_household=cdf$population/ cdf$households

cdf$log_total_rooms=log(cdf$total_rooms)
cdf$log_total_bedrooms=log(cdf$total_bedrooms)
cdf$log_population=log(cdf$population)
cdf$log_households=log(cdf$households)
cdf$log_median_income=log(cdf$median_income)
cdf$log_median_house_value=log(cdf$median_house_value)

cdf$log_rooms_per_household <- cdf$log_total_rooms/ cdf$log_households
cdf$log_bedrooms_per_room= cdf$log_total_bedrooms/ cdf$log_total_rooms
cdf$log_population_per_household=cdf$log_population/ cdf$log_households

cdf2=cdf
cdf2$median_house_value=NULL
cdf$log_median_house_value=NULL
str(cdf)
## Classes 'data.table' and 'data.frame':   18071 obs. of  24 variables:
##  $ longitude                   : num  -122 -122 -122 -122 -122 ...
##  $ latitude                    : num  37.9 37.9 37.8 37.9 37.9 ...
##  $ housing_median_age          : num  41 21 42 40 42 41 48 48 43 40 ...
##  $ total_rooms                 : num  880 7099 2555 751 1639 ...
##  $ total_bedrooms              : num  129 1106 665 184 367 ...
##  $ population                  : num  322 2401 1206 409 929 ...
##  $ households                  : num  126 1138 595 166 366 ...
##  $ median_income               : num  8.33 8.3 2.08 1.36 1.71 ...
##  $ median_house_value          : num  452600 358500 226700 147500 159800 ...
##  $ ocean_proximity_1H_OCEAN    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_INLAND      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_NEAR_BAY    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ocean_proximity_NEAR_OCEAN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rooms_per_household         : num  6.98 6.24 4.29 4.52 4.48 ...
##  $ bedrooms_per_room           : num  0.147 0.156 0.26 0.245 0.224 ...
##  $ population_per_household    : num  2.56 2.11 2.03 2.46 2.54 ...
##  $ log_total_rooms             : num  6.78 8.87 7.85 6.62 7.4 ...
##  $ log_total_bedrooms          : num  4.86 7.01 6.5 5.21 5.91 ...
##  $ log_population              : num  5.77 7.78 7.1 6.01 6.83 ...
##  $ log_households              : num  4.84 7.04 6.39 5.11 5.9 ...
##  $ log_median_income           : num  2.119 2.116 0.733 0.306 0.539 ...
##  $ log_rooms_per_household     : num  1.4 1.26 1.23 1.3 1.25 ...
##  $ log_bedrooms_per_room       : num  0.717 0.79 0.828 0.788 0.798 ...
##  $ log_population_per_household: num  1.19 1.11 1.11 1.18 1.16 ...
##  - attr(*, ".internal.selfref")=<externalptr>
corrmatrix = cor(cdf)
t(corrmatrix)
##                                  longitude     latitude housing_median_age
## longitude                     1.0000000000 -0.922364020        -0.02308773
## latitude                     -0.9223640199  1.000000000        -0.06172033
## housing_median_age           -0.0230877337 -0.061720327         1.00000000
## total_rooms                   0.0336844030 -0.026127082        -0.37476300
## total_bedrooms                0.0630945239 -0.063519194        -0.33130972
## population                    0.0888992366 -0.105798593        -0.27718377
## households                    0.0493879355 -0.069338040        -0.31045185
## median_income                -0.0186060804 -0.074272054        -0.19793210
## median_house_value           -0.0232645793 -0.172011621         0.01381800
## ocean_proximity_1H_OCEAN      0.2854194785 -0.428637043         0.11762475
## ocean_proximity_INLAND       -0.0922112705  0.386166117        -0.21969804
## ocean_proximity_NEAR_BAY     -0.4117854923  0.307068376         0.12337611
## ocean_proximity_NEAR_OCEAN    0.0421879553 -0.162919971         0.03724211
## rooms_per_household          -0.0384494147  0.130983264        -0.17876200
## bedrooms_per_room             0.1157156990 -0.143056445         0.14426166
## population_per_household      0.0004556728  0.002566498         0.02155349
## log_total_rooms               0.0209708958 -0.024666645        -0.32481705
## log_total_bedrooms            0.0576674469 -0.068843342        -0.27763584
## log_population                0.1030874510 -0.141221893        -0.21160782
## log_households                0.0532614785 -0.091996515        -0.24197414
## log_median_income            -0.0198907447 -0.075912691        -0.18403576
## log_rooms_per_household      -0.0700990514  0.151488923        -0.05157560
## log_bedrooms_per_room         0.1105947173 -0.134860118        -0.02612265
## log_population_per_household  0.0623846343 -0.046976566         0.10572349
##                                total_rooms total_bedrooms  population
## longitude                     0.0336844030    0.063094524  0.08889924
## latitude                     -0.0261270816   -0.063519194 -0.10579859
## housing_median_age           -0.3747630026   -0.331309723 -0.27718377
## total_rooms                   1.0000000000    0.935422796  0.86023006
## total_bedrooms                0.9354227956    1.000000000  0.88021200
## population                    0.8602300574    0.880212004  1.00000000
## households                    0.9224226021    0.978746198  0.91068245
## median_income                 0.2227087874    0.020802496  0.04015941
## median_house_value            0.1532913001    0.079970469  0.02227064
## ocean_proximity_1H_OCEAN     -0.0162807691    0.012381030  0.06951887
## ocean_proximity_INLAND        0.0263535919   -0.013435002 -0.03799835
## ocean_proximity_NEAR_BAY      0.0003676967   -0.003761993 -0.03433719
## ocean_proximity_NEAR_OCEAN   -0.0134742604    0.003929836 -0.02140617
## rooms_per_household           0.1397245138    0.013422080 -0.07445637
## bedrooms_per_room            -0.1875404824    0.076759152  0.03409933
## population_per_household     -0.0241525895   -0.027629035  0.06532818
## log_total_rooms               0.7940511233    0.771069090  0.69415683
## log_total_bedrooms            0.7506468980    0.817471436  0.72221616
## log_population                0.6868776974    0.725579093  0.79772220
## log_households                0.7314149724    0.789293297  0.73601806
## log_median_income             0.2322828171    0.042326951  0.05389188
## log_rooms_per_household      -0.1963873711   -0.327411813 -0.35015483
## log_bedrooms_per_room         0.2734459719    0.491888221  0.41129794
## log_population_per_household -0.2756666873   -0.317412823 -0.09722667
##                                households median_income median_house_value
## longitude                     0.049387935  -0.018606080        -0.02326458
## latitude                     -0.069338040  -0.074272054        -0.17201162
## housing_median_age           -0.310451853  -0.197932095         0.01381800
## total_rooms                   0.922422602   0.222708787         0.15329130
## total_bedrooms                0.978746198   0.020802496         0.07997047
## population                    0.910682455   0.040159411         0.02227064
## households                    1.000000000   0.045468585         0.09948996
## median_income                 0.045468585   1.000000000         0.66577485
## median_house_value            0.099489960   0.665774849         1.00000000
## ocean_proximity_1H_OCEAN      0.038132019   0.184800134         0.32103851
## ocean_proximity_INLAND       -0.047228557  -0.228050323        -0.50301677
## ocean_proximity_NEAR_BAY      0.005665539   0.067867661         0.11481952
## ocean_proximity_NEAR_OCEAN    0.006090457  -0.005749940         0.14500129
## rooms_per_household          -0.078929107   0.311129787         0.11363592
## bedrooms_per_room             0.058160385  -0.634558982        -0.23391359
## population_per_household     -0.026534027   0.025866833        -0.01991006
## log_total_rooms               0.769172585   0.234052911         0.17626147
## log_total_bedrooms            0.808594866   0.009043832         0.08548597
## log_population                0.759431749   0.037795960         0.02962732
## log_households                0.815401150   0.042968702         0.11003767
## log_median_income             0.067472361   0.958447829         0.62987265
## log_rooms_per_household      -0.384828841   0.256478728         0.05214824
## log_bedrooms_per_room         0.474746017  -0.478212118        -0.15192829
## log_population_per_household -0.313887439  -0.016494389        -0.15963422
##                              ocean_proximity_1H_OCEAN ocean_proximity_INLAND
## longitude                                 0.285419479            -0.09221127
## latitude                                 -0.428637043             0.38616612
## housing_median_age                        0.117624750            -0.21969804
## total_rooms                              -0.016280769             0.02635359
## total_bedrooms                            0.012381030            -0.01343500
## population                                0.069518865            -0.03799835
## households                                0.038132019            -0.04722856
## median_income                             0.184800134            -0.22805032
## median_house_value                        0.321038511            -0.50301677
## ocean_proximity_1H_OCEAN                  1.000000000            -0.65541248
## ocean_proximity_INLAND                   -0.655412478             1.00000000
## ocean_proximity_NEAR_BAY                 -0.267518416            -0.21501585
## ocean_proximity_NEAR_OCEAN               -0.341321517            -0.27433452
## rooms_per_household                      -0.126176671             0.18085036
## bedrooms_per_room                         0.104289188            -0.14222788
## population_per_household                 -0.002092085             0.01133422
## log_total_rooms                           0.009729217            -0.01091605
## log_total_bedrooms                        0.038781199            -0.05283844
## log_population                            0.116629610            -0.09485921
## log_households                            0.073411382            -0.09864180
## log_median_income                         0.185238287            -0.23232746
## log_rooms_per_household                  -0.144206718             0.19766611
## log_bedrooms_per_room                     0.094314045            -0.12904802
## log_population_per_household              0.033718204             0.04757230
##                              ocean_proximity_NEAR_BAY
## longitude                               -0.4117854923
## latitude                                 0.3070683763
## housing_median_age                       0.1233761099
## total_rooms                              0.0003676967
## total_bedrooms                          -0.0037619932
## population                              -0.0343371915
## households                               0.0056655389
## median_income                            0.0678676612
## median_house_value                       0.1148195192
## ocean_proximity_1H_OCEAN                -0.2675184160
## ocean_proximity_INLAND                  -0.2150158524
## ocean_proximity_NEAR_BAY                 1.0000000000
## ocean_proximity_NEAR_OCEAN              -0.1119745800
## rooms_per_household                     -0.0201352195
## bedrooms_per_room                       -0.0210359696
## population_per_household                -0.0120023879
## log_total_rooms                         -0.0007979923
## log_total_bedrooms                      -0.0080782466
## log_population                          -0.0378127369
## log_households                           0.0036637247
## log_median_income                        0.0633516305
## log_rooms_per_household                 -0.0113660622
## log_bedrooms_per_room                   -0.0232309601
## log_population_per_household            -0.0682139322
##                              ocean_proximity_NEAR_OCEAN rooms_per_household
## longitude                                   0.042187955        -0.038449415
## latitude                                   -0.162919971         0.130983264
## housing_median_age                          0.037242109        -0.178761998
## total_rooms                                -0.013474260         0.139724514
## total_bedrooms                              0.003929836         0.013422080
## population                                 -0.021406165        -0.074456368
## households                                  0.006090457        -0.078929107
## median_income                              -0.005749940         0.311129787
## median_house_value                          0.145001289         0.113635921
## ocean_proximity_1H_OCEAN                   -0.341321517        -0.126176671
## ocean_proximity_INLAND                     -0.274334523         0.180850360
## ocean_proximity_NEAR_BAY                   -0.111974580        -0.020135220
## ocean_proximity_NEAR_OCEAN                  1.000000000        -0.053486583
## rooms_per_household                        -0.053486583         1.000000000
## bedrooms_per_room                           0.064356659        -0.415270347
## population_per_household                   -0.003237739        -0.007216777
## log_total_rooms                             0.002008713         0.139160080
## log_total_bedrooms                          0.024397651         0.001579552
## log_population                             -0.007586525        -0.177676942
## log_households                              0.028601981        -0.162566947
## log_median_income                           0.003446060         0.295219340
## log_rooms_per_household                    -0.057864508         0.659298701
## log_bedrooms_per_room                       0.062433286        -0.298376578
## log_population_per_household               -0.062813422         0.038223326
##                              bedrooms_per_room population_per_household
## longitude                         0.1157156990             0.0004556728
## latitude                         -0.1430564446             0.0025664977
## housing_median_age                0.1442616560             0.0215534933
## total_rooms                      -0.1875404824            -0.0241525895
## total_bedrooms                    0.0767591524            -0.0276290346
## population                        0.0340993331             0.0653281812
## households                        0.0581603852            -0.0265340274
## median_income                    -0.6345589816             0.0258668333
## median_house_value               -0.2339135880            -0.0199100594
## ocean_proximity_1H_OCEAN          0.1042891883            -0.0020920846
## ocean_proximity_INLAND           -0.1422278798             0.0113342227
## ocean_proximity_NEAR_BAY         -0.0210359696            -0.0120023879
## ocean_proximity_NEAR_OCEAN        0.0643566593            -0.0032377391
## rooms_per_household              -0.4152703470            -0.0072167769
## bedrooms_per_room                 1.0000000000             0.0043603587
## population_per_household          0.0043603587             1.0000000000
## log_total_rooms                  -0.2477146086            -0.0835735056
## log_total_bedrooms                0.0617609405            -0.0846318913
## log_population                    0.0172628437             0.0417694650
## log_households                    0.0321968961            -0.0810461733
## log_median_income                -0.6489189054             0.0145874010
## log_rooms_per_household          -0.4207453003             0.0706099308
## log_bedrooms_per_room             0.7211657237            -0.0853377761
## log_population_per_household     -0.0005781063             0.5528646835
##                              log_total_rooms log_total_bedrooms log_population
## longitude                       0.0209708958        0.057667447    0.103087451
## latitude                       -0.0246666450       -0.068843342   -0.141221893
## housing_median_age             -0.3248170511       -0.277635842   -0.211607821
## total_rooms                     0.7940511233        0.750646898    0.686877697
## total_bedrooms                  0.7710690903        0.817471436    0.725579093
## population                      0.6941568286        0.722216158    0.797722200
## households                      0.7691725851        0.808594866    0.759431749
## median_income                   0.2340529114        0.009043832    0.037795960
## median_house_value              0.1762614681        0.085485972    0.029627317
## ocean_proximity_1H_OCEAN        0.0097292173        0.038781199    0.116629610
## ocean_proximity_INLAND         -0.0109160491       -0.052838437   -0.094859210
## ocean_proximity_NEAR_BAY       -0.0007979923       -0.008078247   -0.037812737
## ocean_proximity_NEAR_OCEAN      0.0020087127        0.024397651   -0.007586525
## rooms_per_household             0.1391600797        0.001579552   -0.177676942
## bedrooms_per_room              -0.2477146086        0.061760940    0.017262844
## population_per_household       -0.0835735056       -0.084631891    0.041769465
## log_total_rooms                 1.0000000000        0.949123739    0.863753047
## log_total_bedrooms              0.9491237386        1.000000000    0.895265226
## log_population                  0.8637530465        0.895265226    1.000000000
## log_households                  0.9326880798        0.972298862    0.933558108
## log_median_income               0.2671505080        0.045589068    0.069285155
## log_rooms_per_household        -0.3705381193       -0.520693264   -0.601721079
## log_bedrooms_per_room           0.4350723248        0.686033805    0.586311406
## log_population_per_household   -0.4906483419       -0.511370349   -0.230411960
##                              log_households log_median_income
## longitude                       0.053261479       -0.01989074
## latitude                       -0.091996515       -0.07591269
## housing_median_age             -0.241974142       -0.18403576
## total_rooms                     0.731414972        0.23228282
## total_bedrooms                  0.789293297        0.04232695
## population                      0.736018062        0.05389188
## households                      0.815401150        0.06747236
## median_income                   0.042968702        0.95844783
## median_house_value              0.110037671        0.62987265
## ocean_proximity_1H_OCEAN        0.073411382        0.18523829
## ocean_proximity_INLAND         -0.098641795       -0.23232746
## ocean_proximity_NEAR_BAY        0.003663725        0.06335163
## ocean_proximity_NEAR_OCEAN      0.028601981        0.00344606
## rooms_per_household            -0.162566947        0.29521934
## bedrooms_per_room               0.032196896       -0.64891891
## population_per_household       -0.081046173        0.01458740
## log_total_rooms                 0.932688080        0.26715051
## log_total_bedrooms              0.972298862        0.04558907
## log_population                  0.933558108        0.06928515
## log_households                  1.000000000        0.08102491
## log_median_income               0.081024906        1.00000000
## log_rooms_per_household        -0.630269747        0.22346051
## log_bedrooms_per_room           0.645308828       -0.45100734
## log_population_per_household   -0.507470925       -0.04566184
##                              log_rooms_per_household log_bedrooms_per_room
## longitude                                -0.07009905            0.11059472
## latitude                                  0.15148892           -0.13486012
## housing_median_age                       -0.05157560           -0.02612265
## total_rooms                              -0.19638737            0.27344597
## total_bedrooms                           -0.32741181            0.49188822
## population                               -0.35015483            0.41129794
## households                               -0.38482884            0.47474602
## median_income                             0.25647873           -0.47821212
## median_house_value                        0.05214824           -0.15192829
## ocean_proximity_1H_OCEAN                 -0.14420672            0.09431404
## ocean_proximity_INLAND                    0.19766611           -0.12904802
## ocean_proximity_NEAR_BAY                 -0.01136606           -0.02323096
## ocean_proximity_NEAR_OCEAN               -0.05786451            0.06243329
## rooms_per_household                       0.65929870           -0.29837658
## bedrooms_per_room                        -0.42074530            0.72116572
## population_per_household                  0.07060993           -0.08533778
## log_total_rooms                          -0.37053812            0.43507232
## log_total_bedrooms                       -0.52069326            0.68603380
## log_population                           -0.60172108            0.58631141
## log_households                           -0.63026975            0.64530883
## log_median_income                         0.22346051           -0.45100734
## log_rooms_per_household                   1.00000000           -0.69434825
## log_bedrooms_per_room                    -0.69434825            1.00000000
## log_population_per_household              0.46168538           -0.39208473
##                              log_population_per_household
## longitude                                    0.0623846343
## latitude                                    -0.0469765656
## housing_median_age                           0.1057234910
## total_rooms                                 -0.2756666873
## total_bedrooms                              -0.3174128230
## population                                  -0.0972266706
## households                                  -0.3138874388
## median_income                               -0.0164943895
## median_house_value                          -0.1596342228
## ocean_proximity_1H_OCEAN                     0.0337182043
## ocean_proximity_INLAND                       0.0475723024
## ocean_proximity_NEAR_BAY                    -0.0682139322
## ocean_proximity_NEAR_OCEAN                  -0.0628134216
## rooms_per_household                          0.0382233257
## bedrooms_per_room                           -0.0005781063
## population_per_household                     0.5528646835
## log_total_rooms                             -0.4906483419
## log_total_bedrooms                          -0.5113703493
## log_population                              -0.2304119597
## log_households                              -0.5074709249
## log_median_income                           -0.0456618362
## log_rooms_per_household                      0.4616853762
## log_bedrooms_per_room                       -0.3920847272
## log_population_per_household                 1.0000000000
corrmatrix = cor(cdf2)
t(corrmatrix)
##                                  longitude     latitude housing_median_age
## longitude                     1.0000000000 -0.922364020       -0.023087734
## latitude                     -0.9223640199  1.000000000       -0.061720327
## housing_median_age           -0.0230877337 -0.061720327        1.000000000
## total_rooms                   0.0336844030 -0.026127082       -0.374763003
## total_bedrooms                0.0630945239 -0.063519194       -0.331309723
## population                    0.0888992366 -0.105798593       -0.277183767
## households                    0.0493879355 -0.069338040       -0.310451853
## median_income                -0.0186060804 -0.074272054       -0.197932095
## ocean_proximity_1H_OCEAN      0.2854194785 -0.428637043        0.117624750
## ocean_proximity_INLAND       -0.0922112705  0.386166117       -0.219698042
## ocean_proximity_NEAR_BAY     -0.4117854923  0.307068376        0.123376110
## ocean_proximity_NEAR_OCEAN    0.0421879553 -0.162919971        0.037242109
## rooms_per_household          -0.0384494147  0.130983264       -0.178761998
## bedrooms_per_room             0.1157156990 -0.143056445        0.144261656
## population_per_household      0.0004556728  0.002566498        0.021553493
## log_total_rooms               0.0209708958 -0.024666645       -0.324817051
## log_total_bedrooms            0.0576674469 -0.068843342       -0.277635842
## log_population                0.1030874510 -0.141221893       -0.211607821
## log_households                0.0532614785 -0.091996515       -0.241974142
## log_median_income            -0.0198907447 -0.075912691       -0.184035763
## log_median_house_value        0.0028206135 -0.217282976       -0.002693287
## log_rooms_per_household      -0.0700990514  0.151488923       -0.051575601
## log_bedrooms_per_room         0.1105947173 -0.134860118       -0.026122647
## log_population_per_household  0.0623846343 -0.046976566        0.105723491
##                                total_rooms total_bedrooms  population
## longitude                     0.0336844030    0.063094524  0.08889924
## latitude                     -0.0261270816   -0.063519194 -0.10579859
## housing_median_age           -0.3747630026   -0.331309723 -0.27718377
## total_rooms                   1.0000000000    0.935422796  0.86023006
## total_bedrooms                0.9354227956    1.000000000  0.88021200
## population                    0.8602300574    0.880212004  1.00000000
## households                    0.9224226021    0.978746198  0.91068245
## median_income                 0.2227087874    0.020802496  0.04015941
## ocean_proximity_1H_OCEAN     -0.0162807691    0.012381030  0.06951887
## ocean_proximity_INLAND        0.0263535919   -0.013435002 -0.03799835
## ocean_proximity_NEAR_BAY      0.0003676967   -0.003761993 -0.03433719
## ocean_proximity_NEAR_OCEAN   -0.0134742604    0.003929836 -0.02140617
## rooms_per_household           0.1397245138    0.013422080 -0.07445637
## bedrooms_per_room            -0.1875404824    0.076759152  0.03409933
## population_per_household     -0.0241525895   -0.027629035  0.06532818
## log_total_rooms               0.7940511233    0.771069090  0.69415683
## log_total_bedrooms            0.7506468980    0.817471436  0.72221616
## log_population                0.6868776974    0.725579093  0.79772220
## log_households                0.7314149724    0.789293297  0.73601806
## log_median_income             0.2322828171    0.042326951  0.05389188
## log_median_house_value        0.1652561028    0.100893149  0.05226457
## log_rooms_per_household      -0.1963873711   -0.327411813 -0.35015483
## log_bedrooms_per_room         0.2734459719    0.491888221  0.41129794
## log_population_per_household -0.2756666873   -0.317412823 -0.09722667
##                                households median_income
## longitude                     0.049387935  -0.018606080
## latitude                     -0.069338040  -0.074272054
## housing_median_age           -0.310451853  -0.197932095
## total_rooms                   0.922422602   0.222708787
## total_bedrooms                0.978746198   0.020802496
## population                    0.910682455   0.040159411
## households                    1.000000000   0.045468585
## median_income                 0.045468585   1.000000000
## ocean_proximity_1H_OCEAN      0.038132019   0.184800134
## ocean_proximity_INLAND       -0.047228557  -0.228050323
## ocean_proximity_NEAR_BAY      0.005665539   0.067867661
## ocean_proximity_NEAR_OCEAN    0.006090457  -0.005749940
## rooms_per_household          -0.078929107   0.311129787
## bedrooms_per_room             0.058160385  -0.634558982
## population_per_household     -0.026534027   0.025866833
## log_total_rooms               0.769172585   0.234052911
## log_total_bedrooms            0.808594866   0.009043832
## log_population                0.759431749   0.037795960
## log_households                0.815401150   0.042968702
## log_median_income             0.067472361   0.958447829
## log_median_house_value        0.122602210   0.657736638
## log_rooms_per_household      -0.384828841   0.256478728
## log_bedrooms_per_room         0.474746017  -0.478212118
## log_population_per_household -0.313887439  -0.016494389
##                              ocean_proximity_1H_OCEAN ocean_proximity_INLAND
## longitude                                 0.285419479            -0.09221127
## latitude                                 -0.428637043             0.38616612
## housing_median_age                        0.117624750            -0.21969804
## total_rooms                              -0.016280769             0.02635359
## total_bedrooms                            0.012381030            -0.01343500
## population                                0.069518865            -0.03799835
## households                                0.038132019            -0.04722856
## median_income                             0.184800134            -0.22805032
## ocean_proximity_1H_OCEAN                  1.000000000            -0.65541248
## ocean_proximity_INLAND                   -0.655412478             1.00000000
## ocean_proximity_NEAR_BAY                 -0.267518416            -0.21501585
## ocean_proximity_NEAR_OCEAN               -0.341321517            -0.27433452
## rooms_per_household                      -0.126176671             0.18085036
## bedrooms_per_room                         0.104289188            -0.14222788
## population_per_household                 -0.002092085             0.01133422
## log_total_rooms                           0.009729217            -0.01091605
## log_total_bedrooms                        0.038781199            -0.05283844
## log_population                            0.116629610            -0.09485921
## log_households                            0.073411382            -0.09864180
## log_median_income                         0.185238287            -0.23232746
## log_median_house_value                    0.382798776            -0.56963529
## log_rooms_per_household                  -0.144206718             0.19766611
## log_bedrooms_per_room                     0.094314045            -0.12904802
## log_population_per_household              0.033718204             0.04757230
##                              ocean_proximity_NEAR_BAY
## longitude                               -0.4117854923
## latitude                                 0.3070683763
## housing_median_age                       0.1233761099
## total_rooms                              0.0003676967
## total_bedrooms                          -0.0037619932
## population                              -0.0343371915
## households                               0.0056655389
## median_income                            0.0678676612
## ocean_proximity_1H_OCEAN                -0.2675184160
## ocean_proximity_INLAND                  -0.2150158524
## ocean_proximity_NEAR_BAY                 1.0000000000
## ocean_proximity_NEAR_OCEAN              -0.1119745800
## rooms_per_household                     -0.0201352195
## bedrooms_per_room                       -0.0210359696
## population_per_household                -0.0120023879
## log_total_rooms                         -0.0007979923
## log_total_bedrooms                      -0.0080782466
## log_population                          -0.0378127369
## log_households                           0.0036637247
## log_median_income                        0.0633516305
## log_median_house_value                   0.1184683725
## log_rooms_per_household                 -0.0113660622
## log_bedrooms_per_room                   -0.0232309601
## log_population_per_household            -0.0682139322
##                              ocean_proximity_NEAR_OCEAN rooms_per_household
## longitude                                   0.042187955        -0.038449415
## latitude                                   -0.162919971         0.130983264
## housing_median_age                          0.037242109        -0.178761998
## total_rooms                                -0.013474260         0.139724514
## total_bedrooms                              0.003929836         0.013422080
## population                                 -0.021406165        -0.074456368
## households                                  0.006090457        -0.078929107
## median_income                              -0.005749940         0.311129787
## ocean_proximity_1H_OCEAN                   -0.341321517        -0.126176671
## ocean_proximity_INLAND                     -0.274334523         0.180850360
## ocean_proximity_NEAR_BAY                   -0.111974580        -0.020135220
## ocean_proximity_NEAR_OCEAN                  1.000000000        -0.053486583
## rooms_per_household                        -0.053486583         1.000000000
## bedrooms_per_room                           0.064356659        -0.415270347
## population_per_household                   -0.003237739        -0.007216777
## log_total_rooms                             0.002008713         0.139160080
## log_total_bedrooms                          0.024397651         0.001579552
## log_population                             -0.007586525        -0.177676942
## log_households                              0.028601981        -0.162566947
## log_median_income                           0.003446060         0.295219340
## log_median_house_value                      0.145005058         0.098044494
## log_rooms_per_household                    -0.057864508         0.659298701
## log_bedrooms_per_room                       0.062433286        -0.298376578
## log_population_per_household               -0.062813422         0.038223326
##                              bedrooms_per_room population_per_household
## longitude                         0.1157156990             0.0004556728
## latitude                         -0.1430564446             0.0025664977
## housing_median_age                0.1442616560             0.0215534933
## total_rooms                      -0.1875404824            -0.0241525895
## total_bedrooms                    0.0767591524            -0.0276290346
## population                        0.0340993331             0.0653281812
## households                        0.0581603852            -0.0265340274
## median_income                    -0.6345589816             0.0258668333
## ocean_proximity_1H_OCEAN          0.1042891883            -0.0020920846
## ocean_proximity_INLAND           -0.1422278798             0.0113342227
## ocean_proximity_NEAR_BAY         -0.0210359696            -0.0120023879
## ocean_proximity_NEAR_OCEAN        0.0643566593            -0.0032377391
## rooms_per_household              -0.4152703470            -0.0072167769
## bedrooms_per_room                 1.0000000000             0.0043603587
## population_per_household          0.0043603587             1.0000000000
## log_total_rooms                  -0.2477146086            -0.0835735056
## log_total_bedrooms                0.0617609405            -0.0846318913
## log_population                    0.0172628437             0.0417694650
## log_households                    0.0321968961            -0.0810461733
## log_median_income                -0.6489189054             0.0145874010
## log_median_house_value           -0.2182050354            -0.0188189580
## log_rooms_per_household          -0.4207453003             0.0706099308
## log_bedrooms_per_room             0.7211657237            -0.0853377761
## log_population_per_household     -0.0005781063             0.5528646835
##                              log_total_rooms log_total_bedrooms log_population
## longitude                       0.0209708958        0.057667447    0.103087451
## latitude                       -0.0246666450       -0.068843342   -0.141221893
## housing_median_age             -0.3248170511       -0.277635842   -0.211607821
## total_rooms                     0.7940511233        0.750646898    0.686877697
## total_bedrooms                  0.7710690903        0.817471436    0.725579093
## population                      0.6941568286        0.722216158    0.797722200
## households                      0.7691725851        0.808594866    0.759431749
## median_income                   0.2340529114        0.009043832    0.037795960
## ocean_proximity_1H_OCEAN        0.0097292173        0.038781199    0.116629610
## ocean_proximity_INLAND         -0.0109160491       -0.052838437   -0.094859210
## ocean_proximity_NEAR_BAY       -0.0007979923       -0.008078247   -0.037812737
## ocean_proximity_NEAR_OCEAN      0.0020087127        0.024397651   -0.007586525
## rooms_per_household             0.1391600797        0.001579552   -0.177676942
## bedrooms_per_room              -0.2477146086        0.061760940    0.017262844
## population_per_household       -0.0835735056       -0.084631891    0.041769465
## log_total_rooms                 1.0000000000        0.949123739    0.863753047
## log_total_bedrooms              0.9491237386        1.000000000    0.895265226
## log_population                  0.8637530465        0.895265226    1.000000000
## log_households                  0.9326880798        0.972298862    0.933558108
## log_median_income               0.2671505080        0.045589068    0.069285155
## log_median_house_value          0.1945087697        0.110101047    0.067248042
## log_rooms_per_household        -0.3705381193       -0.520693264   -0.601721079
## log_bedrooms_per_room           0.4350723248        0.686033805    0.586311406
## log_population_per_household   -0.4906483419       -0.511370349   -0.230411960
##                              log_households log_median_income
## longitude                       0.053261479       -0.01989074
## latitude                       -0.091996515       -0.07591269
## housing_median_age             -0.241974142       -0.18403576
## total_rooms                     0.731414972        0.23228282
## total_bedrooms                  0.789293297        0.04232695
## population                      0.736018062        0.05389188
## households                      0.815401150        0.06747236
## median_income                   0.042968702        0.95844783
## ocean_proximity_1H_OCEAN        0.073411382        0.18523829
## ocean_proximity_INLAND         -0.098641795       -0.23232746
## ocean_proximity_NEAR_BAY        0.003663725        0.06335163
## ocean_proximity_NEAR_OCEAN      0.028601981        0.00344606
## rooms_per_household            -0.162566947        0.29521934
## bedrooms_per_room               0.032196896       -0.64891891
## population_per_household       -0.081046173        0.01458740
## log_total_rooms                 0.932688080        0.26715051
## log_total_bedrooms              0.972298862        0.04558907
## log_population                  0.933558108        0.06928515
## log_households                  1.000000000        0.08102491
## log_median_income               0.081024906        1.00000000
## log_median_house_value          0.138879114        0.65700179
## log_rooms_per_household        -0.630269747        0.22346051
## log_bedrooms_per_room           0.645308828       -0.45100734
## log_population_per_household   -0.507470925       -0.04566184
##                              log_median_house_value log_rooms_per_household
## longitude                               0.002820613             -0.07009905
## latitude                               -0.217282976              0.15148892
## housing_median_age                     -0.002693287             -0.05157560
## total_rooms                             0.165256103             -0.19638737
## total_bedrooms                          0.100893149             -0.32741181
## population                              0.052264569             -0.35015483
## households                              0.122602210             -0.38482884
## median_income                           0.657736638              0.25647873
## ocean_proximity_1H_OCEAN                0.382798776             -0.14420672
## ocean_proximity_INLAND                 -0.569635289              0.19766611
## ocean_proximity_NEAR_BAY                0.118468373             -0.01136606
## ocean_proximity_NEAR_OCEAN              0.145005058             -0.05786451
## rooms_per_household                     0.098044494              0.65929870
## bedrooms_per_room                      -0.218205035             -0.42074530
## population_per_household               -0.018818958              0.07060993
## log_total_rooms                         0.194508770             -0.37053812
## log_total_bedrooms                      0.110101047             -0.52069326
## log_population                          0.067248042             -0.60172108
## log_households                          0.138879114             -0.63026975
## log_median_income                       0.657001793              0.22346051
## log_median_house_value                  1.000000000              0.02077848
## log_rooms_per_household                 0.020778480              1.00000000
## log_bedrooms_per_room                  -0.125049088             -0.69434825
## log_population_per_household           -0.156609388              0.46168538
##                              log_bedrooms_per_room log_population_per_household
## longitude                               0.11059472                 0.0623846343
## latitude                               -0.13486012                -0.0469765656
## housing_median_age                     -0.02612265                 0.1057234910
## total_rooms                             0.27344597                -0.2756666873
## total_bedrooms                          0.49188822                -0.3174128230
## population                              0.41129794                -0.0972266706
## households                              0.47474602                -0.3138874388
## median_income                          -0.47821212                -0.0164943895
## ocean_proximity_1H_OCEAN                0.09431404                 0.0337182043
## ocean_proximity_INLAND                 -0.12904802                 0.0475723024
## ocean_proximity_NEAR_BAY               -0.02323096                -0.0682139322
## ocean_proximity_NEAR_OCEAN              0.06243329                -0.0628134216
## rooms_per_household                    -0.29837658                 0.0382233257
## bedrooms_per_room                       0.72116572                -0.0005781063
## population_per_household               -0.08533778                 0.5528646835
## log_total_rooms                         0.43507232                -0.4906483419
## log_total_bedrooms                      0.68603380                -0.5113703493
## log_population                          0.58631141                -0.2304119597
## log_households                          0.64530883                -0.5074709249
## log_median_income                      -0.45100734                -0.0456618362
## log_median_house_value                 -0.12504909                -0.1566093882
## log_rooms_per_household                -0.69434825                 0.4616853762
## log_bedrooms_per_room                   1.00000000                -0.3920847272
## log_population_per_household           -0.39208473                 1.0000000000

CREEMOS UN 3 DATAFRAME ESTA VEZ ESCALANDO LOS DATOS

sdf=df #clean_data_frame
summary(sdf)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1450  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.63      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.72   3rd Qu.:37.00      3rd Qu.: 3143  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5637  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5365  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.4   Mean   : 3.8712  
##  3rd Qu.: 647.0   3rd Qu.: 1722   3rd Qu.: 604.0   3rd Qu.: 4.7440  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  median_house_value ocean_proximity_1H_OCEAN ocean_proximity_INLAND
##  Min.   : 14999     Min.   :0.0000           Min.   :0.0000        
##  1st Qu.:119500     1st Qu.:0.0000           1st Qu.:0.0000        
##  Median :179700     Median :0.0000           Median :0.0000        
##  Mean   :206864     Mean   :0.4421           Mean   :0.3179        
##  3rd Qu.:264700     3rd Qu.:1.0000           3rd Qu.:1.0000        
##  Max.   :500001     Max.   :1.0000           Max.   :1.0000        
##  ocean_proximity_ISLAND ocean_proximity_NEAR_BAY ocean_proximity_NEAR_OCEAN
##  Min.   :0.0000000      Min.   :0.0000           Min.   :0.0000            
##  1st Qu.:0.0000000      1st Qu.:0.0000           1st Qu.:0.0000            
##  Median :0.0000000      Median :0.0000           Median :0.0000            
##  Mean   :0.0002447      Mean   :0.1111           Mean   :0.1286            
##  3rd Qu.:0.0000000      3rd Qu.:0.0000           3rd Qu.:0.0000            
##  Max.   :1.0000000      Max.   :1.0000           Max.   :1.0000
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
#QUITAMOS LOS NAN's



#QUITAMOS 'ISLANDS'
sdf<-sdf[,-12] 
#QUITMOS LONGITUD Y LATITUD
sdf <- sdf[,-1:-2]
#REMOVEMOS ALGUNOS OUTLAYERS
sdf<-sdf[sdf$median_house_value <500000, ] 
sdf<-sdf[sdf$median_income <15, ] 
sdf<-sdf[sdf$housing_median_age <49, ] 


sdf$rooms_per_household <- sdf$total_rooms/ sdf$households
sdf$bedrooms_per_room= sdf$total_bedrooms/ sdf$total_rooms
sdf$population_per_household=sdf$population/ sdf$households

sdf$median_house_value=rescale(sdf$median_house_value,to=c(1,15))
sdf$housing_median_age=rescale(sdf$housing_median_age,to=c(1,15))
sdf$total_rooms=rescale(sdf$total_rooms,to=c(1,15))
sdf$total_bedrooms=rescale(sdf$total_bedrooms,to=c(1,15))
sdf$population=rescale(sdf$population,to=c(1,15))
sdf$households=rescale(sdf$households,to=c(1,15))
sdf$rooms_per_household=rescale(sdf$rooms_per_household,to=c(1,15))
sdf$bedrooms_per_room=rescale(sdf$bedrooms_per_room,to=c(1,15))
sdf$population_per_household=rescale(sdf$population_per_household,to=c(1,15))


sdf$log_total_rooms=log(sdf$total_rooms)
sdf$log_total_bedrooms=log(sdf$total_bedrooms)
sdf$log_population=log(sdf$population)
sdf$log_households=log(sdf$households)
sdf$log_median_income=log(sdf$median_income)
sdf$log_median_house_value=
sdf$log_rooms_per_household <- log(sdf$rooms_per_household)
sdf$log_bedrooms_per_room= log(sdf$bedrooms_per_room)
sdf$log_population_per_household=log(sdf$population_per_household)

sdf2=sdf
sdf2$median_house_value=NULL
sdf$log_median_house_value=NULL

summary(sdf)
##  housing_median_age  total_rooms     total_bedrooms     population    
##  Min.   : 1.000     Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 5.766     1st Qu.: 1.520   1st Qu.: 1.650   1st Qu.: 1.319  
##  Median : 8.745     Median : 1.766   Median : 1.958   Median : 1.474  
##  Mean   : 8.627     Mean   : 1.955   Mean   : 2.192   Mean   : 1.579  
##  3rd Qu.:11.128     3rd Qu.: 2.139   3rd Qu.: 2.432   3rd Qu.: 1.700  
##  Max.   :15.000     Max.   :15.000   Max.   :15.000   Max.   :15.000  
##    households     median_income     median_house_value ocean_proximity_1H_OCEAN
##  Min.   : 1.000   Min.   : 0.4999   Min.   : 1.000     Min.   :0.0000          
##  1st Qu.: 1.652   1st Qu.: 2.5429   1st Qu.: 3.901     1st Qu.:0.0000          
##  Median : 1.956   Median : 3.4821   Median : 5.514     Median :0.0000          
##  Mean   : 2.172   Mean   : 3.7014   Mean   : 6.037     Mean   :0.4492          
##  3rd Qu.: 2.414   3rd Qu.: 4.6250   3rd Qu.: 7.584     3rd Qu.:1.0000          
##  Max.   :15.000   Max.   :13.1477   Max.   :15.000     Max.   :1.0000          
##  ocean_proximity_INLAND ocean_proximity_NEAR_BAY ocean_proximity_NEAR_OCEAN
##  Min.   :0.000          Min.   :0.00000          Min.   :0.000             
##  1st Qu.:0.000          1st Qu.:0.00000          1st Qu.:0.000             
##  Median :0.000          Median :0.00000          Median :0.000             
##  Mean   :0.345          Mean   :0.08068          Mean   :0.125             
##  3rd Qu.:1.000          3rd Qu.:0.00000          3rd Qu.:0.000             
##  Max.   :1.000          Max.   :1.00000          Max.   :1.000             
##  rooms_per_household bedrooms_per_room population_per_household
##  Min.   : 1.000      Min.   : 1.000    Min.   : 1.000          
##  1st Qu.: 1.382      1st Qu.: 2.195    1st Qu.: 1.020          
##  Median : 1.465      Median : 2.611    Median : 1.024          
##  Mean   : 1.484      Mean   : 2.769    Mean   : 1.027          
##  3rd Qu.: 1.548      3rd Qu.: 3.179    3rd Qu.: 1.030          
##  Max.   :15.000      Max.   :15.000    Max.   :15.000          
##  log_total_rooms  log_total_bedrooms log_population   log_households  
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.4186   1st Qu.:0.5006     1st Qu.:0.2769   1st Qu.:0.5018  
##  Median :0.5685   Median :0.6721     Median :0.3880   Median :0.6707  
##  Mean   :0.6163   Mean   :0.7218     Mean   :0.4270   Mean   :0.7153  
##  3rd Qu.:0.7603   3rd Qu.:0.8887     3rd Qu.:0.5306   3rd Qu.:0.8812  
##  Max.   :2.7081   Max.   :2.7081     Max.   :2.7081   Max.   :2.7081  
##  log_median_income log_rooms_per_household log_bedrooms_per_room
##  Min.   :-0.6933   Min.   :0.0000          Min.   :0.0000       
##  1st Qu.: 0.9333   1st Qu.:0.3233          1st Qu.:0.7864       
##  Median : 1.2476   Median :0.3819          Median :0.9596       
##  Mean   : 1.2174   Mean   :0.3868          Mean   :0.9774       
##  3rd Qu.: 1.5315   3rd Qu.:0.4369          3rd Qu.:1.1566       
##  Max.   : 2.5762   Max.   :2.7081          Max.   :2.7081       
##  log_population_per_household
##  Min.   :0.00000             
##  1st Qu.:0.01986             
##  Median :0.02413             
##  Mean   :0.02581             
##  3rd Qu.:0.02921             
##  Max.   :2.70805
corrmatrix = cor(sdf)
t(corrmatrix)
##                              housing_median_age   total_rooms total_bedrooms
## housing_median_age                   1.00000000 -0.3747630026   -0.331309723
## total_rooms                         -0.37476300  1.0000000000    0.935422796
## total_bedrooms                      -0.33130972  0.9354227956    1.000000000
## population                          -0.27718377  0.8602300574    0.880212004
## households                          -0.31045185  0.9224226021    0.978746198
## median_income                       -0.19793210  0.2227087874    0.020802496
## median_house_value                   0.01381800  0.1532913001    0.079970469
## ocean_proximity_1H_OCEAN             0.11762475 -0.0162807691    0.012381030
## ocean_proximity_INLAND              -0.21969804  0.0263535919   -0.013435002
## ocean_proximity_NEAR_BAY             0.12337611  0.0003676967   -0.003761993
## ocean_proximity_NEAR_OCEAN           0.03724211 -0.0134742604    0.003929836
## rooms_per_household                 -0.17876200  0.1397245138    0.013422080
## bedrooms_per_room                    0.14426166 -0.1875404824    0.076759152
## population_per_household             0.02155349 -0.0241525895   -0.027629035
## log_total_rooms                     -0.39347049  0.9419308631    0.900431077
## log_total_bedrooms                  -0.33580108  0.8708651621    0.946099399
## log_population                      -0.28268819  0.8254358961    0.861862266
## log_households                      -0.30696984  0.8565500468    0.922279441
## log_median_income                   -0.18403576  0.2322828171    0.042326951
## log_rooms_per_household             -0.23622198  0.1979466718    0.008893265
## log_bedrooms_per_room                0.17686465 -0.2005556579    0.078948657
## log_population_per_household         0.03483904 -0.0522278774   -0.063176207
##                               population   households median_income
## housing_median_age           -0.27718377 -0.310451853  -0.197932095
## total_rooms                   0.86023006  0.922422602   0.222708787
## total_bedrooms                0.88021200  0.978746198   0.020802496
## population                    1.00000000  0.910682455   0.040159411
## households                    0.91068245  1.000000000   0.045468585
## median_income                 0.04015941  0.045468585   1.000000000
## median_house_value            0.02227064  0.099489960   0.665774849
## ocean_proximity_1H_OCEAN      0.06951887  0.038132019   0.184800134
## ocean_proximity_INLAND       -0.03799835 -0.047228557  -0.228050323
## ocean_proximity_NEAR_BAY     -0.03433719  0.005665539   0.067867661
## ocean_proximity_NEAR_OCEAN   -0.02140617  0.006090457  -0.005749940
## rooms_per_household          -0.07445637 -0.078929107   0.311129787
## bedrooms_per_room             0.03409933  0.058160385  -0.634558982
## population_per_household      0.06532818 -0.026534027   0.025866833
## log_total_rooms               0.81323363  0.892859889   0.250995266
## log_total_bedrooms            0.83073874  0.930785486   0.010404776
## log_population                0.95460000  0.894514840   0.036226741
## log_households                0.85452266  0.946890218   0.042210551
## log_median_income             0.05389188  0.067472361   0.958447829
## log_rooms_per_household      -0.07534127 -0.082167389   0.465248251
## log_bedrooms_per_room         0.02771288  0.057217926  -0.723887316
## log_population_per_household  0.10432185 -0.057916990   0.007957251
##                              median_house_value ocean_proximity_1H_OCEAN
## housing_median_age                   0.01381800              0.117624750
## total_rooms                          0.15329130             -0.016280769
## total_bedrooms                       0.07997047              0.012381030
## population                           0.02227064              0.069518865
## households                           0.09948996              0.038132019
## median_income                        0.66577485              0.184800134
## median_house_value                   1.00000000              0.321038511
## ocean_proximity_1H_OCEAN             0.32103851              1.000000000
## ocean_proximity_INLAND              -0.50301677             -0.655412478
## ocean_proximity_NEAR_BAY             0.11481952             -0.267518416
## ocean_proximity_NEAR_OCEAN           0.14500129             -0.341321517
## rooms_per_household                  0.11363592             -0.126176671
## bedrooms_per_room                   -0.23391359              0.104289188
## population_per_household            -0.01991006             -0.002092085
## log_total_rooms                      0.18386617             -0.010962608
## log_total_bedrooms                   0.09091698              0.021539760
## log_population                       0.02471486              0.090670654
## log_households                       0.11432395              0.053704989
## log_median_income                    0.62987265              0.185238287
## log_rooms_per_household              0.17234770             -0.158836967
## log_bedrooms_per_room               -0.30481258              0.084508064
## log_population_per_household        -0.06682306              0.023595154
##                              ocean_proximity_INLAND ocean_proximity_NEAR_BAY
## housing_median_age                     -0.219698042             0.1233761099
## total_rooms                             0.026353592             0.0003676967
## total_bedrooms                         -0.013435002            -0.0037619932
## population                             -0.037998355            -0.0343371915
## households                             -0.047228557             0.0056655389
## median_income                          -0.228050323             0.0678676612
## median_house_value                     -0.503016774             0.1148195192
## ocean_proximity_1H_OCEAN               -0.655412478            -0.2675184160
## ocean_proximity_INLAND                  1.000000000            -0.2150158524
## ocean_proximity_NEAR_BAY               -0.215015852             1.0000000000
## ocean_proximity_NEAR_OCEAN             -0.274334523            -0.1119745800
## rooms_per_household                     0.180850360            -0.0201352195
## bedrooms_per_room                      -0.142227880            -0.0210359696
## population_per_household                0.011334223            -0.0120023879
## log_total_rooms                         0.014764616             0.0018609286
## log_total_bedrooms                     -0.032381997            -0.0046413556
## log_population                         -0.058401005            -0.0390712783
## log_households                         -0.073495620             0.0057757582
## log_median_income                      -0.232327460             0.0633516305
## log_rooms_per_household                 0.223796236            -0.0179560078
## log_bedrooms_per_room                  -0.123799997            -0.0226607031
## log_population_per_household            0.009430231            -0.0362119402
##                              ocean_proximity_NEAR_OCEAN rooms_per_household
## housing_median_age                          0.037242109        -0.178761998
## total_rooms                                -0.013474260         0.139724514
## total_bedrooms                              0.003929836         0.013422080
## population                                 -0.021406165        -0.074456368
## households                                  0.006090457        -0.078929107
## median_income                              -0.005749940         0.311129787
## median_house_value                          0.145001289         0.113635921
## ocean_proximity_1H_OCEAN                   -0.341321517        -0.126176671
## ocean_proximity_INLAND                     -0.274334523         0.180850360
## ocean_proximity_NEAR_BAY                   -0.111974580        -0.020135220
## ocean_proximity_NEAR_OCEAN                  1.000000000        -0.053486583
## rooms_per_household                        -0.053486583         1.000000000
## bedrooms_per_room                           0.064356659        -0.415270347
## population_per_household                   -0.003237739        -0.007216777
## log_total_rooms                            -0.005971760         0.151481550
## log_total_bedrooms                          0.018127233         0.003540664
## log_population                             -0.019922289        -0.109742072
## log_households                              0.020386282        -0.118986508
## log_median_income                           0.003446060         0.295219340
## log_rooms_per_household                    -0.067851181         0.919151674
## log_bedrooms_per_room                       0.068966766        -0.436854929
## log_population_per_household               -0.019142021        -0.018714076
##                              bedrooms_per_room population_per_household
## housing_median_age                 0.144261656              0.021553493
## total_rooms                       -0.187540482             -0.024152590
## total_bedrooms                     0.076759152             -0.027629035
## population                         0.034099333              0.065328181
## households                         0.058160385             -0.026534027
## median_income                     -0.634558982              0.025866833
## median_house_value                -0.233913588             -0.019910059
## ocean_proximity_1H_OCEAN           0.104289188             -0.002092085
## ocean_proximity_INLAND            -0.142227880              0.011334223
## ocean_proximity_NEAR_BAY          -0.021035970             -0.012002388
## ocean_proximity_NEAR_OCEAN         0.064356659             -0.003237739
## rooms_per_household               -0.415270347             -0.007216777
## bedrooms_per_room                  1.000000000              0.004360359
## population_per_household           0.004360359              1.000000000
## log_total_rooms                   -0.222571415             -0.037566024
## log_total_bedrooms                 0.090352054             -0.040941451
## log_population                     0.046275437              0.058523351
## log_households                     0.064692998             -0.039394648
## log_median_income                 -0.648918905              0.014587401
## log_rooms_per_household           -0.634509189             -0.009666021
## log_bedrooms_per_room              0.966803048              0.003411033
## log_population_per_household       0.006328297              0.934938587
##                              log_total_rooms log_total_bedrooms log_population
## housing_median_age              -0.393470487       -0.335801084    -0.28268819
## total_rooms                      0.941930863        0.870865162     0.82543590
## total_bedrooms                   0.900431077        0.946099399     0.86186227
## population                       0.813233625        0.830738742     0.95460000
## households                       0.892859889        0.930785486     0.89451484
## median_income                    0.250995266        0.010404776     0.03622674
## median_house_value               0.183866169        0.090916985     0.02471486
## ocean_proximity_1H_OCEAN        -0.010962608        0.021539760     0.09067065
## ocean_proximity_INLAND           0.014764616       -0.032381997    -0.05840100
## ocean_proximity_NEAR_BAY         0.001860929       -0.004641356    -0.03907128
## ocean_proximity_NEAR_OCEAN      -0.005971760        0.018127233    -0.01992229
## rooms_per_household              0.151481550        0.003540664    -0.10974207
## bedrooms_per_room               -0.222571415        0.090352054     0.04627544
## population_per_household        -0.037566024       -0.040941451     0.05852335
## log_total_rooms                  1.000000000        0.937263502     0.85690868
## log_total_bedrooms               0.937263502        1.000000000     0.88865109
## log_population                   0.856908682        0.888651089     1.00000000
## log_households                   0.925040691        0.977514499     0.91828515
## log_median_income                0.270005509        0.038784076     0.05517852
## log_rooms_per_household          0.216975238       -0.005331079    -0.11229390
## log_bedrooms_per_room           -0.231886414        0.097919865     0.04262742
## log_population_per_household    -0.076519419       -0.084229164     0.10149631
##                              log_households log_median_income
## housing_median_age             -0.306969839       -0.18403576
## total_rooms                     0.856550047        0.23228282
## total_bedrooms                  0.922279441        0.04232695
## population                      0.854522659        0.05389188
## households                      0.946890218        0.06747236
## median_income                   0.042210551        0.95844783
## median_house_value              0.114323955        0.62987265
## ocean_proximity_1H_OCEAN        0.053704989        0.18523829
## ocean_proximity_INLAND         -0.073495620       -0.23232746
## ocean_proximity_NEAR_BAY        0.005775758        0.06335163
## ocean_proximity_NEAR_OCEAN      0.020386282        0.00344606
## rooms_per_household            -0.118986508        0.29521934
## bedrooms_per_room               0.064692998       -0.64891891
## population_per_household       -0.039394648        0.01458740
## log_total_rooms                 0.925040691        0.27000551
## log_total_bedrooms              0.977514499        0.03878408
## log_population                  0.918285150        0.05517852
## log_households                  1.000000000        0.07118328
## log_median_income               0.071183285        1.00000000
## log_rooms_per_household        -0.121511336        0.44635698
## log_bedrooms_per_room           0.068143022       -0.71121101
## log_population_per_household   -0.077586345       -0.00455519
##                              log_rooms_per_household log_bedrooms_per_room
## housing_median_age                      -0.236221982           0.176864653
## total_rooms                              0.197946672          -0.200555658
## total_bedrooms                           0.008893265           0.078948657
## population                              -0.075341266           0.027712883
## households                              -0.082167389           0.057217926
## median_income                            0.465248251          -0.723887316
## median_house_value                       0.172347704          -0.304812575
## ocean_proximity_1H_OCEAN                -0.158836967           0.084508064
## ocean_proximity_INLAND                   0.223796236          -0.123799997
## ocean_proximity_NEAR_BAY                -0.017956008          -0.022660703
## ocean_proximity_NEAR_OCEAN              -0.067851181           0.068966766
## rooms_per_household                      0.919151674          -0.436854929
## bedrooms_per_room                       -0.634509189           0.966803048
## population_per_household                -0.009666021           0.003411033
## log_total_rooms                          0.216975238          -0.231886414
## log_total_bedrooms                      -0.005331079           0.097919865
## log_population                          -0.112293897           0.042627419
## log_households                          -0.121511336           0.068143022
## log_median_income                        0.446356981          -0.711211014
## log_rooms_per_household                  1.000000000          -0.660985996
## log_bedrooms_per_room                   -0.660985996           1.000000000
## log_population_per_household            -0.021890595           0.001529054
##                              log_population_per_household
## housing_median_age                            0.034839036
## total_rooms                                  -0.052227877
## total_bedrooms                               -0.063176207
## population                                    0.104321849
## households                                   -0.057916990
## median_income                                 0.007957251
## median_house_value                           -0.066823064
## ocean_proximity_1H_OCEAN                      0.023595154
## ocean_proximity_INLAND                        0.009430231
## ocean_proximity_NEAR_BAY                     -0.036211940
## ocean_proximity_NEAR_OCEAN                   -0.019142021
## rooms_per_household                          -0.018714076
## bedrooms_per_room                             0.006328297
## population_per_household                      0.934938587
## log_total_rooms                              -0.076519419
## log_total_bedrooms                           -0.084229164
## log_population                                0.101496308
## log_households                               -0.077586345
## log_median_income                            -0.004555190
## log_rooms_per_household                      -0.021890595
## log_bedrooms_per_room                         0.001529054
## log_population_per_household                  1.000000000
corrmatrix = cor(sdf2)
t(corrmatrix)
##                              housing_median_age   total_rooms total_bedrooms
## housing_median_age                   1.00000000 -0.3747630026   -0.331309723
## total_rooms                         -0.37476300  1.0000000000    0.935422796
## total_bedrooms                      -0.33130972  0.9354227956    1.000000000
## population                          -0.27718377  0.8602300574    0.880212004
## households                          -0.31045185  0.9224226021    0.978746198
## median_income                       -0.19793210  0.2227087874    0.020802496
## ocean_proximity_1H_OCEAN             0.11762475 -0.0162807691    0.012381030
## ocean_proximity_INLAND              -0.21969804  0.0263535919   -0.013435002
## ocean_proximity_NEAR_BAY             0.12337611  0.0003676967   -0.003761993
## ocean_proximity_NEAR_OCEAN           0.03724211 -0.0134742604    0.003929836
## rooms_per_household                 -0.17876200  0.1397245138    0.013422080
## bedrooms_per_room                    0.14426166 -0.1875404824    0.076759152
## population_per_household             0.02155349 -0.0241525895   -0.027629035
## log_total_rooms                     -0.39347049  0.9419308631    0.900431077
## log_total_bedrooms                  -0.33580108  0.8708651621    0.946099399
## log_population                      -0.28268819  0.8254358961    0.861862266
## log_households                      -0.30696984  0.8565500468    0.922279441
## log_median_income                   -0.18403576  0.2322828171    0.042326951
## log_rooms_per_household             -0.23622198  0.1979466718    0.008893265
## log_median_house_value              -0.23622198  0.1979466718    0.008893265
## log_bedrooms_per_room                0.17686465 -0.2005556579    0.078948657
## log_population_per_household         0.03483904 -0.0522278774   -0.063176207
##                               population   households median_income
## housing_median_age           -0.27718377 -0.310451853  -0.197932095
## total_rooms                   0.86023006  0.922422602   0.222708787
## total_bedrooms                0.88021200  0.978746198   0.020802496
## population                    1.00000000  0.910682455   0.040159411
## households                    0.91068245  1.000000000   0.045468585
## median_income                 0.04015941  0.045468585   1.000000000
## ocean_proximity_1H_OCEAN      0.06951887  0.038132019   0.184800134
## ocean_proximity_INLAND       -0.03799835 -0.047228557  -0.228050323
## ocean_proximity_NEAR_BAY     -0.03433719  0.005665539   0.067867661
## ocean_proximity_NEAR_OCEAN   -0.02140617  0.006090457  -0.005749940
## rooms_per_household          -0.07445637 -0.078929107   0.311129787
## bedrooms_per_room             0.03409933  0.058160385  -0.634558982
## population_per_household      0.06532818 -0.026534027   0.025866833
## log_total_rooms               0.81323363  0.892859889   0.250995266
## log_total_bedrooms            0.83073874  0.930785486   0.010404776
## log_population                0.95460000  0.894514840   0.036226741
## log_households                0.85452266  0.946890218   0.042210551
## log_median_income             0.05389188  0.067472361   0.958447829
## log_rooms_per_household      -0.07534127 -0.082167389   0.465248251
## log_median_house_value       -0.07534127 -0.082167389   0.465248251
## log_bedrooms_per_room         0.02771288  0.057217926  -0.723887316
## log_population_per_household  0.10432185 -0.057916990   0.007957251
##                              ocean_proximity_1H_OCEAN ocean_proximity_INLAND
## housing_median_age                        0.117624750           -0.219698042
## total_rooms                              -0.016280769            0.026353592
## total_bedrooms                            0.012381030           -0.013435002
## population                                0.069518865           -0.037998355
## households                                0.038132019           -0.047228557
## median_income                             0.184800134           -0.228050323
## ocean_proximity_1H_OCEAN                  1.000000000           -0.655412478
## ocean_proximity_INLAND                   -0.655412478            1.000000000
## ocean_proximity_NEAR_BAY                 -0.267518416           -0.215015852
## ocean_proximity_NEAR_OCEAN               -0.341321517           -0.274334523
## rooms_per_household                      -0.126176671            0.180850360
## bedrooms_per_room                         0.104289188           -0.142227880
## population_per_household                 -0.002092085            0.011334223
## log_total_rooms                          -0.010962608            0.014764616
## log_total_bedrooms                        0.021539760           -0.032381997
## log_population                            0.090670654           -0.058401005
## log_households                            0.053704989           -0.073495620
## log_median_income                         0.185238287           -0.232327460
## log_rooms_per_household                  -0.158836967            0.223796236
## log_median_house_value                   -0.158836967            0.223796236
## log_bedrooms_per_room                     0.084508064           -0.123799997
## log_population_per_household              0.023595154            0.009430231
##                              ocean_proximity_NEAR_BAY
## housing_median_age                       0.1233761099
## total_rooms                              0.0003676967
## total_bedrooms                          -0.0037619932
## population                              -0.0343371915
## households                               0.0056655389
## median_income                            0.0678676612
## ocean_proximity_1H_OCEAN                -0.2675184160
## ocean_proximity_INLAND                  -0.2150158524
## ocean_proximity_NEAR_BAY                 1.0000000000
## ocean_proximity_NEAR_OCEAN              -0.1119745800
## rooms_per_household                     -0.0201352195
## bedrooms_per_room                       -0.0210359696
## population_per_household                -0.0120023879
## log_total_rooms                          0.0018609286
## log_total_bedrooms                      -0.0046413556
## log_population                          -0.0390712783
## log_households                           0.0057757582
## log_median_income                        0.0633516305
## log_rooms_per_household                 -0.0179560078
## log_median_house_value                  -0.0179560078
## log_bedrooms_per_room                   -0.0226607031
## log_population_per_household            -0.0362119402
##                              ocean_proximity_NEAR_OCEAN rooms_per_household
## housing_median_age                          0.037242109        -0.178761998
## total_rooms                                -0.013474260         0.139724514
## total_bedrooms                              0.003929836         0.013422080
## population                                 -0.021406165        -0.074456368
## households                                  0.006090457        -0.078929107
## median_income                              -0.005749940         0.311129787
## ocean_proximity_1H_OCEAN                   -0.341321517        -0.126176671
## ocean_proximity_INLAND                     -0.274334523         0.180850360
## ocean_proximity_NEAR_BAY                   -0.111974580        -0.020135220
## ocean_proximity_NEAR_OCEAN                  1.000000000        -0.053486583
## rooms_per_household                        -0.053486583         1.000000000
## bedrooms_per_room                           0.064356659        -0.415270347
## population_per_household                   -0.003237739        -0.007216777
## log_total_rooms                            -0.005971760         0.151481550
## log_total_bedrooms                          0.018127233         0.003540664
## log_population                             -0.019922289        -0.109742072
## log_households                              0.020386282        -0.118986508
## log_median_income                           0.003446060         0.295219340
## log_rooms_per_household                    -0.067851181         0.919151674
## log_median_house_value                     -0.067851181         0.919151674
## log_bedrooms_per_room                       0.068966766        -0.436854929
## log_population_per_household               -0.019142021        -0.018714076
##                              bedrooms_per_room population_per_household
## housing_median_age                 0.144261656              0.021553493
## total_rooms                       -0.187540482             -0.024152590
## total_bedrooms                     0.076759152             -0.027629035
## population                         0.034099333              0.065328181
## households                         0.058160385             -0.026534027
## median_income                     -0.634558982              0.025866833
## ocean_proximity_1H_OCEAN           0.104289188             -0.002092085
## ocean_proximity_INLAND            -0.142227880              0.011334223
## ocean_proximity_NEAR_BAY          -0.021035970             -0.012002388
## ocean_proximity_NEAR_OCEAN         0.064356659             -0.003237739
## rooms_per_household               -0.415270347             -0.007216777
## bedrooms_per_room                  1.000000000              0.004360359
## population_per_household           0.004360359              1.000000000
## log_total_rooms                   -0.222571415             -0.037566024
## log_total_bedrooms                 0.090352054             -0.040941451
## log_population                     0.046275437              0.058523351
## log_households                     0.064692998             -0.039394648
## log_median_income                 -0.648918905              0.014587401
## log_rooms_per_household           -0.634509189             -0.009666021
## log_median_house_value            -0.634509189             -0.009666021
## log_bedrooms_per_room              0.966803048              0.003411033
## log_population_per_household       0.006328297              0.934938587
##                              log_total_rooms log_total_bedrooms log_population
## housing_median_age              -0.393470487       -0.335801084    -0.28268819
## total_rooms                      0.941930863        0.870865162     0.82543590
## total_bedrooms                   0.900431077        0.946099399     0.86186227
## population                       0.813233625        0.830738742     0.95460000
## households                       0.892859889        0.930785486     0.89451484
## median_income                    0.250995266        0.010404776     0.03622674
## ocean_proximity_1H_OCEAN        -0.010962608        0.021539760     0.09067065
## ocean_proximity_INLAND           0.014764616       -0.032381997    -0.05840100
## ocean_proximity_NEAR_BAY         0.001860929       -0.004641356    -0.03907128
## ocean_proximity_NEAR_OCEAN      -0.005971760        0.018127233    -0.01992229
## rooms_per_household              0.151481550        0.003540664    -0.10974207
## bedrooms_per_room               -0.222571415        0.090352054     0.04627544
## population_per_household        -0.037566024       -0.040941451     0.05852335
## log_total_rooms                  1.000000000        0.937263502     0.85690868
## log_total_bedrooms               0.937263502        1.000000000     0.88865109
## log_population                   0.856908682        0.888651089     1.00000000
## log_households                   0.925040691        0.977514499     0.91828515
## log_median_income                0.270005509        0.038784076     0.05517852
## log_rooms_per_household          0.216975238       -0.005331079    -0.11229390
## log_median_house_value           0.216975238       -0.005331079    -0.11229390
## log_bedrooms_per_room           -0.231886414        0.097919865     0.04262742
## log_population_per_household    -0.076519419       -0.084229164     0.10149631
##                              log_households log_median_income
## housing_median_age             -0.306969839       -0.18403576
## total_rooms                     0.856550047        0.23228282
## total_bedrooms                  0.922279441        0.04232695
## population                      0.854522659        0.05389188
## households                      0.946890218        0.06747236
## median_income                   0.042210551        0.95844783
## ocean_proximity_1H_OCEAN        0.053704989        0.18523829
## ocean_proximity_INLAND         -0.073495620       -0.23232746
## ocean_proximity_NEAR_BAY        0.005775758        0.06335163
## ocean_proximity_NEAR_OCEAN      0.020386282        0.00344606
## rooms_per_household            -0.118986508        0.29521934
## bedrooms_per_room               0.064692998       -0.64891891
## population_per_household       -0.039394648        0.01458740
## log_total_rooms                 0.925040691        0.27000551
## log_total_bedrooms              0.977514499        0.03878408
## log_population                  0.918285150        0.05517852
## log_households                  1.000000000        0.07118328
## log_median_income               0.071183285        1.00000000
## log_rooms_per_household        -0.121511336        0.44635698
## log_median_house_value         -0.121511336        0.44635698
## log_bedrooms_per_room           0.068143022       -0.71121101
## log_population_per_household   -0.077586345       -0.00455519
##                              log_rooms_per_household log_median_house_value
## housing_median_age                      -0.236221982           -0.236221982
## total_rooms                              0.197946672            0.197946672
## total_bedrooms                           0.008893265            0.008893265
## population                              -0.075341266           -0.075341266
## households                              -0.082167389           -0.082167389
## median_income                            0.465248251            0.465248251
## ocean_proximity_1H_OCEAN                -0.158836967           -0.158836967
## ocean_proximity_INLAND                   0.223796236            0.223796236
## ocean_proximity_NEAR_BAY                -0.017956008           -0.017956008
## ocean_proximity_NEAR_OCEAN              -0.067851181           -0.067851181
## rooms_per_household                      0.919151674            0.919151674
## bedrooms_per_room                       -0.634509189           -0.634509189
## population_per_household                -0.009666021           -0.009666021
## log_total_rooms                          0.216975238            0.216975238
## log_total_bedrooms                      -0.005331079           -0.005331079
## log_population                          -0.112293897           -0.112293897
## log_households                          -0.121511336           -0.121511336
## log_median_income                        0.446356981            0.446356981
## log_rooms_per_household                  1.000000000            1.000000000
## log_median_house_value                   1.000000000            1.000000000
## log_bedrooms_per_room                   -0.660985996           -0.660985996
## log_population_per_household            -0.021890595           -0.021890595
##                              log_bedrooms_per_room log_population_per_household
## housing_median_age                     0.176864653                  0.034839036
## total_rooms                           -0.200555658                 -0.052227877
## total_bedrooms                         0.078948657                 -0.063176207
## population                             0.027712883                  0.104321849
## households                             0.057217926                 -0.057916990
## median_income                         -0.723887316                  0.007957251
## ocean_proximity_1H_OCEAN               0.084508064                  0.023595154
## ocean_proximity_INLAND                -0.123799997                  0.009430231
## ocean_proximity_NEAR_BAY              -0.022660703                 -0.036211940
## ocean_proximity_NEAR_OCEAN             0.068966766                 -0.019142021
## rooms_per_household                   -0.436854929                 -0.018714076
## bedrooms_per_room                      0.966803048                  0.006328297
## population_per_household               0.003411033                  0.934938587
## log_total_rooms                       -0.231886414                 -0.076519419
## log_total_bedrooms                     0.097919865                 -0.084229164
## log_population                         0.042627419                  0.101496308
## log_households                         0.068143022                 -0.077586345
## log_median_income                     -0.711211014                 -0.004555190
## log_rooms_per_household               -0.660985996                 -0.021890595
## log_median_house_value                -0.660985996                 -0.021890595
## log_bedrooms_per_room                  1.000000000                  0.001529054
## log_population_per_household           0.001529054                  1.000000000

Regresión Lineal Múltiple

Primer modelo: Ajustamos todas las variables ‘independientes’

m0<- lm(df$median_house_value ~ . , df)
m1 <- lm(df$median_house_value ~ (.)^2 ,df)
m2<- lm(cdf$median_house_value ~ . , cdf)
m3 <- lm(cdf2$log_median_house_value ~ . , cdf2)
m4<- lm(sdf$median_house_value ~ . , sdf)
m5 <- lm(sdf2$log_median_house_value ~ . , sdf2)

Observamos el resumen de los modelos

summary(m0)
## 
## Call:
## lm(formula = df$median_house_value ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -556980  -42683  -10497   28765  779052 
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -2.266e+06  8.837e+04 -25.640  < 2e-16 ***
## longitude                  -2.681e+04  1.020e+03 -26.296  < 2e-16 ***
## latitude                   -2.548e+04  1.005e+03 -25.363  < 2e-16 ***
## housing_median_age          1.073e+03  4.389e+01  24.439  < 2e-16 ***
## total_rooms                -6.193e+00  7.915e-01  -7.825 5.32e-15 ***
## total_bedrooms              1.006e+02  6.869e+00  14.640  < 2e-16 ***
## population                 -3.797e+01  1.076e+00 -35.282  < 2e-16 ***
## households                  4.962e+01  7.451e+00   6.659 2.83e-11 ***
## median_income               3.926e+04  3.380e+02 116.151  < 2e-16 ***
## ocean_proximity_1H_OCEAN   -4.278e+03  1.570e+03  -2.726 0.006421 ** 
## ocean_proximity_INLAND     -4.356e+04  2.250e+03 -19.363  < 2e-16 ***
## ocean_proximity_ISLAND      1.486e+05  3.075e+04   4.833 1.36e-06 ***
## ocean_proximity_NEAR_BAY   -8.232e+03  2.176e+03  -3.784 0.000155 ***
## ocean_proximity_NEAR_OCEAN         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68660 on 20420 degrees of freedom
## Multiple R-squared:  0.6465, Adjusted R-squared:  0.6463 
## F-statistic:  3112 on 12 and 20420 DF,  p-value: < 2.2e-16
summary(m1)
## 
## Call:
## lm(formula = df$median_house_value ~ (.)^2, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -481870  -36549   -8853   25588  611554 
## 
## Coefficients: (23 not defined because of singularities)
##                                                       Estimate Std. Error
## (Intercept)                                         -7.454e+06  8.751e+05
## longitude                                           -5.244e+04  7.594e+03
## latitude                                             2.932e+05  2.875e+04
## housing_median_age                                  -7.782e+04  7.771e+03
## total_rooms                                          9.507e+02  1.775e+02
## total_bedrooms                                      -3.919e+03  1.113e+03
## population                                           3.584e+02  1.768e+02
## households                                          -1.856e+03  1.067e+03
## median_income                                       -7.990e+05  6.753e+04
## ocean_proximity_1H_OCEAN                             6.833e+05  3.123e+05
## ocean_proximity_INLAND                               1.228e+06  2.955e+05
## ocean_proximity_ISLAND                               7.949e+08  3.272e+08
## ocean_proximity_NEAR_BAY                            -2.363e+07  1.268e+06
## ocean_proximity_NEAR_OCEAN                                  NA         NA
## longitude:latitude                                   2.149e+03  2.177e+02
## longitude:housing_median_age                        -9.823e+02  8.939e+01
## longitude:total_rooms                                1.208e+01  2.089e+00
## longitude:total_bedrooms                            -5.506e+01  1.292e+01
## longitude:population                                 5.376e+00  2.081e+00
## longitude:households                                -2.007e+01  1.192e+01
## longitude:median_income                             -1.022e+04  7.959e+02
## longitude:ocean_proximity_1H_OCEAN                   5.504e+03  3.725e+03
## longitude:ocean_proximity_INLAND                     1.462e+04  3.437e+03
## longitude:ocean_proximity_ISLAND                     1.448e+07  6.581e+06
## longitude:ocean_proximity_NEAR_BAY                  -2.338e+05  1.130e+04
## longitude:ocean_proximity_NEAR_OCEAN                        NA         NA
## latitude:housing_median_age                         -1.125e+03  8.773e+01
## latitude:total_rooms                                 1.387e+01  2.146e+00
## latitude:total_bedrooms                             -7.087e+01  1.312e+01
## latitude:population                                  7.881e+00  2.212e+00
## latitude:households                                 -2.126e+01  1.117e+01
## latitude:median_income                              -1.116e+04  8.235e+02
## latitude:ocean_proximity_1H_OCEAN                   -2.669e+02  3.887e+03
## latitude:ocean_proximity_INLAND                      1.178e+04  3.419e+03
## latitude:ocean_proximity_ISLAND                      2.755e+07  1.368e+07
## latitude:ocean_proximity_NEAR_BAY                   -1.310e+05  8.775e+03
## latitude:ocean_proximity_NEAR_OCEAN                         NA         NA
## housing_median_age:total_rooms                      -3.606e-01  7.281e-02
## housing_median_age:total_bedrooms                    2.030e+00  7.375e-01
## housing_median_age:population                       -1.483e+00  9.685e-02
## housing_median_age:households                        5.172e+00  8.225e-01
## housing_median_age:median_income                     1.336e+02  2.467e+01
## housing_median_age:ocean_proximity_1H_OCEAN         -2.128e+02  1.340e+02
## housing_median_age:ocean_proximity_INLAND            7.225e+02  1.883e+02
## housing_median_age:ocean_proximity_ISLAND            1.094e+04  8.196e+03
## housing_median_age:ocean_proximity_NEAR_BAY         -3.630e+02  1.817e+02
## housing_median_age:ocean_proximity_NEAR_OCEAN               NA         NA
## total_rooms:total_bedrooms                          -1.748e-03  1.575e-03
## total_rooms:population                              -4.932e-03  6.780e-04
## total_rooms:households                               1.414e-02  2.379e-03
## total_rooms:median_income                            1.300e+00  3.065e-01
## total_rooms:ocean_proximity_1H_OCEAN                -3.189e+00  2.527e+00
## total_rooms:ocean_proximity_INLAND                  -1.975e+01  4.145e+00
## total_rooms:ocean_proximity_ISLAND                   5.826e+01  9.104e+01
## total_rooms:ocean_proximity_NEAR_BAY                 8.586e+00  3.937e+00
## total_rooms:ocean_proximity_NEAR_OCEAN                      NA         NA
## total_bedrooms:population                            2.138e-02  5.946e-03
## total_bedrooms:households                           -9.208e-02  8.707e-03
## total_bedrooms:median_income                         5.287e+00  4.013e+00
## total_bedrooms:ocean_proximity_1H_OCEAN             -2.270e+01  2.911e+01
## total_bedrooms:ocean_proximity_INLAND               -7.214e+00  3.510e+01
## total_bedrooms:ocean_proximity_ISLAND                       NA         NA
## total_bedrooms:ocean_proximity_NEAR_BAY             -7.650e+01  4.786e+01
## total_bedrooms:ocean_proximity_NEAR_OCEAN                   NA         NA
## population:households                                1.521e-02  5.303e-03
## population:median_income                            -2.170e+00  7.432e-01
## population:ocean_proximity_1H_OCEAN                 -1.754e+01  3.360e+00
## population:ocean_proximity_INLAND                    9.572e+00  4.998e+00
## population:ocean_proximity_ISLAND                           NA         NA
## population:ocean_proximity_NEAR_BAY                 -2.589e+01  5.796e+00
## population:ocean_proximity_NEAR_OCEAN                       NA         NA
## households:median_income                             8.180e+00  4.786e+00
## households:ocean_proximity_1H_OCEAN                  8.244e+01  3.265e+01
## households:ocean_proximity_INLAND                    9.368e+01  3.654e+01
## households:ocean_proximity_ISLAND                           NA         NA
## households:ocean_proximity_NEAR_BAY                  9.399e+01  5.387e+01
## households:ocean_proximity_NEAR_OCEAN                       NA         NA
## median_income:ocean_proximity_1H_OCEAN              -2.129e+03  9.682e+02
## median_income:ocean_proximity_INLAND                 8.789e+03  1.584e+03
## median_income:ocean_proximity_ISLAND                        NA         NA
## median_income:ocean_proximity_NEAR_BAY              -1.353e+03  1.340e+03
## median_income:ocean_proximity_NEAR_OCEAN                    NA         NA
## ocean_proximity_1H_OCEAN:ocean_proximity_INLAND             NA         NA
## ocean_proximity_1H_OCEAN:ocean_proximity_ISLAND             NA         NA
## ocean_proximity_1H_OCEAN:ocean_proximity_NEAR_BAY           NA         NA
## ocean_proximity_1H_OCEAN:ocean_proximity_NEAR_OCEAN         NA         NA
## ocean_proximity_INLAND:ocean_proximity_ISLAND               NA         NA
## ocean_proximity_INLAND:ocean_proximity_NEAR_BAY             NA         NA
## ocean_proximity_INLAND:ocean_proximity_NEAR_OCEAN           NA         NA
## ocean_proximity_ISLAND:ocean_proximity_NEAR_BAY             NA         NA
## ocean_proximity_ISLAND:ocean_proximity_NEAR_OCEAN           NA         NA
## ocean_proximity_NEAR_BAY:ocean_proximity_NEAR_OCEAN         NA         NA
##                                                     t value Pr(>|t|)    
## (Intercept)                                          -8.518  < 2e-16 ***
## longitude                                            -6.906 5.13e-12 ***
## latitude                                             10.198  < 2e-16 ***
## housing_median_age                                  -10.015  < 2e-16 ***
## total_rooms                                           5.355 8.63e-08 ***
## total_bedrooms                                       -3.522 0.000429 ***
## population                                            2.027 0.042646 *  
## households                                           -1.740 0.081887 .  
## median_income                                       -11.832  < 2e-16 ***
## ocean_proximity_1H_OCEAN                              2.188 0.028690 *  
## ocean_proximity_INLAND                                4.155 3.26e-05 ***
## ocean_proximity_ISLAND                                2.429 0.015141 *  
## ocean_proximity_NEAR_BAY                            -18.629  < 2e-16 ***
## ocean_proximity_NEAR_OCEAN                               NA       NA    
## longitude:latitude                                    9.873  < 2e-16 ***
## longitude:housing_median_age                        -10.989  < 2e-16 ***
## longitude:total_rooms                                 5.780 7.59e-09 ***
## longitude:total_bedrooms                             -4.263 2.03e-05 ***
## longitude:population                                  2.583 0.009793 ** 
## longitude:households                                 -1.684 0.092144 .  
## longitude:median_income                             -12.846  < 2e-16 ***
## longitude:ocean_proximity_1H_OCEAN                    1.478 0.139528    
## longitude:ocean_proximity_INLAND                      4.254 2.11e-05 ***
## longitude:ocean_proximity_ISLAND                      2.201 0.027755 *  
## longitude:ocean_proximity_NEAR_BAY                  -20.685  < 2e-16 ***
## longitude:ocean_proximity_NEAR_OCEAN                     NA       NA    
## latitude:housing_median_age                         -12.820  < 2e-16 ***
## latitude:total_rooms                                  6.461 1.06e-10 ***
## latitude:total_bedrooms                              -5.403 6.63e-08 ***
## latitude:population                                   3.564 0.000367 ***
## latitude:households                                  -1.904 0.056912 .  
## latitude:median_income                              -13.548  < 2e-16 ***
## latitude:ocean_proximity_1H_OCEAN                    -0.069 0.945267    
## latitude:ocean_proximity_INLAND                       3.445 0.000572 ***
## latitude:ocean_proximity_ISLAND                       2.014 0.044046 *  
## latitude:ocean_proximity_NEAR_BAY                   -14.926  < 2e-16 ***
## latitude:ocean_proximity_NEAR_OCEAN                      NA       NA    
## housing_median_age:total_rooms                       -4.952 7.41e-07 ***
## housing_median_age:total_bedrooms                     2.752 0.005925 ** 
## housing_median_age:population                       -15.317  < 2e-16 ***
## housing_median_age:households                         6.289 3.27e-10 ***
## housing_median_age:median_income                      5.415 6.21e-08 ***
## housing_median_age:ocean_proximity_1H_OCEAN          -1.588 0.112312    
## housing_median_age:ocean_proximity_INLAND             3.837 0.000125 ***
## housing_median_age:ocean_proximity_ISLAND             1.335 0.182033    
## housing_median_age:ocean_proximity_NEAR_BAY          -1.998 0.045709 *  
## housing_median_age:ocean_proximity_NEAR_OCEAN            NA       NA    
## total_rooms:total_bedrooms                           -1.110 0.267014    
## total_rooms:population                               -7.273 3.64e-13 ***
## total_rooms:households                                5.943 2.85e-09 ***
## total_rooms:median_income                             4.242 2.22e-05 ***
## total_rooms:ocean_proximity_1H_OCEAN                 -1.262 0.206949    
## total_rooms:ocean_proximity_INLAND                   -4.766 1.89e-06 ***
## total_rooms:ocean_proximity_ISLAND                    0.640 0.522242    
## total_rooms:ocean_proximity_NEAR_BAY                  2.181 0.029208 *  
## total_rooms:ocean_proximity_NEAR_OCEAN                   NA       NA    
## total_bedrooms:population                             3.595 0.000325 ***
## total_bedrooms:households                           -10.575  < 2e-16 ***
## total_bedrooms:median_income                          1.317 0.187688    
## total_bedrooms:ocean_proximity_1H_OCEAN              -0.780 0.435500    
## total_bedrooms:ocean_proximity_INLAND                -0.206 0.837155    
## total_bedrooms:ocean_proximity_ISLAND                    NA       NA    
## total_bedrooms:ocean_proximity_NEAR_BAY              -1.598 0.109988    
## total_bedrooms:ocean_proximity_NEAR_OCEAN                NA       NA    
## population:households                                 2.868 0.004131 ** 
## population:median_income                             -2.920 0.003509 ** 
## population:ocean_proximity_1H_OCEAN                  -5.219 1.81e-07 ***
## population:ocean_proximity_INLAND                     1.915 0.055494 .  
## population:ocean_proximity_ISLAND                        NA       NA    
## population:ocean_proximity_NEAR_BAY                  -4.467 7.96e-06 ***
## population:ocean_proximity_NEAR_OCEAN                    NA       NA    
## households:median_income                              1.709 0.087484 .  
## households:ocean_proximity_1H_OCEAN                   2.525 0.011586 *  
## households:ocean_proximity_INLAND                     2.564 0.010360 *  
## households:ocean_proximity_ISLAND                        NA       NA    
## households:ocean_proximity_NEAR_BAY                   1.745 0.081009 .  
## households:ocean_proximity_NEAR_OCEAN                    NA       NA    
## median_income:ocean_proximity_1H_OCEAN               -2.199 0.027868 *  
## median_income:ocean_proximity_INLAND                  5.548 2.93e-08 ***
## median_income:ocean_proximity_ISLAND                     NA       NA    
## median_income:ocean_proximity_NEAR_BAY               -1.010 0.312660    
## median_income:ocean_proximity_NEAR_OCEAN                 NA       NA    
## ocean_proximity_1H_OCEAN:ocean_proximity_INLAND          NA       NA    
## ocean_proximity_1H_OCEAN:ocean_proximity_ISLAND          NA       NA    
## ocean_proximity_1H_OCEAN:ocean_proximity_NEAR_BAY        NA       NA    
## ocean_proximity_1H_OCEAN:ocean_proximity_NEAR_OCEAN      NA       NA    
## ocean_proximity_INLAND:ocean_proximity_ISLAND            NA       NA    
## ocean_proximity_INLAND:ocean_proximity_NEAR_BAY          NA       NA    
## ocean_proximity_INLAND:ocean_proximity_NEAR_OCEAN        NA       NA    
## ocean_proximity_ISLAND:ocean_proximity_NEAR_BAY          NA       NA    
## ocean_proximity_ISLAND:ocean_proximity_NEAR_OCEAN        NA       NA    
## ocean_proximity_NEAR_BAY:ocean_proximity_NEAR_OCEAN      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63010 on 20364 degrees of freedom
## Multiple R-squared:  0.7031, Adjusted R-squared:  0.7021 
## F-statistic: 709.1 on 68 and 20364 DF,  p-value: < 2.2e-16
summary(m2)
## 
## Call:
## lm(formula = cdf$median_house_value ~ ., data = cdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -434031  -34216   -6896   26017  347654 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.595e+06  1.090e+05 -14.629  < 2e-16 ***
## longitude                    -2.565e+04  8.681e+02 -29.549  < 2e-16 ***
## latitude                     -2.404e+04  8.607e+02 -27.926  < 2e-16 ***
## housing_median_age            8.739e+02  4.313e+01  20.264  < 2e-16 ***
## total_rooms                   1.807e-01  9.392e-01   0.192 0.847425    
## total_bedrooms               -3.042e+01  9.033e+00  -3.368 0.000759 ***
## population                    3.891e-03  1.454e+00   0.003 0.997865    
## households                    2.544e+01  9.478e+00   2.684 0.007275 ** 
## median_income                 4.898e+04  1.075e+03  45.571  < 2e-16 ***
## ocean_proximity_1H_OCEAN     -1.550e+05  3.931e+04  -3.944 8.04e-05 ***
## ocean_proximity_INLAND       -1.910e+05  3.934e+04  -4.854 1.22e-06 ***
## ocean_proximity_NEAR_BAY     -1.740e+05  3.934e+04  -4.423 9.78e-06 ***
## ocean_proximity_NEAR_OCEAN   -1.558e+05  3.931e+04  -3.963 7.43e-05 ***
## rooms_per_household          -3.102e+02  3.997e+02  -0.776 0.437738    
## bedrooms_per_room             4.705e+05  4.234e+04  11.114  < 2e-16 ***
## population_per_household      8.032e+01  6.277e+01   1.280 0.200727    
## log_total_rooms               3.764e+03  1.226e+04   0.307 0.758794    
## log_total_bedrooms            9.310e+04  1.521e+04   6.120 9.55e-10 ***
## log_population               -9.360e+04  5.238e+03 -17.870  < 2e-16 ***
## log_households                3.034e+04  8.443e+03   3.594 0.000327 ***
## log_median_income            -3.517e+04  3.587e+03  -9.804  < 2e-16 ***
## log_rooms_per_household      -7.860e+04  2.113e+04  -3.720 0.000200 ***
## log_bedrooms_per_room        -7.995e+05  7.832e+04 -10.208  < 2e-16 ***
## log_population_per_household  8.899e+04  2.224e+04   4.002 6.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55540 on 18047 degrees of freedom
## Multiple R-squared:  0.6604, Adjusted R-squared:   0.66 
## F-statistic:  1526 on 23 and 18047 DF,  p-value: < 2.2e-16
summary(m3)
## 
## Call:
## lm(formula = cdf2$log_median_house_value ~ ., data = cdf2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38989 -0.17200 -0.00884  0.16710  1.82186 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.647e-01  5.709e-01   1.340 0.180402    
## longitude                    -1.501e-01  4.546e-03 -33.027  < 2e-16 ***
## latitude                     -1.451e-01  4.507e-03 -32.205  < 2e-16 ***
## housing_median_age            2.618e-03  2.258e-04  11.596  < 2e-16 ***
## total_rooms                   1.879e-07  4.918e-06   0.038 0.969516    
## total_bedrooms               -7.879e-05  4.730e-05  -1.666 0.095754 .  
## population                    2.374e-07  7.615e-06   0.031 0.975132    
## households                    4.427e-05  4.963e-05   0.892 0.372432    
## median_income                 1.034e-01  5.628e-03  18.366  < 2e-16 ***
## ocean_proximity_1H_OCEAN     -6.002e-01  2.058e-01  -2.916 0.003545 ** 
## ocean_proximity_INLAND       -8.784e-01  2.060e-01  -4.264 2.02e-05 ***
## ocean_proximity_NEAR_BAY     -7.023e-01  2.060e-01  -3.410 0.000652 ***
## ocean_proximity_NEAR_OCEAN   -6.431e-01  2.058e-01  -3.124 0.001785 ** 
## rooms_per_household           4.496e-03  2.093e-03   2.148 0.031718 *  
## bedrooms_per_room             4.086e+00  2.217e-01  18.432  < 2e-16 ***
## population_per_household      1.234e-03  3.287e-04   3.754 0.000175 ***
## log_total_rooms               2.815e-01  6.418e-02   4.387 1.16e-05 ***
## log_total_bedrooms            9.910e-02  7.966e-02   1.244 0.213467    
## log_population               -3.884e-01  2.742e-02 -14.163  < 2e-16 ***
## log_households                2.042e-01  4.421e-02   4.619 3.89e-06 ***
## log_median_income             3.595e-01  1.878e-02  19.143  < 2e-16 ***
## log_rooms_per_household      -1.555e-01  1.106e-01  -1.406 0.159856    
## log_bedrooms_per_room        -4.693e+00  4.101e-01 -11.443  < 2e-16 ***
## log_population_per_household  8.890e-02  1.164e-01   0.764 0.445126    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2908 on 18047 degrees of freedom
## Multiple R-squared:  0.6959, Adjusted R-squared:  0.6955 
## F-statistic:  1796 on 23 and 18047 DF,  p-value: < 2.2e-16
summary(m4)
## 
## Call:
## lm(formula = sdf$median_house_value ~ ., data = sdf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1862  -1.0126  -0.2196   0.7430  11.5640 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   5.533772   1.240278   4.462 8.18e-06 ***
## housing_median_age            0.090280   0.004287  21.061  < 2e-16 ***
## total_rooms                   0.368137   0.139254   2.644  0.00821 ** 
## total_bedrooms               -0.932254   0.212730  -4.382 1.18e-05 ***
## population                    1.431798   0.164694   8.694  < 2e-16 ***
## households                    0.377677   0.212463   1.778  0.07548 .  
## median_income                 1.489768   0.032026  46.517  < 2e-16 ***
## ocean_proximity_1H_OCEAN     -5.332302   1.168395  -4.564 5.06e-06 ***
## ocean_proximity_INLAND       -7.067375   1.168498  -6.048 1.49e-09 ***
## ocean_proximity_NEAR_BAY     -5.588203   1.169057  -4.780 1.77e-06 ***
## ocean_proximity_NEAR_OCEAN   -5.065813   1.168684  -4.335 1.47e-05 ***
## rooms_per_household          -0.290859   0.180010  -1.616  0.10616    
## bedrooms_per_room             0.591574   0.065764   8.995  < 2e-16 ***
## population_per_household     -1.798775   0.339913  -5.292 1.22e-07 ***
## log_total_rooms              -2.683633   0.648571  -4.138 3.52e-05 ***
## log_total_bedrooms            4.309844   0.942340   4.574 4.83e-06 ***
## log_population               -9.460382   0.381341 -24.808  < 2e-16 ***
## log_households                3.993141   0.863544   4.624 3.79e-06 ***
## log_median_income            -1.031279   0.105985  -9.730  < 2e-16 ***
## log_rooms_per_household       1.747278   0.707550   2.469  0.01354 *  
## log_bedrooms_per_room        -0.944423   0.324317  -2.912  0.00360 ** 
## log_population_per_household  9.368323   1.472751   6.361 2.05e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.651 on 18049 degrees of freedom
## Multiple R-squared:  0.6411, Adjusted R-squared:  0.6407 
## F-statistic:  1535 on 21 and 18049 DF,  p-value: < 2.2e-16
summary(m5)
## Warning in summary.lm(m5): essentially perfect fit: summary may be unreliable
## 
## Call:
## lm(formula = sdf2$log_median_house_value ~ ., data = sdf2)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.007e-16 -3.000e-18  1.000e-18  2.800e-18  1.412e-14 
## 
## Coefficients:
##                                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)                   6.433e-16  7.917e-17  8.126e+00 4.71e-16 ***
## housing_median_age           -1.102e-18  2.736e-19 -4.029e+00 5.63e-05 ***
## total_rooms                  -2.319e-16  8.889e-18 -2.609e+01  < 2e-16 ***
## total_bedrooms               -3.702e-16  1.358e-17 -2.726e+01  < 2e-16 ***
## population                    3.174e-17  1.051e-17  3.019e+00  0.00254 ** 
## households                    5.387e-16  1.356e-17  3.972e+01  < 2e-16 ***
## median_income                 4.931e-17  2.044e-18  2.412e+01  < 2e-16 ***
## ocean_proximity_1H_OCEAN      7.517e-17  7.458e-17  1.008e+00  0.31351    
## ocean_proximity_INLAND        9.439e-17  7.458e-17  1.266e+00  0.20569    
## ocean_proximity_NEAR_BAY      7.193e-17  7.462e-17  9.640e-01  0.33509    
## ocean_proximity_NEAR_OCEAN    7.274e-17  7.460e-17  9.750e-01  0.32954    
## rooms_per_household           1.277e-15  1.149e-17  1.111e+02  < 2e-16 ***
## bedrooms_per_room            -1.593e-16  4.198e-18 -3.794e+01  < 2e-16 ***
## population_per_household     -3.639e-17  2.170e-17 -1.677e+00  0.09350 .  
## log_total_rooms               1.901e-15  4.140e-17  4.593e+01  < 2e-16 ***
## log_total_bedrooms            2.441e-15  6.015e-17  4.059e+01  < 2e-16 ***
## log_population               -7.981e-18  2.434e-17 -3.280e-01  0.74301    
## log_households               -4.147e-15  5.512e-17 -7.525e+01  < 2e-16 ***
## log_median_income            -9.769e-17  6.765e-18 -1.444e+01  < 2e-16 ***
## log_rooms_per_household       1.000e+00  4.516e-17  2.214e+16  < 2e-16 ***
## log_bedrooms_per_room         1.764e-17  2.070e-17  8.520e-01  0.39423    
## log_population_per_household  2.912e-18  9.400e-17  3.100e-02  0.97529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.054e-16 on 18049 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.005e+33 on 21 and 18049 DF,  p-value: < 2.2e-16
autoplot(m0)

autoplot(m1)
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 7 row(s) containing missing values (geom_path).

autoplot(m2)

autoplot(m3)

autoplot(m4)

autoplot(m5)

Observamos metricas de los modelos

m0_adjr2 = summary(m0)$adj.r.squared
m1_adjr2 = summary(m1)$adj.r.squared
m2_adjr2 = summary(m2)$adj.r.squared
m3_adjr2 = summary(m3)$adj.r.squared
m4_adjr2 = summary(m4)$adj.r.squared
m5_adjr2 = summary(m5)$adj.r.squared
## Warning in summary.lm(m5): essentially perfect fit: summary may be unreliable
beginning_mods_results = data.frame(
  "Total Predictors" =
    c("m0" = extractAIC(m0)[1],
      "m1" = extractAIC(m1)[1],
      "m2" = extractAIC(m2)[1],
      "m3" = extractAIC(m3)[1],
      "m4" = extractAIC(m4)[1],
      "m5" = extractAIC(m5)[1]
      
      ),
  
  "AIC" =
    c("m0" = extractAIC(m0)[2],
      "m1" = extractAIC(m1)[2],
      "m2" = extractAIC(m2)[2],
      "m3" = extractAIC(m3)[2],
      "m4" = extractAIC(m4)[2],
      "m5" = extractAIC(m5)[2]
      
      ),
  
  "Adj R-Squared" =
    c("m0" = m0_adjr2,
      "m1" = m1_adjr2,
      "m2" = m2_adjr2,
      "m3" = m3_adjr2,
      "m4" = m4_adjr2,
      "m5" = m5_adjr2
      )
  )

kable(beginning_mods_results, align = c("c", "r"))
Total.Predictors AIC Adj.R.Squared
m0 13 455132.64 0.6462561
m1 69 451679.54 0.7020749
m2 24 394869.82 0.6599973
m3 24 -44615.21 0.6955374
m4 22 18146.77 0.6406785
m5 22 -1329599.98 1.0000000

INICIO: ANÁLISIS ESPACIAL DE DATOS

ANALISIS ESPACIAL DE LOS DATOS

library(sp)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(maptools)
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## 
## Attaching package: 'maptools'
## The following object is masked from 'package:Hmisc':
## 
##     label
latlong_df <- cdf[,c(1,2)]
str(latlong_df)
## Classes 'data.table' and 'data.frame':   18071 obs. of  2 variables:
##  $ longitude: num  -122 -122 -122 -122 -122 ...
##  $ latitude : num  37.9 37.9 37.8 37.9 37.9 ...
##  - attr(*, ".internal.selfref")=<externalptr>
counties <- maps::map('county', fill=TRUE, col="transparent", plot=FALSE)
IDs <- sapply(strsplit(counties$names, ":"), function(x) x[1])
counties_sp <- map2SpatialPolygons(counties, IDs=IDs,
                     proj4string=CRS("+proj=longlat +datum=WGS84"))

    # Convert pointsDF to a SpatialPoints object 
pointsSP <- SpatialPoints(latlong_df, 
                    proj4string=CRS("+proj=longlat +datum=WGS84"))

    # Use 'over' to get _indices_ of the Polygons object containing each point 
indices <- over(pointsSP, counties_sp)

    # Return the county names of the Polygons object containing each point
countyNames <- sapply(counties_sp@polygons, function(x) x@ID)
my_data= countyNames[indices]
my_data=sub("california,", "", my_data)

mdf=cdf
mdf$county_name=toTitleCase(my_data)
mdf$county_name=as.factor(mdf$county_name)
mdf <- na.omit(mdf)
str(mdf)
## Classes 'data.table' and 'data.frame':   16781 obs. of  25 variables:
##  $ longitude                   : num  -122 -122 -122 -122 -122 ...
##  $ latitude                    : num  37.9 37.9 37.8 37.9 37.9 ...
##  $ housing_median_age          : num  41 21 42 40 42 48 43 40 40 21 ...
##  $ total_rooms                 : num  880 7099 2555 751 1639 ...
##  $ total_bedrooms              : num  129 1106 665 184 367 ...
##  $ population                  : num  322 2401 1206 409 929 ...
##  $ households                  : num  126 1138 595 166 366 ...
##  $ median_income               : num  8.33 8.3 2.08 1.36 1.71 ...
##  $ median_house_value          : num  452600 358500 226700 147500 159800 ...
##  $ ocean_proximity_1H_OCEAN    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_INLAND      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_NEAR_BAY    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ocean_proximity_NEAR_OCEAN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rooms_per_household         : num  6.98 6.24 4.29 4.52 4.48 ...
##  $ bedrooms_per_room           : num  0.147 0.156 0.26 0.245 0.224 ...
##  $ population_per_household    : num  2.56 2.11 2.03 2.46 2.54 ...
##  $ log_total_rooms             : num  6.78 8.87 7.85 6.62 7.4 ...
##  $ log_total_bedrooms          : num  4.86 7.01 6.5 5.21 5.91 ...
##  $ log_population              : num  5.77 7.78 7.1 6.01 6.83 ...
##  $ log_households              : num  4.84 7.04 6.39 5.11 5.9 ...
##  $ log_median_income           : num  2.119 2.116 0.733 0.306 0.539 ...
##  $ log_rooms_per_household     : num  1.4 1.26 1.23 1.3 1.25 ...
##  $ log_bedrooms_per_room       : num  0.717 0.79 0.828 0.788 0.798 ...
##  $ log_population_per_household: num  1.19 1.11 1.11 1.18 1.16 ...
##  $ county_name                 : Factor w/ 60 levels "Alameda","Alpine",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Segundo modelo: Ajustamos todas las variables

m6<- lm(mdf$median_house_value ~ . , mdf)
m7<- lm(mdf$median_house_value ~ . , mdf)
summary(m6)
## 
## Call:
## lm(formula = mdf$median_house_value ~ ., data = mdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -375639  -27436   -5451   19908  341759 
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -2.494e+06  2.329e+05 -10.710  < 2e-16 ***
## longitude                    -3.020e+04  1.823e+03 -16.565  < 2e-16 ***
## latitude                     -2.374e+04  2.758e+03  -8.606  < 2e-16 ***
## housing_median_age            8.873e+01  4.317e+01   2.055 0.039850 *  
## total_rooms                  -3.924e-01  8.394e-01  -0.468 0.640110    
## total_bedrooms               -1.863e+01  8.108e+00  -2.298 0.021554 *  
## population                    4.922e-01  1.306e+00   0.377 0.706374    
## households                    1.386e+01  8.524e+00   1.626 0.104060    
## median_income                 4.546e+04  1.001e+03  45.404  < 2e-16 ***
## ocean_proximity_1H_OCEAN     -1.357e+03  1.874e+03  -0.724 0.468904    
## ocean_proximity_INLAND       -7.554e+03  2.803e+03  -2.695 0.007038 ** 
## ocean_proximity_NEAR_BAY      5.209e+02  2.889e+03   0.180 0.856941    
## ocean_proximity_NEAR_OCEAN           NA         NA      NA       NA    
## rooms_per_household          -2.912e+02  3.628e+02  -0.803 0.422213    
## bedrooms_per_room             6.352e+05  3.844e+04  16.526  < 2e-16 ***
## population_per_household      1.263e+02  5.547e+01   2.278 0.022749 *  
## log_total_rooms               1.085e+05  1.133e+04   9.578  < 2e-16 ***
## log_total_bedrooms           -2.434e+04  1.410e+04  -1.727 0.084216 .  
## log_population               -9.570e+04  4.712e+03 -20.311  < 2e-16 ***
## log_households                4.069e+04  7.693e+03   5.289 1.24e-07 ***
## log_median_income            -6.093e+04  3.296e+03 -18.489  < 2e-16 ***
## log_rooms_per_household      -8.936e+04  1.899e+04  -4.705 2.56e-06 ***
## log_bedrooms_per_room        -7.350e+05  7.355e+04  -9.992  < 2e-16 ***
## log_population_per_household  9.648e+04  1.970e+04   4.897 9.81e-07 ***
## county_nameAlpine             1.587e+04  2.848e+04   0.557 0.577363    
## county_nameAmador             1.928e+03  1.000e+04   0.193 0.847182    
## county_nameArizona,la Paz     3.135e+04  3.805e+04   0.824 0.410024    
## county_nameButte             -8.619e+03  7.146e+03  -1.206 0.227746    
## county_nameCalaveras         -9.997e+03  9.456e+03  -1.057 0.290457    
## county_nameColusa            -4.024e+04  1.423e+04  -2.828 0.004685 ** 
## county_nameContra Costa      -9.207e+03  3.021e+03  -3.048 0.002309 ** 
## county_nameDel Norte         -1.317e+04  2.181e+04  -0.604 0.545746    
## county_nameEl Dorado          2.936e+04  6.620e+03   4.436 9.22e-06 ***
## county_nameFresno            -2.756e+04  5.614e+03  -4.909 9.25e-07 ***
## county_nameGlenn             -3.371e+04  1.110e+04  -3.036 0.002400 ** 
## county_nameHumboldt          -6.415e+04  1.043e+04  -6.151 7.86e-10 ***
## county_nameImperial           1.095e+04  1.659e+04   0.660 0.509434    
## county_nameInyo               3.831e+04  1.350e+04   2.837 0.004560 ** 
## county_nameKern              -3.863e+04  8.594e+03  -4.495 7.01e-06 ***
## county_nameKings             -3.279e+04  7.873e+03  -4.165 3.13e-05 ***
## county_nameLake              -5.977e+04  6.941e+03  -8.611  < 2e-16 ***
## county_nameLassen             3.284e+04  1.364e+04   2.407 0.016098 *  
## county_nameLos Angeles        6.213e+04  1.120e+04   5.549 2.92e-08 ***
## county_nameMadera            -1.791e+04  7.187e+03  -2.493 0.012692 *  
## county_nameMarin              7.092e+04  4.971e+03  14.268  < 2e-16 ***
## county_nameMariposa          -3.904e+03  1.223e+04  -0.319 0.749623    
## county_nameMendocino         -4.742e+04  8.066e+03  -5.879 4.20e-09 ***
## county_nameMerced            -1.808e+04  5.773e+03  -3.132 0.001741 ** 
## county_nameModoc              2.962e+04  2.052e+04   1.443 0.148940    
## county_nameMono               4.553e+04  1.310e+04   3.477 0.000508 ***
## county_nameMonterey           3.014e+03  5.615e+03   0.537 0.591378    
## county_nameNapa               4.055e+03  5.566e+03   0.729 0.466260    
## county_nameNevada             4.071e+04  7.411e+03   5.493 4.00e-08 ***
## county_nameNevada,douglas     1.366e+05  4.875e+04   2.802 0.005083 ** 
## county_nameOrange             4.284e+04  1.213e+04   3.532 0.000413 ***
## county_namePlacer             3.207e+04  6.390e+03   5.018 5.26e-07 ***
## county_namePlumas             1.109e+04  1.104e+04   1.004 0.315486    
## county_nameRiverside          1.808e+04  1.294e+04   1.397 0.162386    
## county_nameSacramento        -1.177e+04  4.292e+03  -2.742 0.006116 ** 
## county_nameSan Benito         5.224e+04  1.044e+04   5.003 5.71e-07 ***
## county_nameSan Bernardino     9.215e+03  1.197e+04   0.770 0.441557    
## county_nameSan Diego          1.606e+04  1.456e+04   1.103 0.270199    
## county_nameSan Francisco      6.972e+04  4.793e+03  14.547  < 2e-16 ***
## county_nameSan Joaquin       -1.198e+04  3.996e+03  -2.997 0.002730 ** 
## county_nameSan Luis Obispo    1.969e+04  8.437e+03   2.334 0.019596 *  
## county_nameSan Mateo          8.243e+04  3.943e+03  20.905  < 2e-16 ***
## county_nameSanta Barbara      7.590e+03  9.166e+03   0.828 0.407658    
## county_nameSanta Clara        4.402e+04  3.276e+03  13.438  < 2e-16 ***
## county_nameSanta Cruz         3.665e+04  5.314e+03   6.897 5.51e-12 ***
## county_nameShasta            -5.213e+03  9.269e+03  -0.562 0.573852    
## county_nameSierra             2.755e+03  1.943e+04   0.142 0.887247    
## county_nameSiskiyou          -1.015e+04  1.343e+04  -0.756 0.449655    
## county_nameSolano            -3.203e+04  4.831e+03  -6.630 3.47e-11 ***
## county_nameSonoma             3.300e+03  4.573e+03   0.722 0.470478    
## county_nameStanislaus        -6.158e+02  4.399e+03  -0.140 0.888682    
## county_nameSutter            -1.898e+04  8.002e+03  -2.372 0.017714 *  
## county_nameTehama            -2.682e+04  9.723e+03  -2.759 0.005811 ** 
## county_nameTrinity           -4.878e+04  1.245e+04  -3.918 8.96e-05 ***
## county_nameTulare            -1.488e+04  7.089e+03  -2.099 0.035826 *  
## county_nameTuolumne           1.539e+04  7.748e+03   1.986 0.047046 *  
## county_nameVentura            3.441e+04  1.052e+04   3.271 0.001072 ** 
## county_nameYolo               5.240e+03  6.316e+03   0.830 0.406748    
## county_nameYuba              -1.914e+04  8.304e+03  -2.305 0.021183 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48390 on 16699 degrees of freedom
## Multiple R-squared:  0.7257, Adjusted R-squared:  0.7243 
## F-statistic: 545.3 on 81 and 16699 DF,  p-value: < 2.2e-16
summary(m7)
## 
## Call:
## lm(formula = mdf$median_house_value ~ ., data = mdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -375639  -27436   -5451   19908  341759 
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -2.494e+06  2.329e+05 -10.710  < 2e-16 ***
## longitude                    -3.020e+04  1.823e+03 -16.565  < 2e-16 ***
## latitude                     -2.374e+04  2.758e+03  -8.606  < 2e-16 ***
## housing_median_age            8.873e+01  4.317e+01   2.055 0.039850 *  
## total_rooms                  -3.924e-01  8.394e-01  -0.468 0.640110    
## total_bedrooms               -1.863e+01  8.108e+00  -2.298 0.021554 *  
## population                    4.922e-01  1.306e+00   0.377 0.706374    
## households                    1.386e+01  8.524e+00   1.626 0.104060    
## median_income                 4.546e+04  1.001e+03  45.404  < 2e-16 ***
## ocean_proximity_1H_OCEAN     -1.357e+03  1.874e+03  -0.724 0.468904    
## ocean_proximity_INLAND       -7.554e+03  2.803e+03  -2.695 0.007038 ** 
## ocean_proximity_NEAR_BAY      5.209e+02  2.889e+03   0.180 0.856941    
## ocean_proximity_NEAR_OCEAN           NA         NA      NA       NA    
## rooms_per_household          -2.912e+02  3.628e+02  -0.803 0.422213    
## bedrooms_per_room             6.352e+05  3.844e+04  16.526  < 2e-16 ***
## population_per_household      1.263e+02  5.547e+01   2.278 0.022749 *  
## log_total_rooms               1.085e+05  1.133e+04   9.578  < 2e-16 ***
## log_total_bedrooms           -2.434e+04  1.410e+04  -1.727 0.084216 .  
## log_population               -9.570e+04  4.712e+03 -20.311  < 2e-16 ***
## log_households                4.069e+04  7.693e+03   5.289 1.24e-07 ***
## log_median_income            -6.093e+04  3.296e+03 -18.489  < 2e-16 ***
## log_rooms_per_household      -8.936e+04  1.899e+04  -4.705 2.56e-06 ***
## log_bedrooms_per_room        -7.350e+05  7.355e+04  -9.992  < 2e-16 ***
## log_population_per_household  9.648e+04  1.970e+04   4.897 9.81e-07 ***
## county_nameAlpine             1.587e+04  2.848e+04   0.557 0.577363    
## county_nameAmador             1.928e+03  1.000e+04   0.193 0.847182    
## county_nameArizona,la Paz     3.135e+04  3.805e+04   0.824 0.410024    
## county_nameButte             -8.619e+03  7.146e+03  -1.206 0.227746    
## county_nameCalaveras         -9.997e+03  9.456e+03  -1.057 0.290457    
## county_nameColusa            -4.024e+04  1.423e+04  -2.828 0.004685 ** 
## county_nameContra Costa      -9.207e+03  3.021e+03  -3.048 0.002309 ** 
## county_nameDel Norte         -1.317e+04  2.181e+04  -0.604 0.545746    
## county_nameEl Dorado          2.936e+04  6.620e+03   4.436 9.22e-06 ***
## county_nameFresno            -2.756e+04  5.614e+03  -4.909 9.25e-07 ***
## county_nameGlenn             -3.371e+04  1.110e+04  -3.036 0.002400 ** 
## county_nameHumboldt          -6.415e+04  1.043e+04  -6.151 7.86e-10 ***
## county_nameImperial           1.095e+04  1.659e+04   0.660 0.509434    
## county_nameInyo               3.831e+04  1.350e+04   2.837 0.004560 ** 
## county_nameKern              -3.863e+04  8.594e+03  -4.495 7.01e-06 ***
## county_nameKings             -3.279e+04  7.873e+03  -4.165 3.13e-05 ***
## county_nameLake              -5.977e+04  6.941e+03  -8.611  < 2e-16 ***
## county_nameLassen             3.284e+04  1.364e+04   2.407 0.016098 *  
## county_nameLos Angeles        6.213e+04  1.120e+04   5.549 2.92e-08 ***
## county_nameMadera            -1.791e+04  7.187e+03  -2.493 0.012692 *  
## county_nameMarin              7.092e+04  4.971e+03  14.268  < 2e-16 ***
## county_nameMariposa          -3.904e+03  1.223e+04  -0.319 0.749623    
## county_nameMendocino         -4.742e+04  8.066e+03  -5.879 4.20e-09 ***
## county_nameMerced            -1.808e+04  5.773e+03  -3.132 0.001741 ** 
## county_nameModoc              2.962e+04  2.052e+04   1.443 0.148940    
## county_nameMono               4.553e+04  1.310e+04   3.477 0.000508 ***
## county_nameMonterey           3.014e+03  5.615e+03   0.537 0.591378    
## county_nameNapa               4.055e+03  5.566e+03   0.729 0.466260    
## county_nameNevada             4.071e+04  7.411e+03   5.493 4.00e-08 ***
## county_nameNevada,douglas     1.366e+05  4.875e+04   2.802 0.005083 ** 
## county_nameOrange             4.284e+04  1.213e+04   3.532 0.000413 ***
## county_namePlacer             3.207e+04  6.390e+03   5.018 5.26e-07 ***
## county_namePlumas             1.109e+04  1.104e+04   1.004 0.315486    
## county_nameRiverside          1.808e+04  1.294e+04   1.397 0.162386    
## county_nameSacramento        -1.177e+04  4.292e+03  -2.742 0.006116 ** 
## county_nameSan Benito         5.224e+04  1.044e+04   5.003 5.71e-07 ***
## county_nameSan Bernardino     9.215e+03  1.197e+04   0.770 0.441557    
## county_nameSan Diego          1.606e+04  1.456e+04   1.103 0.270199    
## county_nameSan Francisco      6.972e+04  4.793e+03  14.547  < 2e-16 ***
## county_nameSan Joaquin       -1.198e+04  3.996e+03  -2.997 0.002730 ** 
## county_nameSan Luis Obispo    1.969e+04  8.437e+03   2.334 0.019596 *  
## county_nameSan Mateo          8.243e+04  3.943e+03  20.905  < 2e-16 ***
## county_nameSanta Barbara      7.590e+03  9.166e+03   0.828 0.407658    
## county_nameSanta Clara        4.402e+04  3.276e+03  13.438  < 2e-16 ***
## county_nameSanta Cruz         3.665e+04  5.314e+03   6.897 5.51e-12 ***
## county_nameShasta            -5.213e+03  9.269e+03  -0.562 0.573852    
## county_nameSierra             2.755e+03  1.943e+04   0.142 0.887247    
## county_nameSiskiyou          -1.015e+04  1.343e+04  -0.756 0.449655    
## county_nameSolano            -3.203e+04  4.831e+03  -6.630 3.47e-11 ***
## county_nameSonoma             3.300e+03  4.573e+03   0.722 0.470478    
## county_nameStanislaus        -6.158e+02  4.399e+03  -0.140 0.888682    
## county_nameSutter            -1.898e+04  8.002e+03  -2.372 0.017714 *  
## county_nameTehama            -2.682e+04  9.723e+03  -2.759 0.005811 ** 
## county_nameTrinity           -4.878e+04  1.245e+04  -3.918 8.96e-05 ***
## county_nameTulare            -1.488e+04  7.089e+03  -2.099 0.035826 *  
## county_nameTuolumne           1.539e+04  7.748e+03   1.986 0.047046 *  
## county_nameVentura            3.441e+04  1.052e+04   3.271 0.001072 ** 
## county_nameYolo               5.240e+03  6.316e+03   0.830 0.406748    
## county_nameYuba              -1.914e+04  8.304e+03  -2.305 0.021183 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48390 on 16699 degrees of freedom
## Multiple R-squared:  0.7257, Adjusted R-squared:  0.7243 
## F-statistic: 545.3 on 81 and 16699 DF,  p-value: < 2.2e-16
autoplot(m6)

autoplot(m7)

Observamos metricas de los modelos

m6_adjr2 = summary(m6)$adj.r.squared
m7_adjr2 = summary(m7)$adj.r.squared


beginning_mods_results = data.frame(
  "Total Predictors" =
    c("m6" = extractAIC(m6)[1],
      "m7" = extractAIC(m7)[1]
      
      ),
  
  "AIC" =
    c("m6" = extractAIC(m6)[2],
      "m7" = extractAIC(m7)[2]

      
      ),
  
  "Adj R-Squared" =
    c("m6" = m6_adjr2,
      "m7" = m7_adjr2

      )
  )

kable(beginning_mods_results, align = c("c", "r"))
Total.Predictors AIC Adj.R.Squared
m6 82 362116.5 0.7243287
m7 82 362116.5 0.7243287

MODELO FINAL

library(dplyr)
str(cdf)
## Classes 'data.table' and 'data.frame':   18071 obs. of  24 variables:
##  $ longitude                   : num  -122 -122 -122 -122 -122 ...
##  $ latitude                    : num  37.9 37.9 37.8 37.9 37.9 ...
##  $ housing_median_age          : num  41 21 42 40 42 41 48 48 43 40 ...
##  $ total_rooms                 : num  880 7099 2555 751 1639 ...
##  $ total_bedrooms              : num  129 1106 665 184 367 ...
##  $ population                  : num  322 2401 1206 409 929 ...
##  $ households                  : num  126 1138 595 166 366 ...
##  $ median_income               : num  8.33 8.3 2.08 1.36 1.71 ...
##  $ median_house_value          : num  452600 358500 226700 147500 159800 ...
##  $ ocean_proximity_1H_OCEAN    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_INLAND      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ocean_proximity_NEAR_BAY    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ocean_proximity_NEAR_OCEAN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rooms_per_household         : num  6.98 6.24 4.29 4.52 4.48 ...
##  $ bedrooms_per_room           : num  0.147 0.156 0.26 0.245 0.224 ...
##  $ population_per_household    : num  2.56 2.11 2.03 2.46 2.54 ...
##  $ log_total_rooms             : num  6.78 8.87 7.85 6.62 7.4 ...
##  $ log_total_bedrooms          : num  4.86 7.01 6.5 5.21 5.91 ...
##  $ log_population              : num  5.77 7.78 7.1 6.01 6.83 ...
##  $ log_households              : num  4.84 7.04 6.39 5.11 5.9 ...
##  $ log_median_income           : num  2.119 2.116 0.733 0.306 0.539 ...
##  $ log_rooms_per_household     : num  1.4 1.26 1.23 1.3 1.25 ...
##  $ log_bedrooms_per_room       : num  0.717 0.79 0.828 0.788 0.798 ...
##  $ log_population_per_household: num  1.19 1.11 1.11 1.18 1.16 ...
##  - attr(*, ".internal.selfref")=<externalptr>
cor(cdf)
##                                  longitude     latitude housing_median_age
## longitude                     1.0000000000 -0.922364020        -0.02308773
## latitude                     -0.9223640199  1.000000000        -0.06172033
## housing_median_age           -0.0230877337 -0.061720327         1.00000000
## total_rooms                   0.0336844030 -0.026127082        -0.37476300
## total_bedrooms                0.0630945239 -0.063519194        -0.33130972
## population                    0.0888992366 -0.105798593        -0.27718377
## households                    0.0493879355 -0.069338040        -0.31045185
## median_income                -0.0186060804 -0.074272054        -0.19793210
## median_house_value           -0.0232645793 -0.172011621         0.01381800
## ocean_proximity_1H_OCEAN      0.2854194785 -0.428637043         0.11762475
## ocean_proximity_INLAND       -0.0922112705  0.386166117        -0.21969804
## ocean_proximity_NEAR_BAY     -0.4117854923  0.307068376         0.12337611
## ocean_proximity_NEAR_OCEAN    0.0421879553 -0.162919971         0.03724211
## rooms_per_household          -0.0384494147  0.130983264        -0.17876200
## bedrooms_per_room             0.1157156990 -0.143056445         0.14426166
## population_per_household      0.0004556728  0.002566498         0.02155349
## log_total_rooms               0.0209708958 -0.024666645        -0.32481705
## log_total_bedrooms            0.0576674469 -0.068843342        -0.27763584
## log_population                0.1030874510 -0.141221893        -0.21160782
## log_households                0.0532614785 -0.091996515        -0.24197414
## log_median_income            -0.0198907447 -0.075912691        -0.18403576
## log_rooms_per_household      -0.0700990514  0.151488923        -0.05157560
## log_bedrooms_per_room         0.1105947173 -0.134860118        -0.02612265
## log_population_per_household  0.0623846343 -0.046976566         0.10572349
##                                total_rooms total_bedrooms  population
## longitude                     0.0336844030    0.063094524  0.08889924
## latitude                     -0.0261270816   -0.063519194 -0.10579859
## housing_median_age           -0.3747630026   -0.331309723 -0.27718377
## total_rooms                   1.0000000000    0.935422796  0.86023006
## total_bedrooms                0.9354227956    1.000000000  0.88021200
## population                    0.8602300574    0.880212004  1.00000000
## households                    0.9224226021    0.978746198  0.91068245
## median_income                 0.2227087874    0.020802496  0.04015941
## median_house_value            0.1532913001    0.079970469  0.02227064
## ocean_proximity_1H_OCEAN     -0.0162807691    0.012381030  0.06951887
## ocean_proximity_INLAND        0.0263535919   -0.013435002 -0.03799835
## ocean_proximity_NEAR_BAY      0.0003676967   -0.003761993 -0.03433719
## ocean_proximity_NEAR_OCEAN   -0.0134742604    0.003929836 -0.02140617
## rooms_per_household           0.1397245138    0.013422080 -0.07445637
## bedrooms_per_room            -0.1875404824    0.076759152  0.03409933
## population_per_household     -0.0241525895   -0.027629035  0.06532818
## log_total_rooms               0.7940511233    0.771069090  0.69415683
## log_total_bedrooms            0.7506468980    0.817471436  0.72221616
## log_population                0.6868776974    0.725579093  0.79772220
## log_households                0.7314149724    0.789293297  0.73601806
## log_median_income             0.2322828171    0.042326951  0.05389188
## log_rooms_per_household      -0.1963873711   -0.327411813 -0.35015483
## log_bedrooms_per_room         0.2734459719    0.491888221  0.41129794
## log_population_per_household -0.2756666873   -0.317412823 -0.09722667
##                                households median_income median_house_value
## longitude                     0.049387935  -0.018606080        -0.02326458
## latitude                     -0.069338040  -0.074272054        -0.17201162
## housing_median_age           -0.310451853  -0.197932095         0.01381800
## total_rooms                   0.922422602   0.222708787         0.15329130
## total_bedrooms                0.978746198   0.020802496         0.07997047
## population                    0.910682455   0.040159411         0.02227064
## households                    1.000000000   0.045468585         0.09948996
## median_income                 0.045468585   1.000000000         0.66577485
## median_house_value            0.099489960   0.665774849         1.00000000
## ocean_proximity_1H_OCEAN      0.038132019   0.184800134         0.32103851
## ocean_proximity_INLAND       -0.047228557  -0.228050323        -0.50301677
## ocean_proximity_NEAR_BAY      0.005665539   0.067867661         0.11481952
## ocean_proximity_NEAR_OCEAN    0.006090457  -0.005749940         0.14500129
## rooms_per_household          -0.078929107   0.311129787         0.11363592
## bedrooms_per_room             0.058160385  -0.634558982        -0.23391359
## population_per_household     -0.026534027   0.025866833        -0.01991006
## log_total_rooms               0.769172585   0.234052911         0.17626147
## log_total_bedrooms            0.808594866   0.009043832         0.08548597
## log_population                0.759431749   0.037795960         0.02962732
## log_households                0.815401150   0.042968702         0.11003767
## log_median_income             0.067472361   0.958447829         0.62987265
## log_rooms_per_household      -0.384828841   0.256478728         0.05214824
## log_bedrooms_per_room         0.474746017  -0.478212118        -0.15192829
## log_population_per_household -0.313887439  -0.016494389        -0.15963422
##                              ocean_proximity_1H_OCEAN ocean_proximity_INLAND
## longitude                                 0.285419479            -0.09221127
## latitude                                 -0.428637043             0.38616612
## housing_median_age                        0.117624750            -0.21969804
## total_rooms                              -0.016280769             0.02635359
## total_bedrooms                            0.012381030            -0.01343500
## population                                0.069518865            -0.03799835
## households                                0.038132019            -0.04722856
## median_income                             0.184800134            -0.22805032
## median_house_value                        0.321038511            -0.50301677
## ocean_proximity_1H_OCEAN                  1.000000000            -0.65541248
## ocean_proximity_INLAND                   -0.655412478             1.00000000
## ocean_proximity_NEAR_BAY                 -0.267518416            -0.21501585
## ocean_proximity_NEAR_OCEAN               -0.341321517            -0.27433452
## rooms_per_household                      -0.126176671             0.18085036
## bedrooms_per_room                         0.104289188            -0.14222788
## population_per_household                 -0.002092085             0.01133422
## log_total_rooms                           0.009729217            -0.01091605
## log_total_bedrooms                        0.038781199            -0.05283844
## log_population                            0.116629610            -0.09485921
## log_households                            0.073411382            -0.09864180
## log_median_income                         0.185238287            -0.23232746
## log_rooms_per_household                  -0.144206718             0.19766611
## log_bedrooms_per_room                     0.094314045            -0.12904802
## log_population_per_household              0.033718204             0.04757230
##                              ocean_proximity_NEAR_BAY
## longitude                               -0.4117854923
## latitude                                 0.3070683763
## housing_median_age                       0.1233761099
## total_rooms                              0.0003676967
## total_bedrooms                          -0.0037619932
## population                              -0.0343371915
## households                               0.0056655389
## median_income                            0.0678676612
## median_house_value                       0.1148195192
## ocean_proximity_1H_OCEAN                -0.2675184160
## ocean_proximity_INLAND                  -0.2150158524
## ocean_proximity_NEAR_BAY                 1.0000000000
## ocean_proximity_NEAR_OCEAN              -0.1119745800
## rooms_per_household                     -0.0201352195
## bedrooms_per_room                       -0.0210359696
## population_per_household                -0.0120023879
## log_total_rooms                         -0.0007979923
## log_total_bedrooms                      -0.0080782466
## log_population                          -0.0378127369
## log_households                           0.0036637247
## log_median_income                        0.0633516305
## log_rooms_per_household                 -0.0113660622
## log_bedrooms_per_room                   -0.0232309601
## log_population_per_household            -0.0682139322
##                              ocean_proximity_NEAR_OCEAN rooms_per_household
## longitude                                   0.042187955        -0.038449415
## latitude                                   -0.162919971         0.130983264
## housing_median_age                          0.037242109        -0.178761998
## total_rooms                                -0.013474260         0.139724514
## total_bedrooms                              0.003929836         0.013422080
## population                                 -0.021406165        -0.074456368
## households                                  0.006090457        -0.078929107
## median_income                              -0.005749940         0.311129787
## median_house_value                          0.145001289         0.113635921
## ocean_proximity_1H_OCEAN                   -0.341321517        -0.126176671
## ocean_proximity_INLAND                     -0.274334523         0.180850360
## ocean_proximity_NEAR_BAY                   -0.111974580        -0.020135220
## ocean_proximity_NEAR_OCEAN                  1.000000000        -0.053486583
## rooms_per_household                        -0.053486583         1.000000000
## bedrooms_per_room                           0.064356659        -0.415270347
## population_per_household                   -0.003237739        -0.007216777
## log_total_rooms                             0.002008713         0.139160080
## log_total_bedrooms                          0.024397651         0.001579552
## log_population                             -0.007586525        -0.177676942
## log_households                              0.028601981        -0.162566947
## log_median_income                           0.003446060         0.295219340
## log_rooms_per_household                    -0.057864508         0.659298701
## log_bedrooms_per_room                       0.062433286        -0.298376578
## log_population_per_household               -0.062813422         0.038223326
##                              bedrooms_per_room population_per_household
## longitude                         0.1157156990             0.0004556728
## latitude                         -0.1430564446             0.0025664977
## housing_median_age                0.1442616560             0.0215534933
## total_rooms                      -0.1875404824            -0.0241525895
## total_bedrooms                    0.0767591524            -0.0276290346
## population                        0.0340993331             0.0653281812
## households                        0.0581603852            -0.0265340274
## median_income                    -0.6345589816             0.0258668333
## median_house_value               -0.2339135880            -0.0199100594
## ocean_proximity_1H_OCEAN          0.1042891883            -0.0020920846
## ocean_proximity_INLAND           -0.1422278798             0.0113342227
## ocean_proximity_NEAR_BAY         -0.0210359696            -0.0120023879
## ocean_proximity_NEAR_OCEAN        0.0643566593            -0.0032377391
## rooms_per_household              -0.4152703470            -0.0072167769
## bedrooms_per_room                 1.0000000000             0.0043603587
## population_per_household          0.0043603587             1.0000000000
## log_total_rooms                  -0.2477146086            -0.0835735056
## log_total_bedrooms                0.0617609405            -0.0846318913
## log_population                    0.0172628437             0.0417694650
## log_households                    0.0321968961            -0.0810461733
## log_median_income                -0.6489189054             0.0145874010
## log_rooms_per_household          -0.4207453003             0.0706099308
## log_bedrooms_per_room             0.7211657237            -0.0853377761
## log_population_per_household     -0.0005781063             0.5528646835
##                              log_total_rooms log_total_bedrooms log_population
## longitude                       0.0209708958        0.057667447    0.103087451
## latitude                       -0.0246666450       -0.068843342   -0.141221893
## housing_median_age             -0.3248170511       -0.277635842   -0.211607821
## total_rooms                     0.7940511233        0.750646898    0.686877697
## total_bedrooms                  0.7710690903        0.817471436    0.725579093
## population                      0.6941568286        0.722216158    0.797722200
## households                      0.7691725851        0.808594866    0.759431749
## median_income                   0.2340529114        0.009043832    0.037795960
## median_house_value              0.1762614681        0.085485972    0.029627317
## ocean_proximity_1H_OCEAN        0.0097292173        0.038781199    0.116629610
## ocean_proximity_INLAND         -0.0109160491       -0.052838437   -0.094859210
## ocean_proximity_NEAR_BAY       -0.0007979923       -0.008078247   -0.037812737
## ocean_proximity_NEAR_OCEAN      0.0020087127        0.024397651   -0.007586525
## rooms_per_household             0.1391600797        0.001579552   -0.177676942
## bedrooms_per_room              -0.2477146086        0.061760940    0.017262844
## population_per_household       -0.0835735056       -0.084631891    0.041769465
## log_total_rooms                 1.0000000000        0.949123739    0.863753047
## log_total_bedrooms              0.9491237386        1.000000000    0.895265226
## log_population                  0.8637530465        0.895265226    1.000000000
## log_households                  0.9326880798        0.972298862    0.933558108
## log_median_income               0.2671505080        0.045589068    0.069285155
## log_rooms_per_household        -0.3705381193       -0.520693264   -0.601721079
## log_bedrooms_per_room           0.4350723248        0.686033805    0.586311406
## log_population_per_household   -0.4906483419       -0.511370349   -0.230411960
##                              log_households log_median_income
## longitude                       0.053261479       -0.01989074
## latitude                       -0.091996515       -0.07591269
## housing_median_age             -0.241974142       -0.18403576
## total_rooms                     0.731414972        0.23228282
## total_bedrooms                  0.789293297        0.04232695
## population                      0.736018062        0.05389188
## households                      0.815401150        0.06747236
## median_income                   0.042968702        0.95844783
## median_house_value              0.110037671        0.62987265
## ocean_proximity_1H_OCEAN        0.073411382        0.18523829
## ocean_proximity_INLAND         -0.098641795       -0.23232746
## ocean_proximity_NEAR_BAY        0.003663725        0.06335163
## ocean_proximity_NEAR_OCEAN      0.028601981        0.00344606
## rooms_per_household            -0.162566947        0.29521934
## bedrooms_per_room               0.032196896       -0.64891891
## population_per_household       -0.081046173        0.01458740
## log_total_rooms                 0.932688080        0.26715051
## log_total_bedrooms              0.972298862        0.04558907
## log_population                  0.933558108        0.06928515
## log_households                  1.000000000        0.08102491
## log_median_income               0.081024906        1.00000000
## log_rooms_per_household        -0.630269747        0.22346051
## log_bedrooms_per_room           0.645308828       -0.45100734
## log_population_per_household   -0.507470925       -0.04566184
##                              log_rooms_per_household log_bedrooms_per_room
## longitude                                -0.07009905            0.11059472
## latitude                                  0.15148892           -0.13486012
## housing_median_age                       -0.05157560           -0.02612265
## total_rooms                              -0.19638737            0.27344597
## total_bedrooms                           -0.32741181            0.49188822
## population                               -0.35015483            0.41129794
## households                               -0.38482884            0.47474602
## median_income                             0.25647873           -0.47821212
## median_house_value                        0.05214824           -0.15192829
## ocean_proximity_1H_OCEAN                 -0.14420672            0.09431404
## ocean_proximity_INLAND                    0.19766611           -0.12904802
## ocean_proximity_NEAR_BAY                 -0.01136606           -0.02323096
## ocean_proximity_NEAR_OCEAN               -0.05786451            0.06243329
## rooms_per_household                       0.65929870           -0.29837658
## bedrooms_per_room                        -0.42074530            0.72116572
## population_per_household                  0.07060993           -0.08533778
## log_total_rooms                          -0.37053812            0.43507232
## log_total_bedrooms                       -0.52069326            0.68603380
## log_population                           -0.60172108            0.58631141
## log_households                           -0.63026975            0.64530883
## log_median_income                         0.22346051           -0.45100734
## log_rooms_per_household                   1.00000000           -0.69434825
## log_bedrooms_per_room                    -0.69434825            1.00000000
## log_population_per_household              0.46168538           -0.39208473
##                              log_population_per_household
## longitude                                    0.0623846343
## latitude                                    -0.0469765656
## housing_median_age                           0.1057234910
## total_rooms                                 -0.2756666873
## total_bedrooms                              -0.3174128230
## population                                  -0.0972266706
## households                                  -0.3138874388
## median_income                               -0.0164943895
## median_house_value                          -0.1596342228
## ocean_proximity_1H_OCEAN                     0.0337182043
## ocean_proximity_INLAND                       0.0475723024
## ocean_proximity_NEAR_BAY                    -0.0682139322
## ocean_proximity_NEAR_OCEAN                  -0.0628134216
## rooms_per_household                          0.0382233257
## bedrooms_per_room                           -0.0005781063
## population_per_household                     0.5528646835
## log_total_rooms                             -0.4906483419
## log_total_bedrooms                          -0.5113703493
## log_population                              -0.2304119597
## log_households                              -0.5074709249
## log_median_income                           -0.0456618362
## log_rooms_per_household                      0.4616853762
## log_bedrooms_per_room                       -0.3920847272
## log_population_per_household                 1.0000000000
x.train <- sample_frac(tbl = cdf, replace = FALSE, size = 0.80)
x.test <- anti_join(cdf, x.train)
## Joining, by = c("longitude", "latitude", "housing_median_age", "total_rooms",
## "total_bedrooms", "population", "households", "median_income",
## "median_house_value", "ocean_proximity_1H_OCEAN", "ocean_proximity_INLAND",
## "ocean_proximity_NEAR_BAY", "ocean_proximity_NEAR_OCEAN",
## "rooms_per_household", "bedrooms_per_room", "population_per_household",
## "log_total_rooms", "log_total_bedrooms", "log_population", "log_households",
## "log_median_income", "log_rooms_per_household", "log_bedrooms_per_room",
## "log_population_per_household")
summary(x.train)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.5   1st Qu.:33.92   1st Qu.:17.00      1st Qu.: 1457  
##  Median :-118.5   Median :34.24   Median :27.00      Median : 2143  
##  Mean   :-119.5   Mean   :35.60   Mean   :26.64      Mean   : 2672  
##  3rd Qu.:-118.0   3rd Qu.:37.67   3rd Qu.:35.00      3rd Qu.: 3183  
##  Max.   :-114.3   Max.   :41.95   Max.   :48.00      Max.   :39320  
##  total_bedrooms     population      households     median_income    
##  Min.   :   2.0   Min.   :    3   Min.   :   2.0   Min.   : 0.4999  
##  1st Qu.: 300.0   1st Qu.:  815   1st Qu.: 284.0   1st Qu.: 2.5389  
##  Median : 442.0   Median : 1208   Median : 415.0   Median : 3.4712  
##  Mean   : 549.1   Mean   : 1473   Mean   : 509.4   Mean   : 3.6923  
##  3rd Qu.: 659.0   3rd Qu.: 1781   3rd Qu.: 614.0   3rd Qu.: 4.6118  
##  Max.   :6210.0   Max.   :35682   Max.   :5358.0   Max.   :13.1477  
##  median_house_value ocean_proximity_1H_OCEAN ocean_proximity_INLAND
##  Min.   : 14999     Min.   :0.0000           Min.   :0.0000        
##  1st Qu.:115000     1st Qu.:0.0000           1st Qu.:0.0000        
##  Median :171100     Median :0.0000           Median :0.0000        
##  Mean   :188957     Mean   :0.4459           Mean   :0.3474        
##  3rd Qu.:242100     3rd Qu.:1.0000           3rd Qu.:1.0000        
##  Max.   :499100     Max.   :1.0000           Max.   :1.0000        
##  ocean_proximity_NEAR_BAY ocean_proximity_NEAR_OCEAN rooms_per_household
##  Min.   :0.00000          Min.   :0.0000             Min.   :  0.8461   
##  1st Qu.:0.00000          1st Qu.:0.0000             1st Qu.:  4.4410   
##  Median :0.00000          Median :0.0000             Median :  5.2209   
##  Mean   :0.08024          Mean   :0.1263             Mean   :  5.3947   
##  3rd Qu.:0.00000          3rd Qu.:0.0000             3rd Qu.:  5.9895   
##  Max.   :1.00000          Max.   :1.0000             Max.   :132.5333   
##  bedrooms_per_room population_per_household log_total_rooms  
##  Min.   :0.1128    Min.   :   0.750         Min.   : 0.6931  
##  1st Qu.:0.1771    1st Qu.:   2.472         1st Qu.: 7.2841  
##  Median :0.2036    Median :   2.857         Median : 7.6700  
##  Mean   :0.2139    Mean   :   3.152         Mean   : 7.6446  
##  3rd Qu.:0.2401    3rd Qu.:   3.326         3rd Qu.: 8.0656  
##  Max.   :1.0000    Max.   :1243.333         Max.   :10.5795  
##  log_total_bedrooms log_population   log_households   log_median_income
##  Min.   :0.6931     Min.   : 1.099   Min.   :0.6931   Min.   :-0.6933  
##  1st Qu.:5.7038     1st Qu.: 6.703   1st Qu.:5.6490   1st Qu.: 0.9317  
##  Median :6.0913     Median : 7.097   Median :6.0283   Median : 1.2445  
##  Mean   :6.0736     Mean   : 7.061   Mean   :6.0017   Mean   : 1.2155  
##  3rd Qu.:6.4907     3rd Qu.: 7.485   3rd Qu.:6.4200   3rd Qu.: 1.5286  
##  Max.   :8.7339     Max.   :10.482   Max.   :8.5863   Max.   : 2.5762  
##  log_rooms_per_household log_bedrooms_per_room log_population_per_household
##  Min.   :0.9349          Min.   :0.3372        Min.   :0.7925              
##  1st Qu.:1.2410          1st Qu.:0.7727        1st Qu.:1.1478              
##  Median :1.2734          Median :0.7933        Median :1.1749              
##  Mean   :1.2803          Mean   :0.7930        Mean   :1.1815              
##  3rd Qu.:1.3058          3rd Qu.:0.8144        3rd Qu.:1.2057              
##  Max.   :5.6147          Max.   :1.0000        Max.   :4.9768
model <- lm(x.train$median_house_value ~ x.train$latitude + x.train$median_income + x.train$ocean_proximity_1H_OCEAN + x.train$ocean_proximity_INLAND + x.train$bedrooms_per_room + x.train$log_population_per_household ,x.train)
summary(model)
## 
## Call:
## lm(formula = x.train$median_house_value ~ x.train$latitude + 
##     x.train$median_income + x.train$ocean_proximity_1H_OCEAN + 
##     x.train$ocean_proximity_INLAND + x.train$bedrooms_per_room + 
##     x.train$log_population_per_household, data = x.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -373267  -36922   -9976   25038  538284 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           155174.7    13036.8  11.903  < 2e-16 ***
## x.train$latitude                         516.1      262.2   1.968   0.0491 *  
## x.train$median_income                  42512.3      455.5  93.332  < 2e-16 ***
## x.train$ocean_proximity_1H_OCEAN       -8525.7     1382.1  -6.169 7.08e-10 ***
## x.train$ocean_proximity_INLAND        -70061.2     1470.9 -47.631  < 2e-16 ***
## x.train$bedrooms_per_room             295385.9    12626.8  23.394  < 2e-16 ***
## x.train$log_population_per_household -149470.2     6087.8 -24.552  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60090 on 14450 degrees of freedom
## Multiple R-squared:  0.602,  Adjusted R-squared:  0.6018 
## F-statistic:  3642 on 6 and 14450 DF,  p-value: < 2.2e-16
test_actual = x.test$median_house_value
# the predicted house values for the test set
test_predictions = predict(model, x.test)
## Warning: 'newdata' had 3614 rows but variables found have 14457 rows
# the RMSE

test_rmse = sqrt(mean((test_actual - test_predictions)^2))
## Warning in test_actual - test_predictions: longer object length is not a
## multiple of shorter object length
# the percentage error

test_rmse
## [1] 120726.2

TRY2

x.train <- sample_frac(tbl = df, replace = FALSE, size = 0.80)
x.test <- anti_join(df, x.train)
## Joining, by = c("longitude", "latitude", "housing_median_age", "total_rooms",
## "total_bedrooms", "population", "households", "median_income",
## "median_house_value", "ocean_proximity_1H_OCEAN", "ocean_proximity_INLAND",
## "ocean_proximity_ISLAND", "ocean_proximity_NEAR_BAY",
## "ocean_proximity_NEAR_OCEAN")
model <- lm(x.train$median_house_value ~ .,x.train)
summary(model)
## 
## Call:
## lm(formula = x.train$median_house_value ~ ., data = x.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -557344  -42673  -10637   28533  761915 
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -2.234e+06  9.821e+04 -22.747  < 2e-16 ***
## longitude                  -2.639e+04  1.133e+03 -23.290  < 2e-16 ***
## latitude                   -2.498e+04  1.116e+03 -22.380  < 2e-16 ***
## housing_median_age          1.081e+03  4.883e+01  22.144  < 2e-16 ***
## total_rooms                -5.951e+00  8.879e-01  -6.702 2.12e-11 ***
## total_bedrooms              1.006e+02  7.620e+00  13.205  < 2e-16 ***
## population                 -3.719e+01  1.182e+00 -31.472  < 2e-16 ***
## households                  4.606e+01  8.236e+00   5.593 2.27e-08 ***
## median_income               3.926e+04  3.776e+02 103.993  < 2e-16 ***
## ocean_proximity_1H_OCEAN   -3.472e+03  1.738e+03  -1.998 0.045742 *  
## ocean_proximity_INLAND     -4.388e+04  2.496e+03 -17.577  < 2e-16 ***
## ocean_proximity_ISLAND      1.223e+05  3.959e+04   3.089 0.002009 ** 
## ocean_proximity_NEAR_BAY   -8.682e+03  2.425e+03  -3.579 0.000345 ***
## ocean_proximity_NEAR_OCEAN         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68490 on 16333 degrees of freedom
## Multiple R-squared:  0.647,  Adjusted R-squared:  0.6467 
## F-statistic:  2494 on 12 and 16333 DF,  p-value: < 2.2e-16
test_actual = x.test$median_house_value
# the predicted house values for the test set
test_predictions = predict(m1, x.test)
## Warning in predict.lm(m1, x.test): prediction from a rank-deficient fit may be
## misleading
# the RMSE

test_rmse = sqrt(mean((test_actual - test_predictions)^2))
# the percentage error

test_rmse
## [1] 64675.25