Predicción lineal de casas de Melbourne

Regresión lineal múltiple con variables cuantitativas

Las librerías

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape) # Para renombrar columnas
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
library(caret) # Para particiones
## Loading required package: lattice
library(corrplot) # Para correlaciones visuales
## corrplot 0.84 loaded

Los datos originales

En RUTA, reemplazar la ruta con su directorio de trabajo en donde se encuentre melb_datacomplete.csv

#En RUTA, reemplazar la ruta con su directorio de trabajo en donde se encuentre Sales.csv
ruta <- "D:/inspiron 1545/ESCUELA/ITD/OCTAVO/Analisis Inteligente de Datos/R/datos"
setwd(ruta)
datos <- read_csv("melb_datacomplete.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   CouncilArea = col_character(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
head(datos)
## # A tibble: 6 x 21
##   Suburb Address Rooms Type   Price Method SellerG Date  Distance Postcode
##   <chr>  <chr>   <dbl> <chr>  <dbl> <chr>  <chr>   <chr>    <dbl>    <dbl>
## 1 Abbot~ 85 Tur~     2 h     1.48e6 S      Biggin  3/12~      2.5     3067
## 2 Abbot~ 25 Blo~     2 h     1.03e6 S      Biggin  4/02~      2.5     3067
## 3 Abbot~ 5 Char~     3 h     1.46e6 SP     Biggin  4/03~      2.5     3067
## 4 Abbot~ 40 Fed~     3 h     8.50e5 PI     Biggin  4/03~      2.5     3067
## 5 Abbot~ 55a Pa~     4 h     1.60e6 VB     Nelson  4/06~      2.5     3067
## 6 Abbot~ 129 Ch~     2 h     9.41e5 S      Jellis  7/05~      2.5     3067
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## #   Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## #   Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
tail(datos)
## # A tibble: 6 x 21
##   Suburb Address Rooms Type   Price Method SellerG Date  Distance Postcode
##   <chr>  <chr>   <dbl> <chr>  <dbl> <chr>  <chr>   <chr>    <dbl>    <dbl>
## 1 Westm~ 9 Blac~     3 h     5.82e5 S      Red     26/0~     16.5     3049
## 2 Wheel~ 12 Str~     4 h     1.25e6 S      Barry   26/0~     16.7     3150
## 3 Willi~ 77 Mer~     3 h     1.03e6 SP     Willia~ 26/0~      6.8     3016
## 4 Willi~ 83 Pow~     3 h     1.17e6 S      Raine   26/0~      6.8     3016
## 5 Willi~ 96 Ver~     4 h     2.50e6 PI     Sweeney 26/0~      6.8     3016
## 6 Yarra~ 6 Agne~     4 h     1.28e6 SP     Village 26/0~      6.3     3013
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## #   Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## #   Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>

Describir los datos con str() y summary()

str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of  21 variables:
##  $ Suburb       : chr  "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
##  $ Address      : chr  "85 Turner St" "25 Bloomburg St" "5 Charles St" "40 Federation La" ...
##  $ Rooms        : num  2 2 3 3 4 2 3 2 1 2 ...
##  $ Type         : chr  "h" "h" "h" "h" ...
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Method       : chr  "S" "S" "SP" "PI" ...
##  $ SellerG      : chr  "Biggin" "Biggin" "Biggin" "Biggin" ...
##  $ Date         : chr  "3/12/2016" "4/02/2016" "4/03/2017" "4/03/2017" ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Postcode     : num  3067 3067 3067 3067 3067 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ CouncilArea  : chr  "Yarra" "Yarra" "Yarra" "Yarra" ...
##  $ Lattitude    : num  -37.8 -37.8 -37.8 -37.8 -37.8 ...
##  $ Longtitude   : num  145 145 145 145 145 ...
##  $ Regionname   : chr  "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Suburb = col_character(),
##   ..   Address = col_character(),
##   ..   Rooms = col_double(),
##   ..   Type = col_character(),
##   ..   Price = col_double(),
##   ..   Method = col_character(),
##   ..   SellerG = col_character(),
##   ..   Date = col_character(),
##   ..   Distance = col_double(),
##   ..   Postcode = col_double(),
##   ..   Bedroom2 = col_double(),
##   ..   Bathroom = col_double(),
##   ..   Car = col_double(),
##   ..   Landsize = col_double(),
##   ..   BuildingArea = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   CouncilArea = col_character(),
##   ..   Lattitude = col_double(),
##   ..   Longtitude = col_double(),
##   ..   Regionname = col_character(),
##   ..   Propertycount = col_double()
##   .. )
summary(datos)
##     Suburb            Address              Rooms            Type          
##  Length:13580       Length:13580       Min.   : 1.000   Length:13580      
##  Class :character   Class :character   1st Qu.: 2.000   Class :character  
##  Mode  :character   Mode  :character   Median : 3.000   Mode  :character  
##                                        Mean   : 2.938                     
##                                        3rd Qu.: 3.000                     
##                                        Max.   :10.000                     
##                                                                           
##      Price            Method            SellerG              Date          
##  Min.   :  85000   Length:13580       Length:13580       Length:13580      
##  1st Qu.: 650000   Class :character   Class :character   Class :character  
##  Median : 903000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1075684                                                           
##  3rd Qu.:1330000                                                           
##  Max.   :9000000                                                           
##                                                                            
##     Distance        Postcode       Bedroom2         Bathroom    
##  Min.   : 0.00   Min.   :3000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 6.10   1st Qu.:3044   1st Qu.: 2.000   1st Qu.:1.000  
##  Median : 9.20   Median :3084   Median : 3.000   Median :1.000  
##  Mean   :10.14   Mean   :3105   Mean   : 2.915   Mean   :1.534  
##  3rd Qu.:13.00   3rd Qu.:3148   3rd Qu.: 3.000   3rd Qu.:2.000  
##  Max.   :48.10   Max.   :3977   Max.   :20.000   Max.   :8.000  
##                                                                 
##       Car           Landsize         BuildingArea     YearBuilt   
##  Min.   : 0.00   Min.   :     0.0   Min.   :    0   Min.   :1196  
##  1st Qu.: 1.00   1st Qu.:   177.0   1st Qu.:   93   1st Qu.:1940  
##  Median : 2.00   Median :   440.0   Median :  126   Median :1970  
##  Mean   : 1.61   Mean   :   558.4   Mean   :  152   Mean   :1965  
##  3rd Qu.: 2.00   3rd Qu.:   651.0   3rd Qu.:  174   3rd Qu.:1999  
##  Max.   :10.00   Max.   :433014.0   Max.   :44515   Max.   :2018  
##  NA's   :62                         NA's   :6450    NA's   :5375  
##  CouncilArea          Lattitude        Longtitude     Regionname       
##  Length:13580       Min.   :-38.18   Min.   :144.4   Length:13580      
##  Class :character   1st Qu.:-37.86   1st Qu.:144.9   Class :character  
##  Mode  :character   Median :-37.80   Median :145.0   Mode  :character  
##                     Mean   :-37.81   Mean   :145.0                     
##                     3rd Qu.:-37.76   3rd Qu.:145.1                     
##                     Max.   :-37.41   Max.   :145.5                     
##                                                                        
##  Propertycount  
##  Min.   :  249  
##  1st Qu.: 4380  
##  Median : 6555  
##  Mean   : 7454  
##  3rd Qu.:10331  
##  Max.   :21650  
## 

Precio vs Cuartos + Distancia

modelo <- lm(Price ~ Rooms + Distance, datos)
modelo
## 
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
## 
## Coefficients:
## (Intercept)        Rooms     Distance  
##      277453       398697       -36807
summary(modelo)
## 
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2919059  -309722   -93892   198144  8218424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 277453.1    14787.5   18.76   <2e-16 ***
## Rooms       398696.7     4839.2   82.39   <2e-16 ***
## Distance    -36806.8      788.1  -46.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 515100 on 13577 degrees of freedom
## Multiple R-squared:  0.3509, Adjusted R-squared:  0.3508 
## F-statistic:  3670 on 2 and 13577 DF,  p-value: < 2.2e-16
pairs(datos[,c('Price','Rooms')])

Interpretación:

Un conjunto de datos únicamente con las variables numéricas del conjunto de datos original

datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA        NA
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94           NA        NA
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181           NA        NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of  10 variables:
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Rooms        : num  2 2 3 3 4 2 3 2 1 2 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Suburb = col_character(),
##   ..   Address = col_character(),
##   ..   Rooms = col_double(),
##   ..   Type = col_character(),
##   ..   Price = col_double(),
##   ..   Method = col_character(),
##   ..   SellerG = col_character(),
##   ..   Date = col_character(),
##   ..   Distance = col_double(),
##   ..   Postcode = col_double(),
##   ..   Bedroom2 = col_double(),
##   ..   Bathroom = col_double(),
##   ..   Car = col_double(),
##   ..   Landsize = col_double(),
##   ..   BuildingArea = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   CouncilArea = col_character(),
##   ..   Lattitude = col_double(),
##   ..   Longtitude = col_double(),
##   ..   Regionname = col_character(),
##   ..   Propertycount = col_double()
##   .. )

Depurar, limpiar los datos

mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE)
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE)
mediana.C <- median(datos.Num$Car, na.rm = TRUE)

Actualizar mutate() los NA por las medianas

head(datos.Num, 10)
## # A tibble: 10 x 10
##     Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##     <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
##  1 1.48e6     2      2.5        2        1     1      202           NA        NA
##  2 1.03e6     2      2.5        2        1     0      156           79      1900
##  3 1.46e6     3      2.5        3        2     0      134          150      1900
##  4 8.50e5     3      2.5        3        2     1       94           NA        NA
##  5 1.60e6     4      2.5        3        1     2      120          142      2014
##  6 9.41e5     2      2.5        2        1     0      181           NA        NA
##  7 1.88e6     3      2.5        4        2     0      245          210      1910
##  8 1.64e6     2      2.5        2        1     2      256          107      1890
##  9 3.00e5     1      2.5        1        1     1        0           NA        NA
## 10 1.10e6     2      2.5        3        1     2      220           75      1900
## # ... with 1 more variable: Propertycount <dbl>
ddatos.Num <- datos.Num %>% mutate (BuildingArea == ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))

datos.Num <- datos.Num %>%
  mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt)) 

datos.Num <- datos.Num %>%
  mutate (Car = ifelse(is.na(Car), mediana.C, Car)) 


head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
## # A tibble: 10 x 10
##     Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##     <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
##  1 1.48e6     2      2.5        2        1     1      202           NA      1970
##  2 1.03e6     2      2.5        2        1     0      156           79      1900
##  3 1.46e6     3      2.5        3        2     0      134          150      1900
##  4 8.50e5     3      2.5        3        2     1       94           NA      1970
##  5 1.60e6     4      2.5        3        1     2      120          142      2014
##  6 9.41e5     2      2.5        2        1     0      181           NA      1970
##  7 1.88e6     3      2.5        4        2     0      245          210      1910
##  8 1.64e6     2      2.5        2        1     2      256          107      1890
##  9 3.00e5     1      2.5        1        1     1        0           NA      1970
## 10 1.10e6     2      2.5        3        1     2      220           75      1900
## # ... with 1 more variable: Propertycount <dbl>

Correlaciones

correlaciones <- cor(datos.Num)
correlaciones
##                     Price       Rooms    Distance    Bedroom2    Bathroom
## Price          1.00000000  0.49663368 -0.16252184  0.47595103  0.46703818
## Rooms          0.49663368  1.00000000  0.29420252  0.94419027  0.59293408
## Distance      -0.16252184  0.29420252  1.00000000  0.29592676  0.12715513
## Bedroom2       0.47595103  0.94419027  0.29592676  1.00000000  0.58468549
## Bathroom       0.46703818  0.59293408  0.12715513  0.58468549  1.00000000
## Car            0.23910905  0.40693502  0.26059567  0.40386694  0.32101386
## Landsize       0.03750745  0.02567835  0.02500376  0.02564625  0.03713036
## BuildingArea           NA          NA          NA          NA          NA
## YearBuilt     -0.25938724 -0.05156167  0.19481506 -0.04133120  0.11395652
## Propertycount -0.04215261 -0.08153007 -0.05491034 -0.08135034 -0.05220075
##                       Car     Landsize BuildingArea    YearBuilt Propertycount
## Price          0.23910905  0.037507450           NA -0.259387242  -0.042152615
## Rooms          0.40693502  0.025678350           NA -0.051561667  -0.081530072
## Distance       0.26059567  0.025003758           NA  0.194815064  -0.054910338
## Bedroom2       0.40386694  0.025646248           NA -0.041331197  -0.081350337
## Bathroom       0.32101386  0.037130357           NA  0.113956517  -0.052200750
## Car            1.00000000  0.026779687           NA  0.078695800  -0.024344443
## Landsize       0.02677969  1.000000000           NA  0.008805811  -0.006853942
## BuildingArea           NA           NA            1           NA            NA
## YearBuilt      0.07869580  0.008805811           NA  1.000000000   0.004420750
## Propertycount -0.02434444 -0.006853942           NA  0.004420750   1.000000000
corrplot(correlaciones, method = "number")

Crear conjuntos de entrenamiento y conjuntos de validación

set.seed(2020) #Semilla
entrenamiento <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrenamiento)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
nrow(entrenamiento)
## [1] 9508
# Los registros que no estén en entrenamiento
head(datos.Num[-entrenamiento,])
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181           NA      1970
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrenamiento,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA      1970
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94           NA      1970
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181           NA      1970
## # ... with 1 more variable: Propertycount <dbl>
# Ahora determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrenamiento <- datos.Num[entrenamiento,]
head(datos.Entrenamiento)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA      1970
## 2 1.46e6     3      2.5        3        2     0      134          150      1900
## 3 8.50e5     3      2.5        3        2     1       94           NA      1970
## 4 1.60e6     4      2.5        3        1     2      120          142      2014
## 5 1.88e6     3      2.5        4        2     0      245          210      1910
## 6 3.00e5     1      2.5        1        1     1        0           NA      1970
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrenamiento)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1078063   Mean   : 2.937   Mean   :10.13   Mean   : 2.911  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :47.40   Max.   :10.000  
##                                                                     
##     Bathroom          Car            Landsize         BuildingArea   
##  Min.   :0.000   Min.   : 0.000   Min.   :     0.0   Min.   :   0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:   178.0   1st Qu.:  93.0  
##  Median :1.000   Median : 2.000   Median :   443.5   Median : 126.0  
##  Mean   :1.529   Mean   : 1.613   Mean   :   579.6   Mean   : 146.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:   650.0   3rd Qu.: 173.0  
##  Max.   :8.000   Max.   :10.000   Max.   :433014.0   Max.   :6791.0  
##                                                      NA's   :4555    
##    YearBuilt    Propertycount  
##  Min.   :1830   Min.   :  389  
##  1st Qu.:1960   1st Qu.: 4386  
##  Median :1970   Median : 6567  
##  Mean   :1967   Mean   : 7453  
##  3rd Qu.:1975   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650  
## 
# y conjunto de datos de validación y luego head()
datos.Validacion <- datos.Num[-entrenamiento,]
head(datos.Validacion)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181           NA      1970
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Validacion)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##                                                                    
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:   93.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  163.8  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  175.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##                                                    NA's   :1895     
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650  
## 

Modelo de regresión lineal múltiple

modelo <- lm(Price ~ ., datos.Entrenamiento)
modelo
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
## 
## Coefficients:
##   (Intercept)          Rooms       Distance       Bedroom2       Bathroom  
##     1.084e+07      1.477e+05     -2.915e+04      4.809e+04      3.099e+05  
##           Car       Landsize   BuildingArea      YearBuilt  Propertycount  
##     7.598e+04      1.540e+01      5.679e+02     -5.470e+03     -8.577e-01
summary(modelo)
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3547662  -253049   -67234   172910  8385632 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.084e+07  4.108e+05  26.383  < 2e-16 ***
## Rooms          1.477e+05  2.671e+04   5.530 3.37e-08 ***
## Distance      -2.915e+04  1.288e+03 -22.628  < 2e-16 ***
## Bedroom2       4.809e+04  2.652e+04   1.813   0.0698 .  
## Bathroom       3.099e+05  1.292e+04  23.979  < 2e-16 ***
## Car            7.598e+04  8.236e+03   9.226  < 2e-16 ***
## Landsize       1.540e+01  7.142e+00   2.157   0.0311 *  
## BuildingArea   5.679e+02  5.228e+01  10.862  < 2e-16 ***
## YearBuilt     -5.470e+03  2.101e+02 -26.034  < 2e-16 ***
## Propertycount -8.577e-01  1.587e+00  -0.541   0.5888    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 481400 on 4943 degrees of freedom
##   (4555 observations deleted due to missingness)
## Multiple R-squared:  0.4952, Adjusted R-squared:  0.4942 
## F-statistic: 538.7 on 9 and 4943 DF,  p-value: < 2.2e-16

Interpretación

Probar con los datos de validación

modelo <- lm(Price ~ ., datos.Validacion)
modelo
## 
## Call:
## lm(formula = Price ~ ., data = datos.Validacion)
## 
## Coefficients:
##   (Intercept)          Rooms       Distance       Bedroom2       Bathroom  
##     9.516e+06      2.931e+05     -2.855e+04     -4.886e+04      2.878e+05  
##           Car       Landsize   BuildingArea      YearBuilt  Propertycount  
##     5.334e+04      4.081e+01     -2.439e+01     -4.793e+03     -3.723e+00
summary(modelo)
## 
## Call:
## lm(formula = Price ~ ., data = datos.Validacion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3392886  -258841   -58561   179680  5428216 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.516e+06  5.323e+05  17.879  < 2e-16 ***
## Rooms          2.931e+05  3.055e+04   9.595  < 2e-16 ***
## Distance      -2.855e+04  1.830e+03 -15.598  < 2e-16 ***
## Bedroom2      -4.886e+04  3.115e+04  -1.569  0.11685    
## Bathroom       2.878e+05  1.838e+04  15.654  < 2e-16 ***
## Car            5.334e+04  1.206e+04   4.421 1.03e-05 ***
## Landsize       4.081e+01  1.389e+01   2.937  0.00335 ** 
## BuildingArea  -2.439e+01  1.732e+01  -1.408  0.15939    
## YearBuilt     -4.793e+03  2.703e+02 -17.730  < 2e-16 ***
## Propertycount -3.723e+00  2.328e+00  -1.599  0.10989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 470100 on 2167 degrees of freedom
##   (1895 observations deleted due to missingness)
## Multiple R-squared:  0.5097, Adjusted R-squared:  0.5077 
## F-statistic: 250.3 on 9 and 2167 DF,  p-value: < 2.2e-16

¿Qué hay que hacer como científico de datos?

Vamos a eliminar las observaciones que tienen NA

datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA        NA
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94           NA        NA
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181           NA        NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of  10 variables:
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Rooms        : num  2 2 3 3 4 2 3 2 1 2 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Suburb = col_character(),
##   ..   Address = col_character(),
##   ..   Rooms = col_double(),
##   ..   Type = col_character(),
##   ..   Price = col_double(),
##   ..   Method = col_character(),
##   ..   SellerG = col_character(),
##   ..   Date = col_character(),
##   ..   Distance = col_double(),
##   ..   Postcode = col_double(),
##   ..   Bedroom2 = col_double(),
##   ..   Bathroom = col_double(),
##   ..   Car = col_double(),
##   ..   Landsize = col_double(),
##   ..   BuildingArea = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   CouncilArea = col_character(),
##   ..   Lattitude = col_double(),
##   ..   Longtitude = col_double(),
##   ..   Regionname = col_character(),
##   ..   Propertycount = col_double()
##   .. )
summary(datos.Num)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1075684   Mean   : 2.938   Mean   :10.14   Mean   : 2.915  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :48.10   Max.   :20.000  
##                                                                     
##     Bathroom          Car           Landsize         BuildingArea  
##  Min.   :0.000   Min.   : 0.00   Min.   :     0.0   Min.   :    0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:   177.0   1st Qu.:   93  
##  Median :1.000   Median : 2.00   Median :   440.0   Median :  126  
##  Mean   :1.534   Mean   : 1.61   Mean   :   558.4   Mean   :  152  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:   651.0   3rd Qu.:  174  
##  Max.   :8.000   Max.   :10.00   Max.   :433014.0   Max.   :44515  
##                  NA's   :62                         NA's   :6450   
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1940   1st Qu.: 4380  
##  Median :1970   Median : 6555  
##  Mean   :1965   Mean   : 7454  
##  3rd Qu.:1999   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650  
##  NA's   :5375

Nuevamente datos de entrenamiento y datos de validación

set.seed(2020) #Semilla
entrenamiento <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrenamiento)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
nrow(entrenamiento)
## [1] 9508
# Los registros que no están en entrenamiento
head(datos.Num[-entrenamiento,])
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181           NA        NA
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrenamiento,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA        NA
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94           NA        NA
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181           NA        NA
## # ... with 1 more variable: Propertycount <dbl>
# Ahora a determinar conuntos de datos de entrenamiento y luego head()
datos.Entrenamiento <- datos.Num[entrenamiento,]
head(datos.Entrenamiento)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA        NA
## 2 1.46e6     3      2.5        3        2     0      134          150      1900
## 3 8.50e5     3      2.5        3        2     1       94           NA        NA
## 4 1.60e6     4      2.5        3        1     2      120          142      2014
## 5 1.88e6     3      2.5        4        2     0      245          210      1910
## 6 3.00e5     1      2.5        1        1     1        0           NA        NA
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrenamiento)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1078063   Mean   : 2.937   Mean   :10.13   Mean   : 2.911  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :47.40   Max.   :10.000  
##                                                                     
##     Bathroom          Car            Landsize         BuildingArea   
##  Min.   :0.000   Min.   : 0.000   Min.   :     0.0   Min.   :   0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:   178.0   1st Qu.:  93.0  
##  Median :1.000   Median : 2.000   Median :   443.5   Median : 126.0  
##  Mean   :1.529   Mean   : 1.611   Mean   :   579.6   Mean   : 146.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:   650.0   3rd Qu.: 173.0  
##  Max.   :8.000   Max.   :10.000   Max.   :433014.0   Max.   :6791.0  
##                  NA's   :41                          NA's   :4555    
##    YearBuilt    Propertycount  
##  Min.   :1830   Min.   :  389  
##  1st Qu.:1940   1st Qu.: 4386  
##  Median :1970   Median : 6567  
##  Mean   :1965   Mean   : 7453  
##  3rd Qu.:1999   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650  
##  NA's   :3766
# y conjunto de datos de validación y luego head()
datos.Validacion <- datos.Num[-entrenamiento,]
head(datos.Validacion)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181           NA        NA
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Validacion)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##                                                                    
##     Bathroom          Car            Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.000   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:  173.0   1st Qu.:   93.0  
##  Median :1.000   Median : 2.000   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.608   Mean   :  508.9   Mean   :  163.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:  653.0   3rd Qu.:  175.0  
##  Max.   :8.000   Max.   :10.000   Max.   :44500.0   Max.   :44515.0  
##                  NA's   :21                         NA's   :1895     
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1940   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1964   Mean   : 7457  
##  3rd Qu.:2000   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650  
##  NA's   :1609

Nuevamente modelo de regresión lineal

modelo <- lm(Price ~ ., datos.Entrenamiento)
modelo
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
## 
## Coefficients:
##   (Intercept)          Rooms       Distance       Bedroom2       Bathroom  
##     1.086e+07      9.491e+04     -2.819e+04      5.064e+04      2.682e+05  
##           Car       Landsize   BuildingArea      YearBuilt  Propertycount  
##     6.637e+04      1.454e+01      1.819e+03     -5.459e+03     -4.434e-01
summary(modelo)
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4743841  -240296   -63013   167845  8381097 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.086e+07  4.016e+05  27.037  < 2e-16 ***
## Rooms          9.491e+04  2.678e+04   3.544 0.000399 ***
## Distance      -2.819e+04  1.268e+03 -22.227  < 2e-16 ***
## Bedroom2       5.064e+04  2.638e+04   1.919 0.055000 .  
## Bathroom       2.682e+05  1.327e+04  20.211  < 2e-16 ***
## Car            6.637e+04  8.157e+03   8.136 5.17e-16 ***
## Landsize       1.454e+01  6.995e+00   2.079 0.037691 *  
## BuildingArea   1.819e+03  9.344e+01  19.465  < 2e-16 ***
## YearBuilt     -5.459e+03  2.054e+02 -26.580  < 2e-16 ***
## Propertycount -4.434e-01  1.563e+00  -0.284 0.776609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 465000 on 4732 degrees of freedom
##   (4766 observations deleted due to missingness)
## Multiple R-squared:  0.5248, Adjusted R-squared:  0.5239 
## F-statistic: 580.6 on 9 and 4732 DF,  p-value: < 2.2e-16