library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape) # Para renombrar columnas
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
library(caret) # Para particiones
## Loading required package: lattice
library(corrplot) # Para correlaciones visuales
## corrplot 0.84 loaded
En RUTA, reemplazar la ruta con su directorio de trabajo en donde se encuentre melb_datacomplete.csv
#En RUTA, reemplazar la ruta con su directorio de trabajo en donde se encuentre Sales.csv
ruta <- "D:/inspiron 1545/ESCUELA/ITD/OCTAVO/Analisis Inteligente de Datos/R/datos"
setwd(ruta)
datos <- read_csv("melb_datacomplete.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## CouncilArea = col_character(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
head(datos)
## # A tibble: 6 x 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postcode
## <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 Abbot~ 85 Tur~ 2 h 1.48e6 S Biggin 3/12~ 2.5 3067
## 2 Abbot~ 25 Blo~ 2 h 1.03e6 S Biggin 4/02~ 2.5 3067
## 3 Abbot~ 5 Char~ 3 h 1.46e6 SP Biggin 4/03~ 2.5 3067
## 4 Abbot~ 40 Fed~ 3 h 8.50e5 PI Biggin 4/03~ 2.5 3067
## 5 Abbot~ 55a Pa~ 4 h 1.60e6 VB Nelson 4/06~ 2.5 3067
## 6 Abbot~ 129 Ch~ 2 h 9.41e5 S Jellis 7/05~ 2.5 3067
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## # Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## # Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
tail(datos)
## # A tibble: 6 x 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postcode
## <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 Westm~ 9 Blac~ 3 h 5.82e5 S Red 26/0~ 16.5 3049
## 2 Wheel~ 12 Str~ 4 h 1.25e6 S Barry 26/0~ 16.7 3150
## 3 Willi~ 77 Mer~ 3 h 1.03e6 SP Willia~ 26/0~ 6.8 3016
## 4 Willi~ 83 Pow~ 3 h 1.17e6 S Raine 26/0~ 6.8 3016
## 5 Willi~ 96 Ver~ 4 h 2.50e6 PI Sweeney 26/0~ 6.8 3016
## 6 Yarra~ 6 Agne~ 4 h 1.28e6 SP Village 26/0~ 6.3 3013
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## # Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## # Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of 21 variables:
## $ Suburb : chr "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
## $ Address : chr "85 Turner St" "25 Bloomburg St" "5 Charles St" "40 Federation La" ...
## $ Rooms : num 2 2 3 3 4 2 3 2 1 2 ...
## $ Type : chr "h" "h" "h" "h" ...
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Method : chr "S" "S" "SP" "PI" ...
## $ SellerG : chr "Biggin" "Biggin" "Biggin" "Biggin" ...
## $ Date : chr "3/12/2016" "4/02/2016" "4/03/2017" "4/03/2017" ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Postcode : num 3067 3067 3067 3067 3067 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ CouncilArea : chr "Yarra" "Yarra" "Yarra" "Yarra" ...
## $ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
## $ Longtitude : num 145 145 145 145 145 ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
## - attr(*, "spec")=
## .. cols(
## .. Suburb = col_character(),
## .. Address = col_character(),
## .. Rooms = col_double(),
## .. Type = col_character(),
## .. Price = col_double(),
## .. Method = col_character(),
## .. SellerG = col_character(),
## .. Date = col_character(),
## .. Distance = col_double(),
## .. Postcode = col_double(),
## .. Bedroom2 = col_double(),
## .. Bathroom = col_double(),
## .. Car = col_double(),
## .. Landsize = col_double(),
## .. BuildingArea = col_double(),
## .. YearBuilt = col_double(),
## .. CouncilArea = col_character(),
## .. Lattitude = col_double(),
## .. Longtitude = col_double(),
## .. Regionname = col_character(),
## .. Propertycount = col_double()
## .. )
summary(datos)
## Suburb Address Rooms Type
## Length:13580 Length:13580 Min. : 1.000 Length:13580
## Class :character Class :character 1st Qu.: 2.000 Class :character
## Mode :character Mode :character Median : 3.000 Mode :character
## Mean : 2.938
## 3rd Qu.: 3.000
## Max. :10.000
##
## Price Method SellerG Date
## Min. : 85000 Length:13580 Length:13580 Length:13580
## 1st Qu.: 650000 Class :character Class :character Class :character
## Median : 903000 Mode :character Mode :character Mode :character
## Mean :1075684
## 3rd Qu.:1330000
## Max. :9000000
##
## Distance Postcode Bedroom2 Bathroom
## Min. : 0.00 Min. :3000 Min. : 0.000 Min. :0.000
## 1st Qu.: 6.10 1st Qu.:3044 1st Qu.: 2.000 1st Qu.:1.000
## Median : 9.20 Median :3084 Median : 3.000 Median :1.000
## Mean :10.14 Mean :3105 Mean : 2.915 Mean :1.534
## 3rd Qu.:13.00 3rd Qu.:3148 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :48.10 Max. :3977 Max. :20.000 Max. :8.000
##
## Car Landsize BuildingArea YearBuilt
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. :1196
## 1st Qu.: 1.00 1st Qu.: 177.0 1st Qu.: 93 1st Qu.:1940
## Median : 2.00 Median : 440.0 Median : 126 Median :1970
## Mean : 1.61 Mean : 558.4 Mean : 152 Mean :1965
## 3rd Qu.: 2.00 3rd Qu.: 651.0 3rd Qu.: 174 3rd Qu.:1999
## Max. :10.00 Max. :433014.0 Max. :44515 Max. :2018
## NA's :62 NA's :6450 NA's :5375
## CouncilArea Lattitude Longtitude Regionname
## Length:13580 Min. :-38.18 Min. :144.4 Length:13580
## Class :character 1st Qu.:-37.86 1st Qu.:144.9 Class :character
## Mode :character Median :-37.80 Median :145.0 Mode :character
## Mean :-37.81 Mean :145.0
## 3rd Qu.:-37.76 3rd Qu.:145.1
## Max. :-37.41 Max. :145.5
##
## Propertycount
## Min. : 249
## 1st Qu.: 4380
## Median : 6555
## Mean : 7454
## 3rd Qu.:10331
## Max. :21650
##
modelo <- lm(Price ~ Rooms + Distance, datos)
modelo
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Coefficients:
## (Intercept) Rooms Distance
## 277453 398697 -36807
summary(modelo)
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2919059 -309722 -93892 198144 8218424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 277453.1 14787.5 18.76 <2e-16 ***
## Rooms 398696.7 4839.2 82.39 <2e-16 ***
## Distance -36806.8 788.1 -46.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 515100 on 13577 degrees of freedom
## Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
## F-statistic: 3670 on 2 and 13577 DF, p-value: < 2.2e-16
pairs(datos[,c('Price','Rooms')])
datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of 10 variables:
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Rooms : num 2 2 3 3 4 2 3 2 1 2 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
## - attr(*, "spec")=
## .. cols(
## .. Suburb = col_character(),
## .. Address = col_character(),
## .. Rooms = col_double(),
## .. Type = col_character(),
## .. Price = col_double(),
## .. Method = col_character(),
## .. SellerG = col_character(),
## .. Date = col_character(),
## .. Distance = col_double(),
## .. Postcode = col_double(),
## .. Bedroom2 = col_double(),
## .. Bathroom = col_double(),
## .. Car = col_double(),
## .. Landsize = col_double(),
## .. BuildingArea = col_double(),
## .. YearBuilt = col_double(),
## .. CouncilArea = col_character(),
## .. Lattitude = col_double(),
## .. Longtitude = col_double(),
## .. Regionname = col_character(),
## .. Propertycount = col_double()
## .. )
mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE)
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE)
mediana.C <- median(datos.Num$Car, na.rm = TRUE)
head(datos.Num, 10)
## # A tibble: 10 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## 7 1.88e6 3 2.5 4 2 0 245 210 1910
## 8 1.64e6 2 2.5 2 1 2 256 107 1890
## 9 3.00e5 1 2.5 1 1 1 0 NA NA
## 10 1.10e6 2 2.5 3 1 2 220 75 1900
## # ... with 1 more variable: Propertycount <dbl>
ddatos.Num <- datos.Num %>% mutate (BuildingArea == ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))
datos.Num <- datos.Num %>%
mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt))
datos.Num <- datos.Num %>%
mutate (Car = ifelse(is.na(Car), mediana.C, Car))
head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
## # A tibble: 10 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA 1970
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA 1970
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA 1970
## 7 1.88e6 3 2.5 4 2 0 245 210 1910
## 8 1.64e6 2 2.5 2 1 2 256 107 1890
## 9 3.00e5 1 2.5 1 1 1 0 NA 1970
## 10 1.10e6 2 2.5 3 1 2 220 75 1900
## # ... with 1 more variable: Propertycount <dbl>
correlaciones <- cor(datos.Num)
correlaciones
## Price Rooms Distance Bedroom2 Bathroom
## Price 1.00000000 0.49663368 -0.16252184 0.47595103 0.46703818
## Rooms 0.49663368 1.00000000 0.29420252 0.94419027 0.59293408
## Distance -0.16252184 0.29420252 1.00000000 0.29592676 0.12715513
## Bedroom2 0.47595103 0.94419027 0.29592676 1.00000000 0.58468549
## Bathroom 0.46703818 0.59293408 0.12715513 0.58468549 1.00000000
## Car 0.23910905 0.40693502 0.26059567 0.40386694 0.32101386
## Landsize 0.03750745 0.02567835 0.02500376 0.02564625 0.03713036
## BuildingArea NA NA NA NA NA
## YearBuilt -0.25938724 -0.05156167 0.19481506 -0.04133120 0.11395652
## Propertycount -0.04215261 -0.08153007 -0.05491034 -0.08135034 -0.05220075
## Car Landsize BuildingArea YearBuilt Propertycount
## Price 0.23910905 0.037507450 NA -0.259387242 -0.042152615
## Rooms 0.40693502 0.025678350 NA -0.051561667 -0.081530072
## Distance 0.26059567 0.025003758 NA 0.194815064 -0.054910338
## Bedroom2 0.40386694 0.025646248 NA -0.041331197 -0.081350337
## Bathroom 0.32101386 0.037130357 NA 0.113956517 -0.052200750
## Car 1.00000000 0.026779687 NA 0.078695800 -0.024344443
## Landsize 0.02677969 1.000000000 NA 0.008805811 -0.006853942
## BuildingArea NA NA 1 NA NA
## YearBuilt 0.07869580 0.008805811 NA 1.000000000 0.004420750
## Propertycount -0.02434444 -0.006853942 NA 0.004420750 1.000000000
corrplot(correlaciones, method = "number")
set.seed(2020) #Semilla
entrenamiento <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrenamiento)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 7
## [6,] 9
nrow(entrenamiento)
## [1] 9508
# Los registros que no estén en entrenamiento
head(datos.Num[-entrenamiento,])
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 NA 1970
## 3 1.64e6 2 2.5 2 1 2 256 107 1890
## 4 1.35e6 3 2.5 3 2 2 214 190 2005
## 5 7.50e5 2 2.5 2 2 1 0 94 2009
## 6 8.90e5 2 2.5 2 1 1 150 73 1985
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrenamiento,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA 1970
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA 1970
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA 1970
## # ... with 1 more variable: Propertycount <dbl>
# Ahora determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrenamiento <- datos.Num[entrenamiento,]
head(datos.Entrenamiento)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA 1970
## 2 1.46e6 3 2.5 3 2 0 134 150 1900
## 3 8.50e5 3 2.5 3 2 1 94 NA 1970
## 4 1.60e6 4 2.5 3 1 2 120 142 2014
## 5 1.88e6 3 2.5 4 2 0 245 210 1910
## 6 3.00e5 1 2.5 1 1 1 0 NA 1970
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrenamiento)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1078063 Mean : 2.937 Mean :10.13 Mean : 2.911
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :47.40 Max. :10.000
##
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 178.0 1st Qu.: 93.0
## Median :1.000 Median : 2.000 Median : 443.5 Median : 126.0
## Mean :1.529 Mean : 1.613 Mean : 579.6 Mean : 146.8
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 650.0 3rd Qu.: 173.0
## Max. :8.000 Max. :10.000 Max. :433014.0 Max. :6791.0
## NA's :4555
## YearBuilt Propertycount
## Min. :1830 Min. : 389
## 1st Qu.:1960 1st Qu.: 4386
## Median :1970 Median : 6567
## Mean :1967 Mean : 7453
## 3rd Qu.:1975 3rd Qu.:10331
## Max. :2018 Max. :21650
##
# y conjunto de datos de validación y luego head()
datos.Validacion <- datos.Num[-entrenamiento,]
head(datos.Validacion)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 NA 1970
## 3 1.64e6 2 2.5 2 1 2 256 107 1890
## 4 1.35e6 3 2.5 3 2 2 214 190 2005
## 5 7.50e5 2 2.5 2 2 1 0 94 2009
## 6 8.90e5 2 2.5 2 1 1 150 73 1985
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Validacion)
## Price Rooms Distance Bedroom2
## Min. : 170000 Min. :1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 649500 1st Qu.:2.000 1st Qu.: 6.20 1st Qu.: 2.000
## Median : 902500 Median :3.000 Median : 9.20 Median : 3.000
## Mean :1070130 Mean :2.941 Mean :10.15 Mean : 2.923
## 3rd Qu.:1330000 3rd Qu.:4.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :8000000 Max. :8.000 Max. :48.10 Max. :20.000
##
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.: 173.0 1st Qu.: 93.0
## Median :1.000 Median : 2.00 Median : 435.0 Median : 126.0
## Mean :1.547 Mean : 1.61 Mean : 508.9 Mean : 163.8
## 3rd Qu.:2.000 3rd Qu.: 2.00 3rd Qu.: 653.0 3rd Qu.: 175.0
## Max. :8.000 Max. :10.00 Max. :44500.0 Max. :44515.0
## NA's :1895
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1960 1st Qu.: 4217
## Median :1970 Median : 6543
## Mean :1967 Mean : 7457
## 3rd Qu.:1972 3rd Qu.:10331
## Max. :2017 Max. :21650
##
modelo <- lm(Price ~ ., datos.Entrenamiento)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 1.084e+07 1.477e+05 -2.915e+04 4.809e+04 3.099e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 7.598e+04 1.540e+01 5.679e+02 -5.470e+03 -8.577e-01
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3547662 -253049 -67234 172910 8385632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.084e+07 4.108e+05 26.383 < 2e-16 ***
## Rooms 1.477e+05 2.671e+04 5.530 3.37e-08 ***
## Distance -2.915e+04 1.288e+03 -22.628 < 2e-16 ***
## Bedroom2 4.809e+04 2.652e+04 1.813 0.0698 .
## Bathroom 3.099e+05 1.292e+04 23.979 < 2e-16 ***
## Car 7.598e+04 8.236e+03 9.226 < 2e-16 ***
## Landsize 1.540e+01 7.142e+00 2.157 0.0311 *
## BuildingArea 5.679e+02 5.228e+01 10.862 < 2e-16 ***
## YearBuilt -5.470e+03 2.101e+02 -26.034 < 2e-16 ***
## Propertycount -8.577e-01 1.587e+00 -0.541 0.5888
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 481400 on 4943 degrees of freedom
## (4555 observations deleted due to missingness)
## Multiple R-squared: 0.4952, Adjusted R-squared: 0.4942
## F-statistic: 538.7 on 9 and 4943 DF, p-value: < 2.2e-16
modelo <- lm(Price ~ ., datos.Validacion)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Validacion)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 9.516e+06 2.931e+05 -2.855e+04 -4.886e+04 2.878e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 5.334e+04 4.081e+01 -2.439e+01 -4.793e+03 -3.723e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Validacion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3392886 -258841 -58561 179680 5428216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.516e+06 5.323e+05 17.879 < 2e-16 ***
## Rooms 2.931e+05 3.055e+04 9.595 < 2e-16 ***
## Distance -2.855e+04 1.830e+03 -15.598 < 2e-16 ***
## Bedroom2 -4.886e+04 3.115e+04 -1.569 0.11685
## Bathroom 2.878e+05 1.838e+04 15.654 < 2e-16 ***
## Car 5.334e+04 1.206e+04 4.421 1.03e-05 ***
## Landsize 4.081e+01 1.389e+01 2.937 0.00335 **
## BuildingArea -2.439e+01 1.732e+01 -1.408 0.15939
## YearBuilt -4.793e+03 2.703e+02 -17.730 < 2e-16 ***
## Propertycount -3.723e+00 2.328e+00 -1.599 0.10989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 470100 on 2167 degrees of freedom
## (1895 observations deleted due to missingness)
## Multiple R-squared: 0.5097, Adjusted R-squared: 0.5077
## F-statistic: 250.3 on 9 and 2167 DF, p-value: < 2.2e-16
datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of 10 variables:
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Rooms : num 2 2 3 3 4 2 3 2 1 2 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
## - attr(*, "spec")=
## .. cols(
## .. Suburb = col_character(),
## .. Address = col_character(),
## .. Rooms = col_double(),
## .. Type = col_character(),
## .. Price = col_double(),
## .. Method = col_character(),
## .. SellerG = col_character(),
## .. Date = col_character(),
## .. Distance = col_double(),
## .. Postcode = col_double(),
## .. Bedroom2 = col_double(),
## .. Bathroom = col_double(),
## .. Car = col_double(),
## .. Landsize = col_double(),
## .. BuildingArea = col_double(),
## .. YearBuilt = col_double(),
## .. CouncilArea = col_character(),
## .. Lattitude = col_double(),
## .. Longtitude = col_double(),
## .. Regionname = col_character(),
## .. Propertycount = col_double()
## .. )
summary(datos.Num)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1075684 Mean : 2.938 Mean :10.14 Mean : 2.915
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :48.10 Max. :20.000
##
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.: 177.0 1st Qu.: 93
## Median :1.000 Median : 2.00 Median : 440.0 Median : 126
## Mean :1.534 Mean : 1.61 Mean : 558.4 Mean : 152
## 3rd Qu.:2.000 3rd Qu.: 2.00 3rd Qu.: 651.0 3rd Qu.: 174
## Max. :8.000 Max. :10.00 Max. :433014.0 Max. :44515
## NA's :62 NA's :6450
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1940 1st Qu.: 4380
## Median :1970 Median : 6555
## Mean :1965 Mean : 7454
## 3rd Qu.:1999 3rd Qu.:10331
## Max. :2018 Max. :21650
## NA's :5375
set.seed(2020) #Semilla
entrenamiento <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrenamiento)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 7
## [6,] 9
nrow(entrenamiento)
## [1] 9508
# Los registros que no están en entrenamiento
head(datos.Num[-entrenamiento,])
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 NA NA
## 3 1.64e6 2 2.5 2 1 2 256 107 1890
## 4 1.35e6 3 2.5 3 2 2 214 190 2005
## 5 7.50e5 2 2.5 2 2 1 0 94 2009
## 6 8.90e5 2 2.5 2 1 1 150 73 1985
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrenamiento,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## # ... with 1 more variable: Propertycount <dbl>
# Ahora a determinar conuntos de datos de entrenamiento y luego head()
datos.Entrenamiento <- datos.Num[entrenamiento,]
head(datos.Entrenamiento)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.46e6 3 2.5 3 2 0 134 150 1900
## 3 8.50e5 3 2.5 3 2 1 94 NA NA
## 4 1.60e6 4 2.5 3 1 2 120 142 2014
## 5 1.88e6 3 2.5 4 2 0 245 210 1910
## 6 3.00e5 1 2.5 1 1 1 0 NA NA
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrenamiento)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1078063 Mean : 2.937 Mean :10.13 Mean : 2.911
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :47.40 Max. :10.000
##
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 178.0 1st Qu.: 93.0
## Median :1.000 Median : 2.000 Median : 443.5 Median : 126.0
## Mean :1.529 Mean : 1.611 Mean : 579.6 Mean : 146.8
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 650.0 3rd Qu.: 173.0
## Max. :8.000 Max. :10.000 Max. :433014.0 Max. :6791.0
## NA's :41 NA's :4555
## YearBuilt Propertycount
## Min. :1830 Min. : 389
## 1st Qu.:1940 1st Qu.: 4386
## Median :1970 Median : 6567
## Mean :1965 Mean : 7453
## 3rd Qu.:1999 3rd Qu.:10331
## Max. :2018 Max. :21650
## NA's :3766
# y conjunto de datos de validación y luego head()
datos.Validacion <- datos.Num[-entrenamiento,]
head(datos.Validacion)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 NA NA
## 3 1.64e6 2 2.5 2 1 2 256 107 1890
## 4 1.35e6 3 2.5 3 2 2 214 190 2005
## 5 7.50e5 2 2.5 2 2 1 0 94 2009
## 6 8.90e5 2 2.5 2 1 1 150 73 1985
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Validacion)
## Price Rooms Distance Bedroom2
## Min. : 170000 Min. :1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 649500 1st Qu.:2.000 1st Qu.: 6.20 1st Qu.: 2.000
## Median : 902500 Median :3.000 Median : 9.20 Median : 3.000
## Mean :1070130 Mean :2.941 Mean :10.15 Mean : 2.923
## 3rd Qu.:1330000 3rd Qu.:4.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :8000000 Max. :8.000 Max. :48.10 Max. :20.000
##
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 173.0 1st Qu.: 93.0
## Median :1.000 Median : 2.000 Median : 435.0 Median : 126.0
## Mean :1.547 Mean : 1.608 Mean : 508.9 Mean : 163.8
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 653.0 3rd Qu.: 175.0
## Max. :8.000 Max. :10.000 Max. :44500.0 Max. :44515.0
## NA's :21 NA's :1895
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1940 1st Qu.: 4217
## Median :1970 Median : 6543
## Mean :1964 Mean : 7457
## 3rd Qu.:2000 3rd Qu.:10331
## Max. :2017 Max. :21650
## NA's :1609
modelo <- lm(Price ~ ., datos.Entrenamiento)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 1.086e+07 9.491e+04 -2.819e+04 5.064e+04 2.682e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 6.637e+04 1.454e+01 1.819e+03 -5.459e+03 -4.434e-01
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4743841 -240296 -63013 167845 8381097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.086e+07 4.016e+05 27.037 < 2e-16 ***
## Rooms 9.491e+04 2.678e+04 3.544 0.000399 ***
## Distance -2.819e+04 1.268e+03 -22.227 < 2e-16 ***
## Bedroom2 5.064e+04 2.638e+04 1.919 0.055000 .
## Bathroom 2.682e+05 1.327e+04 20.211 < 2e-16 ***
## Car 6.637e+04 8.157e+03 8.136 5.17e-16 ***
## Landsize 1.454e+01 6.995e+00 2.079 0.037691 *
## BuildingArea 1.819e+03 9.344e+01 19.465 < 2e-16 ***
## YearBuilt -5.459e+03 2.054e+02 -26.580 < 2e-16 ***
## Propertycount -4.434e-01 1.563e+00 -0.284 0.776609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 465000 on 4732 degrees of freedom
## (4766 observations deleted due to missingness)
## Multiple R-squared: 0.5248, Adjusted R-squared: 0.5239
## F-statistic: 580.6 on 9 and 4732 DF, p-value: < 2.2e-16