###Regresión lineal múltiple para precios de casas ##Predicción lineal ##Regresión lineal múltiple con variables cuantitativas
#Las librerías
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape) # Para renombrar columnas
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
library(caret) # Para particiones
## Loading required package: lattice
library(corrplot) # Para correlaciones visuales
## corrplot 0.84 loaded
##los datos originales
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/FundamentosMachineLearning/master/datos/melb_data.csv")
head(datos)
## Suburb Address Rooms Type Price Method SellerG Date
## 1 Abbotsford 85 Turner St 2 h 1480000 S Biggin 3/12/2016
## 2 Abbotsford 25 Bloomburg St 2 h 1035000 S Biggin 4/02/2016
## 3 Abbotsford 5 Charles St 3 h 1465000 SP Biggin 4/03/2017
## 4 Abbotsford 40 Federation La 3 h 850000 PI Biggin 4/03/2017
## 5 Abbotsford 55a Park St 4 h 1600000 VB Nelson 4/06/2016
## 6 Abbotsford 129 Charles St 2 h 941000 S Jellis 7/05/2016
## Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 2.5 3067 2 1 1 202 NA NA
## 2 2.5 3067 2 1 0 156 79 1900
## 3 2.5 3067 3 2 0 134 150 1900
## 4 2.5 3067 3 2 1 94 NA NA
## 5 2.5 3067 3 1 2 120 142 2014
## 6 2.5 3067 2 1 0 181 NA NA
## CouncilArea Lattitude Longtitude Regionname Propertycount
## 1 Yarra -37.7996 144.9984 Northern Metropolitan 4019
## 2 Yarra -37.8079 144.9934 Northern Metropolitan 4019
## 3 Yarra -37.8093 144.9944 Northern Metropolitan 4019
## 4 Yarra -37.7969 144.9969 Northern Metropolitan 4019
## 5 Yarra -37.8072 144.9941 Northern Metropolitan 4019
## 6 Yarra -37.8041 144.9953 Northern Metropolitan 4019
tail(datos)
## Suburb Address Rooms Type Price Method SellerG Date
## 13575 Westmeadows 9 Black St 3 h 582000 S Red 26/08/2017
## 13576 Wheelers Hill 12 Strada Cr 4 h 1245000 S Barry 26/08/2017
## 13577 Williamstown 77 Merrett Dr 3 h 1031000 SP Williams 26/08/2017
## 13578 Williamstown 83 Power St 3 h 1170000 S Raine 26/08/2017
## 13579 Williamstown 96 Verdon St 4 h 2500000 PI Sweeney 26/08/2017
## 13580 Yarraville 6 Agnes St 4 h 1285000 SP Village 26/08/2017
## Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 13575 16.5 3049 3 2 2 256 NA NA
## 13576 16.7 3150 4 2 2 652 NA 1981
## 13577 6.8 3016 3 2 2 333 133 1995
## 13578 6.8 3016 3 2 4 436 NA 1997
## 13579 6.8 3016 4 1 5 866 157 1920
## 13580 6.3 3013 4 1 1 362 112 1920
## CouncilArea Lattitude Longtitude Regionname Propertycount
## 13575 -37.67917 144.8939 Northern Metropolitan 2474
## 13576 -37.90562 145.1676 South-Eastern Metropolitan 7392
## 13577 -37.85927 144.8790 Western Metropolitan 6380
## 13578 -37.85274 144.8874 Western Metropolitan 6380
## 13579 -37.85908 144.8930 Western Metropolitan 6380
## 13580 -37.81188 144.8845 Western Metropolitan 6543
##Describir los datos str() y summary() Hay variables numéricas y categóricas Nos vamos a concentrar en las variables numéricas sólamente
str(datos)
## 'data.frame': 13580 obs. of 21 variables:
## $ Suburb : Factor w/ 314 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Address : Factor w/ 13378 levels "1 Adelle Ct",..: 12795 5944 9815 9005 10590 2196 2143 13336 11083 1091 ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Type : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 3 1 ...
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Method : Factor w/ 5 levels "PI","S","SA",..: 2 2 4 1 5 2 2 2 2 2 ...
## $ SellerG : Factor w/ 268 levels "@Realty","Abercromby's",..: 24 24 24 24 165 114 165 165 24 24 ...
## $ Date : Factor w/ 58 levels "1/07/2017","10/09/2016",..: 46 48 49 49 50 53 53 57 57 57 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Postcode : num 3067 3067 3067 3067 3067 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ CouncilArea : Factor w/ 34 levels "","Banyule","Bayside",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
## $ Longtitude : num 145 145 145 145 145 ...
## $ Regionname : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
summary(datos)
## Suburb Address Rooms Type
## Reservoir : 359 1/1 Clarendon St: 3 Min. : 1.000 h:9449
## Richmond : 260 13 Robinson St : 3 1st Qu.: 2.000 t:1114
## Bentleigh East: 249 14 Arthur St : 3 Median : 3.000 u:3017
## Preston : 239 2 Bruce St : 3 Mean : 2.938
## Brunswick : 222 28 Blair St : 3 3rd Qu.: 3.000
## Essendon : 220 36 Aberfeldie St: 3 Max. :10.000
## (Other) :12031 (Other) :13562
## Price Method SellerG Date
## Min. : 85000 PI:1564 Nelson :1565 27/05/2017: 473
## 1st Qu.: 650000 S :9022 Jellis :1316 3/06/2017 : 395
## Median : 903000 SA: 92 hockingstuart:1167 12/08/2017: 387
## Mean :1075684 SP:1703 Barry :1011 17/06/2017: 374
## 3rd Qu.:1330000 VB:1199 Ray : 701 27/11/2016: 362
## Max. :9000000 Marshall : 659 29/07/2017: 341
## (Other) :7161 (Other) :11248
## Distance Postcode Bedroom2 Bathroom
## Min. : 0.00 Min. :3000 Min. : 0.000 Min. :0.000
## 1st Qu.: 6.10 1st Qu.:3044 1st Qu.: 2.000 1st Qu.:1.000
## Median : 9.20 Median :3084 Median : 3.000 Median :1.000
## Mean :10.14 Mean :3105 Mean : 2.915 Mean :1.534
## 3rd Qu.:13.00 3rd Qu.:3148 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :48.10 Max. :3977 Max. :20.000 Max. :8.000
##
## Car Landsize BuildingArea YearBuilt
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. :1196
## 1st Qu.: 1.00 1st Qu.: 177.0 1st Qu.: 93 1st Qu.:1940
## Median : 2.00 Median : 440.0 Median : 126 Median :1970
## Mean : 1.61 Mean : 558.4 Mean : 152 Mean :1965
## 3rd Qu.: 2.00 3rd Qu.: 651.0 3rd Qu.: 174 3rd Qu.:1999
## Max. :10.00 Max. :433014.0 Max. :44515 Max. :2018
## NA's :62 NA's :6450 NA's :5375
## CouncilArea Lattitude Longtitude
## :1369 Min. :-38.18 Min. :144.4
## Moreland :1163 1st Qu.:-37.86 1st Qu.:144.9
## Boroondara :1160 Median :-37.80 Median :145.0
## Moonee Valley: 997 Mean :-37.81 Mean :145.0
## Darebin : 934 3rd Qu.:-37.76 3rd Qu.:145.1
## Glen Eira : 848 Max. :-37.41 Max. :145.5
## (Other) :7109
## Regionname Propertycount
## Southern Metropolitan :4695 Min. : 249
## Northern Metropolitan :3890 1st Qu.: 4380
## Western Metropolitan :2948 Median : 6555
## Eastern Metropolitan :1471 Mean : 7454
## South-Eastern Metropolitan: 450 3rd Qu.:10331
## Eastern Victoria : 53 Max. :21650
## (Other) : 73
##Price Vs Rooms + Distance
modelo <- lm(Price ~ Rooms + Distance, datos)
modelo
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Coefficients:
## (Intercept) Rooms Distance
## 277453 398697 -36807
summary(modelo)
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2919059 -309722 -93892 198144 8218424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 277453.1 14787.5 18.76 <2e-16 ***
## Rooms 398696.7 4839.2 82.39 <2e-16 ***
## Distance -36806.8 788.1 -46.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 515100 on 13577 degrees of freedom
## Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
## F-statistic: 3670 on 2 and 13577 DF, p-value: < 2.2e-16
pairs(datos[,c('Price','Rooms')])
###Interpretación:
Con el modelo generado lineal múltiple de dos variables (Rooms y Distance), se observa en summary() que la variabel Rooms y Distance solo representa el 35% de la variabilida del precio, además de que el Error Cuadrado es muy alto:
Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
Residual standard error: $ 554,900, por lo cual no convence.
Veremos que sucede con un modelo en donde participen todas las variables
Sin embargo un buen principio es identificar las correlaciones del conjunto de datos de aquellas variables que son numéricas o variables cuantitativas y tentativamente tomar las correlaciones cercanas a 1 o a -1
Variable dependiente: Price; es la que vamos a predecir
Vaiables independientes: Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt + PropertyCount y,
PropertyCount significa … Número de propiedades que existen cerca de la casa en particular
¿ Que otra variable a entender ?
##Un conjunto de datos únicamente con las variables numéricas del conjunto de datos original
Son select() de la libreria dplyr, se determinan el nuevo conjunto de datos
datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 NA NA
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 NA NA
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 NA NA
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
str(datos.Num)
## 'data.frame': 13580 obs. of 10 variables:
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
##Depurar, limpiar los datos
Hay algunos NA que úeden afectar al modelo?
Si en la variable BuildingArea y YearBuilding
Primero encontrar los registros y columnas que tienen NA
Actualziar conorme a su mediante, ¿porqué?, decisión del analista, y la finalidad es que no afecten al modelo, que mejor que tengan un valor (la mediana) a que no tengan nada
mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE) # summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE) # summary(datos.Num$YearBuilt)[3], , como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE) # summary(datos.Num$Car)[3], , como otra alternativa
##Actualizar mutate() los Na por la medianas
head(datos.Num, 10) # Los primeros 10, se observan NAs
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 NA NA
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 NA NA
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 NA NA
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 9 300000 1 2.5 1 1 1 0 NA NA
## 10 1097000 2 2.5 3 1 2 220 75 1900
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
## 7 4019
## 8 4019
## 9 4019
## 10 4019
datos.Num<- datos.Num %>%
mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))
datos.Num <- datos.Num %>%
mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt))
datos.Num <- datos.Num %>%
mutate (Car = ifelse(is.na(Car), mediana.C, Car))
head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 126 1970
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 9 300000 1 2.5 1 1 1 0 126 1970
## 10 1097000 2 2.5 3 1 2 220 75 1900
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
## 7 4019
## 8 4019
## 9 4019
## 10 4019
##Correlaciones
correlaciones <- cor(datos.Num)
correlaciones
## Price Rooms Distance Bedroom2 Bathroom
## Price 1.00000000 0.49663368 -0.16252184 0.47595103 0.46703818
## Rooms 0.49663368 1.00000000 0.29420252 0.94419027 0.59293408
## Distance -0.16252184 0.29420252 1.00000000 0.29592676 0.12715513
## Bedroom2 0.47595103 0.94419027 0.29592676 1.00000000 0.58468549
## Bathroom 0.46703818 0.59293408 0.12715513 0.58468549 1.00000000
## Car 0.23910905 0.40693502 0.26059567 0.40386694 0.32101386
## Landsize 0.03750745 0.02567835 0.02500376 0.02564625 0.03713036
## BuildingArea 0.06976260 0.09275700 0.07396811 0.09034577 0.08771400
## YearBuilt -0.25938724 -0.05156167 0.19481506 -0.04133120 0.11395652
## Propertycount -0.04215261 -0.08153007 -0.05491034 -0.08135034 -0.05220075
## Car Landsize BuildingArea YearBuilt Propertycount
## Price 0.23910905 0.037507450 0.069762599 -0.259387242 -0.042152615
## Rooms 0.40693502 0.025678350 0.092757000 -0.051561667 -0.081530072
## Distance 0.26059567 0.025003758 0.073968113 0.194815064 -0.054910338
## Bedroom2 0.40386694 0.025646248 0.090345775 -0.041331197 -0.081350337
## Bathroom 0.32101386 0.037130357 0.087714000 0.113956517 -0.052200750
## Car 1.00000000 0.026779687 0.068271708 0.078695800 -0.024344443
## Landsize 0.02677969 1.000000000 0.094015130 0.008805811 -0.006853942
## BuildingArea 0.06827171 0.094015130 1.000000000 0.002358526 -0.020905312
## YearBuilt 0.07869580 0.008805811 0.002358526 1.000000000 0.004420750
## Propertycount -0.02434444 -0.006853942 -0.020905312 0.004420750 1.000000000
corrplot(correlaciones, method = "number")
*¿Que les dicen las correlaciones?
*Las variables Prices y Car no tienen correlación
*No hay correlaciones importantes ni cercanas a 1 (positivas) ni cercanas a -1 (negativas), pero seguimos en busca del modelo de regresión lineal
##Crear conjuntos de entrenamiento y conjuntos de validación
*Ya se tienen los datos limpios, sin NAs
*Primero determinas el 70% de los registros para entrenamiento y 30% estante para validación
*Se muestran con head y número de registros respectivamente
*Los datos de entrenamiento representan el 70%
*Los datos de validación representan el 30% restante
set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrena)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 7
## [6,] 9
nrow(entrena)
## [1] 9508
# Los registros que no estén en entrena
head(datos.Num[-entrena,])
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 6 941000 2 2.5 2 1 0 181 126 1970
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 12 1350000 3 2.5 3 2 2 214 190 2005
## 13 750000 2 2.5 2 2 1 0 94 2009
## 20 890000 2 2.5 2 1 1 150 73 1985
## Propertycount
## 2 4019
## 6 4019
## 8 4019
## 12 4019
## 13 4019
## 20 4019
nrow(datos.Num[-entrena,])
## [1] 4072
head(datos.Num)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 126 1970
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
# Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 9 300000 1 2.5 1 1 1 0 126 1970
## Propertycount
## 1 4019
## 3 4019
## 4 4019
## 5 4019
## 7 4019
## 9 4019
summary(datos.Entrena)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1078063 Mean : 2.937 Mean :10.13 Mean : 2.911
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :47.40 Max. :10.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 178.0 1st Qu.: 123.0
## Median :1.000 Median : 2.000 Median : 443.5 Median : 126.0
## Mean :1.529 Mean : 1.613 Mean : 579.6 Mean : 136.8
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 650.0 3rd Qu.: 129.0
## Max. :8.000 Max. :10.000 Max. :433014.0 Max. :6791.0
## YearBuilt Propertycount
## Min. :1830 Min. : 389
## 1st Qu.:1960 1st Qu.: 4386
## Median :1970 Median : 6567
## Mean :1967 Mean : 7453
## 3rd Qu.:1975 3rd Qu.:10331
## Max. :2018 Max. :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 6 941000 2 2.5 2 1 0 181 126 1970
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 12 1350000 3 2.5 3 2 2 214 190 2005
## 13 750000 2 2.5 2 2 1 0 94 2009
## 20 890000 2 2.5 2 1 1 150 73 1985
## Propertycount
## 2 4019
## 6 4019
## 8 4019
## 12 4019
## 13 4019
## 20 4019
summary(datos.Valida)
## Price Rooms Distance Bedroom2
## Min. : 170000 Min. :1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 649500 1st Qu.:2.000 1st Qu.: 6.20 1st Qu.: 2.000
## Median : 902500 Median :3.000 Median : 9.20 Median : 3.000
## Mean :1070130 Mean :2.941 Mean :10.15 Mean : 2.923
## 3rd Qu.:1330000 3rd Qu.:4.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :8000000 Max. :8.000 Max. :48.10 Max. :20.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.: 173.0 1st Qu.: 120.0
## Median :1.000 Median : 2.00 Median : 435.0 Median : 126.0
## Mean :1.547 Mean : 1.61 Mean : 508.9 Mean : 146.2
## 3rd Qu.:2.000 3rd Qu.: 2.00 3rd Qu.: 653.0 3rd Qu.: 130.0
## Max. :8.000 Max. :10.00 Max. :44500.0 Max. :44515.0
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1960 1st Qu.: 4217
## Median :1970 Median : 6543
## Mean :1967 Mean : 7457
## 3rd Qu.:1972 3rd Qu.:10331
## Max. :2017 Max. :21650
#Modelo de regresion lineal multiple
*precio en funcion de todas las variables numericas del conjunto de datos de entrenamiento
modelo <- lm(Price ~ ., datos.Entrena)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 1.031e+07 1.889e+05 -3.116e+04 3.998e+04 2.527e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 6.403e+04 3.342e+00 5.647e+02 -5.159e+03 -1.143e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3627062 -274161 -81595 184748 8324301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.031e+07 3.590e+05 28.718 < 2e-16 ***
## Rooms 1.889e+05 1.819e+04 10.387 < 2e-16 ***
## Distance -3.116e+04 9.198e+02 -33.875 < 2e-16 ***
## Bedroom2 3.998e+04 1.805e+04 2.215 0.02675 *
## Bathroom 2.527e+05 9.235e+03 27.365 < 2e-16 ***
## Car 6.403e+04 5.722e+03 11.190 < 2e-16 ***
## Landsize 3.342e+00 1.043e+00 3.204 0.00136 **
## BuildingArea 5.647e+02 4.954e+01 11.399 < 2e-16 ***
## YearBuilt -5.159e+03 1.828e+02 -28.216 < 2e-16 ***
## Propertycount -1.143e+00 1.128e+00 -1.014 0.31072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 478200 on 9498 degrees of freedom
## Multiple R-squared: 0.4437, Adjusted R-squared: 0.4432
## F-statistic: 841.9 on 9 and 9498 DF, p-value: < 2.2e-16
##Interpretación
La regresion lineal no es un buen modelo para el analisis en este caso
Error residual estandar: 478900 en 9455 degrees of freedom (43 observations deleted due to missingness) Multiple R-squared: 0.4519, Adjusted R-squared: 0.4514
###Probar con los datos de validación
modelo <- lm(Price ~ ., datos.Valida)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 9.056e+06 2.323e+05 -3.189e+04 2.120e+04 2.502e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 4.811e+04 2.836e+01 -7.488e+00 -4.511e+03 -1.789e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3209068 -265850 -74331 190050 5434862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.056e+06 4.884e+05 18.542 < 2e-16 ***
## Rooms 2.323e+05 1.950e+04 11.913 < 2e-16 ***
## Distance -3.189e+04 1.369e+03 -23.293 < 2e-16 ***
## Bedroom2 2.120e+04 1.846e+04 1.148 0.250936
## Bathroom 2.502e+05 1.344e+04 18.618 < 2e-16 ***
## Car 4.811e+04 8.610e+03 5.588 2.45e-08 ***
## Landsize 2.836e+01 7.860e+00 3.609 0.000312 ***
## BuildingArea -7.488e+00 1.307e+01 -0.573 0.566616
## YearBuilt -4.511e+03 2.480e+02 -18.187 < 2e-16 ***
## Propertycount -1.789e+00 1.681e+00 -1.065 0.287092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 469900 on 4062 degrees of freedom
## Multiple R-squared: 0.4546, Adjusted R-squared: 0.4533
## F-statistic: 376.1 on 9 and 4062 DF, p-value: < 2.2e-16