#Cargar librerias
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape) # Para renombrar columnas
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
library(caret) # Para particiones
## Loading required package: lattice
library(corrplot) # Para correlaciones visuales
## corrplot 0.84 loaded
#Cargar los datos
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/FundamentosMachineLearning/master/datos/melb_data.csv")
head(datos)
## Suburb Address Rooms Type Price Method SellerG Date
## 1 Abbotsford 85 Turner St 2 h 1480000 S Biggin 3/12/2016
## 2 Abbotsford 25 Bloomburg St 2 h 1035000 S Biggin 4/02/2016
## 3 Abbotsford 5 Charles St 3 h 1465000 SP Biggin 4/03/2017
## 4 Abbotsford 40 Federation La 3 h 850000 PI Biggin 4/03/2017
## 5 Abbotsford 55a Park St 4 h 1600000 VB Nelson 4/06/2016
## 6 Abbotsford 129 Charles St 2 h 941000 S Jellis 7/05/2016
## Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 2.5 3067 2 1 1 202 NA NA
## 2 2.5 3067 2 1 0 156 79 1900
## 3 2.5 3067 3 2 0 134 150 1900
## 4 2.5 3067 3 2 1 94 NA NA
## 5 2.5 3067 3 1 2 120 142 2014
## 6 2.5 3067 2 1 0 181 NA NA
## CouncilArea Lattitude Longtitude Regionname Propertycount
## 1 Yarra -37.7996 144.9984 Northern Metropolitan 4019
## 2 Yarra -37.8079 144.9934 Northern Metropolitan 4019
## 3 Yarra -37.8093 144.9944 Northern Metropolitan 4019
## 4 Yarra -37.7969 144.9969 Northern Metropolitan 4019
## 5 Yarra -37.8072 144.9941 Northern Metropolitan 4019
## 6 Yarra -37.8041 144.9953 Northern Metropolitan 4019
tail(datos)
## Suburb Address Rooms Type Price Method SellerG Date
## 13575 Westmeadows 9 Black St 3 h 582000 S Red 26/08/2017
## 13576 Wheelers Hill 12 Strada Cr 4 h 1245000 S Barry 26/08/2017
## 13577 Williamstown 77 Merrett Dr 3 h 1031000 SP Williams 26/08/2017
## 13578 Williamstown 83 Power St 3 h 1170000 S Raine 26/08/2017
## 13579 Williamstown 96 Verdon St 4 h 2500000 PI Sweeney 26/08/2017
## 13580 Yarraville 6 Agnes St 4 h 1285000 SP Village 26/08/2017
## Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 13575 16.5 3049 3 2 2 256 NA NA
## 13576 16.7 3150 4 2 2 652 NA 1981
## 13577 6.8 3016 3 2 2 333 133 1995
## 13578 6.8 3016 3 2 4 436 NA 1997
## 13579 6.8 3016 4 1 5 866 157 1920
## 13580 6.3 3013 4 1 1 362 112 1920
## CouncilArea Lattitude Longtitude Regionname Propertycount
## 13575 -37.67917 144.8939 Northern Metropolitan 2474
## 13576 -37.90562 145.1676 South-Eastern Metropolitan 7392
## 13577 -37.85927 144.8790 Western Metropolitan 6380
## 13578 -37.85274 144.8874 Western Metropolitan 6380
## 13579 -37.85908 144.8930 Western Metropolitan 6380
## 13580 -37.81188 144.8845 Western Metropolitan 6543
#Describir los datos str y summary
str(datos)
## 'data.frame': 13580 obs. of 21 variables:
## $ Suburb : Factor w/ 314 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Address : Factor w/ 13378 levels "1 Adelle Ct",..: 12795 5944 9815 9005 10590 2196 2143 13336 11083 1091 ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Type : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 3 1 ...
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Method : Factor w/ 5 levels "PI","S","SA",..: 2 2 4 1 5 2 2 2 2 2 ...
## $ SellerG : Factor w/ 268 levels "@Realty","Abercromby's",..: 24 24 24 24 165 114 165 165 24 24 ...
## $ Date : Factor w/ 58 levels "1/07/2017","10/09/2016",..: 46 48 49 49 50 53 53 57 57 57 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Postcode : num 3067 3067 3067 3067 3067 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ CouncilArea : Factor w/ 34 levels "","Banyule","Bayside",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
## $ Longtitude : num 145 145 145 145 145 ...
## $ Regionname : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
summary(datos)
## Suburb Address Rooms Type
## Reservoir : 359 1/1 Clarendon St: 3 Min. : 1.000 h:9449
## Richmond : 260 13 Robinson St : 3 1st Qu.: 2.000 t:1114
## Bentleigh East: 249 14 Arthur St : 3 Median : 3.000 u:3017
## Preston : 239 2 Bruce St : 3 Mean : 2.938
## Brunswick : 222 28 Blair St : 3 3rd Qu.: 3.000
## Essendon : 220 36 Aberfeldie St: 3 Max. :10.000
## (Other) :12031 (Other) :13562
## Price Method SellerG Date
## Min. : 85000 PI:1564 Nelson :1565 27/05/2017: 473
## 1st Qu.: 650000 S :9022 Jellis :1316 3/06/2017 : 395
## Median : 903000 SA: 92 hockingstuart:1167 12/08/2017: 387
## Mean :1075684 SP:1703 Barry :1011 17/06/2017: 374
## 3rd Qu.:1330000 VB:1199 Ray : 701 27/11/2016: 362
## Max. :9000000 Marshall : 659 29/07/2017: 341
## (Other) :7161 (Other) :11248
## Distance Postcode Bedroom2 Bathroom
## Min. : 0.00 Min. :3000 Min. : 0.000 Min. :0.000
## 1st Qu.: 6.10 1st Qu.:3044 1st Qu.: 2.000 1st Qu.:1.000
## Median : 9.20 Median :3084 Median : 3.000 Median :1.000
## Mean :10.14 Mean :3105 Mean : 2.915 Mean :1.534
## 3rd Qu.:13.00 3rd Qu.:3148 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :48.10 Max. :3977 Max. :20.000 Max. :8.000
##
## Car Landsize BuildingArea YearBuilt
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. :1196
## 1st Qu.: 1.00 1st Qu.: 177.0 1st Qu.: 93 1st Qu.:1940
## Median : 2.00 Median : 440.0 Median : 126 Median :1970
## Mean : 1.61 Mean : 558.4 Mean : 152 Mean :1965
## 3rd Qu.: 2.00 3rd Qu.: 651.0 3rd Qu.: 174 3rd Qu.:1999
## Max. :10.00 Max. :433014.0 Max. :44515 Max. :2018
## NA's :62 NA's :6450 NA's :5375
## CouncilArea Lattitude Longtitude
## :1369 Min. :-38.18 Min. :144.4
## Moreland :1163 1st Qu.:-37.86 1st Qu.:144.9
## Boroondara :1160 Median :-37.80 Median :145.0
## Moonee Valley: 997 Mean :-37.81 Mean :145.0
## Darebin : 934 3rd Qu.:-37.76 3rd Qu.:145.1
## Glen Eira : 848 Max. :-37.41 Max. :145.5
## (Other) :7109
## Regionname Propertycount
## Southern Metropolitan :4695 Min. : 249
## Northern Metropolitan :3890 1st Qu.: 4380
## Western Metropolitan :2948 Median : 6555
## Eastern Metropolitan :1471 Mean : 7454
## South-Eastern Metropolitan: 450 3rd Qu.:10331
## Eastern Victoria : 53 Max. :21650
## (Other) : 73
#Price Vs Rooms + Distance
modelo <- lm(Price ~ Rooms + Distance, datos)
modelo
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Coefficients:
## (Intercept) Rooms Distance
## 277453 398697 -36807
summary(modelo)
##
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2919059 -309722 -93892 198144 8218424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 277453.1 14787.5 18.76 <2e-16 ***
## Rooms 398696.7 4839.2 82.39 <2e-16 ***
## Distance -36806.8 788.1 -46.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 515100 on 13577 degrees of freedom
## Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
## F-statistic: 3670 on 2 and 13577 DF, p-value: < 2.2e-16
#Grafica
pairs(datos[,c('Price','Rooms')])
#INTERPRETACION
#Con el modelo generado lineal múltiple de dos variables (Rooms y Distance), se observa en summary() que la variabel Rooms y Distance solo representa el 35% de la variabilida del precio, además de que el Error Cuadrado es muy alto:
#Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
#Residual standard error: $ 554,900, por lo cual no convence.
#Veremos que sucede con un modelo en donde participen todas las variables
#Sin embargo un buen principio es identificar las correlaciones del conjunto de datos de aquellas variables que son numéricas o variables cuantitativas y #tentativamente tomar las correlaciones cercanas a 1 o a -1
#Variable dependiente: Price; es la que vamos a predecir
#Vaiables independientes: Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt + PropertyCount y,
#PropertyCount significa … Número de propiedades que existen cerca de la casa en particular
#¿ Que otra variable a entender ?
#Un conjunto de datos únicamente con las variables numéricas del conjunto de datos original
datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 NA NA
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 NA NA
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 NA NA
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
str(datos.Num)
## 'data.frame': 13580 obs. of 10 variables:
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
#Depurar, limpiar los datos
#* Hay algunos NA que úeden afectar al modelo?
#* Si en la variable BuildingArea y YearBuilding
#* Primero encontrar los registros y columnas que tienen NA
#* Actualziar conorme a su mediante, ¿porqué?, decisión del analista, y la #finalidad es que no afecten al modelo, que mejor que tengan un valor (la mediana) a que no tengan nada
mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE) # summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE) # summary(datos.Num$YearBuilt)[3], , como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE) # summary(datos.Num$Car)[3], , como otra alternativa
#Actualizar mutate() los Na por la medianas
head(datos.Num, 10) # Los primeros 10, se observan NAs
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 NA NA
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 NA NA
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 NA NA
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 9 300000 1 2.5 1 1 1 0 NA NA
## 10 1097000 2 2.5 3 1 2 220 75 1900
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
## 7 4019
## 8 4019
## 9 4019
## 10 4019
datos.Num<- datos.Num %>%
mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))
datos.Num <- datos.Num %>%
mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt))
datos.Num <- datos.Num %>%
mutate (Car = ifelse(is.na(Car), mediana.C, Car))
head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 126 1970
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 9 300000 1 2.5 1 1 1 0 126 1970
## 10 1097000 2 2.5 3 1 2 220 75 1900
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
## 7 4019
## 8 4019
## 9 4019
## 10 4019
#Correlaciones
correlaciones <- cor(datos.Num)
correlaciones
## Price Rooms Distance Bedroom2 Bathroom
## Price 1.00000000 0.49663368 -0.16252184 0.47595103 0.46703818
## Rooms 0.49663368 1.00000000 0.29420252 0.94419027 0.59293408
## Distance -0.16252184 0.29420252 1.00000000 0.29592676 0.12715513
## Bedroom2 0.47595103 0.94419027 0.29592676 1.00000000 0.58468549
## Bathroom 0.46703818 0.59293408 0.12715513 0.58468549 1.00000000
## Car 0.23910905 0.40693502 0.26059567 0.40386694 0.32101386
## Landsize 0.03750745 0.02567835 0.02500376 0.02564625 0.03713036
## BuildingArea 0.06976260 0.09275700 0.07396811 0.09034577 0.08771400
## YearBuilt -0.25938724 -0.05156167 0.19481506 -0.04133120 0.11395652
## Propertycount -0.04215261 -0.08153007 -0.05491034 -0.08135034 -0.05220075
## Car Landsize BuildingArea YearBuilt Propertycount
## Price 0.23910905 0.037507450 0.069762599 -0.259387242 -0.042152615
## Rooms 0.40693502 0.025678350 0.092757000 -0.051561667 -0.081530072
## Distance 0.26059567 0.025003758 0.073968113 0.194815064 -0.054910338
## Bedroom2 0.40386694 0.025646248 0.090345775 -0.041331197 -0.081350337
## Bathroom 0.32101386 0.037130357 0.087714000 0.113956517 -0.052200750
## Car 1.00000000 0.026779687 0.068271708 0.078695800 -0.024344443
## Landsize 0.02677969 1.000000000 0.094015130 0.008805811 -0.006853942
## BuildingArea 0.06827171 0.094015130 1.000000000 0.002358526 -0.020905312
## YearBuilt 0.07869580 0.008805811 0.002358526 1.000000000 0.004420750
## Propertycount -0.02434444 -0.006853942 -0.020905312 0.004420750 1.000000000
corrplot(correlaciones, method = "number")
#* ¿Que les dicen las correlaciones?
#* Las variables Prices y Car no tienen correlación
#* No hay correlaciones importantes ni cercanas a 1 (positivas) ni cercanas a -1 (negativas), pero seguimos en busca del modelo de regresión lineal
#Crear conjuntos de entrenamiento y conjuntos de validación
#* Ya se tienen los datos limpios, sin NAs
#* Primero determinas el 70% de los registros para entrenamiento y 30% estante para validación
#* Se muestran con head y número de registros respectivamente
#* Los datos de entrenamiento representan el 70%
#* Los datos de validación representan el 30% restante
set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrena)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 7
## [6,] 9
nrow(entrena)
## [1] 9508
# Los registros que no estén en entrena
head(datos.Num[-entrena,])
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 6 941000 2 2.5 2 1 0 181 126 1970
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 12 1350000 3 2.5 3 2 2 214 190 2005
## 13 750000 2 2.5 2 2 1 0 94 2009
## 20 890000 2 2.5 2 1 1 150 73 1985
## Propertycount
## 2 4019
## 6 4019
## 8 4019
## 12 4019
## 13 4019
## 20 4019
nrow(datos.Num[-entrena,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 6 941000 2 2.5 2 1 0 181 126 1970
## Propertycount
## 1 4019
## 2 4019
## 3 4019
## 4 4019
## 5 4019
## 6 4019
# Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000 2 2.5 2 1 1 202 126 1970
## 3 1465000 3 2.5 3 2 0 134 150 1900
## 4 850000 3 2.5 3 2 1 94 126 1970
## 5 1600000 4 2.5 3 1 2 120 142 2014
## 7 1876000 3 2.5 4 2 0 245 210 1910
## 9 300000 1 2.5 1 1 1 0 126 1970
## Propertycount
## 1 4019
## 3 4019
## 4 4019
## 5 4019
## 7 4019
## 9 4019
summary(datos.Entrena)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1078063 Mean : 2.937 Mean :10.13 Mean : 2.911
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :47.40 Max. :10.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 178.0 1st Qu.: 123.0
## Median :1.000 Median : 2.000 Median : 443.5 Median : 126.0
## Mean :1.529 Mean : 1.613 Mean : 579.6 Mean : 136.8
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 650.0 3rd Qu.: 129.0
## Max. :8.000 Max. :10.000 Max. :433014.0 Max. :6791.0
## YearBuilt Propertycount
## Min. :1830 Min. : 389
## 1st Qu.:1960 1st Qu.: 4386
## Median :1970 Median : 6567
## Mean :1967 Mean : 7453
## 3rd Qu.:1975 3rd Qu.:10331
## Max. :2018 Max. :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000 2 2.5 2 1 0 156 79 1900
## 6 941000 2 2.5 2 1 0 181 126 1970
## 8 1636000 2 2.5 2 1 2 256 107 1890
## 12 1350000 3 2.5 3 2 2 214 190 2005
## 13 750000 2 2.5 2 2 1 0 94 2009
## 20 890000 2 2.5 2 1 1 150 73 1985
## Propertycount
## 2 4019
## 6 4019
## 8 4019
## 12 4019
## 13 4019
## 20 4019
summary(datos.Valida)
## Price Rooms Distance Bedroom2
## Min. : 170000 Min. :1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 649500 1st Qu.:2.000 1st Qu.: 6.20 1st Qu.: 2.000
## Median : 902500 Median :3.000 Median : 9.20 Median : 3.000
## Mean :1070130 Mean :2.941 Mean :10.15 Mean : 2.923
## 3rd Qu.:1330000 3rd Qu.:4.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :8000000 Max. :8.000 Max. :48.10 Max. :20.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.: 173.0 1st Qu.: 120.0
## Median :1.000 Median : 2.00 Median : 435.0 Median : 126.0
## Mean :1.547 Mean : 1.61 Mean : 508.9 Mean : 146.2
## 3rd Qu.:2.000 3rd Qu.: 2.00 3rd Qu.: 653.0 3rd Qu.: 130.0
## Max. :8.000 Max. :10.00 Max. :44500.0 Max. :44515.0
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1960 1st Qu.: 4217
## Median :1970 Median : 6543
## Mean :1967 Mean : 7457
## 3rd Qu.:1972 3rd Qu.:10331
## Max. :2017 Max. :21650
#Modelo de regresion lineal multiple
#* precio en funcion de todas las variables numericas del conjunto de datos de entrenamiento
modelo <- lm(Price ~ ., datos.Entrena)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 1.031e+07 1.889e+05 -3.116e+04 3.998e+04 2.527e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 6.403e+04 3.342e+00 5.647e+02 -5.159e+03 -1.143e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3627062 -274161 -81595 184748 8324301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.031e+07 3.590e+05 28.718 < 2e-16 ***
## Rooms 1.889e+05 1.819e+04 10.387 < 2e-16 ***
## Distance -3.116e+04 9.198e+02 -33.875 < 2e-16 ***
## Bedroom2 3.998e+04 1.805e+04 2.215 0.02675 *
## Bathroom 2.527e+05 9.235e+03 27.365 < 2e-16 ***
## Car 6.403e+04 5.722e+03 11.190 < 2e-16 ***
## Landsize 3.342e+00 1.043e+00 3.204 0.00136 **
## BuildingArea 5.647e+02 4.954e+01 11.399 < 2e-16 ***
## YearBuilt -5.159e+03 1.828e+02 -28.216 < 2e-16 ***
## Propertycount -1.143e+00 1.128e+00 -1.014 0.31072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 478200 on 9498 degrees of freedom
## Multiple R-squared: 0.4437, Adjusted R-squared: 0.4432
## F-statistic: 841.9 on 9 and 9498 DF, p-value: < 2.2e-16
#Interpretación
#* No es un buen modelo
#* Residual standard error: 478900 on 9455 degrees of freedom (43 observations deleted due to missingness) Multiple R-squared: 0.4519, Adjusted R-squared: 0.4514
#* La regresión lineal no es el modelo adecuado para predecir con estos datos
#Probar con los datos de validación
modelo <- lm(Price ~ ., datos.Valida)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 9.056e+06 2.323e+05 -3.189e+04 2.120e+04 2.502e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 4.811e+04 2.836e+01 -7.488e+00 -4.511e+03 -1.789e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3209068 -265850 -74331 190050 5434862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.056e+06 4.884e+05 18.542 < 2e-16 ***
## Rooms 2.323e+05 1.950e+04 11.913 < 2e-16 ***
## Distance -3.189e+04 1.369e+03 -23.293 < 2e-16 ***
## Bedroom2 2.120e+04 1.846e+04 1.148 0.250936
## Bathroom 2.502e+05 1.344e+04 18.618 < 2e-16 ***
## Car 4.811e+04 8.610e+03 5.588 2.45e-08 ***
## Landsize 2.836e+01 7.860e+00 3.609 0.000312 ***
## BuildingArea -7.488e+00 1.307e+01 -0.573 0.566616
## YearBuilt -4.511e+03 2.480e+02 -18.187 < 2e-16 ***
## Propertycount -1.789e+00 1.681e+00 -1.065 0.287092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 469900 on 4062 degrees of freedom
## Multiple R-squared: 0.4546, Adjusted R-squared: 0.4533
## F-statistic: 376.1 on 9 and 4062 DF, p-value: < 2.2e-16
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.