#Cargar librerias 
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape) # Para renombrar columnas
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
library(caret) # Para particiones
## Loading required package: lattice
library(corrplot) # Para correlaciones visuales
## corrplot 0.84 loaded
#Cargar los datos
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/FundamentosMachineLearning/master/datos/melb_data.csv")
head(datos)
##       Suburb          Address Rooms Type   Price Method SellerG      Date
## 1 Abbotsford     85 Turner St     2    h 1480000      S  Biggin 3/12/2016
## 2 Abbotsford  25 Bloomburg St     2    h 1035000      S  Biggin 4/02/2016
## 3 Abbotsford     5 Charles St     3    h 1465000     SP  Biggin 4/03/2017
## 4 Abbotsford 40 Federation La     3    h  850000     PI  Biggin 4/03/2017
## 5 Abbotsford      55a Park St     4    h 1600000     VB  Nelson 4/06/2016
## 6 Abbotsford   129 Charles St     2    h  941000      S  Jellis 7/05/2016
##   Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1      2.5     3067        2        1   1      202           NA        NA
## 2      2.5     3067        2        1   0      156           79      1900
## 3      2.5     3067        3        2   0      134          150      1900
## 4      2.5     3067        3        2   1       94           NA        NA
## 5      2.5     3067        3        1   2      120          142      2014
## 6      2.5     3067        2        1   0      181           NA        NA
##   CouncilArea Lattitude Longtitude            Regionname Propertycount
## 1       Yarra  -37.7996   144.9984 Northern Metropolitan          4019
## 2       Yarra  -37.8079   144.9934 Northern Metropolitan          4019
## 3       Yarra  -37.8093   144.9944 Northern Metropolitan          4019
## 4       Yarra  -37.7969   144.9969 Northern Metropolitan          4019
## 5       Yarra  -37.8072   144.9941 Northern Metropolitan          4019
## 6       Yarra  -37.8041   144.9953 Northern Metropolitan          4019
tail(datos)
##              Suburb       Address Rooms Type   Price Method  SellerG       Date
## 13575   Westmeadows    9 Black St     3    h  582000      S      Red 26/08/2017
## 13576 Wheelers Hill  12 Strada Cr     4    h 1245000      S    Barry 26/08/2017
## 13577  Williamstown 77 Merrett Dr     3    h 1031000     SP Williams 26/08/2017
## 13578  Williamstown   83 Power St     3    h 1170000      S    Raine 26/08/2017
## 13579  Williamstown  96 Verdon St     4    h 2500000     PI  Sweeney 26/08/2017
## 13580    Yarraville    6 Agnes St     4    h 1285000     SP  Village 26/08/2017
##       Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 13575     16.5     3049        3        2   2      256           NA        NA
## 13576     16.7     3150        4        2   2      652           NA      1981
## 13577      6.8     3016        3        2   2      333          133      1995
## 13578      6.8     3016        3        2   4      436           NA      1997
## 13579      6.8     3016        4        1   5      866          157      1920
## 13580      6.3     3013        4        1   1      362          112      1920
##       CouncilArea Lattitude Longtitude                 Regionname Propertycount
## 13575             -37.67917   144.8939      Northern Metropolitan          2474
## 13576             -37.90562   145.1676 South-Eastern Metropolitan          7392
## 13577             -37.85927   144.8790       Western Metropolitan          6380
## 13578             -37.85274   144.8874       Western Metropolitan          6380
## 13579             -37.85908   144.8930       Western Metropolitan          6380
## 13580             -37.81188   144.8845       Western Metropolitan          6543
#Describir los datos str y summary
str(datos)
## 'data.frame':    13580 obs. of  21 variables:
##  $ Suburb       : Factor w/ 314 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Address      : Factor w/ 13378 levels "1 Adelle Ct",..: 12795 5944 9815 9005 10590 2196 2143 13336 11083 1091 ...
##  $ Rooms        : int  2 2 3 3 4 2 3 2 1 2 ...
##  $ Type         : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 3 1 ...
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Method       : Factor w/ 5 levels "PI","S","SA",..: 2 2 4 1 5 2 2 2 2 2 ...
##  $ SellerG      : Factor w/ 268 levels "@Realty","Abercromby's",..: 24 24 24 24 165 114 165 165 24 24 ...
##  $ Date         : Factor w/ 58 levels "1/07/2017","10/09/2016",..: 46 48 49 49 50 53 53 57 57 57 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Postcode     : num  3067 3067 3067 3067 3067 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ CouncilArea  : Factor w/ 34 levels "","Banyule","Bayside",..: 33 33 33 33 33 33 33 33 33 33 ...
##  $ Lattitude    : num  -37.8 -37.8 -37.8 -37.8 -37.8 ...
##  $ Longtitude   : num  145 145 145 145 145 ...
##  $ Regionname   : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
summary(datos)
##             Suburb                  Address          Rooms        Type    
##  Reservoir     :  359   1/1 Clarendon St:    3   Min.   : 1.000   h:9449  
##  Richmond      :  260   13 Robinson St  :    3   1st Qu.: 2.000   t:1114  
##  Bentleigh East:  249   14 Arthur St    :    3   Median : 3.000   u:3017  
##  Preston       :  239   2 Bruce St      :    3   Mean   : 2.938           
##  Brunswick     :  222   28 Blair St     :    3   3rd Qu.: 3.000           
##  Essendon      :  220   36 Aberfeldie St:    3   Max.   :10.000           
##  (Other)       :12031   (Other)         :13562                            
##      Price         Method             SellerG             Date      
##  Min.   :  85000   PI:1564   Nelson       :1565   27/05/2017:  473  
##  1st Qu.: 650000   S :9022   Jellis       :1316   3/06/2017 :  395  
##  Median : 903000   SA:  92   hockingstuart:1167   12/08/2017:  387  
##  Mean   :1075684   SP:1703   Barry        :1011   17/06/2017:  374  
##  3rd Qu.:1330000   VB:1199   Ray          : 701   27/11/2016:  362  
##  Max.   :9000000             Marshall     : 659   29/07/2017:  341  
##                              (Other)      :7161   (Other)   :11248  
##     Distance        Postcode       Bedroom2         Bathroom    
##  Min.   : 0.00   Min.   :3000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 6.10   1st Qu.:3044   1st Qu.: 2.000   1st Qu.:1.000  
##  Median : 9.20   Median :3084   Median : 3.000   Median :1.000  
##  Mean   :10.14   Mean   :3105   Mean   : 2.915   Mean   :1.534  
##  3rd Qu.:13.00   3rd Qu.:3148   3rd Qu.: 3.000   3rd Qu.:2.000  
##  Max.   :48.10   Max.   :3977   Max.   :20.000   Max.   :8.000  
##                                                                 
##       Car           Landsize         BuildingArea     YearBuilt   
##  Min.   : 0.00   Min.   :     0.0   Min.   :    0   Min.   :1196  
##  1st Qu.: 1.00   1st Qu.:   177.0   1st Qu.:   93   1st Qu.:1940  
##  Median : 2.00   Median :   440.0   Median :  126   Median :1970  
##  Mean   : 1.61   Mean   :   558.4   Mean   :  152   Mean   :1965  
##  3rd Qu.: 2.00   3rd Qu.:   651.0   3rd Qu.:  174   3rd Qu.:1999  
##  Max.   :10.00   Max.   :433014.0   Max.   :44515   Max.   :2018  
##  NA's   :62                         NA's   :6450    NA's   :5375  
##         CouncilArea     Lattitude        Longtitude   
##               :1369   Min.   :-38.18   Min.   :144.4  
##  Moreland     :1163   1st Qu.:-37.86   1st Qu.:144.9  
##  Boroondara   :1160   Median :-37.80   Median :145.0  
##  Moonee Valley: 997   Mean   :-37.81   Mean   :145.0  
##  Darebin      : 934   3rd Qu.:-37.76   3rd Qu.:145.1  
##  Glen Eira    : 848   Max.   :-37.41   Max.   :145.5  
##  (Other)      :7109                                   
##                       Regionname   Propertycount  
##  Southern Metropolitan     :4695   Min.   :  249  
##  Northern Metropolitan     :3890   1st Qu.: 4380  
##  Western Metropolitan      :2948   Median : 6555  
##  Eastern Metropolitan      :1471   Mean   : 7454  
##  South-Eastern Metropolitan: 450   3rd Qu.:10331  
##  Eastern Victoria          :  53   Max.   :21650  
##  (Other)                   :  73
#Price Vs Rooms + Distance
modelo <- lm(Price ~ Rooms + Distance, datos)
modelo
## 
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
## 
## Coefficients:
## (Intercept)        Rooms     Distance  
##      277453       398697       -36807
summary(modelo)
## 
## Call:
## lm(formula = Price ~ Rooms + Distance, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2919059  -309722   -93892   198144  8218424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 277453.1    14787.5   18.76   <2e-16 ***
## Rooms       398696.7     4839.2   82.39   <2e-16 ***
## Distance    -36806.8      788.1  -46.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 515100 on 13577 degrees of freedom
## Multiple R-squared:  0.3509, Adjusted R-squared:  0.3508 
## F-statistic:  3670 on 2 and 13577 DF,  p-value: < 2.2e-16
#Grafica
pairs(datos[,c('Price','Rooms')])

#INTERPRETACION
#Con el modelo generado lineal múltiple de dos variables (Rooms y Distance), se observa en summary() que la variabel Rooms y Distance solo representa el 35% de la variabilida del precio, además de que el Error Cuadrado es muy alto:
#Multiple R-squared: 0.3509, Adjusted R-squared: 0.3508
#Residual standard error: $ 554,900, por lo cual no convence.
#Veremos que sucede con un modelo en donde participen todas las variables
#Sin embargo un buen principio es identificar las correlaciones del conjunto de datos de aquellas variables que son numéricas o variables cuantitativas y #tentativamente tomar las correlaciones cercanas a 1 o a -1
#Variable dependiente: Price; es la que vamos a predecir
#Vaiables independientes: Rooms + Distance + Bedroom2 + Bathroom + Car + Landsize + BuildingArea + YearBuilt + PropertyCount y,
#PropertyCount significa … Número de propiedades que existen cerca de la casa en particular
#¿ Que otra variable a entender ?

#Un conjunto de datos únicamente con las variables numéricas del conjunto de datos original

datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount) 
head(datos.Num)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202           NA        NA
## 2 1035000     2      2.5        2        1   0      156           79      1900
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94           NA        NA
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 6  941000     2      2.5        2        1   0      181           NA        NA
##   Propertycount
## 1          4019
## 2          4019
## 3          4019
## 4          4019
## 5          4019
## 6          4019
str(datos.Num)
## 'data.frame':    13580 obs. of  10 variables:
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Rooms        : int  2 2 3 3 4 2 3 2 1 2 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
#Depurar, limpiar los datos
#* Hay algunos NA que úeden afectar al modelo?
#* Si en la variable BuildingArea y YearBuilding
#* Primero encontrar los registros y columnas que tienen NA
#* Actualziar conorme a su mediante, ¿porqué?, decisión del analista, y la #finalidad es que no afecten al modelo, que mejor que tengan un valor (la mediana) a que no tengan nada

mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE) # summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE)    # summary(datos.Num$YearBuilt)[3], , como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE)    # summary(datos.Num$Car)[3], , como otra alternativa

#Actualizar mutate() los Na por la medianas
head(datos.Num, 10) # Los primeros 10, se observan NAs
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1  1480000     2      2.5        2        1   1      202           NA        NA
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 3  1465000     3      2.5        3        2   0      134          150      1900
## 4   850000     3      2.5        3        2   1       94           NA        NA
## 5  1600000     4      2.5        3        1   2      120          142      2014
## 6   941000     2      2.5        2        1   0      181           NA        NA
## 7  1876000     3      2.5        4        2   0      245          210      1910
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 9   300000     1      2.5        1        1   1        0           NA        NA
## 10 1097000     2      2.5        3        1   2      220           75      1900
##    Propertycount
## 1           4019
## 2           4019
## 3           4019
## 4           4019
## 5           4019
## 6           4019
## 7           4019
## 8           4019
## 9           4019
## 10          4019
datos.Num<- datos.Num %>%
  mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))

datos.Num <- datos.Num %>%
  mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt)) 

datos.Num <- datos.Num %>%
  mutate (Car = ifelse(is.na(Car), mediana.C, Car)) 


head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1  1480000     2      2.5        2        1   1      202          126      1970
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 3  1465000     3      2.5        3        2   0      134          150      1900
## 4   850000     3      2.5        3        2   1       94          126      1970
## 5  1600000     4      2.5        3        1   2      120          142      2014
## 6   941000     2      2.5        2        1   0      181          126      1970
## 7  1876000     3      2.5        4        2   0      245          210      1910
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 9   300000     1      2.5        1        1   1        0          126      1970
## 10 1097000     2      2.5        3        1   2      220           75      1900
##    Propertycount
## 1           4019
## 2           4019
## 3           4019
## 4           4019
## 5           4019
## 6           4019
## 7           4019
## 8           4019
## 9           4019
## 10          4019
#Correlaciones
correlaciones <- cor(datos.Num)
correlaciones
##                     Price       Rooms    Distance    Bedroom2    Bathroom
## Price          1.00000000  0.49663368 -0.16252184  0.47595103  0.46703818
## Rooms          0.49663368  1.00000000  0.29420252  0.94419027  0.59293408
## Distance      -0.16252184  0.29420252  1.00000000  0.29592676  0.12715513
## Bedroom2       0.47595103  0.94419027  0.29592676  1.00000000  0.58468549
## Bathroom       0.46703818  0.59293408  0.12715513  0.58468549  1.00000000
## Car            0.23910905  0.40693502  0.26059567  0.40386694  0.32101386
## Landsize       0.03750745  0.02567835  0.02500376  0.02564625  0.03713036
## BuildingArea   0.06976260  0.09275700  0.07396811  0.09034577  0.08771400
## YearBuilt     -0.25938724 -0.05156167  0.19481506 -0.04133120  0.11395652
## Propertycount -0.04215261 -0.08153007 -0.05491034 -0.08135034 -0.05220075
##                       Car     Landsize BuildingArea    YearBuilt Propertycount
## Price          0.23910905  0.037507450  0.069762599 -0.259387242  -0.042152615
## Rooms          0.40693502  0.025678350  0.092757000 -0.051561667  -0.081530072
## Distance       0.26059567  0.025003758  0.073968113  0.194815064  -0.054910338
## Bedroom2       0.40386694  0.025646248  0.090345775 -0.041331197  -0.081350337
## Bathroom       0.32101386  0.037130357  0.087714000  0.113956517  -0.052200750
## Car            1.00000000  0.026779687  0.068271708  0.078695800  -0.024344443
## Landsize       0.02677969  1.000000000  0.094015130  0.008805811  -0.006853942
## BuildingArea   0.06827171  0.094015130  1.000000000  0.002358526  -0.020905312
## YearBuilt      0.07869580  0.008805811  0.002358526  1.000000000   0.004420750
## Propertycount -0.02434444 -0.006853942 -0.020905312  0.004420750   1.000000000
corrplot(correlaciones, method = "number")

#* ¿Que les dicen las correlaciones?
#* Las variables Prices y Car no tienen correlación
#* No hay correlaciones importantes ni cercanas a 1 (positivas) ni cercanas a -1 (negativas), pero seguimos en busca del modelo de regresión lineal

#Crear conjuntos de entrenamiento y conjuntos de validación
#* Ya se tienen los datos limpios, sin NAs
#* Primero determinas el 70% de los registros para entrenamiento y 30% estante para validación
#* Se muestran con head y número de registros respectivamente
#* Los datos de entrenamiento representan el 70%
#* Los datos de validación representan el 30% restante

set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrena)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
nrow(entrena)
## [1] 9508
# Los registros que no estén en entrena
head(datos.Num[-entrena,])
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 6   941000     2      2.5        2        1   0      181          126      1970
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 12 1350000     3      2.5        3        2   2      214          190      2005
## 13  750000     2      2.5        2        2   1        0           94      2009
## 20  890000     2      2.5        2        1   1      150           73      1985
##    Propertycount
## 2           4019
## 6           4019
## 8           4019
## 12          4019
## 13          4019
## 20          4019
nrow(datos.Num[-entrena,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202          126      1970
## 2 1035000     2      2.5        2        1   0      156           79      1900
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94          126      1970
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 6  941000     2      2.5        2        1   0      181          126      1970
##   Propertycount
## 1          4019
## 2          4019
## 3          4019
## 4          4019
## 5          4019
## 6          4019
# Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202          126      1970
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94          126      1970
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 7 1876000     3      2.5        4        2   0      245          210      1910
## 9  300000     1      2.5        1        1   1        0          126      1970
##   Propertycount
## 1          4019
## 3          4019
## 4          4019
## 5          4019
## 7          4019
## 9          4019
summary(datos.Entrena)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1078063   Mean   : 2.937   Mean   :10.13   Mean   : 2.911  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :47.40   Max.   :10.000  
##     Bathroom          Car            Landsize         BuildingArea   
##  Min.   :0.000   Min.   : 0.000   Min.   :     0.0   Min.   :   0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:   178.0   1st Qu.: 123.0  
##  Median :1.000   Median : 2.000   Median :   443.5   Median : 126.0  
##  Mean   :1.529   Mean   : 1.613   Mean   :   579.6   Mean   : 136.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:   650.0   3rd Qu.: 129.0  
##  Max.   :8.000   Max.   :10.000   Max.   :433014.0   Max.   :6791.0  
##    YearBuilt    Propertycount  
##  Min.   :1830   Min.   :  389  
##  1st Qu.:1960   1st Qu.: 4386  
##  Median :1970   Median : 6567  
##  Mean   :1967   Mean   : 7453  
##  3rd Qu.:1975   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 6   941000     2      2.5        2        1   0      181          126      1970
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 12 1350000     3      2.5        3        2   2      214          190      2005
## 13  750000     2      2.5        2        2   1        0           94      2009
## 20  890000     2      2.5        2        1   1      150           73      1985
##    Propertycount
## 2           4019
## 6           4019
## 8           4019
## 12          4019
## 13          4019
## 20          4019
summary(datos.Valida)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:  120.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  146.2  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  130.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650
#Modelo de regresion lineal multiple
#* precio en funcion de todas las variables numericas del conjunto de datos de entrenamiento
modelo <- lm(Price ~ ., datos.Entrena)
modelo
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
## 
## Coefficients:
##   (Intercept)          Rooms       Distance       Bedroom2       Bathroom  
##     1.031e+07      1.889e+05     -3.116e+04      3.998e+04      2.527e+05  
##           Car       Landsize   BuildingArea      YearBuilt  Propertycount  
##     6.403e+04      3.342e+00      5.647e+02     -5.159e+03     -1.143e+00
summary(modelo)
## 
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3627062  -274161   -81595   184748  8324301 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.031e+07  3.590e+05  28.718  < 2e-16 ***
## Rooms          1.889e+05  1.819e+04  10.387  < 2e-16 ***
## Distance      -3.116e+04  9.198e+02 -33.875  < 2e-16 ***
## Bedroom2       3.998e+04  1.805e+04   2.215  0.02675 *  
## Bathroom       2.527e+05  9.235e+03  27.365  < 2e-16 ***
## Car            6.403e+04  5.722e+03  11.190  < 2e-16 ***
## Landsize       3.342e+00  1.043e+00   3.204  0.00136 ** 
## BuildingArea   5.647e+02  4.954e+01  11.399  < 2e-16 ***
## YearBuilt     -5.159e+03  1.828e+02 -28.216  < 2e-16 ***
## Propertycount -1.143e+00  1.128e+00  -1.014  0.31072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 478200 on 9498 degrees of freedom
## Multiple R-squared:  0.4437, Adjusted R-squared:  0.4432 
## F-statistic: 841.9 on 9 and 9498 DF,  p-value: < 2.2e-16
#Interpretación
#* No es un buen modelo
#* Residual standard error: 478900 on 9455 degrees of freedom (43 observations deleted due to missingness) Multiple R-squared: 0.4519, Adjusted R-squared: 0.4514
#* La regresión lineal no es el modelo adecuado para predecir con estos datos
#Probar con los datos de validación

modelo <- lm(Price ~ ., datos.Valida)
modelo
## 
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
## 
## Coefficients:
##   (Intercept)          Rooms       Distance       Bedroom2       Bathroom  
##     9.056e+06      2.323e+05     -3.189e+04      2.120e+04      2.502e+05  
##           Car       Landsize   BuildingArea      YearBuilt  Propertycount  
##     4.811e+04      2.836e+01     -7.488e+00     -4.511e+03     -1.789e+00
summary(modelo)
## 
## Call:
## lm(formula = Price ~ ., data = datos.Valida)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3209068  -265850   -74331   190050  5434862 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.056e+06  4.884e+05  18.542  < 2e-16 ***
## Rooms          2.323e+05  1.950e+04  11.913  < 2e-16 ***
## Distance      -3.189e+04  1.369e+03 -23.293  < 2e-16 ***
## Bedroom2       2.120e+04  1.846e+04   1.148 0.250936    
## Bathroom       2.502e+05  1.344e+04  18.618  < 2e-16 ***
## Car            4.811e+04  8.610e+03   5.588 2.45e-08 ***
## Landsize       2.836e+01  7.860e+00   3.609 0.000312 ***
## BuildingArea  -7.488e+00  1.307e+01  -0.573 0.566616    
## YearBuilt     -4.511e+03  2.480e+02 -18.187  < 2e-16 ***
## Propertycount -1.789e+00  1.681e+00  -1.065 0.287092    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 469900 on 4062 degrees of freedom
## Multiple R-squared:  0.4546, Adjusted R-squared:  0.4533 
## F-statistic: 376.1 on 9 and 4062 DF,  p-value: < 2.2e-16

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.