TP 1: Analisis exploratorio e Introduccion a Regresión lineal

Datos provistos por Properati Argentina

ALUMNO: Carlos Arana

Preparacion de los datos (I)

Leer el archivo ar_properties.csv y mostrar su estructura
Quedarse con aquellos registros que:

i.   Pertenecen a Argentina y Capital Federal
ii.  Cuyo precio esta en dolares (USD)
iii. El tipo de propiedad sea: Departamento, PH o Casa
iv.  El tipo de operacion sea Venta

library(tidyr)
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------ tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v ggplot2 3.2.1     v forcats 0.4.0

## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(ggplot2)


#############################
# Pto a)
#############################

pr = read_csv('C:/Users/charly/Desktop/FCEyN- MAESTRIA/Materia - ENFOQUE ESTADISTICO DEL APRENDIZAJE/trabajos_practicos/TP-1/ar_properties.csv')

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   start_date = col_date(format = ""),
##   end_date = col_date(format = ""),
##   created_on = col_date(format = ""),
##   lat = col_double(),
##   lon = col_double(),
##   l6 = col_logical(),
##   rooms = col_double(),
##   bedrooms = col_double(),
##   bathrooms = col_double(),
##   surface_total = col_double(),
##   surface_covered = col_double(),
##   price = col_double()
## )

## See spec(...) for full column specifications.

glimpse(pr)

## Observations: 388,891
## Variables: 24
## $ id              <chr> "S0we3z3V2JpHUJreqQ2t/w==", "kMxcmAS8NvrynGBVb...
## $ ad_type         <chr> "Propiedad", "Propiedad", "Propiedad", "Propie...
## $ start_date      <date> 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-1...
## $ end_date        <date> 2019-06-14, 2019-04-16, 9999-12-31, 9999-12-3...
## $ created_on      <date> 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-1...
## $ lat             <dbl> -34.94331, -34.63181, NA, -34.65471, -34.65495...
## $ lon             <dbl> -54.92966, -58.42060, NA, -58.79089, -58.78712...
## $ l1              <chr> "Uruguay", "Argentina", "Argentina", "Argentin...
## $ l2              <chr> "Maldonado", "Capital Federal", "Bs.As. G.B.A....
## $ l3              <chr> "Punta del Este", "Boedo", NA, "Moreno", "More...
## $ l4              <chr> NA, NA, NA, "Moreno", "Moreno", NA, "Ituzaingó...
## $ l5              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ l6              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rooms           <dbl> 2, NA, 2, 2, 2, 4, NA, 6, NA, NA, NA, NA, NA, ...
## $ bedrooms        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ bathrooms       <dbl> 1, NA, 1, 2, 3, 1, 3, 3, NA, NA, NA, NA, NA, N...
## $ surface_total   <dbl> 45, NA, 200, 460, 660, NA, 70, NA, 1300, 405, ...
## $ surface_covered <dbl> 40, NA, NA, 100, 148, 89, 122, NA, NA, NA, NA,...
## $ price           <dbl> 13000, 0, NA, NA, NA, NA, NA, NA, 0, NA, 0, NA...
## $ currency        <chr> "UYU", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ price_period    <chr> "Mensual", "Mensual", NA, "Mensual", "Mensual"...
## $ title           <chr> "Departamento - Roosevelt", "PH - Boedo", "Itu...
## $ property_type   <chr> "Departamento", "PH", "Casa", "Casa", "Casa", ...
## $ operation_type  <chr> "Alquiler", "Venta", "Alquiler", "Venta", "Ven...

#############################
# Pto b)
#############################

pr<- pr %>% 
  filter (l1=="Argentina" , l2=="Capital Federal", currency=="USD", 
          property_type=="Departamento" | property_type=="PH" | property_type=="Casa",
          operation_type=="Venta") %>%
  select(id, l3, rooms, bedrooms, bathrooms, surface_total, surface_covered, price, property_type)

glimpse(pr)

## Observations: 61,905
## Variables: 9
## $ id              <chr> "oyj+f764ALCYodIqBvWAww==", "HdjpKrqdwYfH9YU1D...
## $ l3              <chr> "Barracas", "Boedo", "Palermo", "Belgrano", "V...
## $ rooms           <dbl> NA, 6, NA, 3, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bedrooms        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ bathrooms       <dbl> NA, 2, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ surface_total   <dbl> 300, 178, 240, 157, 140, 95, 44, 40, 49, 40, 4...
## $ surface_covered <dbl> 180, 240, 157, NA, 110, 69, 38, 37, 44, 37, 37...
## $ price           <dbl> 320000, 500000, 350000, 470000, 155000, 199900...
## $ property_type   <chr> "PH", "Casa", "Casa", "Casa", "Casa", "Casa", ...

Analisis exploratorios (I)

Obtener la cantidad de valores unicos y de valores faltantes (NAs) para cada una de estas variables
Obtener la matriz de correlacion para las variables numericas. Pista: usen ‘complete.obs’ para poder omitir los valores faltantes.

#############################
# Pto a)
#############################

#  con RBase

Variable=c("id","l3", "rooms", "bedrooms", "bathrooms", "surface_total",        "surface_covered", "price", "property_type")

Cant_Valores_unicos=c(length(unique(pr$id)),length(unique(pr$l3)),
                      length(unique(pr$rooms)),length(unique(pr$bedrooms)),
                      length(unique(pr$bathrooms)),length(unique(pr$surface_total)),
                      length(unique(pr$surface_covered)), length(unique(pr$price)),                                   length(unique(pr$property_type)))

Cant_NA=c(sum(is.na(pr$id)),sum(is.na(pr$l3)),sum(is.na(pr$rooms)),sum(is.na(pr$bedrooms)),
sum(is.na(pr$bathrooms)),sum(is.na(pr$surface_total)),sum(is.na(pr$surface_covered)),sum(is.na(pr$price)),sum(is.na(pr$property_type)))

data.frame(Variable=Variable, Val_Unicos=Cant_Valores_unicos, Cant_NA=Cant_NA)

#  con Tidyverse

Variable=c("id","l3", "rooms", "bedrooms", "bathrooms", "surface_total",        "surface_covered", "price", "property_type")

Cant_Valores_unicos=unlist(map(.x=pr, function(x) length(unique(x))))

Cant_NA=unlist(map(.x=pr, function(x) sum(is.na(x))))


data.frame(Variable=Variable, Val_Unicos=Cant_Valores_unicos, Cant_NA=Cant_NA)

#############################
# Pto b)
#############################
install.packages("corrr", dependencies = TRUE, repos = 'http://cran.rstudio.com/')

## Installing package into 'C:/Users/charly/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## also installing the dependencies 'htmlwidgets', 'praise', 'blob', 'memoise', 'config', 'forge', 'r2d3', 'rappdirs', 'rprojroot', 'rex', 'testthat', 'RSQLite', 'sparklyr', 'covr'

## package 'htmlwidgets' successfully unpacked and MD5 sums checked
## package 'praise' successfully unpacked and MD5 sums checked
## package 'blob' successfully unpacked and MD5 sums checked
## package 'memoise' successfully unpacked and MD5 sums checked
## package 'config' successfully unpacked and MD5 sums checked
## package 'forge' successfully unpacked and MD5 sums checked
## package 'r2d3' successfully unpacked and MD5 sums checked
## package 'rappdirs' successfully unpacked and MD5 sums checked
## package 'rprojroot' successfully unpacked and MD5 sums checked
## package 'rex' successfully unpacked and MD5 sums checked
## package 'testthat' successfully unpacked and MD5 sums checked
## package 'RSQLite' successfully unpacked and MD5 sums checked
## package 'sparklyr' successfully unpacked and MD5 sums checked
## package 'covr' successfully unpacked and MD5 sums checked
## package 'corrr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\charly\AppData\Local\Temp\RtmpuOsXyK\downloaded_packages

library(corrr)
library(tidyverse)

var_numericas = colnames(pr)[unlist(map(.x=pr, .f=is.numeric))]

pr %>% select(var_numericas)%>% correlate(use = "complete.obs")

## 
## Correlation method: 'pearson'
## Missing treated using: 'complete.obs'

Preparacion de los datos (II)

En el punto 2 deberian haber encontrado que la variable bedrooms presenta una alta proporción de valores faltantes y que presenta una fuerte correlacion con la variable rooms. Por lo tanto, vamos a eliminarla. Eliminar todos los registros que presentan valores faltantes

pr <- select(pr, -bedrooms)

pr <- na.omit(pr)

glimpse(pr)

## Observations: 51,210
## Variables: 8
## $ id              <chr> "HdjpKrqdwYfH9YU1DKjltg==", "AfdcsqUSelai1ofCA...
## $ l3              <chr> "Boedo", "Velez Sarsfield", "Nuñez", "Almagro"...
## $ rooms           <dbl> 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bathrooms       <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ surface_total   <dbl> 178, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, 4...
## $ surface_covered <dbl> 240, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
## $ price           <dbl> 500000, 199900, 147000, 92294, 115000, 77000, ...
## $ property_type   <chr> "Casa", "Casa", "Departamento", "Departamento"...

Analisis exploratorios (II)

Obtener estadisticas descriptivas para la variable precio (cuartiles, promedio, minimo y maximo) y realizar un histograma de la variable
Obtener estadisticas descriptivas para la variable precio (cuartiles, promedio, minimo y maximo) por cada tipo de propiedad.
Realizar un grafico de boxplot de la variable precio por tipo de propiedad
Realizar un correlagrama usando GGAlly

#############################
# Pto a)
#############################

summary(pr$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6000  119000  170000  251577  270000 6000000

#############################
# Pto b)
#############################

pr %>% group_by(property_type) %>% summarize(Q1=quantile(price,probs = 0.25))

pr %>% group_by(property_type) %>% summarize(Q3=quantile(price,probs = 0.75))

pr %>% group_by(property_type) %>% summarize(media=mean(price))

pr %>% group_by(property_type) %>% summarize(min=min(price))

pr %>% group_by(property_type) %>% summarize(max=max(price))

#############################
# Pto c)
#############################

ggplot(pr, aes(x = property_type, y = price, group = property_type, fill = property_type )) +
geom_boxplot()

#############################
# Pto d)
#############################

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

pr %>% select(-c(id, l3)) %>% ggpairs(mapping = aes(color = property_type))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers
1. Eliminar los outliers de la variable precio con algún criterio que elijan: puede ser por valores de corte, eliminar el x% a izquierda y derecha,etc.

# Como se puede observar este hoistograma no nos aprta mucha información visulamente por la importante presencia de valorews extremos , poir eso empezaré a seleccionar los valores de precio en función de un análisisa bo0xplot

#  veo como queda el boxplot
ggplot(pr, aes( y = price)) +
geom_boxplot()

#  quedan muichísima sobservaciones afuera

#  Cálculo de Q1 y Q3
Q1 =quantile(pr$price,probs = 0.25)
Q3=quantile(pr$price,probs = 0.75)
IQR=Q3-Q1


#  armno un variavble que multiplique al IQR, y le asigno el valor por default de Tukey, 1.5
mult_IQR=1.5



# para el claculo de loslimites  chuequeo que no sean mayores a los minimos y max respect
L_inf = ifelse(Q1-mult_IQR*IQR > min(pr$price),Q1-mult_IQR*IQR, min(pr$price))
L_sup = ifelse(Q3+mult_IQR*IQR < max(pr$price),Q3+mult_IQR*IQR,max(pr$price)) 



#  veo el histograma de la variable original
ggplot(pr, aes(x=price))+
  geom_histogram(binwidth = 100000) + geom_vline(aes(xintercept=L_inf),color="blue", linetype="dashed", size=1) + geom_vline(aes(xintercept=L_sup),color="blue", linetype="dashed", size=1)

# veo quñe cuantil es
sum(pr$price>L_sup)/nrow(pr)

## [1] 0.08314782

# como se pobservar el histograma es bastante  sesgado a derecha, por lo que voy a armar un nuevo dataset que exlcuya valores extremso selecionando u intervalo mayor al de Tukey
mult_IQR=4
L_sup_2 = ifelse(Q3+mult_IQR*IQR<max(pr$price),Q3+mult_IQR*IQR, max(pr$price)) 

pr_bplot <- pr %>% filter(price>L_inf, price<L_sup_2)

#  veo el histograma de la variable sin outliers (los superioires a L_sup_2)
ggplot(pr_bplot, aes(x=price))+
  geom_histogram(binwidth = 10000) + geom_vline(aes(xintercept=L_inf),color="blue", linetype="dashed", size=1) + geom_vline(aes(xintercept=L_sup),color="blue", linetype="dashed", size=1)

# veo quñe cuantil es
sum(pr$price>L_sup_2)/nrow(pr)

## [1] 0.02956454

Analisis exploratorios (III)
1. Repetir los 4 análisis exploratorios realizados en el punto 4 y realizar unos breves comentarios sobre los cambios que encontraron

#############################
# Pto a)
#############################

summary(pr_bplot$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7500  117000  166000  212879  260000  870000

# Relaizao el histograma con todos los valores de la variable precio
ggplot(pr_bplot, aes(x=price))+
  geom_histogram(binwidth = 10000)

#############################
# Pto b)
#############################

pr_bplot %>% group_by(property_type) %>% summarize(Q1=quantile(price,probs = 0.25))

pr_bplot %>% group_by(property_type) %>% summarize(Q3=quantile(price,probs = 0.75))

pr_bplot %>% group_by(property_type) %>% summarize(media=mean(price))

pr_bplot %>% group_by(property_type) %>% summarize(min=min(price))

pr_bplot %>% group_by(property_type) %>% summarize(max=max(price))

#############################
# Pto c)
#############################

ggplot(pr, aes(x = property_type, y = price, group = property_type, fill = property_type )) +
geom_boxplot()

7)Modelo lineal

a) Realizar un modelo lineal simple para explicar el precio en función de las habitaciones (rooms) y otro modelo que explique el precio en función de la superficie total (surface_total)
b)Usar la función summary() para obtener informacion de ambos modelos. Explicar los valores de los coeficientes estimados.
c)¿Cuál modelo usarían para predecir el precio? ¿Por qué?

#############################
# Pto a)
#############################
glimpse(pr_bplot)

## Observations: 49,693
## Variables: 8
## $ id              <chr> "HdjpKrqdwYfH9YU1DKjltg==", "AfdcsqUSelai1ofCA...
## $ l3              <chr> "Boedo", "Velez Sarsfield", "Nuñez", "Almagro"...
## $ rooms           <dbl> 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bathrooms       <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ surface_total   <dbl> 178, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, 4...
## $ surface_covered <dbl> 240, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
## $ price           <dbl> 500000, 199900, 147000, 92294, 115000, 77000, ...
## $ property_type   <chr> "Casa", "Casa", "Departamento", "Departamento"...

lm_rooms = lm(price ~ rooms, data=pr_bplot)
lm_surface = lm(price ~ surface_total  ,data=pr_bplot)

#############################
# Pto b)
#############################

# Info modelos lm_rooms
summary(lm_rooms)

## 
## Call:
## lm(formula = price ~ rooms, data = pr_bplot)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1551259   -64478   -17378    30622   705499 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31919.6     1162.8   27.45   <2e-16 ***
## rooms        66229.3      383.2  172.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112800 on 49691 degrees of freedom
## Multiple R-squared:  0.3754, Adjusted R-squared:  0.3754 
## F-statistic: 2.987e+04 on 1 and 49691 DF,  p-value: < 2.2e-16

# Info modelos lm_rooms
summary(lm_surface)

## 
## Call:
## lm(formula = price ~ surface_total, data = pr_bplot)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1213783   -95429   -46651    47021   656828 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.120e+05  6.435e+02  329.38   <2e-16 ***
## surface_total 9.653e+00  7.744e-01   12.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142500 on 49691 degrees of freedom
## Multiple R-squared:  0.003117,   Adjusted R-squared:  0.003097 
## F-statistic: 155.4 on 1 and 49691 DF,  p-value: < 2.2e-16

#############################
# Pto c)
#############################

# Elegiría el modelo ajustado utilizando la variable explicativa "rooms". Ambos modelos tienen una estimación de coeficiente Beta_1 rechaza la hipótesis nula de que su valor es 0 (lo que indicaría que no hay relación lineal )entre las varoables) con un p_value extremadamente bajo, peroporcentaje de la el modelo ajustado con "rooms" posee un R2 mayor, indicando una mejor calidad del ajuste, medida en términos cuanto "explica" la variación en la cantidad de habitaciones la variabilidad lineal del precio.