Datos provistos por Properati Argentina
ALUMNO: Carlos Arana
i. Pertenecen a Argentina y Capital Federal
ii. Cuyo precio esta en dolares (USD)
iii. El tipo de propiedad sea: Departamento, PH o Casa
iv. El tipo de operacion sea Venta
library(tidyr)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
#############################
# Pto a)
#############################
pr = read_csv('C:/Users/charly/Desktop/FCEyN- MAESTRIA/Materia - ENFOQUE ESTADISTICO DEL APRENDIZAJE/trabajos_practicos/TP-1/ar_properties.csv')
## Parsed with column specification:
## cols(
## .default = col_character(),
## start_date = col_date(format = ""),
## end_date = col_date(format = ""),
## created_on = col_date(format = ""),
## lat = col_double(),
## lon = col_double(),
## l6 = col_logical(),
## rooms = col_double(),
## bedrooms = col_double(),
## bathrooms = col_double(),
## surface_total = col_double(),
## surface_covered = col_double(),
## price = col_double()
## )
## See spec(...) for full column specifications.
glimpse(pr)
## Observations: 388,891
## Variables: 24
## $ id <chr> "S0we3z3V2JpHUJreqQ2t/w==", "kMxcmAS8NvrynGBVb...
## $ ad_type <chr> "Propiedad", "Propiedad", "Propiedad", "Propie...
## $ start_date <date> 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-1...
## $ end_date <date> 2019-06-14, 2019-04-16, 9999-12-31, 9999-12-3...
## $ created_on <date> 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-1...
## $ lat <dbl> -34.94331, -34.63181, NA, -34.65471, -34.65495...
## $ lon <dbl> -54.92966, -58.42060, NA, -58.79089, -58.78712...
## $ l1 <chr> "Uruguay", "Argentina", "Argentina", "Argentin...
## $ l2 <chr> "Maldonado", "Capital Federal", "Bs.As. G.B.A....
## $ l3 <chr> "Punta del Este", "Boedo", NA, "Moreno", "More...
## $ l4 <chr> NA, NA, NA, "Moreno", "Moreno", NA, "Ituzaingó...
## $ l5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ l6 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rooms <dbl> 2, NA, 2, 2, 2, 4, NA, 6, NA, NA, NA, NA, NA, ...
## $ bedrooms <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ bathrooms <dbl> 1, NA, 1, 2, 3, 1, 3, 3, NA, NA, NA, NA, NA, N...
## $ surface_total <dbl> 45, NA, 200, 460, 660, NA, 70, NA, 1300, 405, ...
## $ surface_covered <dbl> 40, NA, NA, 100, 148, 89, 122, NA, NA, NA, NA,...
## $ price <dbl> 13000, 0, NA, NA, NA, NA, NA, NA, 0, NA, 0, NA...
## $ currency <chr> "UYU", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ price_period <chr> "Mensual", "Mensual", NA, "Mensual", "Mensual"...
## $ title <chr> "Departamento - Roosevelt", "PH - Boedo", "Itu...
## $ property_type <chr> "Departamento", "PH", "Casa", "Casa", "Casa", ...
## $ operation_type <chr> "Alquiler", "Venta", "Alquiler", "Venta", "Ven...
#############################
# Pto b)
#############################
pr<- pr %>%
filter (l1=="Argentina" , l2=="Capital Federal", currency=="USD",
property_type=="Departamento" | property_type=="PH" | property_type=="Casa",
operation_type=="Venta") %>%
select(id, l3, rooms, bedrooms, bathrooms, surface_total, surface_covered, price, property_type)
glimpse(pr)
## Observations: 61,905
## Variables: 9
## $ id <chr> "oyj+f764ALCYodIqBvWAww==", "HdjpKrqdwYfH9YU1D...
## $ l3 <chr> "Barracas", "Boedo", "Palermo", "Belgrano", "V...
## $ rooms <dbl> NA, 6, NA, 3, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bedrooms <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ bathrooms <dbl> NA, 2, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ surface_total <dbl> 300, 178, 240, 157, 140, 95, 44, 40, 49, 40, 4...
## $ surface_covered <dbl> 180, 240, 157, NA, 110, 69, 38, 37, 44, 37, 37...
## $ price <dbl> 320000, 500000, 350000, 470000, 155000, 199900...
## $ property_type <chr> "PH", "Casa", "Casa", "Casa", "Casa", "Casa", ...
#############################
# Pto a)
#############################
# con RBase
Variable=c("id","l3", "rooms", "bedrooms", "bathrooms", "surface_total", "surface_covered", "price", "property_type")
Cant_Valores_unicos=c(length(unique(pr$id)),length(unique(pr$l3)),
length(unique(pr$rooms)),length(unique(pr$bedrooms)),
length(unique(pr$bathrooms)),length(unique(pr$surface_total)),
length(unique(pr$surface_covered)), length(unique(pr$price)), length(unique(pr$property_type)))
Cant_NA=c(sum(is.na(pr$id)),sum(is.na(pr$l3)),sum(is.na(pr$rooms)),sum(is.na(pr$bedrooms)),
sum(is.na(pr$bathrooms)),sum(is.na(pr$surface_total)),sum(is.na(pr$surface_covered)),sum(is.na(pr$price)),sum(is.na(pr$property_type)))
data.frame(Variable=Variable, Val_Unicos=Cant_Valores_unicos, Cant_NA=Cant_NA)
# con Tidyverse
Variable=c("id","l3", "rooms", "bedrooms", "bathrooms", "surface_total", "surface_covered", "price", "property_type")
Cant_Valores_unicos=unlist(map(.x=pr, function(x) length(unique(x))))
Cant_NA=unlist(map(.x=pr, function(x) sum(is.na(x))))
data.frame(Variable=Variable, Val_Unicos=Cant_Valores_unicos, Cant_NA=Cant_NA)
#############################
# Pto b)
#############################
install.packages("corrr", dependencies = TRUE, repos = 'http://cran.rstudio.com/')
## Installing package into 'C:/Users/charly/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## also installing the dependencies 'htmlwidgets', 'praise', 'blob', 'memoise', 'config', 'forge', 'r2d3', 'rappdirs', 'rprojroot', 'rex', 'testthat', 'RSQLite', 'sparklyr', 'covr'
## package 'htmlwidgets' successfully unpacked and MD5 sums checked
## package 'praise' successfully unpacked and MD5 sums checked
## package 'blob' successfully unpacked and MD5 sums checked
## package 'memoise' successfully unpacked and MD5 sums checked
## package 'config' successfully unpacked and MD5 sums checked
## package 'forge' successfully unpacked and MD5 sums checked
## package 'r2d3' successfully unpacked and MD5 sums checked
## package 'rappdirs' successfully unpacked and MD5 sums checked
## package 'rprojroot' successfully unpacked and MD5 sums checked
## package 'rex' successfully unpacked and MD5 sums checked
## package 'testthat' successfully unpacked and MD5 sums checked
## package 'RSQLite' successfully unpacked and MD5 sums checked
## package 'sparklyr' successfully unpacked and MD5 sums checked
## package 'covr' successfully unpacked and MD5 sums checked
## package 'corrr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\charly\AppData\Local\Temp\RtmpuOsXyK\downloaded_packages
library(corrr)
library(tidyverse)
var_numericas = colnames(pr)[unlist(map(.x=pr, .f=is.numeric))]
pr %>% select(var_numericas)%>% correlate(use = "complete.obs")
##
## Correlation method: 'pearson'
## Missing treated using: 'complete.obs'
Preparacion de los datos (II)
En el punto 2 deberian haber encontrado que la variable bedrooms presenta una alta proporción de valores faltantes y que presenta una fuerte correlacion con la variable rooms. Por lo tanto, vamos a eliminarla. Eliminar todos los registros que presentan valores faltantes
pr <- select(pr, -bedrooms)
pr <- na.omit(pr)
glimpse(pr)
## Observations: 51,210
## Variables: 8
## $ id <chr> "HdjpKrqdwYfH9YU1DKjltg==", "AfdcsqUSelai1ofCA...
## $ l3 <chr> "Boedo", "Velez Sarsfield", "Nuñez", "Almagro"...
## $ rooms <dbl> 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bathrooms <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ surface_total <dbl> 178, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, 4...
## $ surface_covered <dbl> 240, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
## $ price <dbl> 500000, 199900, 147000, 92294, 115000, 77000, ...
## $ property_type <chr> "Casa", "Casa", "Departamento", "Departamento"...
#############################
# Pto a)
#############################
summary(pr$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6000 119000 170000 251577 270000 6000000
#############################
# Pto b)
#############################
pr %>% group_by(property_type) %>% summarize(Q1=quantile(price,probs = 0.25))
pr %>% group_by(property_type) %>% summarize(Q3=quantile(price,probs = 0.75))
pr %>% group_by(property_type) %>% summarize(media=mean(price))
pr %>% group_by(property_type) %>% summarize(min=min(price))
pr %>% group_by(property_type) %>% summarize(max=max(price))
#############################
# Pto c)
#############################
ggplot(pr, aes(x = property_type, y = price, group = property_type, fill = property_type )) +
geom_boxplot()
#############################
# Pto d)
#############################
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
pr %>% select(-c(id, l3)) %>% ggpairs(mapping = aes(color = property_type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Outliers
# Como se puede observar este hoistograma no nos aprta mucha información visulamente por la importante presencia de valorews extremos , poir eso empezaré a seleccionar los valores de precio en función de un análisisa bo0xplot
# veo como queda el boxplot
ggplot(pr, aes( y = price)) +
geom_boxplot()
# quedan muichísima sobservaciones afuera
# Cálculo de Q1 y Q3
Q1 =quantile(pr$price,probs = 0.25)
Q3=quantile(pr$price,probs = 0.75)
IQR=Q3-Q1
# armno un variavble que multiplique al IQR, y le asigno el valor por default de Tukey, 1.5
mult_IQR=1.5
# para el claculo de loslimites chuequeo que no sean mayores a los minimos y max respect
L_inf = ifelse(Q1-mult_IQR*IQR > min(pr$price),Q1-mult_IQR*IQR, min(pr$price))
L_sup = ifelse(Q3+mult_IQR*IQR < max(pr$price),Q3+mult_IQR*IQR,max(pr$price))
# veo el histograma de la variable original
ggplot(pr, aes(x=price))+
geom_histogram(binwidth = 100000) + geom_vline(aes(xintercept=L_inf),color="blue", linetype="dashed", size=1) + geom_vline(aes(xintercept=L_sup),color="blue", linetype="dashed", size=1)
# veo quñe cuantil es
sum(pr$price>L_sup)/nrow(pr)
## [1] 0.08314782
# como se pobservar el histograma es bastante sesgado a derecha, por lo que voy a armar un nuevo dataset que exlcuya valores extremso selecionando u intervalo mayor al de Tukey
mult_IQR=4
L_sup_2 = ifelse(Q3+mult_IQR*IQR<max(pr$price),Q3+mult_IQR*IQR, max(pr$price))
pr_bplot <- pr %>% filter(price>L_inf, price<L_sup_2)
# veo el histograma de la variable sin outliers (los superioires a L_sup_2)
ggplot(pr_bplot, aes(x=price))+
geom_histogram(binwidth = 10000) + geom_vline(aes(xintercept=L_inf),color="blue", linetype="dashed", size=1) + geom_vline(aes(xintercept=L_sup),color="blue", linetype="dashed", size=1)
# veo quñe cuantil es
sum(pr$price>L_sup_2)/nrow(pr)
## [1] 0.02956454
Analisis exploratorios (III)
#############################
# Pto a)
#############################
summary(pr_bplot$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7500 117000 166000 212879 260000 870000
# Relaizao el histograma con todos los valores de la variable precio
ggplot(pr_bplot, aes(x=price))+
geom_histogram(binwidth = 10000)
#############################
# Pto b)
#############################
pr_bplot %>% group_by(property_type) %>% summarize(Q1=quantile(price,probs = 0.25))
pr_bplot %>% group_by(property_type) %>% summarize(Q3=quantile(price,probs = 0.75))
pr_bplot %>% group_by(property_type) %>% summarize(media=mean(price))
pr_bplot %>% group_by(property_type) %>% summarize(min=min(price))
pr_bplot %>% group_by(property_type) %>% summarize(max=max(price))
#############################
# Pto c)
#############################
ggplot(pr, aes(x = property_type, y = price, group = property_type, fill = property_type )) +
geom_boxplot()
7)Modelo lineal
a) Realizar un modelo lineal simple para explicar el precio en función de las habitaciones (rooms) y otro modelo que explique el precio en función de la superficie total (surface_total)
b)Usar la función summary() para obtener informacion de ambos modelos. Explicar los valores de los coeficientes estimados.
c)¿Cuál modelo usarían para predecir el precio? ¿Por qué?
#############################
# Pto a)
#############################
glimpse(pr_bplot)
## Observations: 49,693
## Variables: 8
## $ id <chr> "HdjpKrqdwYfH9YU1DKjltg==", "AfdcsqUSelai1ofCA...
## $ l3 <chr> "Boedo", "Velez Sarsfield", "Nuñez", "Almagro"...
## $ rooms <dbl> 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bathrooms <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ surface_total <dbl> 178, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, 4...
## $ surface_covered <dbl> 240, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
## $ price <dbl> 500000, 199900, 147000, 92294, 115000, 77000, ...
## $ property_type <chr> "Casa", "Casa", "Departamento", "Departamento"...
lm_rooms = lm(price ~ rooms, data=pr_bplot)
lm_surface = lm(price ~ surface_total ,data=pr_bplot)
#############################
# Pto b)
#############################
# Info modelos lm_rooms
summary(lm_rooms)
##
## Call:
## lm(formula = price ~ rooms, data = pr_bplot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1551259 -64478 -17378 30622 705499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31919.6 1162.8 27.45 <2e-16 ***
## rooms 66229.3 383.2 172.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 112800 on 49691 degrees of freedom
## Multiple R-squared: 0.3754, Adjusted R-squared: 0.3754
## F-statistic: 2.987e+04 on 1 and 49691 DF, p-value: < 2.2e-16
# Info modelos lm_rooms
summary(lm_surface)
##
## Call:
## lm(formula = price ~ surface_total, data = pr_bplot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1213783 -95429 -46651 47021 656828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.120e+05 6.435e+02 329.38 <2e-16 ***
## surface_total 9.653e+00 7.744e-01 12.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 142500 on 49691 degrees of freedom
## Multiple R-squared: 0.003117, Adjusted R-squared: 0.003097
## F-statistic: 155.4 on 1 and 49691 DF, p-value: < 2.2e-16
#############################
# Pto c)
#############################
# Elegiría el modelo ajustado utilizando la variable explicativa "rooms". Ambos modelos tienen una estimación de coeficiente Beta_1 rechaza la hipótesis nula de que su valor es 0 (lo que indicaría que no hay relación lineal )entre las varoables) con un p_value extremadamente bajo, peroporcentaje de la el modelo ajustado con "rooms" posee un R2 mayor, indicando una mejor calidad del ajuste, medida en términos cuanto "explica" la variación en la cantidad de habitaciones la variabilidad lineal del precio.