Librerias revisadas en el taller: Técnicas de limpieza y calidad de datos.
Se utilizará la Rolling Sales Data, con información de ventas que incluyen variables de vecindario, tipo de construcción, pies cuadrados, etc.
tab1 <- introduce(brooklyn) %>% select(-all_missing_columns, -total_missing_values,
-complete_rows,
-memory_usage) %>%
select(Filas=rows, Columnas=columns,
`Columnas Caracter`=discrete_columns,
`Columnas Numéricas`=continuous_columns,
`Observaciones Disponibles`= total_observations)
Tabla 1: Resumen de Variables
| Nro. Filas | Nro. Columnas | Nro. Columnas Caracter |
Nro. Columnas Numéricas |
Nro. Columnas Null  |
Observaciones Disponibles |
|---|---|---|---|---|---|
| 19244 | 21 | 10 | 10 | 1 | 404124 |
Se concluye que hay una columna que contiene valores nulos, y 10 columnas con valores categóricos y numéricos respectivamente.
Gráfico 1: Porcentaje de valores perdidos
tab2 <- plot_missing(brooklyn)
La variable que tiene el 100% de valores nulos es EASE_MENT se sugiere eliminar esta variable para el análisis.
Gráfico 2: Disperción por variable
brooklyn %>% select(where(is.numeric)) %>% ggpairs()
****
De aqui se puede observar la correrlación entre LOT y BLOQ (***)
# Limpieza de nombres
ABC = brooklyn %>% clean_names(case = "all_caps")
# Vistazo a la base final
ABC %>% glimpse()
## Rows: 19,244
## Columns: 21
## $ BOROUGH <chr> "3", "3", "3", "3", "3", "3", "3", "3",~
## $ NEIGHBORHOOD <chr> "BATH BEACH", "BATH BEACH", "BATH BEACH~
## $ BUILDING_CLASS_CATEGORY <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FAMI~
## $ TAX_CLASS_AT_PRESENT <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BLOCK <dbl> 6359, 6360, 6362, 6367, 6371, 6380, 639~
## $ LOT <dbl> 70, 56, 23, 24, 60, 73, 115, 13, 40, 38~
## $ EASE_MENT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ BUILDING_CLASS_AT_PRESENT <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ ADDRESS <chr> "8684 15TH AVENUE", "30 BAY 10TH STREET~
## $ APARTMENT_NUMBER <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ ZIP_CODE <dbl> 11228, 11228, 11228, 11214, 11214, 1121~
## $ RESIDENTIAL_UNITS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ COMMERCIAL_UNITS <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ~
## $ TOTAL_UNITS <dbl> 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, ~
## $ LAND_SQUARE_FEET <dbl> 1933, 1547, 2280, 1571, 2417, 1740, 144~
## $ GROSS_SQUARE_FEET <dbl> 4080, 1428, 1052, 1456, 2106, 2124, 128~
## $ YEAR_BUILT <dbl> 1930, 1930, 1901, 1935, 1930, 1960, 194~
## $ TAX_CLASS_AT_TIME_OF_SALE <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BUILDING_CLASS_AT_TIME_OF_SALE <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ SALE_PRICE <dbl> 1300000, 75000, 0, 830000, 1188000, 990~
## $ SALE_DATE <dttm> 2020-04-28, 2020-11-30, 2020-11-04, 20~
# Descripción del tipo de variables
ABC %>% vis_dat(warn_large_data = F)
# Datos perdidos
ABC %>% vis_miss(warn_large_data = F)
# Eliminación de variables perdidas
ABC = ABC %>%
select(-EASE_MENT)
# Correlaciones entre variables
ABC %>%
select(where(is.numeric)) %>% vis_cor()
# Patrones en datos perdidos
ABC %>% gg_miss_upset()
# Transformación columnas a filas
ABC %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
head()
## # A tibble: 6 x 4
## NEIGHBORHOOD TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
## <chr> <dbl> <dbl> <dbl>
## 1 BATH BEACH 529 490 39
## 2 BAY RIDGE 1059 940 119
## 3 BEDFORD STUYVESANT 3049 2936 113
## 4 BENSONHURST 1028 976 52
## 5 BERGEN BEACH 280 262 18
## 6 BOERUM HILL 334 300 34
ABC %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>%
head()
## # A tibble: 6 x 3
## NEIGHBORHOOD UNIT_TYPE NUMBER_UNITS
## <chr> <chr> <dbl>
## 1 BATH BEACH TOTAL_UNITS 529
## 2 BATH BEACH RESIDENTIAL_UNITS 490
## 3 BATH BEACH COMMERCIAL_UNITS 39
## 4 BAY RIDGE TOTAL_UNITS 1059
## 5 BAY RIDGE RESIDENTIAL_UNITS 940
## 6 BAY RIDGE COMMERCIAL_UNITS 119
# Transformación filas a columnas
ABC %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>%
pivot_wider(names_from = c(UNIT_TYPE), values_from = c(NUMBER_UNITS), values_fill = 0) %>%
head()
## # A tibble: 6 x 4
## NEIGHBORHOOD TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
## <chr> <dbl> <dbl> <dbl>
## 1 BATH BEACH 529 490 39
## 2 BAY RIDGE 1059 940 119
## 3 BEDFORD STUYVESANT 3049 2936 113
## 4 BENSONHURST 1028 976 52
## 5 BERGEN BEACH 280 262 18
## 6 BOERUM HILL 334 300 34
El bosque de aislamiento es un método de detección de anomalÃas introducido por el papel Detección de anomalÃas basada en el aislamiento.
library(solitude) # Isolation forest
# Modelo isolation forest
isoforest = isolationForest$new(sample_size = as.integer(nrow(ABC)/2),
num_trees = 500,
replace = TRUE,
seed = 123)
isoforest$fit(dataset = ABC %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit())
## INFO [20:07:02.053] dataset has duplicated rows
## INFO [20:07:02.091] Building Isolation Forest ...
## INFO [20:07:03.947] done
## INFO [20:07:03.948] Computing depth of terminal nodes ...
## INFO [20:07:05.181] done
## INFO [20:07:06.875] Completed growing isolation forest
predicciones = isoforest$predict(data = ABC %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit())
head(predicciones)
## id average_depth anomaly_score
## 1: 1 13.976 0.5748599
## 2: 2 13.834 0.5781026
## 3: 3 14.000 0.5743136
## 4: 4 13.998 0.5743591
## 5: 5 14.000 0.5743136
## 6: 6 13.998 0.5743591
ggplot(data = predicciones, aes(x = average_depth)) +
geom_histogram(color = "gray40") +
geom_vline(
xintercept = quantile(predicciones$average_depth, seq(0, 1, 0.1)),
color = "red",
linetype = "dashed") +
labs(
title = "Distribución de las distancias medias del Isolation Forest",
subtitle = "Cuantiles marcados en rojo" ) +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pca_res = PCA(ABC %>% select(where(is.numeric)), scale.unit = T)
pca_res %>% fviz_screeplot()
pca_res %>% fviz_pca_biplot()
pca_res = PCA(ABC %>%
slice(-43798) %>%
select(where(is.numeric)), scale.unit = T)
pca_res %>% fviz_screeplot()
pca_res %>% fviz_pca_biplot()