Contenido:
library(readxl) # leer tablas desde excel
library(tidyverse) # para manejo de bases de datos
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor) # para limpiar bases de datos
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(visdat) # crea visualizaciones de datos exploratorios preliminares
library(naniar) # visualizacion y tratamiento de datos perdidos
library(GGally) # matriz de graficos ggplot para analisis exploratorio
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Carga de datos ----------------------------------------------------------
# Dirección: https://www1.nyc.gov/site/finance/taxes/property-rolling-sales-data.page
# Carga de datos
brooklyn = read_excel("data/rollingsales_brooklyn.xls", skip = 4)
# Vistazo breve -----------------------------------------------------------
brooklyn %>% glimpse()
## Rows: 19,244
## Columns: 21
## $ BOROUGH <chr> "3", "3", "3", "3", "3", "3", "3", "3~
## $ NEIGHBORHOOD <chr> "BATH BEACH", "BATH BEACH", "BATH BEA~
## $ `BUILDING CLASS CATEGORY` <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FA~
## $ `TAX CLASS AT PRESENT` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ BLOCK <dbl> 6359, 6360, 6362, 6367, 6371, 6380, 6~
## $ LOT <dbl> 70, 56, 23, 24, 60, 73, 115, 13, 40, ~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "S1", "A5", "A9", "A9", "A9", "S1", "~
## $ ADDRESS <chr> "8684 15TH AVENUE", "30 BAY 10TH STRE~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 11228, 11228, 11228, 11214, 11214, 11~
## $ `RESIDENTIAL UNITS` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `COMMERCIAL UNITS` <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0~
## $ `TOTAL UNITS` <dbl> 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1~
## $ `LAND SQUARE FEET` <dbl> 1933, 1547, 2280, 1571, 2417, 1740, 1~
## $ `GROSS SQUARE FEET` <dbl> 4080, 1428, 1052, 1456, 2106, 2124, 1~
## $ `YEAR BUILT` <dbl> 1930, 1930, 1901, 1935, 1930, 1960, 1~
## $ `TAX CLASS AT TIME OF SALE` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "S1", "A5", "A9", "A9", "A9", "S1", "~
## $ `SALE PRICE` <dbl> 1300000, 75000, 0, 830000, 1188000, 9~
## $ `SALE DATE` <dttm> 2020-04-28, 2020-11-30, 2020-11-04, ~
brooklyn %>% head()
## # A tibble: 6 x 21
## BOROUGH NEIGHBORHOOD `BUILDING CLASS~ `TAX CLASS AT P~ BLOCK LOT `EASE-MENT`
## <chr> <chr> <chr> <chr> <dbl> <dbl> <lgl>
## 1 3 BATH BEACH 01 ONE FAMILY D~ 1 6359 70 NA
## 2 3 BATH BEACH 01 ONE FAMILY D~ 1 6360 56 NA
## 3 3 BATH BEACH 01 ONE FAMILY D~ 1 6362 23 NA
## 4 3 BATH BEACH 01 ONE FAMILY D~ 1 6367 24 NA
## 5 3 BATH BEACH 01 ONE FAMILY D~ 1 6371 60 NA
## 6 3 BATH BEACH 01 ONE FAMILY D~ 1 6380 73 NA
## # ... with 14 more variables: BUILDING CLASS AT PRESENT <chr>, ADDRESS <chr>,
## # APARTMENT NUMBER <chr>, ZIP CODE <dbl>, RESIDENTIAL UNITS <dbl>,
## # COMMERCIAL UNITS <dbl>, TOTAL UNITS <dbl>, LAND SQUARE FEET <dbl>,
## # GROSS SQUARE FEET <dbl>, YEAR BUILT <dbl>, TAX CLASS AT TIME OF SALE <chr>,
## # BUILDING CLASS AT TIME OF SALE <chr>, SALE PRICE <dbl>, SALE DATE <dttm>
# Filtrado y selección de variables ---------------------------------------
brooklyn %>% select(BOROUGH, NEIGHBORHOOD) %>% head()
## # A tibble: 6 x 2
## BOROUGH NEIGHBORHOOD
## <chr> <chr>
## 1 3 BATH BEACH
## 2 3 BATH BEACH
## 3 3 BATH BEACH
## 4 3 BATH BEACH
## 5 3 BATH BEACH
## 6 3 BATH BEACH
brooklyn %>% filter(LOT==70) %>% head()
## # A tibble: 6 x 21
## BOROUGH NEIGHBORHOOD `BUILDING CLASS~ `TAX CLASS AT P~ BLOCK LOT `EASE-MENT`
## <chr> <chr> <chr> <chr> <dbl> <dbl> <lgl>
## 1 3 BATH BEACH 01 ONE FAMILY D~ 1 6359 70 NA
## 2 3 BATH BEACH 02 TWO FAMILY D~ 1 6373 70 NA
## 3 3 BAY RIDGE 01 ONE FAMILY D~ 1 5930 70 NA
## 4 3 BAY RIDGE 01 ONE FAMILY D~ 1 5963 70 NA
## 5 3 BAY RIDGE 03 THREE FAMILY~ 1 5971 70 NA
## 6 3 BAY RIDGE 03 THREE FAMILY~ 1 5971 70 NA
## # ... with 14 more variables: BUILDING CLASS AT PRESENT <chr>, ADDRESS <chr>,
## # APARTMENT NUMBER <chr>, ZIP CODE <dbl>, RESIDENTIAL UNITS <dbl>,
## # COMMERCIAL UNITS <dbl>, TOTAL UNITS <dbl>, LAND SQUARE FEET <dbl>,
## # GROSS SQUARE FEET <dbl>, YEAR BUILT <dbl>, TAX CLASS AT TIME OF SALE <chr>,
## # BUILDING CLASS AT TIME OF SALE <chr>, SALE PRICE <dbl>, SALE DATE <dttm>
# Mutación de variables ---------------------------------------------------
x <- 1:50
# Esta función le permite vectorizar múltiples declaraciones if y else if. Es un equivalente R de la sentencia CASE WHEN de SQL.
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
TRUE ~ as.character(x)
)
## [1] "1" "2" "3" "4" "fizz" "6"
## [7] "buzz" "8" "9" "fizz" "11" "12"
## [13] "13" "buzz" "fizz" "16" "17" "18"
## [19] "19" "fizz" "buzz" "22" "23" "24"
## [25] "fizz" "26" "27" "buzz" "29" "fizz"
## [31] "31" "32" "33" "34" "fizz buzz" "36"
## [37] "37" "38" "39" "fizz" "41" "buzz"
## [43] "43" "44" "fizz" "46" "47" "48"
## [49] "buzz" "fizz"
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
TRUE ~ "0" #Debe ser del mismo tipo de dato, TRUE para conservar los datos que no cumplen ninguna condicion
)
## [1] "0" "0" "0" "0" "fizz" "0"
## [7] "buzz" "0" "0" "fizz" "0" "0"
## [13] "0" "buzz" "fizz" "0" "0" "0"
## [19] "0" "fizz" "buzz" "0" "0" "0"
## [25] "fizz" "0" "0" "buzz" "0" "fizz"
## [31] "0" "0" "0" "0" "fizz buzz" "0"
## [37] "0" "0" "0" "fizz" "0" "buzz"
## [43] "0" "0" "fizz" "0" "0" "0"
## [49] "buzz" "fizz"
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
FALSE ~ "0" # con FALSE y tendremos NA donde no se cumple la condicion
)
## [1] NA NA NA NA "fizz" NA
## [7] "buzz" NA NA "fizz" NA NA
## [13] NA "buzz" "fizz" NA NA NA
## [19] NA "fizz" "buzz" NA NA NA
## [25] "fizz" NA NA "buzz" NA "fizz"
## [31] NA NA NA NA "fizz buzz" NA
## [37] NA NA NA "fizz" NA "buzz"
## [43] NA NA "fizz" NA NA NA
## [49] "buzz" "fizz"
brooklyn %>%
mutate(`BUILDING CLASS CATEGORY NEW` = case_when(`BUILDING CLASS CATEGORY`=="01 ONE FAMILY DWELLINGS" ~ "SI",
T ~ "OTROS")) %>%
select(`BUILDING CLASS CATEGORY`, `BUILDING CLASS CATEGORY NEW`)
## # A tibble: 19,244 x 2
## `BUILDING CLASS CATEGORY` `BUILDING CLASS CATEGORY NEW`
## <chr> <chr>
## 1 01 ONE FAMILY DWELLINGS SI
## 2 01 ONE FAMILY DWELLINGS SI
## 3 01 ONE FAMILY DWELLINGS SI
## 4 01 ONE FAMILY DWELLINGS SI
## 5 01 ONE FAMILY DWELLINGS SI
## 6 01 ONE FAMILY DWELLINGS SI
## 7 01 ONE FAMILY DWELLINGS SI
## 8 01 ONE FAMILY DWELLINGS SI
## 9 01 ONE FAMILY DWELLINGS SI
## 10 01 ONE FAMILY DWELLINGS SI
## # ... with 19,234 more rows
# Agrupación y sumarizado -------------------------------------------------
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
head()
## # A tibble: 6 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 BATH BEACH 529
## 2 BAY RIDGE 1059
## 3 BEDFORD STUYVESANT 3049
## 4 BENSONHURST 1028
## 5 BERGEN BEACH 280
## 6 BOERUM HILL 334
# Slicing para seleccionar filas segun su posicion -----------------------------------------------------------------
# para obtener la categoria o fila con valor maximo
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
slice_max(order_by = `TOTAL UNITS`, n=1)
## # A tibble: 1 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 DOWNTOWN-FULTON MALL 47772
# para obtener la categoria o fila con valor minimo
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
slice_min(order_by = `TOTAL UNITS`, n=1)
## # A tibble: 1 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 NAVY YARD 28
# para obtener una muestra aleatoria de tamanio n
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
slice_sample(n=3)
## # A tibble: 3 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 FLATBUSH-NORTH 806
## 2 BENSONHURST 1028
## 3 PROSPECT HEIGHTS 418
# para obtener los n primeras observaciones de la tabla
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
slice_head(n=3)
## # A tibble: 3 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 BATH BEACH 529
## 2 BAY RIDGE 1059
## 3 BEDFORD STUYVESANT 3049
# para obtener las n ultimas observaciones de la tabla
brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS` = sum(`TOTAL UNITS`, na.rm = T)) %>%
slice_tail(n=3)
## # A tibble: 3 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 WILLIAMSBURG-SOUTH 330
## 2 WINDSOR TERRACE 227
## 3 WYCKOFF HEIGHTS 351
# Joining -----------------------------------------------------------------
sint_1 = brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`TOTAL UNITS`=sum(`TOTAL UNITS`, na.rm = T))
sint_1
## # A tibble: 60 x 2
## NEIGHBORHOOD `TOTAL UNITS`
## <chr> <dbl>
## 1 BATH BEACH 529
## 2 BAY RIDGE 1059
## 3 BEDFORD STUYVESANT 3049
## 4 BENSONHURST 1028
## 5 BERGEN BEACH 280
## 6 BOERUM HILL 334
## 7 BOROUGH PARK 2745
## 8 BRIGHTON BEACH 324
## 9 BROOKLYN HEIGHTS 346
## 10 BROWNSVILLE 384
## # ... with 50 more rows
sint_2 = brooklyn %>%
group_by(NEIGHBORHOOD) %>%
summarise(`COMMERCIAL UNITS`=sum(`COMMERCIAL UNITS`, na.rm = T))
sint_2
## # A tibble: 60 x 2
## NEIGHBORHOOD `COMMERCIAL UNITS`
## <chr> <dbl>
## 1 BATH BEACH 39
## 2 BAY RIDGE 119
## 3 BEDFORD STUYVESANT 113
## 4 BENSONHURST 52
## 5 BERGEN BEACH 18
## 6 BOERUM HILL 34
## 7 BOROUGH PARK 210
## 8 BRIGHTON BEACH 63
## 9 BROOKLYN HEIGHTS 21
## 10 BROWNSVILLE 25
## # ... with 50 more rows
sint_1 %>%
inner_join(sint_2 %>% slice_sample(n=2))
## Joining, by = "NEIGHBORHOOD"
## # A tibble: 2 x 3
## NEIGHBORHOOD `TOTAL UNITS` `COMMERCIAL UNITS`
## <chr> <dbl> <dbl>
## 1 FORT GREENE 508 27
## 2 SUNSET PARK 1185 79
sint_1 %>%
left_join(sint_2 %>% slice_head(n=2))
## Joining, by = "NEIGHBORHOOD"
## # A tibble: 60 x 3
## NEIGHBORHOOD `TOTAL UNITS` `COMMERCIAL UNITS`
## <chr> <dbl> <dbl>
## 1 BATH BEACH 529 39
## 2 BAY RIDGE 1059 119
## 3 BEDFORD STUYVESANT 3049 NA
## 4 BENSONHURST 1028 NA
## 5 BERGEN BEACH 280 NA
## 6 BOERUM HILL 334 NA
## 7 BOROUGH PARK 2745 NA
## 8 BRIGHTON BEACH 324 NA
## 9 BROOKLYN HEIGHTS 346 NA
## 10 BROWNSVILLE 384 NA
## # ... with 50 more rows
sint_1 %>%
right_join(sint_2 %>% slice_head(n=2))
## Joining, by = "NEIGHBORHOOD"
## # A tibble: 2 x 3
## NEIGHBORHOOD `TOTAL UNITS` `COMMERCIAL UNITS`
## <chr> <dbl> <dbl>
## 1 BATH BEACH 529 39
## 2 BAY RIDGE 1059 119
sint_1 %>%
full_join(sint_2 %>% slice_head(n=2))
## Joining, by = "NEIGHBORHOOD"
## # A tibble: 60 x 3
## NEIGHBORHOOD `TOTAL UNITS` `COMMERCIAL UNITS`
## <chr> <dbl> <dbl>
## 1 BATH BEACH 529 39
## 2 BAY RIDGE 1059 119
## 3 BEDFORD STUYVESANT 3049 NA
## 4 BENSONHURST 1028 NA
## 5 BERGEN BEACH 280 NA
## 6 BOERUM HILL 334 NA
## 7 BOROUGH PARK 2745 NA
## 8 BRIGHTON BEACH 324 NA
## 9 BROOKLYN HEIGHTS 346 NA
## 10 BROWNSVILLE 384 NA
## # ... with 50 more rows
# Exploración gráfica -----------------------------------------------------
brooklyn %>% select(where(is.numeric)) %>% ggpairs()
****
# Unión for filas ---------------------------------------------------------
# Carga del resto de datasets
manhattan = read_excel("data/rollingsales_manhattan.xls", skip=4)
manhattan %>% glimpse()
## Rows: 12,190
## Columns: 21
## $ BOROUGH <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ NEIGHBORHOOD <chr> "ALPHABET CITY", "ALPHABET CITY", "AL~
## $ `BUILDING CLASS CATEGORY` <chr> "01 ONE FAMILY DWELLINGS", "02 TWO FA~
## $ `TAX CLASS AT PRESENT` <chr> "1", "1", "1", "1", "2B", "2B", "2", ~
## $ BLOCK <dbl> 374, 377, 377, 377, 372, 374, 376, 37~
## $ LOT <dbl> 46, 72, 66, 66, 10, 1, 28, 54, 6, 7, ~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "A4", "B9", "C0", "C0", "C7", "C7", "~
## $ ADDRESS <chr> "347 EAST 4TH STREET", "231 EAST 7TH"~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 10009, 10009, 10009, 10009, 10009, 10~
## $ `RESIDENTIAL UNITS` <dbl> 1, 2, 3, 3, 9, 8, 24, 20, 8, 8, 10, 8~
## $ `COMMERCIAL UNITS` <dbl> 0, 0, 0, 0, 1, 2, 0, 0, 1, 1, 1, 2, 2~
## $ `TOTAL UNITS` <dbl> 1, 2, 3, 3, 10, 10, 24, 20, 9, 9, 11,~
## $ `LAND SQUARE FEET` <dbl> 2116, 1718, 2381, 2381, 2021, 1501, 2~
## $ `GROSS SQUARE FEET` <dbl> 4400, 5154, 3084, 3084, 6445, 6929, 1~
## $ `YEAR BUILT` <dbl> 1900, 1901, 1899, 1899, 1900, 1900, 1~
## $ `TAX CLASS AT TIME OF SALE` <chr> "1", "1", "1", "1", "2", "2", "2", "2~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "A4", "B9", "C0", "C0", "C7", "C7", "~
## $ `SALE PRICE` <dbl> 2385000, 0, 0, 4350000, 0, 3672530, 1~
## $ `SALE DATE` <dttm> 2021-02-09, 2020-10-02, 2020-10-31, ~
bronx = read_excel("data/rollingsales_bronx.xls", skip=4)
bronx %>% glimpse()
## Rows: 6,139
## Columns: 21
## $ BOROUGH <chr> "2", "2", "2", "2", "2", "2", "2", "2~
## $ NEIGHBORHOOD <chr> "BATHGATE", "BATHGATE", "BATHGATE", "~
## $ `BUILDING CLASS CATEGORY` <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FA~
## $ `TAX CLASS AT PRESENT` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ BLOCK <dbl> 3028, 3028, 3029, 3030, 3045, 3046, 2~
## $ LOT <dbl> 23, 24, 36, 67, 12, 52, 118, 131, 114~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "A1", "A1", "A5", "A1", "A1", "B3", "~
## $ ADDRESS <chr> "408 EAST 179TH STREET", "410 EAST 17~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 10457, 10457, 10457, 10457, 10457, 10~
## $ `RESIDENTIAL UNITS` <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2~
## $ `COMMERCIAL UNITS` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ `TOTAL UNITS` <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2~
## $ `LAND SQUARE FEET` <dbl> 1842, 1842, 1568, 1646, 2340, 2329, 2~
## $ `GROSS SQUARE FEET` <dbl> 2048, 2048, 1526, 1495, 1516, 1431, 2~
## $ `YEAR BUILT` <dbl> 1901, 1901, 1899, 1899, 1910, 1901, 1~
## $ `TAX CLASS AT TIME OF SALE` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "A1", "A1", "A5", "A1", "A1", "A1", "~
## $ `SALE PRICE` <dbl> 0, 600000, 0, 475000, 289000, 526000,~
## $ `SALE DATE` <dttm> 2021-01-20, 2021-01-15, 2020-07-09, ~
queens = read_excel("data/rollingsales_queens.xls", skip=4)
queens %>% glimpse()
## Rows: 20,945
## Columns: 21
## $ BOROUGH <chr> "4", "4", "4", "4", "4", "4", "4", "4~
## $ NEIGHBORHOOD <chr> "AIRPORT JFK", "AIRPORT LA GUARDIA", ~
## $ `BUILDING CLASS CATEGORY` <chr> "39 TRANSPORTATION FACILITIES", "01 O~
## $ `TAX CLASS AT PRESENT` <chr> "4", "1", "1", "1", "1", "1", "1", "1~
## $ BLOCK <dbl> 14260, 949, 976, 15830, 15830, 15831,~
## $ LOT <dbl> 80, 41, 12, 20, 27, 27, 51, 32, 59, 4~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "V7", "A5", "A5", "A5", "A5", "A5", "~
## $ ADDRESS <chr> "N/A ROCKAWAY BLVD", "19-62 81ST STRE~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 11434, 11370, 11370, 11691, 11691, 11~
## $ `RESIDENTIAL UNITS` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `COMMERCIAL UNITS` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ `TOTAL UNITS` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `LAND SQUARE FEET` <dbl> 268092, 1800, 1800, 2469, 2325, 2470,~
## $ `GROSS SQUARE FEET` <dbl> 0, 1224, 1224, 1476, 1350, 1332, 1224~
## $ `YEAR BUILT` <dbl> NA, 1945, 1950, 2005, 2005, 2006, 192~
## $ `TAX CLASS AT TIME OF SALE` <chr> "4", "1", "1", "1", "1", "1", "1", "1~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "T1", "A5", "A5", "A5", "A5", "A5", "~
## $ `SALE PRICE` <dbl> 4121000, 584569, 800000, 300000, 3600~
## $ `SALE DATE` <dttm> 2020-07-16, 2020-08-28, 2021-01-11, ~
statenisland = read_excel("data/rollingsales_statenisland.xls", skip=4)
statenisland %>% glimpse()
## Rows: 6,483
## Columns: 21
## $ BOROUGH <chr> "5", "5", "5", "5", "5", "5", "5", "5~
## $ NEIGHBORHOOD <chr> "ANNADALE", "ANNADALE", "ANNADALE", "~
## $ `BUILDING CLASS CATEGORY` <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FA~
## $ `TAX CLASS AT PRESENT` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ BLOCK <dbl> 5404, 5407, 5425, 6205, 6205, 6205, 6~
## $ LOT <dbl> 46, 6, 31, 46, 83, 115, 124, 145, 16,~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "A1", "A1", "A1", "A5", "A1", "A5", "~
## $ ADDRESS <chr> "119 LENZIE STREET", "118 ELMBANK STR~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 10312, 10312, 10312, 10312, 10312, 10~
## $ `RESIDENTIAL UNITS` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `COMMERCIAL UNITS` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ `TOTAL UNITS` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `LAND SQUARE FEET` <dbl> 8000, 6200, 2500, 2243, 3989, 2475, 2~
## $ `GROSS SQUARE FEET` <dbl> 3660, 2580, 1222, 1373, 2300, 2495, 2~
## $ `YEAR BUILT` <dbl> 1998, 1975, 1925, 1986, 2004, 1986, 1~
## $ `TAX CLASS AT TIME OF SALE` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "A1", "A1", "A2", "A5", "A1", "A5", "~
## $ `SALE PRICE` <dbl> 315000, 450000, 525000, 455000, 72000~
## $ `SALE DATE` <dttm> 2020-10-02, 2020-06-24, 2020-07-02, ~
# Unión de datos
NYC_propiedades_venta = bind_rows(brooklyn,
manhattan,
queens,
statenisland,
brooklyn)
NYC_propiedades_venta %>% glimpse()
## Rows: 78,106
## Columns: 21
## $ BOROUGH <chr> "3", "3", "3", "3", "3", "3", "3", "3~
## $ NEIGHBORHOOD <chr> "BATH BEACH", "BATH BEACH", "BATH BEA~
## $ `BUILDING CLASS CATEGORY` <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FA~
## $ `TAX CLASS AT PRESENT` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ BLOCK <dbl> 6359, 6360, 6362, 6367, 6371, 6380, 6~
## $ LOT <dbl> 70, 56, 23, 24, 60, 73, 115, 13, 40, ~
## $ `EASE-MENT` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `BUILDING CLASS AT PRESENT` <chr> "S1", "A5", "A9", "A9", "A9", "S1", "~
## $ ADDRESS <chr> "8684 15TH AVENUE", "30 BAY 10TH STRE~
## $ `APARTMENT NUMBER` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ `ZIP CODE` <dbl> 11228, 11228, 11228, 11214, 11214, 11~
## $ `RESIDENTIAL UNITS` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ `COMMERCIAL UNITS` <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0~
## $ `TOTAL UNITS` <dbl> 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1~
## $ `LAND SQUARE FEET` <dbl> 1933, 1547, 2280, 1571, 2417, 1740, 1~
## $ `GROSS SQUARE FEET` <dbl> 4080, 1428, 1052, 1456, 2106, 2124, 1~
## $ `YEAR BUILT` <dbl> 1930, 1930, 1901, 1935, 1930, 1960, 1~
## $ `TAX CLASS AT TIME OF SALE` <chr> "1", "1", "1", "1", "1", "1", "1", "1~
## $ `BUILDING CLASS AT TIME OF SALE` <chr> "S1", "A5", "A9", "A9", "A9", "S1", "~
## $ `SALE PRICE` <dbl> 1300000, 75000, 0, 830000, 1188000, 9~
## $ `SALE DATE` <dttm> 2020-04-28, 2020-11-30, 2020-11-04, ~
# Limpieza de nombres
NYC_propiedades_venta = NYC_propiedades_venta %>% clean_names(case = "all_caps")
# Vistazo a la base final
NYC_propiedades_venta %>% glimpse()
## Rows: 78,106
## Columns: 21
## $ BOROUGH <chr> "3", "3", "3", "3", "3", "3", "3", "3",~
## $ NEIGHBORHOOD <chr> "BATH BEACH", "BATH BEACH", "BATH BEACH~
## $ BUILDING_CLASS_CATEGORY <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FAMI~
## $ TAX_CLASS_AT_PRESENT <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BLOCK <dbl> 6359, 6360, 6362, 6367, 6371, 6380, 639~
## $ LOT <dbl> 70, 56, 23, 24, 60, 73, 115, 13, 40, 38~
## $ EASE_MENT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ BUILDING_CLASS_AT_PRESENT <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ ADDRESS <chr> "8684 15TH AVENUE", "30 BAY 10TH STREET~
## $ APARTMENT_NUMBER <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ ZIP_CODE <dbl> 11228, 11228, 11228, 11214, 11214, 1121~
## $ RESIDENTIAL_UNITS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ COMMERCIAL_UNITS <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ~
## $ TOTAL_UNITS <dbl> 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, ~
## $ LAND_SQUARE_FEET <dbl> 1933, 1547, 2280, 1571, 2417, 1740, 144~
## $ GROSS_SQUARE_FEET <dbl> 4080, 1428, 1052, 1456, 2106, 2124, 128~
## $ YEAR_BUILT <dbl> 1930, 1930, 1901, 1935, 1930, 1960, 194~
## $ TAX_CLASS_AT_TIME_OF_SALE <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BUILDING_CLASS_AT_TIME_OF_SALE <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ SALE_PRICE <dbl> 1300000, 75000, 0, 830000, 1188000, 990~
## $ SALE_DATE <dttm> 2020-04-28, 2020-11-30, 2020-11-04, 20~
# Descripción del tipo de variables
NYC_propiedades_venta %>% vis_dat(warn_large_data = F)
# Datos perdidos
NYC_propiedades_venta %>% vis_miss(warn_large_data = F)
# Eliminación de variables perdidas
NYC_propiedades_venta = NYC_propiedades_venta %>%
select(-EASE_MENT)
# Correlaciones entre variables
NYC_propiedades_venta %>%
select(where(is.numeric)) %>% vis_cor()
# Patrones en datos perdidos
NYC_propiedades_venta %>% gg_miss_upset()
# Transformación columnas a filas
NYC_propiedades_venta %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
head()
## # A tibble: 6 x 4
## NEIGHBORHOOD TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
## <chr> <dbl> <dbl> <dbl>
## 1 AIRPORT JFK 0 0 0
## 2 AIRPORT LA GUARDIA 2 2 0
## 3 ALPHABET CITY 346 325 21
## 4 ANNADALE 192 190 2
## 5 ARDEN HEIGHTS 280 279 1
## 6 ARROCHAR 41 39 2
NYC_propiedades_venta %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>%
head()
## # A tibble: 6 x 3
## NEIGHBORHOOD UNIT_TYPE NUMBER_UNITS
## <chr> <chr> <dbl>
## 1 AIRPORT JFK TOTAL_UNITS 0
## 2 AIRPORT JFK RESIDENTIAL_UNITS 0
## 3 AIRPORT JFK COMMERCIAL_UNITS 0
## 4 AIRPORT LA GUARDIA TOTAL_UNITS 2
## 5 AIRPORT LA GUARDIA RESIDENTIAL_UNITS 2
## 6 AIRPORT LA GUARDIA COMMERCIAL_UNITS 0
# Transformación filas a columnas
NYC_propiedades_venta %>%
group_by(NEIGHBORHOOD) %>%
summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>%
pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>%
pivot_wider(names_from = c(UNIT_TYPE), values_from = c(NUMBER_UNITS), values_fill = 0) %>%
head()
## # A tibble: 6 x 4
## NEIGHBORHOOD TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
## <chr> <dbl> <dbl> <dbl>
## 1 AIRPORT JFK 0 0 0
## 2 AIRPORT LA GUARDIA 2 2 0
## 3 ALPHABET CITY 346 325 21
## 4 ANNADALE 192 190 2
## 5 ARDEN HEIGHTS 280 279 1
## 6 ARROCHAR 41 39 2
Existen los siguientes tipos de analítica y de aprendizaje:
**** Campos de aplicación de la detección de anomalías:
****
****
Más información aquí
library(qicharts) # Isolation forest
## qicharts will no longer be maintained. Please consider moving to qicharts2: https://anhoej.github.io/qicharts2/.
# fijamos la semilla
set.seed(7)
# creamos un vector aleatorio con valores que siguen distribucion normal
y <- rnorm(24)
# graficamos
qic(y, chart = 'i')
# Introducimos un punto atípico en el punto 18
y[18] <- 5
# graficamos
qic(y, chart = 'i')
# fijamos parametros
m.beds <- 300
m.stay <- 4
m.days <- m.beds * 7
m.discharges <- m.days / m.stay
p.pu <- 0.08
# simulacion datos
discharges <- rpois(24, lambda = m.discharges)
patientdays <- round(rnorm(24, mean = m.days, sd = 100))
n.pu <- rpois(24, lambda = m.discharges * p.pu * 1.5)
n.pat.pu <- rbinom(24, size = discharges, prob = p.pu)
week <- seq(as.Date('2014-1-1'),
length.out = 24,
by = 'week')
# consolidamoa las tablas en una sola
d <- data.frame(week, discharges, patientdays,n.pu, n.pat.pu)
d
## week discharges patientdays n.pu n.pat.pu
## 1 2014-01-01 554 2271 58 47
## 2 2014-01-08 529 2172 68 39
## 3 2014-01-15 542 2148 70 31
## 4 2014-01-22 538 1943 71 29
## 5 2014-01-29 502 2132 66 37
## 6 2014-02-05 511 2117 63 42
## 7 2014-02-12 541 2010 69 49
## 8 2014-02-19 527 2108 61 47
## 9 2014-02-26 523 2116 69 37
## 10 2014-03-05 515 2154 67 51
## 11 2014-03-12 530 2170 67 41
## 12 2014-03-19 532 2132 62 42
## 13 2014-03-26 528 2211 79 53
## 14 2014-04-02 521 2177 58 47
## 15 2014-04-09 552 2215 67 51
## 16 2014-04-16 508 2226 64 35
## 17 2014-04-23 492 2170 69 35
## 18 2014-04-30 516 2143 58 28
## 19 2014-05-07 470 2008 67 41
## 20 2014-05-14 538 2038 61 51
## 21 2014-05-21 527 2013 73 51
## 22 2014-05-28 546 1936 60 41
## 23 2014-06-04 518 1967 67 43
## 24 2014-06-11 518 2011 62 28
# conteo del numero de atipicos
qic(n.pu,
x = week,
data = d,
chart = 'c',
main = 'Úlceras por presión adquiridas en el hospital (gráfico C)',
ylab = 'Número',
xlab = 'Semana')
# conteo de la tasa de atipicos
qic(n.pu,
n = patientdays,
x = week,
data = d,
chart = 'u',
multiply = 1000,
main = 'Úlceras por presión adquiridas en el hospital (gráfico U)',
ylab = 'Recuento por 1000 días de pacientes',
xlab = 'Semana')
# proporcion de atipicos
qic(n.pat.pu,
n = discharges,
x = week,
data = d,
chart = 'p',
multiply = 100,
main = 'Úlceras por presión adquiridas en el hospital (gráfico P)',
ylab = '% Pacientes',
xlab = 'Semana')
Isolation forest.- El bosque de aislamiento es un método de detección de anomalías. Más información aquí
library(solitude) # Isolation forest
# Modelo isolation forest
isoforest = isolationForest$new(sample_size = as.integer(nrow(NYC_propiedades_venta)/2),
num_trees = 500,
replace = TRUE,
seed = 123)
isoforest$fit(dataset = NYC_propiedades_venta %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit())
## INFO [07:53:34.883] dataset has duplicated rows
## INFO [07:53:34.941] Building Isolation Forest ...
## INFO [07:53:40.716] done
## INFO [07:53:40.717] Computing depth of terminal nodes ...
## INFO [07:53:42.340] done
## INFO [07:53:48.635] Completed growing isolation forest
predicciones = isoforest$predict(data = NYC_propiedades_venta %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit())
head(predicciones)
## id average_depth anomaly_score
## 1: 1 16.000 0.5790718
## 2: 2 15.922 0.5806161
## 3: 3 15.982 0.5794278
## 4: 4 16.000 0.5790718
## 5: 5 16.000 0.5790718
## 6: 6 16.000 0.5790718
ggplot(data = predicciones, aes(x = average_depth)) +
geom_histogram(color = "gray40") +
geom_vline(
xintercept = quantile(predicciones$average_depth, seq(0, 1, 0.1)),
color = "red",
linetype = "dashed") +
labs(
title = "Distribución de las distancias medias del Isolation Forest",
subtitle = "Cuantiles marcados en rojo" ) +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Reducción de dimensión (columnas).
FactoMineR es un paquete R dedicado al análisis de datos exploratorios multivariados. Más información aquí
El paquete factoextra tiene métodos flexibles y fáciles de usar para extraer rápidamente, en un formato de datos estándar legible por humanos. Produce una elegante visualización de datos basada en ggplot2 con menos escritura.
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pca_res = PCA(NYC_propiedades_venta %>% select(where(is.numeric)), scale.unit = T)
pca_res %>% fviz_screeplot()
pca_res %>% fviz_pca_biplot()
pca_res = PCA(NYC_propiedades_venta %>%
slice(-43798) %>%
select(where(is.numeric)), scale.unit = T)
pca_res %>% fviz_screeplot()
pca_res %>% fviz_pca_biplot()
NYC_propiedades_venta_imputed = NYC_propiedades_venta %>%
select(where(is.numeric)) %>%
impute_mean_all()
# Datos perdidos
NYC_propiedades_venta_imputed %>% vis_miss(warn_large_data = F)
NYC_propiedades_venta_imputed = NYC_propiedades_venta %>%
select(where(is.numeric)) %>%
impute_median_all()
# Datos perdidos
NYC_propiedades_venta_imputed %>% vis_miss(warn_large_data = F)