R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(readxl) # leer tablas desde excel

library(tidyverse) # para manejo de bases de datos library(janitor) library(visdat) # crea visualizaciones de datos exploratorios preliminares

library(naniar) # visualizacion y tratamiento de datos perdidos

library(GGally)

brooklyn = read_excel(“data/rollingsales_brooklyn.xls”, skip = 4)

brooklyn %>% glimpse()

brooklyn %>% head()

brooklyn %>% select(BOROUGH, NEIGHBORHOOD) %>% head()

brooklyn %>% filter(LOT==70) %>% head()

x <- 1:50

case_when( x %% 35 == 0 ~ “fizz buzz”, x %% 5 == 0 ~ “fizz”, x %% 7 == 0 ~ “buzz”, TRUE ~ as.character(x) )

case_when( x %% 35 == 0 ~ “fizz buzz”, x %% 5 == 0 ~ “fizz”, x %% 7 == 0 ~ “buzz”, TRUE ~ “0” #Debe ser del mismo tipo de dato, TRUE para conservar los datos que no cumplen ninguna condicion )

case_when( x %% 35 == 0 ~ “fizz buzz”, x %% 5 == 0 ~ “fizz”, x %% 7 == 0 ~ “buzz”, FALSE ~ “0” # con FALSE y tendremos NA donde no se cumple la condicion )

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% head()

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% slice_max(order_by = TOTAL UNITS, n=1)

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% slice_min(order_by = TOTAL UNITS, n=1)

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% slice_sample(n=3)

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% slice_head(n=3)

brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS = sum(TOTAL UNITS, na.rm = T)) %>% slice_tail(n=3)

sint_1 = brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL UNITS=sum(TOTAL UNITS, na.rm = T))

sint_1

sint_2 = brooklyn %>% group_by(NEIGHBORHOOD) %>% summarise(COMMERCIAL UNITS=sum(COMMERCIAL UNITS, na.rm = T))

sint_2

sint_1 %>% inner_join(sint_2 %>% slice_sample(n=2))

sint_1 %>% inner_join(sint_2 %>% slice_sample(n=2))

sint_1 %>% left_join(sint_2 %>% slice_head(n=2))

sint_1 %>% right_join(sint_2 %>% slice_head(n=2))

sint_1 %>% full_join(sint_2 %>% slice_head(n=2))

brooklyn %>% select(where(is.numeric)) %>% ggpairs()

manhattan = read_excel(“data/rollingsales_manhattan.xls”, skip=4) manhattan %>% glimpse()

queens = read_excel(“data/rollingsales_queens.xls”, skip=4) queens %>% glimpse()

statenisland = read_excel(“data/rollingsales_statenisland.xls”, skip=4) statenisland %>% glimpse()

NYC_propiedades_venta = bind_rows(brooklyn, manhattan, queens, statenisland, brooklyn)

NYC_propiedades_venta %>% glimpse()

NYC_propiedades_venta = NYC_propiedades_venta %>% clean_names(case = “all_caps”)

Vistazo a la base final

NYC_propiedades_venta %>% glimpse()

NYC_propiedades_venta %>% vis_dat(warn_large_data = F)

NYC_propiedades_venta %>% vis_miss(warn_large_data = F)

NYC_propiedades_venta = NYC_propiedades_venta %>% select(-EASE_MENT)

NYC_propiedades_venta %>% select(where(is.numeric)) %>% vis_cor()

NYC_propiedades_venta %>% gg_miss_upset()

NYC_propiedades_venta %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T), RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T), COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% head()

NYC_propiedades_venta %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T), RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T), COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% pivot_longer(cols = -c(NEIGHBORHOOD), names_to = “UNIT_TYPE”, values_to = “NUMBER_UNITS”) %>% head()

NYC_propiedades_venta %>% group_by(NEIGHBORHOOD) %>% summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T), RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T), COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% pivot_longer(cols = -c(NEIGHBORHOOD), names_to = “UNIT_TYPE”, values_to = “NUMBER_UNITS”) %>% pivot_wider(names_from = c(UNIT_TYPE), values_from = c(NUMBER_UNITS), values_fill = 0) %>% head()

library(qicharts)

set.seed(7) y <- rnorm(24) qic(y, chart = ‘i’)

y[18] <- 5

graficamos

qic(y, chart = ‘i’)

m.beds <- 300 m.stay <- 4 m.days <- m.beds * 7 m.discharges <- m.days / m.stay p.pu <- 0.08

discharges <- rpois(24, lambda = m.discharges) patientdays <- round(rnorm(24, mean = m.days, sd = 100)) n.pu <- rpois(24, lambda = m.discharges * p.pu * 1.5) n.pat.pu <- rbinom(24, size = discharges, prob = p.pu) week <- seq(as.Date(‘2014-1-1’), length.out = 24, by = ‘week’)

d <- data.frame(week, discharges, patientdays,n.pu, n.pat.pu) d

qic(n.pu, x = week, data = d, chart = ‘c’, main = ‘Úlceras por presión adquiridas en el hospital (gráfico C)’, ylab = ‘Número’, xlab = ‘Semana’)

qic

qic(n.pat.pu, n = discharges, x = week, data = d, chart = ‘p’, multiply = 100, main = ‘Úlceras por presión adquiridas en el hospital (gráfico P)’, ylab = ‘% Pacientes’, xlab = ‘Semana’)

library(solitude) # Isolation forest

isoforest = isolationForest\(new(sample_size = as.integer(nrow(NYC_propiedades_venta)/2), num_trees = 500, replace = TRUE, seed = 123) isoforest\)fit(dataset = NYC_propiedades_venta %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit())

predicciones = isoforest$predict(data = NYC_propiedades_venta %>% select(SALE_PRICE, GROSS_SQUARE_FEET) %>% na.omit()) head(predicciones)

ggplot(data = predicciones, aes(x = average_depth)) + geom_histogram(color = “gray40”) + geom_vline( xintercept = quantile(predicciones$average_depth, seq(0, 1, 0.1)), color = “red”, linetype = “dashed”) + labs( title = “Distribución de las distancias medias del Isolation Forest”, subtitle = “Cuantiles marcados en rojo” ) + theme_bw()

library(FactoMineR) library(factoextra)

pca_res = PCA(NYC_propiedades_venta %>% select(where(is.numeric)), scale.unit = T)

pca_res %>% fviz_screeplot()

pca_res %>% fviz_pca_biplot()

pca_res = PCA(NYC_propiedades_venta %>% slice(-43798) %>% select(where(is.numeric)), scale.unit = T)

pca_res %>% fviz_screeplot()

pca_res %>% fviz_pca_biplot()

NYC_propiedades_venta_imputed = NYC_propiedades_venta %>% select(where(is.numeric)) %>% impute_mean_all()

Datos perdidos

NYC_propiedades_venta_imputed %>% vis_miss(warn_large_data = F)

NYC_propiedades_venta_imputed = NYC_propiedades_venta %>% select(where(is.numeric)) %>% impute_median_all()

Datos perdidos

NYC_propiedades_venta_imputed %>% vis_miss(warn_large_data = F)