Pregunta de investigación
¿Cómo puede este análisis ayudar al trabajo policial?
Estadisticas descriptivas
# Basic summary of the dataset
summary(chicagoCrime2020)
## ...1 id case_number
## Min. : 24279 Min. : 24889 Length:91318
## 1st Qu.:1781055 1st Qu.:11974960 Class :character
## Median :3564740 Median :12010619 Mode :character
## Mean :3574290 Mean :11973896
## 3rd Qu.:5376446 3rd Qu.:12046081
## Max. :7135304 Max. :12085764
##
## date block iucr
## Min. :2020-01-01 00:00:00.00 Length:91318 Length:91318
## 1st Qu.:2020-02-05 18:49:15.00 Class :character Class :character
## Median :2020-03-14 12:07:30.00 Mode :character Mode :character
## Mean :2020-03-20 17:37:40.25
## 3rd Qu.:2020-05-05 15:33:45.00
## Max. :2020-06-16 00:00:00.00
##
## primary_type description location_description arrest
## Length:91318 Length:91318 Length:91318 Mode :logical
## Class :character Class :character Class :character FALSE:75086
## Mode :character Mode :character Mode :character TRUE :16232
##
##
##
##
## domestic beat district ward community_area
## Mode :logical Min. : 111 Min. : 1.00 Min. : 1.00 Min. : 1.00
## FALSE:74227 1st Qu.: 612 1st Qu.: 6.00 1st Qu.:10.00 1st Qu.:23.00
## TRUE :17091 Median :1021 Median :10.00 Median :23.00 Median :32.00
## Mean :1136 Mean :11.13 Mean :22.86 Mean :37.61
## 3rd Qu.:1654 3rd Qu.:16.00 3rd Qu.:34.00 3rd Qu.:56.00
## Max. :2535 Max. :31.00 Max. :50.00 Max. :77.00
## NA's :2
## fbi_code x_coordinate y_coordinate year
## Length:91318 Min. :1092647 Min. :1813897 Min. :2020
## Class :character 1st Qu.:1152959 1st Qu.:1857892 1st Qu.:2020
## Mode :character Median :1166541 Median :1890260 Median :2020
## Mean :1164941 Mean :1884924 Mean :2020
## 3rd Qu.:1176618 3rd Qu.:1907957 3rd Qu.:2020
## Max. :1205112 Max. :1951507 Max. :2020
## NA's :751 NA's :751
## updated_on latitude longitude
## Min. :2020-01-08 15:49:48.00 Min. :41.64 Min. :-87.93
## 1st Qu.:2020-02-15 15:48:50.00 1st Qu.:41.77 1st Qu.:-87.71
## Median :2020-03-25 15:45:43.00 Median :41.85 Median :-87.66
## Mean :2020-03-30 21:16:08.47 Mean :41.84 Mean :-87.67
## 3rd Qu.:2020-05-15 15:50:24.00 3rd Qu.:41.90 3rd Qu.:-87.63
## Max. :2020-06-23 15:47:15.00 Max. :42.02 Max. :-87.52
## NA's :751 NA's :751
## location hash point_date
## Length:91318 Min. :2020-01-01 00:00:00.00 Mode:logical
## Class :character 1st Qu.:2020-02-05 18:49:15.00 NA's:91318
## Mode :character Median :2020-03-14 12:07:30.00
## Mean :2020-03-20 17:37:40.25
## 3rd Qu.:2020-05-05 15:33:45.00
## Max. :2020-06-16 00:00:00.00
##
## geom
## Mode:logical
## NA's:91318
##
##
##
##
##
Limpieza de datos
#Quitar la primera columna
chicagoCrime2020 <- subset(chicagoCrime2020, select = -1)
#Quitar las columnas "point_date" y "geom" que no contienen información
chicagoCrime2020 <- chicagoCrime2020[, !colnames(chicagoCrime2020) %in% c("point_date", "geom")]
#Limpieza de datos - verificar datos faltantes en el dataset
sum(is.na(chicagoCrime2020))
## [1] 4216
#Quitar valores "NA" de las columnas y eliminar registros duplicados
chicagoCrime2020 <- chicagoCrime2020 %>% na.omit() %>% distinct()
#Separamos "Date" para tener dia y hora en diferentes columnas
chicagoCrime2020 <- chicagoCrime2020 %>% separate(date, into =c("day", "Time"), sep = " ")
str(chicagoCrime2020$day)
## chr [1:90149] "2020-01-01" "2020-01-02" "2020-01-02" "2020-01-02" ...
# Columna "day" en formato de fecha
chicagoCrime2020$day <- as.Date(chicagoCrime2020$day, format = "%Y-%m-%d")
#Separamos "Block" para tener numero y bloque en diferentes columnas
chicagoCrime2020 <- chicagoCrime2020 %>% separate(block, into =c("block_num", "block"), sep = 6)
sum(is.na(chicagoCrime2020))
## [1] 0
Análisis
¿En que mes se cometen más delitos?
# Calcula el número de casos por mes
casos_mes <- chicagoCrime2020 %>%
mutate(month = format(day, format = "%Y-%m")) %>%
group_by(month) %>%
summarise(total_cases = n())
#Gráfico de barras
ggplot(casos_mes, aes(x = month, y = total_cases, fill = total_cases)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_cases), vjust = -0.5, color = "black", size = 3.0) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme_minimal() +
labs(
title = "Número de Casos por Mes",
x = "Mes",
y = "Total de Casos"
)

¿Qué día de la semana presenta mayor ocurrencia de delitos?
# Columna con el día de la semana
chicagoCrime2020 <- chicagoCrime2020 %>%
mutate(dia_semana = format(day, "%A", locale = "es"))
# Conteo de casos por día de la semana
casos_dia <- chicagoCrime2020 %>%
group_by(dia_semana) %>%
summarise(total_casos = n()) %>%
arrange(match(dia_semana, c("domingo", "lunes", "martes", "miércoles", "jueves", "viernes", "sábado")))
# Orden 'dia_semana'
casos_dia$dia_semana <- factor(casos_dia$dia_semana, levels = c("domingo", "lunes", "martes", "miércoles", "jueves", "viernes", "sábado"), ordered = TRUE)
data <- as.data.frame(t(casos_dia$total_casos))
colnames(data) <- t(casos_dia$dia_semana)
data <- rbind(rep(14000,7) , rep(11000,7) , data)
radarchart(data, axistype=1 ,
pcol= "blue" , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=2 ,
cglcol="grey", cglty=1, axislabcol="black", caxislabels=seq(11000,14000,650), cglwd=0.8,)

¿Qué hora del día presenta mayor ocurrencia de delitos?
str(chicagoCrime2020$Time)
## chr [1:90149] "04:00:00" "01:33:00" "14:00:00" "23:00:00" "03:55:00" ...
chicagoCrime2020$Time <- as.POSIXct(chicagoCrime2020$Time, format = "%H:%M:%S")
# Crea una columna con los rangos de 1 hora
chicagoCrime2020 <- chicagoCrime2020 %>%
mutate(hour_range = cut(as.POSIXlt(Time), breaks = "1 hour"))
# Resumen de las ocurrencias por rango de 1 hora
resumen_horas <- chicagoCrime2020 %>%
group_by(hour_range) %>%
summarise(total_ocurrencias = n())
# Crea el gráfico de barras
ggplot(resumen_horas, aes(x = hour_range, y = total_ocurrencias, fill = total_ocurrencias)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_ocurrencias), vjust = -0.5, color = "black", size = 3.0) +
labs(
title = "Ocurrencias de delitos por hora",
x = "Horas del día",
y = "Número de Ocurrencias"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_discrete(labels = function(x) format(as.POSIXlt(x), "%H:%M")) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme(legend.position = "right")

¿Cuál es el delito más cometido en la ciudad?
# Calcula el número de casos por categoría
casos_tipo <- chicagoCrime2020 %>% group_by(primary_type) %>% summarise(total_cases = n())
# Gráfico de barras
ggplot(casos_tipo, aes(x = total_cases, y = reorder(primary_type, total_cases), fill = total_cases)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_cases), vjust = 0.5, color = "black", size = 3.0) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme_minimal() +
labs(
title = "Número de casos por tipo de crimen",
x = "Total de Casos",
y = "Tipo"
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Tendencia de delitos por mes "BATTERY"
datos_linea <- chicagoCrime2020[, c("day", "primary_type")]
datos_battery <- datos_linea[datos_linea$primary_type == "BATTERY", ]
datos_battery <- datos_battery %>% mutate(day = as.Date(day), month = month(day))
resumen_mes2 <- datos_battery %>% group_by(month) %>% summarise(total_battery = n())
ggplot(resumen_mes2, aes(x = month, y = total_battery)) +
geom_line(arrow = arrow(),color = "seagreen", lwd = 1, linetype = 1) +
geom_point(size=2, shape=21, fill="white", colour="darkolivegreen") +
geom_text(aes(label = total_battery), vjust = -0.5, hjust = 0.5, size = 3) +
labs(
title = "Tendencia de BATTERY por mes",
x = "Mes",
y = "Número de incedentes"
) +
theme_minimal() +
scale_y_continuous(limits = c(0, max(resumen_mes2$total_battery) + 10))

# Tendencia de robos por mes "THEFT"
datos_thefts <- datos_linea[datos_linea$primary_type == "THEFT", ]
datos_thefts <- datos_thefts %>% mutate(day = as.Date(day), month = month(day))
resumen_mes <- datos_thefts %>% group_by(month) %>% summarise(total_thefts = n())
ggplot(resumen_mes, aes(x = month, y = total_thefts)) +
geom_line(arrow = arrow(),color = "seagreen", lwd = 1, linetype = 1) +
geom_point(size=2, shape=21, fill="white", colour="darkolivegreen") +
geom_text(aes(label = total_thefts), vjust = -0.5, hjust = 0.5, size = 3) +
labs(
title = "Tendencia de Robo por mes",
x = "Mes",
y = "Número de Robos"
) +
theme_minimal() +
scale_y_continuous(limits = c(0, max(resumen_mes$total_thefts) + 10))

# Tendencia de delitos por mes "CRIMINAL DAMAGE"
datos_damage <- datos_linea[datos_linea$primary_type == "CRIMINAL DAMAGE", ]
datos_damage <- datos_damage %>% mutate(day = as.Date(day), month = month(day))
resumen_mes3 <- datos_damage %>% group_by(month) %>% summarise(total_damage = n())
ggplot(resumen_mes3, aes(x = month, y = total_damage)) +
geom_line(arrow = arrow(),color = "seagreen", lwd = 1, linetype = 1) +
geom_point(size=2, shape=21, fill="white", colour="darkolivegreen") +
geom_text(aes(label = total_damage), vjust = -0.5, hjust = 0.5, size = 3) +
labs(
title = "Tendencia de CRIMINAL DAMAGE por mes",
x = "Mes",
y = "Número de incidentes"
) +
theme_minimal() +
scale_y_continuous(limits = c(0, max(resumen_mes3$total_damage) + 10))

¿Arrestos por delitos cometidos en la ciudad?
# Calcula el número de casos por categoría
arrestos <- chicagoCrime2020 %>% group_by(arrest) %>% summarise(total_cases = n())
# Ordena el dataset por el número de casos en orden descendente
arrestos <- arrestos[order(-arrestos$total_cases), ]
# Gráfico de barras
ggplot(arrestos, aes(x = reorder(arrest, desc(total_cases)), y = total_cases, fill = total_cases)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_cases), vjust = -0.5, color = "black", size = 3.0) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme_minimal() +
labs(
title = "Número de casos con arresto",
x = "Arresto efectivo",
y = "Total de Casos"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

¿En qué lugares de Chicago se se presentan los delitos?
# Calcula el número de casos por lugar del incidente
casos_lugar <- chicagoCrime2020 %>% group_by(location_description) %>% summarise(total_cases = n())
# Ordena el dataset por el número de casos en orden descendente
casos_lugar <- casos_lugar[order(-casos_lugar$total_cases), ]
# Filtra los 15 lugares con más casos
lugar_top15 <- casos_lugar %>% slice_head(n = 15)
# Gráfico de barras
ggplot(lugar_top15, aes(x = total_cases, y = reorder(location_description, total_cases), fill = total_cases)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_cases), vjust = 0.5, color = "black", size = 2.5) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme_minimal() +
labs(
title = "Número de casos por lugar de incidencia",
x = "Total de Casos",
y = "Lugar"
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

¿En qué áreas de Chicago se necesita más la presencia policial?
# Calcula el número de casos por bloque
casos_bloque <- chicagoCrime2020 %>% group_by(block) %>% summarise(total_cases = n())
# Ordena el dataset por el número de casos en orden descendente
casos_bloque <- casos_bloque[order(-casos_bloque$total_cases), ]
# Filtra los 15 bloques con más casos
bloque_top15 <- casos_bloque %>% slice_head(n = 15)
# Gráfico de barras
ggplot(bloque_top15, aes(x = total_cases, y = reorder(block, total_cases), fill = total_cases)) +
geom_bar(stat = "identity") +
geom_text(aes(label = total_cases), vjust = 0.5, color = "black", size = 3.0) +
scale_fill_gradient(low = "grey", high = "seagreen") +
theme_minimal() +
labs(
title = "Número de casos por bloque",
x = "Total de Casos",
y = "Bloque"
) +
theme(axis.text.x = element_text(angle = 0, hjust = 1))

¿En qué distritos de Chicago se necesita más la presencia
policial?
datos_heatmap <- chicagoCrime2020[, c("district", "primary_type")]
frecuencia_combinaciones <- table(datos_heatmap)
df_heatmap <- as.data.frame(frecuencia_combinaciones)
ggplot(df_heatmap, aes(x = district, y = primary_type, fill = Freq)) +
geom_tile() +
scale_fill_gradient(low = "azure2", high = "blue") +
labs(
title = "Heatmap de Frecuencia de Combinaciones",
x = "Distrito",
y = "Tipo de delito",
fill = "Cantidad de casos"
) +
theme_minimal()

Distribución de arrestos por tipo de delito
# Treemap
chicagoCrime2020 <- chicagoCrime2020 %>%
group_by(location_description) %>%
mutate(total_cases = n()) %>%
ungroup()
treemap(chicagoCrime2020,
index = c("primary_type", "arrest"),
vSize = "total_cases",
title = "Distribucion de arrestos por delito",
border.col = "black",
palette = "PRGn",
type = "index")

Wordcloud de la descripción del delito
# Calcula el número de casos por descripción
casos_descrip <- chicagoCrime2020 %>% group_by(description) %>% summarise(total_cases = n())
# Basic plot
wordcloud2(data = casos_descrip)