Estadistica1_Parcial1_Castaño-Romero-Suaza

library(readr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(ggthemes)
library(gmodels)
library(descr)

## 
## Attaching package: 'descr'

## The following object is masked from 'package:gmodels':
## 
##     CrossTable

Obtener una base de datos por municipios, que contenga la cantidad de delitos por temática, porcentaje de mujeres víctimas y porcentaje de delitos en zonas rural.

De = read_csv("Delitos en Colombia 2.csv")

## Rows: 569127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (18): TEMÁTICA, FECHA, DEPARTAMENTO, MUNICIPIO, DIA, BARRIO, ZONA, CLAS...
## dbl   (1): CODIGO DANE
## time  (1): HORA
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

De$SEXO = ifelse(De$SEXO=="FEMENINO",1,0)
De$ZONA = ifelse(De$ZONA=="RURAL",1,0)
De %>% 
   select(MUNICIPIO, TEMÁTICA, ZONA, SEXO) %>% 
   group_by(MUNICIPIO, TEMÁTICA) %>%
  summarise (conteo = n(), FEM=sum(SEXO/conteo*100),RURAL=sum(ZONA/conteo*100)) -> df

## `summarise()` has grouped output by 'MUNICIPIO'. You can override using the
## `.groups` argument.

df1 = df[, -c(4,5)] 
df1 %>% 
pivot_wider(names_from = TEMÁTICA , values_from =conteo) -> df1

1. Explique si la cantidad de homicidios tiene una correlación alta con la cantidad de robos.

df1 %>% 
  filter(complete.cases(HOMICIDIOS, `HURTO A PERSONAS`))  -> df2

round(cor(df2$HOMICIDIOS, df2$`HURTO A PERSONAS`), 2)

## [1] 0.85

El coeficente de correlación es 0.85, esto nos indica que las variables que se están analizando (Homicidios y hurto a personas), están altamente relacionadas, puesto que, cuando el coeficiente está más cerca de 1 hay mayor relación.**

2.Realice diagrama de barras de los diez municipios con mayor cantidad de delitos por temática y explique.

HF = read_csv("Delitos en Colombia 2.csv")

## Rows: 569127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (18): TEMÁTICA, FECHA, DEPARTAMENTO, MUNICIPIO, DIA, BARRIO, ZONA, CLAS...
## dbl   (1): CODIGO DANE
## time  (1): HORA
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

HF %>% 
  group_by(MUNICIPIO) %>% 
  summarise(conteo =n ()) %>% 
  top_n (10) -> hf

## Selecting by conteo

top = hf$MUNICIPIO

hj = filter(df) [, -c(4,5)] %>% 
  filter(MUNICIPIO %in% top) -> hj 

ggplot(hj, aes(fill = TEMÁTICA, y=conteo, x=reorder(MUNICIPIO,conteo)))+geom_bar(position ="stack", stat="identity")+labs(y="Frecuencia")+coord_flip() + ggtitle("Top 10 municipios con más delitos")

Podemos concluir con el diagrama de barras, que los delitos analizados se presentan con mayor frecuencia en los principales municipios del país, estas pertenecen al top 10 de municipios con más delitos. Los delitos son: Violencia intrafamiliar, lesiones personales, hurto a residencias, hurto a personas, homicidios, delitos sexuales y amenazas. Bogotá al ser la capital del país, es el municipio que presenta mayor frecuencia en los delitos mencionados anteriormente.

3. Realice histogramas de la razón de homicidios de hombres sobre homicidios de mujeres.

 CF = read_csv("Delitos en Colombia 2.csv")

## Rows: 569127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (18): TEMÁTICA, FECHA, DEPARTAMENTO, MUNICIPIO, DIA, BARRIO, ZONA, CLAS...
## dbl   (1): CODIGO DANE
## time  (1): HORA
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

CF%>% 
  select(MUNICIPIO, SEXO, TEMÁTICA) %>% 
  group_by(MUNICIPIO, SEXO, TEMÁTICA) %>% 
  filter(TEMÁTICA == "HOMICIDIOS") %>% 
  summarise(conteo = n()) -> CF

## `summarise()` has grouped output by 'MUNICIPIO', 'SEXO'. You can override using
## the `.groups` argument.

CF %>% 
  pivot_wider(names_from = SEXO , values_from =conteo) -> CF 


CF <- CF[, -5]
CF <- CF[, -5]


((CF$MASCULINO)/(CF$FEMENINO)) -> CF$RAZÓN
CF = CF[complete.cases(CF),]

any(is.na(CF))

## [1] FALSE

CF[!complete.cases(CF),]

## # A tibble: 0 × 5
## # Groups:   MUNICIPIO [0]
## # … with 5 variables: MUNICIPIO <chr>, TEMÁTICA <chr>, MASCULINO <int>,
## #   FEMENINO <int>, RAZÓN <dbl>

CF

## # A tibble: 539 × 5
## # Groups:   MUNICIPIO [539]
##    MUNICIPIO       TEMÁTICA   MASCULINO FEMENINO RAZÓN
##    <chr>           <chr>          <int>    <int> <dbl>
##  1 ABREGO          HOMICIDIOS        28        6  4.67
##  2 ACACÍAS         HOMICIDIOS        51        8  6.38
##  3 ACANDÍ          HOMICIDIOS        14        1 14   
##  4 ACEVEDO         HOMICIDIOS        19        1 19   
##  5 AGUACHICA       HOMICIDIOS        55        3 18.3 
##  6 AGUADAS         HOMICIDIOS        17        1 17   
##  7 AGUAZUL         HOMICIDIOS        30        7  4.29
##  8 AGUSTÍN CODAZZI HOMICIDIOS        38        9  4.22
##  9 AIPE            HOMICIDIOS        14        1 14   
## 10 ALBÁN           HOMICIDIOS         3        2  1.5 
## # … with 529 more rows

hist(x = CF$RAZÓN, main = "Histograma de razón", 
     xlab = "Razón", ylab = "# Delitos",
     col = "purple")

Con la gráfica anterior podemos interpretar que entre mayor sea la razón de hombres sobre mujeres, es menor el número de homicidios.

Ahora debemos obtener una base de datos para Bogotá

IF = De

IF %>% 
  select("MUNICIPIO", "ARMA EMPLEADA") %>% 
  filter(MUNICIPIO == "BOGOTÁ D.C. (CT)") -> If

1. Reclasifique el tipo de arma empleada en las siguientes categorías: Sin empleo de armas, arma de fuego, arma blanca y otra.

If %>% 
  select("MUNICIPIO" , "ARMA EMPLEADA") %>% 
  group_by(MUNICIPIO , `ARMA EMPLEADA`) %>%
  summarise(conteo = n()) -> If

## `summarise()` has grouped output by 'MUNICIPIO'. You can override using the
## `.groups` argument.

If %>% 
  pivot_wider(names_from = `ARMA EMPLEADA` , values_from =conteo) -> If

If$Arma_Blanca = sum(If$`ARMAS BLANCAS`, If$CORTOPUNZANTES,If$CORTANTES,If$CUCHILLA,If$JERINGA,If$`LLAVE MAESTRA`,If$PUNZANTES)
If$Arma_Fuego = sum(If$`ARMA DE FUEGO`)
If$Sin_Arma = sum(If$`REDES SOCIALES`,If$`SIN EMPLEO DE ARMAS`, If$ESCOPOLAMINA, If$VENENO, If$`SUSTANCIAS TOXICAS`, If$QUIMICOS, If$GASOLINA, If$GASES, If$`LLAMADA TELEFONICA`, If$COMBUSTIBLE)
If$Otros = sum(If$BICICLETA, If$`CARRO BOMBA`, If$`CARTA EXTORSIVA`, If$`CINTAS/CINTURON`,If$MIXTA, If$MOTO, If$CONTUNDENTES, If$`CUERDA/SOGA/CADENA`, If$DIRECTA, If$`GRANADA DE MANO`, If$`NO REPORTADO`,If$`NO REPORTADA`, If$`PAPA EXPLOSIVA`, If$PERRO, If$`POLVORA(FUEGOS PIROTECNICOS)`, If$VEHICULO)

2. Calcule la proporción de delitos cometidos en Bogotá, por cada tipo de arma empleada. Para eso, utilice la clasificación obtenida en el punto anterior.

If %>% 
  select("Arma_Blanca", "Arma_Fuego", "Sin_Arma", "Otros") -> If

## Adding missing grouping variables: `MUNICIPIO`

If$TOTAL = sum(If$Arma_Blanca, If$Arma_Fuego, If$Sin_Arma, If$Otros)

ProporcionArma_Blanca = ((If$Arma_Blanca)/(If$TOTAL)*100)
ProporcionArma_Fuego = ((If$Arma_Fuego)/(If$TOTAL)*100)
ProporcionSin_Arma = ((If$Sin_Arma)/(If$TOTAL)*100)
ProporcionOtro = ((If$Otros)/(If$TOTAL)*100)

ProporcionArma_Blanca

## [1] 1.428426

ProporcionArma_Fuego

## [1] 13.39079

ProporcionSin_Arma

## [1] 41.4006

ProporcionOtro

## [1] 43.78018

La mayoria de delitos que se presentan son cometidos por otro tipo de armas que no corresponde a armas de fuego, arma blanca, o aquellos delitos que se cometieron sin arma, un ejemplo de estos son: Bicicleta, Carro bomba, Carta extorsiva, Moto, entre otros.

3. Realice una tabla cruzada que contemple las variables sexo y el tipo de delito. ¿Cuál es la proporción de víctimas por tipo para cada sexo?

LF = De

LF %>% 
  select("SEXO" , "TEMÁTICA") %>% 
  group_by(SEXO, TEMÁTICA) %>% 
  summarise(conteo = n()) ->Uf

## `summarise()` has grouped output by 'SEXO'. You can override using the
## `.groups` argument.

Uf %>% 
  pivot_wider(names_from = `TEMÁTICA`, values_from = conteo)-> Uf 

Uf %>% 
  filter(SEXO %in% c("FEMENINO", "MASCULINO")) -> Uf

Uf$TOTAL = rowSums(Uf [ , 2:13])

PropAmenza = (Uf$AMENAZA)/(Uf$TOTAL)
PropDelitosSexuales = (Uf$`DELITOS SEXUALES`)/(Uf$TOTAL)

PropLesionesPersonales = (Uf$`LESIONES PERSONALES`)/(Uf$TOTAL)
PropAbigeato = (Uf$ABIGEATO)/(Uf$TOTAL)
PropCabezaDeGanado = (Uf$`CABEZA DE GANADO`)/(Uf$TOTAL)
PropExtorsion = (Uf$EXTORCIÓN)/(Uf$TOTAL)
PropHomicidios = (Uf$HOMICIDIOS)/(Uf$TOTAL)
PropHurtoPersonas=(Uf$`HURTO A PERSONAS`)/(Uf$TOTAL)
PropHurtoResidencias = (Uf$`HURTO A RESIDENCIAS`)/(Uf$TOTAL)
PropPirateriaTerrestre = (Uf$`PIRATERIA TERRESTRE`)/(Uf$TOTAL)
PropSecuestro = (Uf$SECUESTRO)/(Uf$TOTAL)
PropViolenciaIntrafamiliar = (Uf$`VIOLENCIA INTRAFAMILIAR`)/(Uf$TOTAL)

De la tabla anterior podemos analizar que los hombres son las principales victimas en todos los delitos, en excepción a los delitos sexuales y en la violencia intrafamiliar.

4. Realice diagramas de caja donde se evidencie la comparación de la distribución de las edades y el tipo de arma empleada en los delitos.

OF = read_csv("Delitos en Colombia 2.csv")

## Rows: 569127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (18): TEMÁTICA, FECHA, DEPARTAMENTO, MUNICIPIO, DIA, BARRIO, ZONA, CLAS...
## dbl   (1): CODIGO DANE
## time  (1): HORA
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

OF %>% 
  select(MUNICIPIO , `ARMA EMPLEADA` , EDAD) %>% 
  filter(MUNICIPIO == "BOGOTÁ D.C. (CT)") -> Of

Of %>% 
  filter(`ARMA EMPLEADA`== "SIN EMPLEO DE ARMAS") -> ggg

Of %>% 
  filter(`ARMA EMPLEADA` == "ARMA BLANCA")-> ggg1

Of %>% 
  filter(`ARMA EMPLEADA` == "ARMA DE FUEGO")-> ggg2

Of %>% 
  filter(`ARMA EMPLEADA` %in% c("-", "ACIDO" , "AGUA CALIENTE" , "ARMA BLANCA / CORTOPUNZANTE" , 
                                "ARMAS BLANCAS" , "ARTEFACTO EXPLOSIVO/CARGA DINAMITA" , "BICICLETA" , "CARRO BOMBA" , "CARTA EXTORSIVA" , "CINTAS/CINTURON" , "COMBUSTIBLE" , "CONTUNDENTES" , "CORTANTES" , 
"CORTOPUNZANTES" , "CUCHILLA" , "CUERDA/SOGA/CADENA" , "DIRECTA" , "ESCOPOLAMINA" , "GASES" , "GASOLINA" , "GRANADA DE MANO" , "JERINGA" , "LLAMADA TELEFONICA" , "LLAVE MAESTRA", "MIXTA" , "MOTO" , "NO REPORTADA" , "NO REPORTADO" , "PAPA EXPLOSIVA" , "PERRO" , "POLVORA" , "POLVORA" , "FUEGOS PIROTECNICOS" , "PUNZANTES" , "QUIMICOS" , "REDES SOCIALES" , "SIN EMPLEO DE ARMAS" , "SUSTANCIAS TOXICAS" , "VEHICULO" , "VENENO"))-> ggg4

Of %>% 
  select(MUNICIPIO , `ARMA EMPLEADA` , EDAD) %>% 
  group_by( `ARMA EMPLEADA`) %>% 
  summarise(jeje = n())-> Off


class(ggg$EDAD)

## [1] "character"

is.character(ggg$EDAD)

## [1] TRUE

ggg$EDAD = as.numeric(ggg$EDAD)

## Warning: NAs introduced by coercion

class(ggg1$EDAD)

## [1] "character"

is.character(ggg1$EDAD)

## [1] TRUE

ggg1$EDAD = as.numeric(ggg1$EDAD)

## Warning: NAs introduced by coercion

class(ggg2$EDAD)

## [1] "character"

is.character(ggg2$EDAD)

## [1] TRUE

ggg2$EDAD = as.numeric(ggg2$EDAD)

## Warning: NAs introduced by coercion

class(ggg4$EDAD)

## [1] "character"

is.character(ggg4$EDAD)

## [1] TRUE

ggg4$EDAD = as.numeric(ggg4$EDAD)

## Warning: NAs introduced by coercion

boxplot(ggg$EDAD , ggg1$EDAD , ggg2$EDAD , ggg4$EDAD , names = c("SinArma" , "Ablanca" , "Afuego" , "Otros") , col = "rosybrown1", xlab = 'Armas empleadas', ylab = 'Edad')

De la anterior gráfica, podemos deducir que las edades de las víctimas están en intervalo entre 25 y 50 años, sin embargo, se puede evidenciar que existen datos atípicos en cada uno de los grupos analizados.