The provided information describes a dataset containing crime incidents reported in Los Angeles from 2020 to present (with an update date of August 23, 2024). It’s important to note some limitations:
Data reflects incidents reported in the retiring system (pre-March 7th, 2024). There might be inaccuracies due to manual transcription. Location details are anonymized.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
la_crime_data <- read.csv("Crime_Data_from_2020_to_Present.csv")
str(la_crime_data)
## 'data.frame': 970760 obs. of 28 variables:
## $ DR_NO : int 190326475 200106753 200320258 200907217 220614831 231808869 230110144 220314085 231309864 211904005 ...
## $ Date.Rptd : chr "03/01/2020 12:00:00 AM" "02/09/2020 12:00:00 AM" "11/11/2020 12:00:00 AM" "05/10/2023 12:00:00 AM" ...
## $ DATE.OCC : chr "03/01/2020 12:00:00 AM" "02/08/2020 12:00:00 AM" "11/04/2020 12:00:00 AM" "03/10/2020 12:00:00 AM" ...
## $ TIME.OCC : int 2130 1800 1700 2037 1200 2300 900 1110 1400 1220 ...
## $ AREA : int 7 1 3 9 6 18 1 3 13 19 ...
## $ AREA.NAME : chr "Wilshire" "Central" "Southwest" "Van Nuys" ...
## $ Rpt.Dist.No : int 784 182 356 964 666 1826 182 303 1375 1974 ...
## $ Part.1.2 : int 1 1 1 1 2 2 2 2 2 2 ...
## $ Crm.Cd : int 510 330 480 343 354 354 354 354 354 624 ...
## $ Crm.Cd.Desc : chr "VEHICLE - STOLEN" "BURGLARY FROM VEHICLE" "BIKE - STOLEN" "SHOPLIFTING-GRAND THEFT ($950.01 & OVER)" ...
## $ Mocodes : chr "" "1822 1402 0344" "0344 1251" "0325 1501" ...
## $ Vict.Age : int 0 47 19 19 28 41 25 27 24 26 ...
## $ Vict.Sex : chr "M" "M" "X" "M" ...
## $ Vict.Descent : chr "O" "O" "X" "O" ...
## $ Premis.Cd : int 101 128 502 405 102 501 502 248 750 502 ...
## $ Premis.Desc : chr "STREET" "BUS STOP/LAYOVER (ALSO QUERY 124)" "MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)" "CLOTHING STORE" ...
## $ Weapon.Used.Cd: int NA NA NA NA NA NA NA NA NA 400 ...
## $ Weapon.Desc : chr "" "" "" "" ...
## $ Status : chr "AA" "IC" "IC" "IC" ...
## $ Status.Desc : chr "Adult Arrest" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ Crm.Cd.1 : int 510 330 480 343 354 354 354 354 354 624 ...
## $ Crm.Cd.2 : int 998 998 NA NA NA NA NA NA NA NA ...
## $ Crm.Cd.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Crm.Cd.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ LOCATION : chr "1900 S LONGWOOD AV" "1000 S FLOWER ST" "1400 W 37TH ST" "14000 RIVERSIDE DR" ...
## $ Cross.Street : chr "" "" "" "" ...
## $ LAT : num 34 34 34 34.2 34.1 ...
## $ LON : num -118 -118 -118 -118 -118 ...
head(la_crime_data)
## DR_NO Date.Rptd DATE.OCC TIME.OCC AREA
## 1 190326475 03/01/2020 12:00:00 AM 03/01/2020 12:00:00 AM 2130 7
## 2 200106753 02/09/2020 12:00:00 AM 02/08/2020 12:00:00 AM 1800 1
## 3 200320258 11/11/2020 12:00:00 AM 11/04/2020 12:00:00 AM 1700 3
## 4 200907217 05/10/2023 12:00:00 AM 03/10/2020 12:00:00 AM 2037 9
## 5 220614831 08/18/2022 12:00:00 AM 08/17/2020 12:00:00 AM 1200 6
## 6 231808869 04/04/2023 12:00:00 AM 12/01/2020 12:00:00 AM 2300 18
## AREA.NAME Rpt.Dist.No Part.1.2 Crm.Cd
## 1 Wilshire 784 1 510
## 2 Central 182 1 330
## 3 Southwest 356 1 480
## 4 Van Nuys 964 1 343
## 5 Hollywood 666 2 354
## 6 Southeast 1826 2 354
## Crm.Cd.Desc Mocodes Vict.Age
## 1 VEHICLE - STOLEN 0
## 2 BURGLARY FROM VEHICLE 1822 1402 0344 47
## 3 BIKE - STOLEN 0344 1251 19
## 4 SHOPLIFTING-GRAND THEFT ($950.01 & OVER) 0325 1501 19
## 5 THEFT OF IDENTITY 1822 1501 0930 2004 28
## 6 THEFT OF IDENTITY 1822 0100 0930 0929 41
## Vict.Sex Vict.Descent Premis.Cd Premis.Desc
## 1 M O 101 STREET
## 2 M O 128 BUS STOP/LAYOVER (ALSO QUERY 124)
## 3 X X 502 MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
## 4 M O 405 CLOTHING STORE
## 5 M H 102 SIDEWALK
## 6 M H 501 SINGLE FAMILY DWELLING
## Weapon.Used.Cd Weapon.Desc Status Status.Desc Crm.Cd.1 Crm.Cd.2 Crm.Cd.3
## 1 NA AA Adult Arrest 510 998 NA
## 2 NA IC Invest Cont 330 998 NA
## 3 NA IC Invest Cont 480 NA NA
## 4 NA IC Invest Cont 343 NA NA
## 5 NA IC Invest Cont 354 NA NA
## 6 NA IC Invest Cont 354 NA NA
## Crm.Cd.4 LOCATION Cross.Street LAT
## 1 NA 1900 S LONGWOOD AV 34.0375
## 2 NA 1000 S FLOWER ST 34.0444
## 3 NA 1400 W 37TH ST 34.0210
## 4 NA 14000 RIVERSIDE DR 34.1576
## 5 NA 1900 TRANSIENT 34.0944
## 6 NA 9900 COMPTON AV 33.9467
## LON
## 1 -118.3506
## 2 -118.2628
## 3 -118.3002
## 4 -118.4387
## 5 -118.3277
## 6 -118.2463
summary(la_crime_data)
## DR_NO Date.Rptd DATE.OCC TIME.OCC
## Min. : 817 Length:970760 Length:970760 Min. : 1
## 1st Qu.:210604478 Class :character Class :character 1st Qu.: 900
## Median :220806008 Mode :character Mode :character Median :1420
## Mean :219483130 Mean :1339
## 3rd Qu.:230817672 3rd Qu.:1900
## Max. :249918669 Max. :2359
##
## AREA AREA.NAME Rpt.Dist.No Part.1.2
## Min. : 1.00 Length:970760 Min. : 101 Min. :1.000
## 1st Qu.: 6.00 Class :character 1st Qu.: 615 1st Qu.:1.000
## Median :11.00 Mode :character Median :1141 Median :1.000
## Mean :10.71 Mean :1117 Mean :1.406
## 3rd Qu.:16.00 3rd Qu.:1617 3rd Qu.:2.000
## Max. :21.00 Max. :2199 Max. :2.000
##
## Crm.Cd Crm.Cd.Desc Mocodes Vict.Age
## Min. :110.0 Length:970760 Length:970760 Min. : -4.00
## 1st Qu.:331.0 Class :character Class :character 1st Qu.: 0.00
## Median :442.0 Mode :character Mode :character Median : 30.00
## Mean :500.7 Mean : 29.21
## 3rd Qu.:626.0 3rd Qu.: 44.00
## Max. :956.0 Max. :120.00
##
## Vict.Sex Vict.Descent Premis.Cd Premis.Desc
## Length:970760 Length:970760 Min. :101.0 Length:970760
## Class :character Class :character 1st Qu.:101.0 Class :character
## Mode :character Mode :character Median :203.0 Mode :character
## Mean :306.1
## 3rd Qu.:501.0
## Max. :976.0
## NA's :14
## Weapon.Used.Cd Weapon.Desc Status Status.Desc
## Min. :101.0 Length:970760 Length:970760 Length:970760
## 1st Qu.:311.0 Class :character Class :character Class :character
## Median :400.0 Mode :character Mode :character Mode :character
## Mean :363.8
## 3rd Qu.:400.0
## Max. :516.0
## NA's :645153
## Crm.Cd.1 Crm.Cd.2 Crm.Cd.3 Crm.Cd.4
## Min. :110.0 Min. :210.0 Min. :310.0 Min. :821.0
## 1st Qu.:331.0 1st Qu.:998.0 1st Qu.:998.0 1st Qu.:998.0
## Median :442.0 Median :998.0 Median :998.0 Median :998.0
## Mean :500.5 Mean :958.1 Mean :984.2 Mean :991.2
## 3rd Qu.:626.0 3rd Qu.:998.0 3rd Qu.:998.0 3rd Qu.:998.0
## Max. :956.0 Max. :999.0 Max. :999.0 Max. :999.0
## NA's :11 NA's :902042 NA's :968456 NA's :970696
## LOCATION Cross.Street LAT LON
## Length:970760 Length:970760 Min. : 0.00 Min. :-118.7
## Class :character Class :character 1st Qu.:34.01 1st Qu.:-118.4
## Mode :character Mode :character Median :34.06 Median :-118.3
## Mean :33.99 Mean :-118.1
## 3rd Qu.:34.16 3rd Qu.:-118.3
## Max. :34.33 Max. : 0.0
##
3)Sampling
population_size <- nrow(la_crime_data)
cat("The population size of the crime dataset is", population_size)
## The population size of the crime dataset is 970760
sample_size <- round(0.1 * nrow(la_crime_data))
la_crime_data_sample <- sample_n(la_crime_data, sample_size, replace = FALSE)
sample_size <- nrow(la_crime_data_sample)
cat(" and the sample size is", sample_size)
## and the sample size is 97076
4)Data Cleaning
la_crime_data_sample <- la_crime_data_sample %>%
mutate(
Vict.Sex = case_when(
Vict.Sex == "M" ~ "Male",
Vict.Sex == "F" ~ "Female",
TRUE ~ "Other"
)
)
la_crime_data_sample <- la_crime_data_sample %>%
mutate(
Vict.Descent = case_when(
Vict.Descent == "O" ~ "Other",
Vict.Descent == "X" ~ "Unknown",
Vict.Descent == "H" ~ "Hispanic/Latin/Mexican",
Vict.Descent == "B" ~ "Black",
Vict.Descent == "W" ~ "White",
Vict.Descent == "C" ~ "Chinese",
Vict.Descent == "A" ~ "Asian",
Vict.Descent == "J" ~ "Japanese",
Vict.Descent == "F" ~ "Filipino",
TRUE ~ "Other"
)
)
summary(la_crime_data_sample$Vict.Sex)
## Length Class Mode
## 97076 character character
summary(la_crime_data_sample$Vict.Descent)
## Length Class Mode
## 97076 character character
summary(la_crime_data_sample$Crm.Cd.Desc)
## Length Class Mode
## 97076 character character
summary(la_crime_data_sample$Crm.Cd)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 110.0 331.0 442.0 501.3 626.0 956.0
crime_data_by_desc <- la_crime_data_sample %>%
group_by(Crm.Cd.Desc) %>%
summarize(num_incidents = n())
print(crime_data_by_desc)
## # A tibble: 130 × 2
## Crm.Cd.Desc num_incidents
## <chr> <int>
## 1 ARSON 244
## 2 ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER 95
## 3 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT 5391
## 4 ATTEMPTED ROBBERY 489
## 5 BATTERY - SIMPLE ASSAULT 7477
## 6 BATTERY ON A FIREFIGHTER 19
## 7 BATTERY POLICE (SIMPLE) 264
## 8 BATTERY WITH SEXUAL CONTACT 392
## 9 BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM 1
## 10 BIKE - ATTEMPTED STOLEN 1
## # ℹ 120 more rows
Crime with highest number of incidents
highest_incident <- crime_data_by_desc %>%
arrange(desc(num_incidents)) %>%
head(1)
most_frequent_crime <- highest_incident$Crm.Cd.Desc
number_of_incidents <- highest_incident$num_incidents
cat("The most frequent crime type is", most_frequent_crime, "with", number_of_incidents, "incidents.")
## The most frequent crime type is VEHICLE - STOLEN with 10507 incidents.
Crime with lowest number of incidents
lowest_incident_threshold <- min(crime_data_by_desc$num_incidents)
lowest_incident_crimes <- crime_data_by_desc %>%
filter(num_incidents == lowest_incident_threshold)
if (nrow(lowest_incident_crimes) > 1) {
cat("Multiple crime types have the lowest number of incidents:", lowest_incident_threshold, "\n")
print(lowest_incident_crimes[, c("Crm.Cd.Desc", "num_incidents")])
} else {
lowest_frequent_crime <- lowest_incident_crimes$Crm.Cd.Desc
number_of_incidents <- lowest_incident_crimes$num_incidents
cat("The crime type(s) with the lowest number of incidents is", lowest_frequent_crime, "with", number_of_incidents, "incident(s).")
}
## Multiple crime types have the lowest number of incidents: 1
## # A tibble: 12 × 2
## Crm.Cd.Desc num_incidents
## <chr> <int>
## 1 BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM 1
## 2 BIKE - ATTEMPTED STOLEN 1
## 3 BLOCKING DOOR INDUCTION CENTER 1
## 4 DISHONEST EMPLOYEE - PETTY THEFT 1
## 5 DRUGS, TO A MINOR 1
## 6 FAILURE TO DISPERSE 1
## 7 FIREARMS EMERGENCY PROTECTIVE ORDER (FIREARMS EPO) 1
## 8 GRAND THEFT / AUTO REPAIR 1
## 9 MANSLAUGHTER, NEGLIGENT 1
## 10 REPLICA FIREARMS(SALE,DISPLAY,MANUFACTURE OR DISTRIBUTE) 1
## 11 TELEPHONE PROPERTY - DAMAGE 1
## 12 THEFT, COIN MACHINE - ATTEMPT 1
ggplot(la_crime_data_sample, aes(x = Vict.Sex)) +
geom_bar() +
labs(title = "Distribution of Victim's Sex")
ggplot(la_crime_data_sample, aes(x = Crm.Cd.Desc)) +
geom_bar() +
labs(title = "Distribution of Crime Descriptions")
mosaicplot(table(la_crime_data_sample$Vict.Sex, la_crime_data_sample$Crm.Cd.Desc), main = "Association between Victim's Sex and Crime Description")
heatmap(table(la_crime_data$Vict.Descent, la_crime_data$Crm.Cd.Desc), main = "Crime Types by Victim Descent")
contingency_table_sex <- table(la_crime_data_sample$Vict.Sex, la_crime_data_sample$Crm.Cd.Desc)
contingency_table_descent <- table(la_crime_data_sample$Vict.Descent, la_crime_data_sample$Crm.Cd)
chi_square_sex <- chisq.test(contingency_table_sex)
## Warning in chisq.test(contingency_table_sex): Chi-squared approximation may be
## incorrect
print(chi_square_sex)
##
## Pearson's Chi-squared test
##
## data: contingency_table_sex
## X-squared = 64085, df = 258, p-value < 2.2e-16
chi_square_descent <- chisq.test(contingency_table_descent)
## Warning in chisq.test(contingency_table_descent): Chi-squared approximation may
## be incorrect
print(chi_square_descent)
##
## Pearson's Chi-squared test
##
## data: contingency_table_descent
## X-squared = 82692, df = 1032, p-value < 2.2e-16
1)There is a relationship with Victim’s Sex and Crime type. Accept the Alternative Hypothesis.
2)There is a relationship with Victim’s Descent and crime type. Accept the Alternative Hypothesis.