Load libraries
# Load libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Read the CSV
# Step 1: Read the CSV
library(readr)
library(stringr)
crime <- read_csv("/Users/choeshi/ITEC4220/crime.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 1004991 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Date Rptd, DATE OCC, TIME OCC, AREA, AREA NAME, Rpt Dist No, Crm C...
## dbl (11): DR_NO, Part 1-2, Crm Cd, Vict Age, Premis Cd, Weapon Used Cd, Crm ...
## lgl (1): Crm Cd 4
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Convert date columns
crime <- crime %>%
mutate(
`Date Rptd` = mdy_hms(`Date Rptd`),
`DATE OCC` = mdy_hms(`DATE OCC`),
`TIME OCC` = str_pad(`TIME OCC`, width = 4, pad = "0"),
`TIME OCC` = hm(paste0(substr(`TIME OCC`, 1, 2), ":", substr(`TIME OCC`, 3, 4)))
)
# Step 3: View structure
glimpse(crime)
## Rows: 1,004,991
## Columns: 28
## $ DR_NO <dbl> 211507896, 201516622, 240913563, 210704711, 201418201…
## $ `Date Rptd` <dttm> 2021-04-11, 2020-10-21, 2024-12-10, 2020-12-24, 2020…
## $ `DATE OCC` <dttm> 2020-11-07, 2020-10-18, 2020-10-30, 2020-12-24, 2020…
## $ `TIME OCC` <Period> 8H 45M 0S, 18H 45M 0S, 12H 40M 0S, 13H 10M 0S, 18H…
## $ AREA <chr> "15", "15", "09", "07", "14", "04", "03", "11", "17",…
## $ `AREA NAME` <chr> "N Hollywood", "N Hollywood", "Van Nuys", "Wilshire",…
## $ `Rpt Dist No` <chr> "1502", "1521", "0933", "0782", "1454", "0429", "0396…
## $ `Part 1-2` <dbl> 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,…
## $ `Crm Cd` <dbl> 354, 230, 354, 331, 420, 354, 354, 812, 354, 354, 812…
## $ `Crm Cd Desc` <chr> "THEFT OF IDENTITY", "ASSAULT WITH DEADLY WEAPON, AGG…
## $ Mocodes <chr> "0377", "0416 0334 2004 1822 1414 0305 0319 0400", "0…
## $ `Vict Age` <dbl> 31, 32, 30, 47, 63, 35, 21, 14, 43, 57, 13, 34, 0, 0,…
## $ `Vict Sex` <chr> "M", "M", "M", "F", "M", "M", "F", "F", "M", "M", "M"…
## $ `Vict Descent` <chr> "H", "H", "W", "A", "H", "B", "B", "H", "W", "W", "H"…
## $ `Premis Cd` <dbl> 501, 102, 501, 101, 103, 502, 501, 121, 501, 501, 501…
## $ `Premis Desc` <chr> "SINGLE FAMILY DWELLING", "SIDEWALK", "SINGLE FAMILY …
## $ `Weapon Used Cd` <dbl> NA, 200, NA, NA, NA, NA, NA, 500, NA, NA, 400, NA, NA…
## $ `Weapon Desc` <chr> NA, "KNIFE WITH BLADE 6INCHES OR LESS", NA, NA, NA, N…
## $ Status <chr> "IC", "IC", "IC", "IC", "IC", "IC", "IC", "AO", "IC",…
## $ `Status Desc` <chr> "Invest Cont", "Invest Cont", "Invest Cont", "Invest …
## $ `Crm Cd 1` <dbl> 354, 230, 354, 331, 420, 354, 354, 812, 354, 354, 812…
## $ `Crm Cd 2` <dbl> NA, NA, NA, NA, NA, NA, NA, 860, NA, NA, 860, NA, NA,…
## $ `Crm Cd 3` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ `Crm Cd 4` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LOCATION <chr> "7800 BEEMAN AV", "ATOLL …
## $ `Cross Street` <chr> NA, "N GAULT", NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LAT <dbl> 34.2124, 34.1993, 34.1847, 34.0339, 33.9813, 34.0830,…
## $ LON <dbl> -118.4092, -118.4203, -118.4509, -118.3747, -118.4350…
Extracting crime for 2024 and creating a scatter plot. The scatter plot shows the total amount of crime by the Area code in red and then the blue dashed lines on the scatter plot shows the average number of crimes. Area code that has the highest crime rate in 2024 is 01, Central LA, the lowest crime rate area is area 16, Foothil.
library(readr)
library(lubridate)
crime <- crime %>%
mutate(
year_occ = year(`DATE OCC`),
year_rptd= year(`Date Rptd`)
)
crime2024 <- crime %>%
filter(year_occ== 2024)
#creating a scatterplot
crimeRate2024 <- crime2024 %>%
group_by(AREA) %>%
summarise(total_crimes=n(), .groups = "drop")
library(ggplot2)
mean_crime <- mean(crimeRate2024$total_crimes, na.rm = TRUE)
ggplot(crimeRate2024, aes(x = AREA, y= total_crimes))+
geom_point(color="red", size=3)+
geom_hline(yintercept = mean_crime, linetype="dashed", color="blue")+
labs(title = "Crime count in 2024 by Area Code",
x="Area Code",
y="Number of Crimes")
2.This code calculates total crimes per year, total crimes per area, and the average crimes per month, enabling me to analyze trends and changes in crime over time. The line plot shows that the average monthly crime rate increased in 2022, remained stable in 2023, and then decreased in 2024.
#Total crimes per year
library(dplyr)
total_crime_per_year <-crime %>%
group_by(year_occ) %>%
summarise(Totalcrime= n(), .groups='drop')
print(total_crime_per_year)
## # A tibble: 6 × 2
## year_occ Totalcrime
## <dbl> <int>
## 1 2020 199847
## 2 2021 209876
## 3 2022 235259
## 4 2023 232345
## 5 2024 127567
## 6 2025 97
#Total crimes per area per year
total_crime_by_area <- crime%>%
group_by(year_occ, AREA )%>%
summarise(CrimePerArea= n(), .groups = 'drop')
print(total_crime_by_area)
## # A tibble: 125 × 3
## year_occ AREA CrimePerArea
## <dbl> <chr> <int>
## 1 2020 01 11604
## 2 2020 02 9025
## 3 2020 03 11180
## 4 2020 04 7807
## 5 2020 05 8875
## 6 2020 06 10174
## 7 2020 07 9292
## 8 2020 08 9311
## 9 2020 09 8764
## 10 2020 10 8095
## # ℹ 115 more rows
#Average crime per month per year
average_crime_per_month <- crime %>%
mutate(
Year= year(`DATE OCC`),
Month=month(`DATE OCC`)
)%>%
group_by(year_occ, Month) %>% #counting crimes per month
summarise(CrimesPerMonth= n(), .groups = 'drop')%>%
group_by(year_occ) %>%
summarise(AverageCrimePerMonth= mean(CrimesPerMonth, na.rm=TRUE))
print(average_crime_per_month)
## # A tibble: 6 × 2
## year_occ AverageCrimePerMonth
## <dbl> <dbl>
## 1 2020 16654.
## 2 2021 17490.
## 3 2022 19605.
## 4 2023 19362.
## 5 2024 10631.
## 6 2025 19.4
library(ggplot2)
ggplot(average_crime_per_month, aes(x = year_occ, y = AverageCrimePerMonth)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "Average Crime Per Month",
x = "Year",
y = "Average Crimes Per Month")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
3. Finding out if there is a correlation between total crimes and
average crime per month. The correlation coefficient is 1, which means
there’s a perfect positive linear relationship between Total Crimes per
year and Average crimes per month.
# Merge total crimes and average monthly crimes per year
crime_summary <- total_crime_per_year %>%
left_join(average_crime_per_month, by ="year_occ")
print(crime_summary)
## # A tibble: 6 × 3
## year_occ Totalcrime AverageCrimePerMonth
## <dbl> <int> <dbl>
## 1 2020 199847 16654.
## 2 2021 209876 17490.
## 3 2022 235259 19605.
## 4 2023 232345 19362.
## 5 2024 127567 10631.
## 6 2025 97 19.4
#Finding correlation
cor_test <- cor.test(crime_summary$Totalcrime, crime_summary$AverageCrimePerMonth)
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: crime_summary$Totalcrime and crime_summary$AverageCrimePerMonth
## t = 7616.4, df = 4, p-value = 1.783e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9999997 1.0000000
## sample estimates:
## cor
## 1
head(crime)
## # A tibble: 6 × 30
## DR_NO `Date Rptd` `DATE OCC` `TIME OCC` AREA `AREA NAME`
## <dbl> <dttm> <dttm> <Period> <chr> <chr>
## 1 211507896 2021-04-11 00:00:00 2020-11-07 00:00:00 8H 45M 0S 15 N Hollywood
## 2 201516622 2020-10-21 00:00:00 2020-10-18 00:00:00 18H 45M 0S 15 N Hollywood
## 3 240913563 2024-12-10 00:00:00 2020-10-30 00:00:00 12H 40M 0S 09 Van Nuys
## 4 210704711 2020-12-24 00:00:00 2020-12-24 00:00:00 13H 10M 0S 07 Wilshire
## 5 201418201 2020-10-03 00:00:00 2020-09-29 00:00:00 18H 30M 0S 14 Pacific
## 6 240412063 2024-12-11 00:00:00 2020-11-11 00:00:00 12H 10M 0S 04 Hollenbeck
## # ℹ 24 more variables: `Rpt Dist No` <chr>, `Part 1-2` <dbl>, `Crm Cd` <dbl>,
## # `Crm Cd Desc` <chr>, Mocodes <chr>, `Vict Age` <dbl>, `Vict Sex` <chr>,
## # `Vict Descent` <chr>, `Premis Cd` <dbl>, `Premis Desc` <chr>,
## # `Weapon Used Cd` <dbl>, `Weapon Desc` <chr>, Status <chr>,
## # `Status Desc` <chr>, `Crm Cd 1` <dbl>, `Crm Cd 2` <dbl>, `Crm Cd 3` <dbl>,
## # `Crm Cd 4` <lgl>, LOCATION <chr>, `Cross Street` <chr>, LAT <dbl>,
## # LON <dbl>, year_occ <dbl>, year_rptd <dbl>
hist(crime$`Vict Age`)
5.Question: Is there a significant difference in the number of crimes
between area 1 and 3? The results showed that Area 1 had slightly higher
average yearly crime count(11,611) compared to Area 3(9,573). However,
the p-value (0.55) indicates that the difference is not statistically
significant. Therefore,the difference in the crime levels between these
two areas are not meaningful.
#Creating a subset with only Area 1 and Area 2
crime_subset <- crime %>%
filter(AREA %in% c("01","03")) %>%
mutate(AREA= factor(AREA))
#count crimes per year per area
crime_counts <-crime_subset %>%
group_by(AREA, year_occ) %>%
summarise(TotalCrimes= n(), .groups= "drop")
#t-test comparing Area 1 Vs Area3
t.test_result <- t.test(TotalCrimes ~ AREA, data =crime_counts)
print(t.test_result)
##
## Welch Two Sample t-test
##
## data: TotalCrimes by AREA
## t = 0.61315, df = 9.4664, p-value = 0.5542
## alternative hypothesis: true difference in means between group 01 and group 03 is not equal to 0
## 95 percent confidence interval:
## -5425.367 9501.701
## sample estimates:
## mean in group 01 mean in group 03
## 11611.67 9573.50