Load libraries

# Load libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Read the CSV

# Step 1: Read the CSV
library(readr)
library(stringr)

crime <- read_csv("/Users/choeshi/ITEC4220/crime.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 1004991 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Date Rptd, DATE OCC, TIME OCC, AREA, AREA NAME, Rpt Dist No, Crm C...
## dbl (11): DR_NO, Part 1-2, Crm Cd, Vict Age, Premis Cd, Weapon Used Cd, Crm ...
## lgl  (1): Crm Cd 4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Convert date columns
crime <- crime %>%
  mutate(
    `Date Rptd` = mdy_hms(`Date Rptd`),
    `DATE OCC`  = mdy_hms(`DATE OCC`),
    `TIME OCC` = str_pad(`TIME OCC`, width = 4, pad = "0"),
    `TIME OCC` = hm(paste0(substr(`TIME OCC`, 1, 2), ":", substr(`TIME OCC`, 3, 4)))
  )
# Step 3: View structure
glimpse(crime)
## Rows: 1,004,991
## Columns: 28
## $ DR_NO            <dbl> 211507896, 201516622, 240913563, 210704711, 201418201…
## $ `Date Rptd`      <dttm> 2021-04-11, 2020-10-21, 2024-12-10, 2020-12-24, 2020…
## $ `DATE OCC`       <dttm> 2020-11-07, 2020-10-18, 2020-10-30, 2020-12-24, 2020…
## $ `TIME OCC`       <Period> 8H 45M 0S, 18H 45M 0S, 12H 40M 0S, 13H 10M 0S, 18H…
## $ AREA             <chr> "15", "15", "09", "07", "14", "04", "03", "11", "17",…
## $ `AREA NAME`      <chr> "N Hollywood", "N Hollywood", "Van Nuys", "Wilshire",…
## $ `Rpt Dist No`    <chr> "1502", "1521", "0933", "0782", "1454", "0429", "0396…
## $ `Part 1-2`       <dbl> 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,…
## $ `Crm Cd`         <dbl> 354, 230, 354, 331, 420, 354, 354, 812, 354, 354, 812…
## $ `Crm Cd Desc`    <chr> "THEFT OF IDENTITY", "ASSAULT WITH DEADLY WEAPON, AGG…
## $ Mocodes          <chr> "0377", "0416 0334 2004 1822 1414 0305 0319 0400", "0…
## $ `Vict Age`       <dbl> 31, 32, 30, 47, 63, 35, 21, 14, 43, 57, 13, 34, 0, 0,…
## $ `Vict Sex`       <chr> "M", "M", "M", "F", "M", "M", "F", "F", "M", "M", "M"…
## $ `Vict Descent`   <chr> "H", "H", "W", "A", "H", "B", "B", "H", "W", "W", "H"…
## $ `Premis Cd`      <dbl> 501, 102, 501, 101, 103, 502, 501, 121, 501, 501, 501…
## $ `Premis Desc`    <chr> "SINGLE FAMILY DWELLING", "SIDEWALK", "SINGLE FAMILY …
## $ `Weapon Used Cd` <dbl> NA, 200, NA, NA, NA, NA, NA, 500, NA, NA, 400, NA, NA…
## $ `Weapon Desc`    <chr> NA, "KNIFE WITH BLADE 6INCHES OR LESS", NA, NA, NA, N…
## $ Status           <chr> "IC", "IC", "IC", "IC", "IC", "IC", "IC", "AO", "IC",…
## $ `Status Desc`    <chr> "Invest Cont", "Invest Cont", "Invest Cont", "Invest …
## $ `Crm Cd 1`       <dbl> 354, 230, 354, 331, 420, 354, 354, 812, 354, 354, 812…
## $ `Crm Cd 2`       <dbl> NA, NA, NA, NA, NA, NA, NA, 860, NA, NA, 860, NA, NA,…
## $ `Crm Cd 3`       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ `Crm Cd 4`       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LOCATION         <chr> "7800    BEEMAN                       AV", "ATOLL    …
## $ `Cross Street`   <chr> NA, "N  GAULT", NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LAT              <dbl> 34.2124, 34.1993, 34.1847, 34.0339, 33.9813, 34.0830,…
## $ LON              <dbl> -118.4092, -118.4203, -118.4509, -118.3747, -118.4350…

Extracting crime for 2024 and creating a scatter plot. The scatter plot shows the total amount of crime by the Area code in red and then the blue dashed lines on the scatter plot shows the average number of crimes. Area code that has the highest crime rate in 2024 is 01, Central LA, the lowest crime rate area is area 16, Foothil.

library(readr)
library(lubridate)
crime <- crime %>%
  mutate(
    year_occ = year(`DATE OCC`),
    year_rptd= year(`Date Rptd`)
  )
crime2024 <- crime %>%
  filter(year_occ== 2024)
#creating a scatterplot
crimeRate2024 <- crime2024 %>%
  group_by(AREA) %>%
  summarise(total_crimes=n(), .groups = "drop")
library(ggplot2)
mean_crime <- mean(crimeRate2024$total_crimes, na.rm = TRUE)
ggplot(crimeRate2024, aes(x = AREA, y= total_crimes))+
  geom_point(color="red", size=3)+
  geom_hline(yintercept = mean_crime, linetype="dashed", color="blue")+
  labs(title = "Crime count in 2024 by Area Code",
       x="Area Code",
       y="Number of Crimes")

2.This code calculates total crimes per year, total crimes per area, and the average crimes per month, enabling me to analyze trends and changes in crime over time. The line plot shows that the average monthly crime rate increased in 2022, remained stable in 2023, and then decreased in 2024.

#Total crimes per year
library(dplyr)
total_crime_per_year <-crime %>%
  group_by(year_occ) %>%
  summarise(Totalcrime= n(), .groups='drop')
print(total_crime_per_year)
## # A tibble: 6 × 2
##   year_occ Totalcrime
##      <dbl>      <int>
## 1     2020     199847
## 2     2021     209876
## 3     2022     235259
## 4     2023     232345
## 5     2024     127567
## 6     2025         97
#Total crimes per area per year 
total_crime_by_area <- crime%>%
  group_by(year_occ, AREA )%>%
  summarise(CrimePerArea= n(), .groups = 'drop')
print(total_crime_by_area)
## # A tibble: 125 × 3
##    year_occ AREA  CrimePerArea
##       <dbl> <chr>        <int>
##  1     2020 01           11604
##  2     2020 02            9025
##  3     2020 03           11180
##  4     2020 04            7807
##  5     2020 05            8875
##  6     2020 06           10174
##  7     2020 07            9292
##  8     2020 08            9311
##  9     2020 09            8764
## 10     2020 10            8095
## # ℹ 115 more rows
#Average crime per month per year 
average_crime_per_month <- crime %>%
  mutate(
    Year= year(`DATE OCC`),
    Month=month(`DATE OCC`)
  )%>%
  group_by(year_occ, Month) %>% #counting crimes per month
  summarise(CrimesPerMonth= n(), .groups = 'drop')%>%
  group_by(year_occ) %>%
  summarise(AverageCrimePerMonth= mean(CrimesPerMonth, na.rm=TRUE))
print(average_crime_per_month)
## # A tibble: 6 × 2
##   year_occ AverageCrimePerMonth
##      <dbl>                <dbl>
## 1     2020              16654. 
## 2     2021              17490. 
## 3     2022              19605. 
## 4     2023              19362. 
## 5     2024              10631. 
## 6     2025                 19.4
library(ggplot2)
ggplot(average_crime_per_month, aes(x = year_occ, y = AverageCrimePerMonth)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(title = "Average Crime Per Month",
       x = "Year",
       y = "Average Crimes Per Month")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

3. Finding out if there is a correlation between total crimes and average crime per month. The correlation coefficient is 1, which means there’s a perfect positive linear relationship between Total Crimes per year and Average crimes per month.

# Merge total crimes and average monthly crimes per year
crime_summary <- total_crime_per_year %>%
  left_join(average_crime_per_month, by ="year_occ")
print(crime_summary)
## # A tibble: 6 × 3
##   year_occ Totalcrime AverageCrimePerMonth
##      <dbl>      <int>                <dbl>
## 1     2020     199847              16654. 
## 2     2021     209876              17490. 
## 3     2022     235259              19605. 
## 4     2023     232345              19362. 
## 5     2024     127567              10631. 
## 6     2025         97                 19.4
#Finding correlation
cor_test <- cor.test(crime_summary$Totalcrime, crime_summary$AverageCrimePerMonth)
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  crime_summary$Totalcrime and crime_summary$AverageCrimePerMonth
## t = 7616.4, df = 4, p-value = 1.783e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9999997 1.0000000
## sample estimates:
## cor 
##   1
  1. The histogram below shows the distribution of victim ages. The most frequent age recorded is 0, which occurs because, in certain crime categories such as vehicle theft or vandalism, the data set records the age of the property rather than the age of a person. Besides that, the highest victim age falls between late 20’s and early 30’s.
head(crime)
## # A tibble: 6 × 30
##       DR_NO `Date Rptd`         `DATE OCC`          `TIME OCC` AREA  `AREA NAME`
##       <dbl> <dttm>              <dttm>              <Period>   <chr> <chr>      
## 1 211507896 2021-04-11 00:00:00 2020-11-07 00:00:00 8H 45M 0S  15    N Hollywood
## 2 201516622 2020-10-21 00:00:00 2020-10-18 00:00:00 18H 45M 0S 15    N Hollywood
## 3 240913563 2024-12-10 00:00:00 2020-10-30 00:00:00 12H 40M 0S 09    Van Nuys   
## 4 210704711 2020-12-24 00:00:00 2020-12-24 00:00:00 13H 10M 0S 07    Wilshire   
## 5 201418201 2020-10-03 00:00:00 2020-09-29 00:00:00 18H 30M 0S 14    Pacific    
## 6 240412063 2024-12-11 00:00:00 2020-11-11 00:00:00 12H 10M 0S 04    Hollenbeck 
## # ℹ 24 more variables: `Rpt Dist No` <chr>, `Part 1-2` <dbl>, `Crm Cd` <dbl>,
## #   `Crm Cd Desc` <chr>, Mocodes <chr>, `Vict Age` <dbl>, `Vict Sex` <chr>,
## #   `Vict Descent` <chr>, `Premis Cd` <dbl>, `Premis Desc` <chr>,
## #   `Weapon Used Cd` <dbl>, `Weapon Desc` <chr>, Status <chr>,
## #   `Status Desc` <chr>, `Crm Cd 1` <dbl>, `Crm Cd 2` <dbl>, `Crm Cd 3` <dbl>,
## #   `Crm Cd 4` <lgl>, LOCATION <chr>, `Cross Street` <chr>, LAT <dbl>,
## #   LON <dbl>, year_occ <dbl>, year_rptd <dbl>
hist(crime$`Vict Age`)

5.Question: Is there a significant difference in the number of crimes between area 1 and 3? The results showed that Area 1 had slightly higher average yearly crime count(11,611) compared to Area 3(9,573). However, the p-value (0.55) indicates that the difference is not statistically significant. Therefore,the difference in the crime levels between these two areas are not meaningful.

#Creating a subset with only Area 1 and Area 2
crime_subset <- crime %>%
  filter(AREA %in% c("01","03")) %>%
  mutate(AREA= factor(AREA))
#count crimes per year per area
crime_counts <-crime_subset %>%
  group_by(AREA,  year_occ) %>%
  summarise(TotalCrimes= n(), .groups= "drop")
#t-test comparing Area 1 Vs Area3
t.test_result <- t.test(TotalCrimes ~ AREA, data =crime_counts)
print(t.test_result)
## 
##  Welch Two Sample t-test
## 
## data:  TotalCrimes by AREA
## t = 0.61315, df = 9.4664, p-value = 0.5542
## alternative hypothesis: true difference in means between group 01 and group 03 is not equal to 0
## 95 percent confidence interval:
##  -5425.367  9501.701
## sample estimates:
## mean in group 01 mean in group 03 
##         11611.67          9573.50