library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(crimedata)
## Warning: package 'crimedata' was built under R version 4.4.3
get_crime_data(cities = "Seattle")
subset the dataframe to include all rows for specific columns
library(readr)
library(readxl)
Seattle <- read_excel("C:/Users/ku26/OneDrive - Drexel University/Documents/310Wi26/Seattle/Seattle.xlsx")
Seattle_households <- read_excel("C:/Users/ku26/OneDrive - Drexel University/Documents/310Wi26/Seattle/2010_Census_Tract_Seattle_-_Household_Statistics.xlsx")
Seattle_2010_households <- Seattle_households[, c("GEOID10", "Total_Population", "Total_Households", "Family_households", "Nonfamily_households")]
rename the columns
Seattle_2010_households <- Seattle_2010_households %>%
rename(census_block = GEOID10, total_pop = Total_Population, total_households = Total_Households, total_family = Family_households, total_nonfam = Nonfamily_households)
delete one complete row if needed
# not application here Seattle_2010_households <- Seattle_2010_households[-1, ]
remove last four digits from the entries in one column
Seattle$census_block <- str_sub(Seattle$census_block, end = -5)
Seattle$census_block <- as.numeric(Seattle$census_block)
merge on block_group
Seattle_crime_households <- full_join(Seattle_2010_households, Seattle, by = "census_block")
str(Seattle)
## tibble [690 × 12] (S3: tbl_df/tbl/data.frame)
## $ uid : num [1:690] 27344461 27344544 27344564 27345105 27345138 ...
## $ city_name : chr [1:690] "Seattle" "Seattle" "Seattle" "Seattle" ...
## $ offense_code : chr [1:690] "90D" "22U" "23F" "13C" ...
## $ offense_type : chr [1:690] "driving under the influence" "burglary/breaking & entering" "theft from motor vehicle (except theft of motor vehicle parts or accessories)" "intimidation" ...
## $ offense_group : chr [1:690] "driving under the influence" "burglary/breaking & entering" "larceny/theft offenses" "assault offenses" ...
## $ offense_against: chr [1:690] "society" "property" "property" "persons" ...
## $ date_single : POSIXct[1:690], format: "2022-01-01 11:48:00" "2022-01-01 21:32:00" ...
## $ date_start : POSIXct[1:690], format: "2022-01-01 11:00:00" "2022-01-01 21:30:00" ...
## $ date_end : POSIXct[1:690], format: "2022-01-01 12:36:00" "2022-01-01 21:35:00" ...
## $ longitude : num [1:690] -122 -122 -122 -122 -122 ...
## $ latitude : num [1:690] 47.6 47.6 47.7 47.6 47.6 ...
## $ census_block : num [1:690] 5303300 5303300 5303300 5303300 5303300 ...
define breaks for total population
breaks <- c(0,1000, 2000, 3000, 4000, 5000, Inf)
labels <- c("Small", "Kindof Small", "Medium", "kindof Medium", "Large", "Larger")
Seattle_crime_households$total_pop <- as.numeric(Seattle_crime_households$total_pop)
Seattle_crime_households <- Seattle_crime_households %>%
mutate(population_category = cut(total_pop, breaks = breaks, labels = labels, include.lowest = TRUE))
summary(Seattle_crime_households)
## census_block total_pop total_households total_family
## Min. :5.303e+06 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.:5.303e+06 1st Qu.:3500 1st Qu.:1481 1st Qu.: 617.5
## Median :5.303e+06 Median :4476 Median :2047 Median : 899.0
## Mean :8.683e+09 Mean :4509 Mean :2100 Mean : 901.4
## 3rd Qu.:5.303e+06 3rd Qu.:5612 3rd Qu.:2629 3rd Qu.:1117.5
## Max. :5.303e+10 Max. :7789 Max. :4878 Max. :2136.0
## NA's :690 NA's :690 NA's :690
## total_nonfam uid city_name offense_code
## Min. : 0 Min. :27344461 Length:825 Length:825
## 1st Qu.: 617 1st Qu.:27360594 Class :character Class :character
## Median :1019 Median :27378578 Mode :character Mode :character
## Mean :1199 Mean :27378709
## 3rd Qu.:1582 3rd Qu.:27396509
## Max. :3992 Max. :27413244
## NA's :690 NA's :135
## offense_type offense_group offense_against
## Length:825 Length:825 Length:825
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## date_single date_start
## Min. :2022-01-01 11:48:00.00 Min. :2021-09-13 16:00:00.00
## 1st Qu.:2022-03-22 11:59:00.00 1st Qu.:2022-03-20 09:57:30.00
## Median :2022-06-26 13:00:00.00 Median :2022-06-24 14:45:00.00
## Mean :2022-06-28 00:37:50.98 Mean :2022-06-26 10:15:58.47
## 3rd Qu.:2022-09-28 05:37:00.00 3rd Qu.:2022-09-27 18:33:45.00
## Max. :2022-12-31 19:08:00.00 Max. :2022-12-31 19:08:00.00
## NA's :160 NA's :197
## date_end longitude latitude
## Min. :2022-01-01 12:36:00.00 Min. :-122.4 Min. :47.50
## 1st Qu.:2022-03-23 00:09:00.00 1st Qu.:-122.3 1st Qu.:47.59
## Median :2022-06-26 07:00:30.00 Median :-122.3 Median :47.62
## Mean :2022-06-29 17:52:35.69 Mean :-122.3 Mean :47.62
## 3rd Qu.:2022-09-30 19:22:30.00 3rd Qu.:-122.3 3rd Qu.:47.66
## Max. :2023-07-09 16:08:00.00 Max. :-122.3 Max. :47.73
## NA's :171 NA's :135 NA's :135
## population_category
## Small : 3
## Kindof Small : 2
## Medium : 18
## kindof Medium: 26
## Large : 38
## Larger : 48
## NA's :690
create visualization for offense types by population
Seattle_crime_households %>%
ggplot(aes(x = population_category, fill = offense_against)) +
geom_bar(position = "dodge")

get rid of NA
clean_crime <- Seattle_crime_households %>% drop_na()
clean_crime %>%
ggplot(aes(x = population_category, fill = offense_against)) +
geom_bar(position = "dodge")

note what happens
try this
clean_crime_1 <- Seattle_crime_households %>% drop_na(offense_against)
clean_crime_1 %>%
ggplot(aes(x = population_category, fill = offense_against)) +
geom_bar(position = "dodge")

now this
clean_crime_2 <- clean_crime_1 %>% drop_na(population_category)
clean_crime_2 %>%
ggplot(aes(x = population_category, fill = offense_against)) +
geom_bar(position = "dodge")
