Hi, I’m Tyler Goad. I’m an MBA student at the
University of Cincinnati.
I currently live in Columbus, OH with my dog Max. In my free time I like
to take him to the park, go to the gym, and play golf on the
weekends.
I have some experience using R for data wrangling,
summaries, and creating reports with R Markdown. I’m becoming more
comfortable with the tidyverse packages
(dplyr, ggplot2, readr) and have
used R for simple data analysis and visualization. I like how R Markdown
lets me combine text, code, and output all in one place.
Outside of R, I’ve worked with: - Python (pandas,
matplotlib, NumPy)
- SQL (basic queries and joins for pulling data)
- Excel (Solver, pivot tables, Power Query, financial
modeling)
- Tableau (interactive dashboards)
- Alteryx (workflows for variance commentary, data
cleaning)
- Databricks/Spark (basic Python + SQL in a big data
environment)
This section imports the blood_transfusion.csv file and
answers the lab questions about its structure and contents.
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Import the dataset (adjust path if needed, e.g., "data/blood_transfusion.csv")
df <- read_csv("blood_transfusion.csv")
## Rows: 748 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Class
## dbl (4): Recency, Frequency, Monetary, Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Are there any missing values?
sum(is.na(df))
## [1] 0
# Dimensions (rows, columns)
dim(df)
## [1] 748 5
# First 10 rows
head(df, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 50 12500 98 donated
## 2 0 13 3250 28 donated
## 3 1 16 4000 35 donated
## 4 2 20 5000 45 donated
## 5 1 24 6000 77 not donated
## 6 4 4 1000 4 not donated
## 7 2 7 1750 14 donated
## 8 1 12 3000 35 not donated
## 9 2 9 2250 22 donated
## 10 5 46 11500 98 donated
# Last 10 rows
tail(df, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 23 1 250 23 not donated
## 2 23 4 1000 52 not donated
## 3 23 1 250 23 not donated
## 4 23 7 1750 88 not donated
## 5 16 3 750 86 not donated
## 6 23 2 500 38 not donated
## 7 21 2 500 52 not donated
## 8 23 3 750 62 not donated
## 9 39 1 250 39 not donated
## 10 72 1 250 72 not donated
# Class values for first 10 and last 10 observations
head(df$Class, 10)
## [1] "donated" "donated" "donated" "donated" "not donated"
## [6] "not donated" "donated" "not donated" "donated" "donated"
tail(df$Class, 10)
## [1] "not donated" "not donated" "not donated" "not donated" "not donated"
## [6] "not donated" "not donated" "not donated" "not donated" "not donated"
# 100th row, Monetary column
df[100, "Monetary"]
## # A tibble: 1 × 1
## Monetary
## <dbl>
## 1 1750
# Mean of Monetary column
mean(df[["Monetary"]], na.rm = TRUE)
## [1] 1378.676
# Subset where Monetary > mean(Monetary) and count rows
above_avg <- df[["Monetary"]] > mean(df[["Monetary"]], na.rm = TRUE)
nrow(df[above_avg, ])
## [1] 267
library(readr)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Import the dataset (adjust path if needed, e.g., "data/PDI__Police_Data_Initiative__Crime_Incidents.csv")
crime <- read_csv("PDI__Police_Data_Initiative__Crime_Incidents.csv")
## Rows: 15155 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (32): INSTANCEID, INCIDENT_NO, DATE_REPORTED, DATE_FROM, DATE_TO, CLSD, ...
## dbl (8): UCR, HOUR_FROM, HOUR_TO, LONGITUDE_X, LATITUDE_X, TOTALNUMBERVICTI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Dimensions (rows, columns)
dim(crime)
## [1] 15155 40
# Any missing values overall?
anyNA(crime)
## [1] TRUE
# Missing count per column
colSums(is.na(crime))
## INSTANCEID INCIDENT_NO
## 0 0
## DATE_REPORTED DATE_FROM
## 0 2
## DATE_TO CLSD
## 9 545
## UCR DST
## 10 0
## BEAT OFFENSE
## 28 10
## LOCATION THEFT_CODE
## 2 10167
## FLOOR SIDE
## 14127 14120
## OPENING HATE_BIAS
## 14508 0
## DAYOFWEEK RPT_AREA
## 423 239
## CPD_NEIGHBORHOOD WEAPONS
## 249 5
## DATE_OF_CLEARANCE HOUR_FROM
## 2613 2
## HOUR_TO ADDRESS_X
## 9 148
## LONGITUDE_X LATITUDE_X
## 1714 1714
## VICTIM_AGE VICTIM_RACE
## 0 2192
## VICTIM_ETHNICITY VICTIM_GENDER
## 2192 2192
## SUSPECT_AGE SUSPECT_RACE
## 0 7082
## SUSPECT_ETHNICITY SUSPECT_GENDER
## 7082 7082
## TOTALNUMBERVICTIMS TOTALSUSPECTS
## 33 7082
## UCR_GROUP ZIP
## 10 1
## COMMUNITY_COUNCIL_NEIGHBORHOOD SNA_NEIGHBORHOOD
## 0 0
# ---- Robust DATE_REPORTED parsing and range ----
# Peek at the format (optional)
head(crime$DATE_REPORTED, 5)
## [1] "1/1/2022 0:09" "1/1/2022 0:09" "1/1/2022 0:09" "1/1/2022 0:09"
## [5] "1/1/2022 0:09"
# Parse using multiple common formats
parsed_date <- parse_date_time(
crime$DATE_REPORTED,
orders = c("mdy HMS", "mdy HM", "mdy IMS p", "mdy", "Ymd HMS", "Ymd HM", "Ymd"),
tz = "America/New_York",
exact = FALSE
)
range(parsed_date, na.rm = TRUE)
## [1] "2022-01-01 00:09:00 EST" "2022-06-26 03:44:00 EDT"
sum(is.na(parsed_date)) # how many didn't parse
## [1] 0
# ---- Most common suspect age (or age range) ----
age_var <- if ("SUSPECT_AGE" %in% names(crime)) {
"SUSPECT_AGE"
} else if ("SUSPECT_AGE_RANGE" %in% names(crime)) {
"SUSPECT_AGE_RANGE"
} else {
NA_character_
}
if (!is.na(age_var)) {
sort(table(crime[[age_var]], useNA = "no"), decreasing = TRUE)[1:10]
} else {
"No suspect age column found."
}
##
## UNKNOWN 18-25 31-40 26-30 41-50 UNDER 18 51-60 61-70
## 9003 1778 1525 1126 659 629 298 121
## OVER 70 <NA>
## 16
# ---- Incidents per ZIP (sorted) + quick quality check ----
zip_tbl <- table(as.character(crime[["ZIP"]]))
sort(zip_tbl, decreasing = TRUE)[1:15]
##
## 45202 45205 45211 45238 45229 45219 45225 45214 45237 45223 45206 45220 45232
## 2049 1110 1094 956 913 863 811 774 699 653 616 477 477
## 45224 45209
## 429 380
# Non 5-digit or odd values (e.g., blanks)
names(zip_tbl)[!grepl("^\\d{5}$", names(zip_tbl))][1:20]
## [1] "4523" "5239" NA NA NA NA NA NA NA NA
## [11] NA NA NA NA NA NA NA NA NA NA
# ---- Day with most incidents + its proportion ----
dow_tbl <- table(crime[["DAYOFWEEK"]])
sort(dow_tbl, decreasing = TRUE)[1]
## SATURDAY
## 2272
sort(dow_tbl / sum(dow_tbl), decreasing = TRUE)[1]
## SATURDAY
## 0.1542221
# ---- Explore 3 relevant columns that exist in this file ----
cand_cols <- c("OFFENSE", "VICTIM_AGE", "CPD_NEIGHBORHOOD", "COMMUNITY_COUNCIL_NEIGHBORHOOD")
cols_to_check <- intersect(cand_cols, names(crime))
cols_to_check
## [1] "OFFENSE" "VICTIM_AGE"
## [3] "CPD_NEIGHBORHOOD" "COMMUNITY_COUNCIL_NEIGHBORHOOD"
# Missingness for those columns
colSums(is.na(crime[cols_to_check]))
## OFFENSE VICTIM_AGE
## 10 0
## CPD_NEIGHBORHOOD COMMUNITY_COUNCIL_NEIGHBORHOOD
## 249 0
# Quick summaries:
# - For categorical columns: top levels preview
# - For numeric columns (e.g., VICTIM_AGE): summary + simple outlier scan
lapply(crime[cols_to_check], function(col) {
if (is.numeric(col)) {
list(summary = summary(col), outliers = boxplot.stats(col)$out)
} else {
sort(table(col), decreasing = TRUE)[1:10]
}
})
## $OFFENSE
## col
## THEFT CRIMINAL DAMAGING/ENDANGERING
## 4988 2248
## ASSAULT DOMESTIC VIOLENCE
## 1668 916
## FELONIOUS ASSAULT AGGRAVATED MENACING
## 639 591
## BURGLARY BREAKING AND ENTERING
## 514 451
## AGGRAVATED ROBBERY MENACING
## 399 356
##
## $VICTIM_AGE
## col
## 31-40 18-25 UNKNOWN 41-50 26-30 51-60
## 2978 2782 2283 1838 1736 1335
## 61-70 UNDER 18 OVER 70 ADULT (18+)
## 879 744 432 108
##
## $CPD_NEIGHBORHOOD
## col
## WESTWOOD OVER-THE-RHINE C. B. D. / RIVERFRONT
## 1683 904 845
## WEST PRICE HILL EAST PRICE HILL AVONDALE
## 782 764 664
## WEST END WALNUT HILLS ROSELAWN
## 499 490 436
## COLLEGE HILL
## 418
##
## $COMMUNITY_COUNCIL_NEIGHBORHOOD
## col
## N/A WESTWOOD OTR WEST PRICE HILL EAST PRICE HILL
## 1639 1604 810 758 711
## AVONDALE DOWNTOWN WEST END WALNUT HILLS COLLEGE HILL
## 600 556 531 436 388