Hello, my name is Sanghyun Hyun. I am currently working at Technovation Partners, a Korean consulting firm specialized in R&D planning. My dream is to become a more professional consultant by learning data analysis and applying it to my work.
2017–2024: Sungkyunkwan University, B.A. in Korean Language and Literature
2022–2024: M.S. in Data Marketing
I am a complete beginner in R.
Although R feels difficult at first, I am eager to learn step by step.
Python: data analysis, regression analysis, visualization
# 1) Import the dataset
blood <- readr::read_csv("blood_transfusion.csv", show_col_types = FALSE)
# 2) Dimensions and column types
dim(blood) # rows, columns
## [1] 748 5
str(blood) # column types
## spc_tbl_ [748 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Recency : num [1:748] 2 0 1 2 1 4 2 1 2 5 ...
## $ Frequency: num [1:748] 50 13 16 20 24 4 7 12 9 46 ...
## $ Monetary : num [1:748] 12500 3250 4000 5000 6000 1000 1750 3000 2250 11500 ...
## $ Time : num [1:748] 98 28 35 45 77 4 14 35 22 98 ...
## $ Class : chr [1:748] "donated" "donated" "donated" "donated" ...
## - attr(*, "spec")=
## .. cols(
## .. Recency = col_double(),
## .. Frequency = col_double(),
## .. Monetary = col_double(),
## .. Time = col_double(),
## .. Class = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# 3) Missing values
total_na_blood <- sum(is.na(blood))
col_na_blood <- colSums(is.na(blood))
total_na_blood
## [1] 0
col_na_blood
## Recency Frequency Monetary Time Class
## 0 0 0 0 0
# 4) First 10 rows and their Class values
head(blood, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 50 12500 98 donated
## 2 0 13 3250 28 donated
## 3 1 16 4000 35 donated
## 4 2 20 5000 45 donated
## 5 1 24 6000 77 not donated
## 6 4 4 1000 4 not donated
## 7 2 7 1750 14 donated
## 8 1 12 3000 35 not donated
## 9 2 9 2250 22 donated
## 10 5 46 11500 98 donated
head(blood[["Class"]], 10)
## [1] "donated" "donated" "donated" "donated" "not donated"
## [6] "not donated" "donated" "not donated" "donated" "donated"
# 5) Last 10 rows and their Class values
tail(blood, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 23 1 250 23 not donated
## 2 23 4 1000 52 not donated
## 3 23 1 250 23 not donated
## 4 23 7 1750 88 not donated
## 5 16 3 750 86 not donated
## 6 23 2 500 38 not donated
## 7 21 2 500 52 not donated
## 8 23 3 750 62 not donated
## 9 39 1 250 39 not donated
## 10 72 1 250 72 not donated
tail(blood[["Class"]], 10)
## [1] "not donated" "not donated" "not donated" "not donated" "not donated"
## [6] "not donated" "not donated" "not donated" "not donated" "not donated"
# 6) The Monetary value of the 100th row
blood[100, "Monetary"]
## # A tibble: 1 × 1
## Monetary
## <dbl>
## 1 1750
# 7) The mean of the Monetary column
monetary_mean <- mean(blood[["Monetary"]])
monetary_mean
## [1] 1378.676
# 8) Subset for Monetary greater than the mean (how many rows?)
above_avg_idx <- blood[["Monetary"]] > monetary_mean
nrow(blood[above_avg_idx, ])
## [1] 267
# 1) Import the dataset
crime <- readr::read_csv("PDI__Police_Data_Initiative__Crime_Incidents.csv", show_col_types = FALSE)
# 2) Dimensions
dim(crime)
## [1] 15155 40
# 3) Missing values
total_na_crime <- sum(is.na(crime))
col_na_crime <- colSums(is.na(crime))
total_na_crime
## [1] 95592
col_na_crime
## INSTANCEID INCIDENT_NO
## 0 0
## DATE_REPORTED DATE_FROM
## 0 2
## DATE_TO CLSD
## 9 545
## UCR DST
## 10 0
## BEAT OFFENSE
## 28 10
## LOCATION THEFT_CODE
## 2 10167
## FLOOR SIDE
## 14127 14120
## OPENING HATE_BIAS
## 14508 0
## DAYOFWEEK RPT_AREA
## 423 239
## CPD_NEIGHBORHOOD WEAPONS
## 249 5
## DATE_OF_CLEARANCE HOUR_FROM
## 2613 2
## HOUR_TO ADDRESS_X
## 9 148
## LONGITUDE_X LATITUDE_X
## 1714 1714
## VICTIM_AGE VICTIM_RACE
## 0 2192
## VICTIM_ETHNICITY VICTIM_GENDER
## 2192 2192
## SUSPECT_AGE SUSPECT_RACE
## 0 7082
## SUSPECT_ETHNICITY SUSPECT_GENDER
## 7082 7082
## TOTALNUMBERVICTIMS TOTALSUSPECTS
## 33 7082
## UCR_GROUP ZIP
## 10 1
## COMMUNITY_COUNCIL_NEIGHBORHOOD SNA_NEIGHBORHOOD
## 0 0
# 4) Date range for DATE_REPORTED (coerce safely to Date)
date_vec <- as.Date(substr(crime[["DATE_REPORTED"]], 1, 10))
range(date_vec, na.rm = TRUE)
## [1] "0001-01-20" "0006-12-20"
# 5) Most common known SUSPECT_AGE
age_tab <- table(crime[["SUSPECT_AGE"]])
known_age_tab <- age_tab[names(age_tab) != "UNKNOWN"] # exclude UNKNOWN
sort(known_age_tab, decreasing = TRUE)[1]
## 18-25
## 1778
# 6) Incidents per ZIP code (sorted, high → low)
zip_tab <- sort(table(crime[["ZIP"]]), decreasing = TRUE)
head(zip_tab, 10) # top 10 ZIPs
##
## 45202 45205 45211 45238 45229 45219 45225 45214 45237 45223
## 2049 1110 1094 956 913 863 811 774 699 653
# (Optional) quick ZIP quality check: length distribution
zip_chr <- as.character(crime[["ZIP"]])
table(nchar(zip_chr), useNA = "ifany")
##
## 4 5 <NA>
## 3 15151 1
# 7) Day-of-week proportions
day_tab <- table(crime[["DAYOFWEEK"]])
day_prop <- day_tab / sum(day_tab)
day_tab[which.max(day_tab)] # most common day (count)
## SATURDAY
## 2272
max(day_prop) # proportion for that day
## [1] 0.1542221
day_prop # full breakdown
##
## FRIDAY MONDAY SATURDAY SUNDAY THURSDAY TUESDAY WEDNESDAY
## 0.1369807 0.1438365 0.1542221 0.1448547 0.1363019 0.1432935 0.1405105
# 8) Example exploratory starts
sort(table(crime[["OFFENSE"]]), decreasing = TRUE)[1:10]
##
## THEFT CRIMINAL DAMAGING/ENDANGERING
## 4988 2248
## ASSAULT DOMESTIC VIOLENCE
## 1668 916
## FELONIOUS ASSAULT AGGRAVATED MENACING
## 639 591
## BURGLARY BREAKING AND ENTERING
## 514 451
## AGGRAVATED ROBBERY MENACING
## 399 356
summary(crime[["HOUR_FROM"]])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 120 230 780 1535 2359 2