Introduction to Me!

Sanghyun

Hello, my name is Sanghyun Hyun. I am currently working at Technovation Partners, a Korean consulting firm specialized in R&D planning. My dream is to become a more professional consultant by learning data analysis and applying it to my work.


Academic Background

  • 2017–2024: Sungkyunkwan University, B.A. in Korean Language and Literature

  • 2022–2024: M.S. in Data Marketing


Professional Background

  • Oct 2023 – Present: Technovation Partners — R&D Planning Consultant

Experience with R

I am a complete beginner in R.

Although R feels difficult at first, I am eager to learn step by step.

I want to apply R to real consulting projects in the near future.

Experience with Other Analytic Software

Python: data analysis, regression analysis, visualization


Part 2 — Labwork(week2)

2-1) blood_transfusion.csv

# 1) Import the dataset
blood <- readr::read_csv("blood_transfusion.csv", show_col_types = FALSE)

# 2) Dimensions and column types
dim(blood)                 # rows, columns
## [1] 748   5
str(blood)                 # column types
## spc_tbl_ [748 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Recency  : num [1:748] 2 0 1 2 1 4 2 1 2 5 ...
##  $ Frequency: num [1:748] 50 13 16 20 24 4 7 12 9 46 ...
##  $ Monetary : num [1:748] 12500 3250 4000 5000 6000 1000 1750 3000 2250 11500 ...
##  $ Time     : num [1:748] 98 28 35 45 77 4 14 35 22 98 ...
##  $ Class    : chr [1:748] "donated" "donated" "donated" "donated" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Recency = col_double(),
##   ..   Frequency = col_double(),
##   ..   Monetary = col_double(),
##   ..   Time = col_double(),
##   ..   Class = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# 3) Missing values
total_na_blood <- sum(is.na(blood))
col_na_blood   <- colSums(is.na(blood))
total_na_blood
## [1] 0
col_na_blood
##   Recency Frequency  Monetary      Time     Class 
##         0         0         0         0         0
# 4) First 10 rows and their Class values
head(blood, 10)
## # A tibble: 10 × 5
##    Recency Frequency Monetary  Time Class      
##      <dbl>     <dbl>    <dbl> <dbl> <chr>      
##  1       2        50    12500    98 donated    
##  2       0        13     3250    28 donated    
##  3       1        16     4000    35 donated    
##  4       2        20     5000    45 donated    
##  5       1        24     6000    77 not donated
##  6       4         4     1000     4 not donated
##  7       2         7     1750    14 donated    
##  8       1        12     3000    35 not donated
##  9       2         9     2250    22 donated    
## 10       5        46    11500    98 donated
head(blood[["Class"]], 10)
##  [1] "donated"     "donated"     "donated"     "donated"     "not donated"
##  [6] "not donated" "donated"     "not donated" "donated"     "donated"
# 5) Last 10 rows and their Class values
tail(blood, 10)
## # A tibble: 10 × 5
##    Recency Frequency Monetary  Time Class      
##      <dbl>     <dbl>    <dbl> <dbl> <chr>      
##  1      23         1      250    23 not donated
##  2      23         4     1000    52 not donated
##  3      23         1      250    23 not donated
##  4      23         7     1750    88 not donated
##  5      16         3      750    86 not donated
##  6      23         2      500    38 not donated
##  7      21         2      500    52 not donated
##  8      23         3      750    62 not donated
##  9      39         1      250    39 not donated
## 10      72         1      250    72 not donated
tail(blood[["Class"]], 10)
##  [1] "not donated" "not donated" "not donated" "not donated" "not donated"
##  [6] "not donated" "not donated" "not donated" "not donated" "not donated"
# 6) The Monetary value of the 100th row
blood[100, "Monetary"]
## # A tibble: 1 × 1
##   Monetary
##      <dbl>
## 1     1750
# 7) The mean of the Monetary column
monetary_mean <- mean(blood[["Monetary"]])
monetary_mean
## [1] 1378.676
# 8) Subset for Monetary greater than the mean (how many rows?)
above_avg_idx <- blood[["Monetary"]] > monetary_mean
nrow(blood[above_avg_idx, ])
## [1] 267

2-2) PDI__Police_Data_Initiative__Crime_Incidents.csv

# 1) Import the dataset
crime <- readr::read_csv("PDI__Police_Data_Initiative__Crime_Incidents.csv", show_col_types = FALSE)

# 2) Dimensions
dim(crime)
## [1] 15155    40
# 3) Missing values
total_na_crime <- sum(is.na(crime))
col_na_crime   <- colSums(is.na(crime))
total_na_crime
## [1] 95592
col_na_crime
##                     INSTANCEID                    INCIDENT_NO 
##                              0                              0 
##                  DATE_REPORTED                      DATE_FROM 
##                              0                              2 
##                        DATE_TO                           CLSD 
##                              9                            545 
##                            UCR                            DST 
##                             10                              0 
##                           BEAT                        OFFENSE 
##                             28                             10 
##                       LOCATION                     THEFT_CODE 
##                              2                          10167 
##                          FLOOR                           SIDE 
##                          14127                          14120 
##                        OPENING                      HATE_BIAS 
##                          14508                              0 
##                      DAYOFWEEK                       RPT_AREA 
##                            423                            239 
##               CPD_NEIGHBORHOOD                        WEAPONS 
##                            249                              5 
##              DATE_OF_CLEARANCE                      HOUR_FROM 
##                           2613                              2 
##                        HOUR_TO                      ADDRESS_X 
##                              9                            148 
##                    LONGITUDE_X                     LATITUDE_X 
##                           1714                           1714 
##                     VICTIM_AGE                    VICTIM_RACE 
##                              0                           2192 
##               VICTIM_ETHNICITY                  VICTIM_GENDER 
##                           2192                           2192 
##                    SUSPECT_AGE                   SUSPECT_RACE 
##                              0                           7082 
##              SUSPECT_ETHNICITY                 SUSPECT_GENDER 
##                           7082                           7082 
##             TOTALNUMBERVICTIMS                  TOTALSUSPECTS 
##                             33                           7082 
##                      UCR_GROUP                            ZIP 
##                             10                              1 
## COMMUNITY_COUNCIL_NEIGHBORHOOD               SNA_NEIGHBORHOOD 
##                              0                              0
# 4) Date range for DATE_REPORTED (coerce safely to Date)
date_vec <- as.Date(substr(crime[["DATE_REPORTED"]], 1, 10))
range(date_vec, na.rm = TRUE)
## [1] "0001-01-20" "0006-12-20"
# 5) Most common known SUSPECT_AGE
age_tab <- table(crime[["SUSPECT_AGE"]])
known_age_tab <- age_tab[names(age_tab) != "UNKNOWN"]  # exclude UNKNOWN
sort(known_age_tab, decreasing = TRUE)[1]
## 18-25 
##  1778
# 6) Incidents per ZIP code (sorted, high → low)
zip_tab <- sort(table(crime[["ZIP"]]), decreasing = TRUE)
head(zip_tab, 10)   # top 10 ZIPs
## 
## 45202 45205 45211 45238 45229 45219 45225 45214 45237 45223 
##  2049  1110  1094   956   913   863   811   774   699   653
# (Optional) quick ZIP quality check: length distribution
zip_chr <- as.character(crime[["ZIP"]])
table(nchar(zip_chr), useNA = "ifany")
## 
##     4     5  <NA> 
##     3 15151     1
# 7) Day-of-week proportions
day_tab  <- table(crime[["DAYOFWEEK"]])
day_prop <- day_tab / sum(day_tab)
day_tab[which.max(day_tab)]  # most common day (count)
## SATURDAY 
##     2272
max(day_prop)                # proportion for that day
## [1] 0.1542221
day_prop                     # full breakdown
## 
##    FRIDAY    MONDAY  SATURDAY    SUNDAY  THURSDAY   TUESDAY WEDNESDAY 
## 0.1369807 0.1438365 0.1542221 0.1448547 0.1363019 0.1432935 0.1405105
# 8) Example exploratory starts
sort(table(crime[["OFFENSE"]]), decreasing = TRUE)[1:10]
## 
##                         THEFT CRIMINAL DAMAGING/ENDANGERING 
##                          4988                          2248 
##                       ASSAULT             DOMESTIC VIOLENCE 
##                          1668                           916 
##             FELONIOUS ASSAULT           AGGRAVATED MENACING 
##                           639                           591 
##                      BURGLARY         BREAKING AND ENTERING 
##                           514                           451 
##            AGGRAVATED ROBBERY                      MENACING 
##                           399                           356
summary(crime[["HOUR_FROM"]])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0     120     230     780    1535    2359       2