PaulCase

# ==== LIBRARIES ====
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(writexl)
library(geosphere)    # distHaversine
library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

# ==== CONFIG ====
setwd("D:/PaulCase")
input_path <- "D:/PaulCase/Senior Data Anlyst_Case Study Dataset.xlsx"
top_n <- 20
bssid_reuse_threshold <- 5         # BSSID xuất hiện > 5 lần
jump_km_threshold <- 100           # Nhảy vị trí > 100 km
#roundtrip_q <- 0.95                # Outlier ngưỡng 95%
config_q <- 0.95
options(scipen = 999) # tắt scientific notation

# ==== HELPERS ====
parse_list_num <- function(x) {
  if (length(x) == 0 || is.na(x)) return(numeric(0))
  if (is.list(x)) return(suppressWarnings(as.numeric(unlist(x))))
  if (is.character(x)) {
    y <- tryCatch(fromJSON(x), error = function(e) NULL)
    if (is.null(y)) return(numeric(0))
    return(suppressWarnings(as.numeric(y)))
  }
  out <- suppressWarnings(as.numeric(x))
  out[is.na(out)] <- numeric(0)
  out
}

extract_last_numeric  <- function(x){ v <- parse_list_num(x); if (length(v)==0) NA_real_ else tail(v,1) }
extract_first_numeric <- function(x){ v <- parse_list_num(x); if (length(v)==0) NA_real_ else v[1] }

as_bool <- function(x) {
  if (is.logical(x)) return(x)
  if (is.numeric(x)) return(x != 0)
  if (is.character(x)) {
    x2 <- tolower(trimws(x))
    return(x2 %in% c("1","true","t","yes","y"))
  }
  rep(FALSE, length(x))
}

normalize_ctry <- function(x) {
  if (is.na(x)) return(NA_character_)
  if (is.character(x) && grepl("^\\s*\\[", x)) {
    v <- tryCatch(fromJSON(x), error = function(e) NULL)
    if (is.null(v) || length(v) == 0) return(NA_character_)
    return(as.character(v[[1]]))
  }
  as.character(x)
}

# ==== LOAD ====
ios_df     <- read_xlsx(input_path, sheet = "ios")     %>% clean_names()
android_df <- read_xlsx(input_path, sheet = "android") %>% clean_names()

# ==== MAP ANDROID -> iOS naming ====
android_df <- android_df %>%
  rename(
    rooted = is_rooted,
    proxy  = is_proxy,
    location_history_is_simulated_by_software = mocked_location
  )

# ==== HARMONIZE TYPES (trước khi merge) ====
bool_like_cols <- c(
  "rooted",
  "proxy",
  "location_history_is_simulated_by_software",
  "is_spoofing",
  "adb_is_active",
  "gc_authorized"
)

json_like_cols <- c(
  "last_known_location_lat", "last_known_location_lon",
  "location_history_lat", "location_history_lon",
  "cell_tower_mobile_country_code", "wifiaps_mac_address"
)

# Ép các cột “cờ” & “list/JSON” về character để 2 bảng cùng kiểu
for (c in bool_like_cols) {
  if (c %in% names(ios_df))     ios_df[[c]]     <- as.character(ios_df[[c]])
  if (c %in% names(android_df)) android_df[[c]] <- as.character(android_df[[c]])
}
for (c in json_like_cols) {
  if (c %in% names(ios_df))     ios_df[[c]]     <- as.character(ios_df[[c]])
  if (c %in% names(android_df)) android_df[[c]] <- as.character(android_df[[c]])
}

# ==== PREP iOS: chuẩn hoá list & gắn source ====
ios_df <- ios_df %>%
  mutate(
    last_known_location_lat_val = map_dbl(last_known_location_lat, extract_last_numeric),
    last_known_location_lon_val = map_dbl(last_known_location_lon, extract_last_numeric),
    location_history_lat_first  = map_dbl(location_history_lat,  extract_first_numeric),
    location_history_lon_first  = map_dbl(location_history_lon,  extract_first_numeric),
    source = "iOS"
  )

# ==== PREP ANDROID: dùng lat/lon hiện tại thay cho last_known ====
android_df <- android_df %>%
  mutate(
    last_known_location_lat_val = suppressWarnings(as.numeric(lat)),
    last_known_location_lon_val = suppressWarnings(as.numeric(lon)),
    location_history_lat_first  = suppressWarnings(as.numeric(lat)),
    location_history_lon_first  = suppressWarnings(as.numeric(lon)),
    source = "Android"
  )

# ==== MERGE ====
all_cols <- union(names(ios_df), names(android_df))
ios_df     <- ios_df[, intersect(all_cols, names(ios_df))]
android_df <- android_df[, intersect(all_cols, names(android_df))]

# Bổ sung cột còn thiếu bằng NA để bind_rows không mất cột
missing_ios     <- setdiff(all_cols, names(ios_df))
missing_android <- setdiff(all_cols, names(android_df))
for (c in missing_ios)     ios_df[[c]]     <- NA
for (c in missing_android) android_df[[c]] <- NA

merged_df <- bind_rows(ios_df[, all_cols], android_df[, all_cols])

# ==== NORMALIZE TYPES ====
merged_df <- merged_df %>%
  mutate(
    lat = suppressWarnings(as.numeric(lat)),
    lon = suppressWarnings(as.numeric(lon)),
    engine_roundtrip = suppressWarnings(as.numeric(engine_roundtrip)),
    config_duration  = suppressWarnings(as.numeric(ifelse("config_duration" %in% names(.), config_duration, NA))),
    rooted         = as_bool(ifelse("rooted" %in% names(.), rooted, FALSE)),
    adb_is_active  = as_bool(ifelse("adb_is_active" %in% names(.), adb_is_active, FALSE)),
    is_spoofing    = as_bool(ifelse("is_spoofing" %in% names(.), is_spoofing, FALSE)),
    proxy          = as_bool(ifelse("proxy" %in% names(.), proxy, FALSE))
  )

# ==== DISTANCE JUMP (km) ====
merged_df$distance_jump_km <- pmap_dbl(
  list(merged_df$last_known_location_lon_val, merged_df$last_known_location_lat_val,
       merged_df$lon, merged_df$lat),
  function(lon1, lat1, lon2, lat2){
    if (any(is.na(c(lon1, lat1, lon2, lat2)))) return(NA_real_)
    geosphere::distHaversine(c(lon1, lat1), c(lon2, lat2)) / 1000
  }
)

# ==== COUNTRY MISMATCH ====
merged_df <- merged_df %>%
  mutate(
    cell_ctry = if ("cell_tower_mobile_country_code" %in% names(.))
                  vapply(cell_tower_mobile_country_code, normalize_ctry, FUN.VALUE = character(1))
                else NA_character_,
    wifi_ctry = if ("connected_wifi_country" %in% names(.))
                  as.character(connected_wifi_country) else NA_character_
  )

# ==== BSSID REUSE ====
merged_df <- merged_df %>%
  mutate(bssid = if ("connected_wifi_bssid" %in% names(.)) as.character(connected_wifi_bssid) else NA_character_) %>%
  group_by(bssid) %>%
  mutate(bssid_count = if (all(is.na(bssid))) NA_integer_ else n()) %>%
  ungroup()

# ==== FLAGS ====
# (two-sided 95% for engine_roundtrip)

# Ngưỡng phân vị hai phía cho roundtrip
rt_q_low  <- suppressWarnings(stats::quantile(merged_df$engine_roundtrip, 0.025, na.rm = TRUE))
rt_q_high <- suppressWarnings(stats::quantile(merged_df$engine_roundtrip, 0.975, na.rm = TRUE))

# Cờ roundtrip thấp/cao theo p2.5/p97.5
merged_df <- merged_df %>%
  mutate(
    flag_roundtrip_low  = as.integer(!is.na(engine_roundtrip) & engine_roundtrip <  rt_q_low),
    flag_roundtrip_high = as.integer(!is.na(engine_roundtrip) & engine_roundtrip >  rt_q_high)
  )

# Ngưỡng p95 cho config_duration
q95_config <- suppressWarnings(stats::quantile(merged_df$config_duration, config_q, na.rm = TRUE))

# Các cờ còn lại
merged_df <- merged_df %>%
  mutate(
    flag_root                 = as.integer(rooted | adb_is_active),
    flag_spoofing             = as.integer(is_spoofing | as_bool(ifelse("location_history_is_simulated_by_software" %in% names(.),
                                                                        location_history_is_simulated_by_software, FALSE))),
    flag_location_jump        = as.integer(!is.na(distance_jump_km) & distance_jump_km > jump_km_threshold),
    flag_country_mismatch     = as.integer(!is.na(cell_ctry) & !is.na(wifi_ctry) & cell_ctry != wifi_ctry),
    flag_proxy                = as.integer(proxy),
    flag_bssid_reuse          = as.integer(!is.na(bssid) & bssid != "" & !is.na(bssid_count) & bssid_count > bssid_reuse_threshold),
    flag_config_duration_outlier = as.integer(!is.na(config_duration) & !is.na(q95_config) & config_duration > q95_config)
  )

# ==== RISK: compute & attach all risk columns to merged_df ====

# 1) Weights (đang dùng theo mức độ nghiêm trọng)
weights <- c(
  root = 3,
  spoofing = 3,
  location_jump = 3,
  country_mismatch = 3,
  proxy = 2,
  bssid_reuse = 1,
  roundtrip_low = 1,
  roundtrip_high = 1,
  config_duration_outlier = 1
)

# 2) Reason builders (sửa nhẹ: mapping rõ ràng để sắp xếp theo weights)
reason_tags_builder <- function(fr, fs, flj, fcm, fp, fbr, frl, frh, fco) {
  tags <- c()
  if (!is.na(fr)  && fr  == 1) tags <- c(tags, "root/jailbreak_or_adb")
  if (!is.na(fs)  && fs  == 1) tags <- c(tags, "mock_location_or_spoofing")
  if (!is.na(flj) && flj == 1) tags <- c(tags, "location_jump")
  if (!is.na(fcm) && fcm == 1) tags <- c(tags, "country_mismatch")
  if (!is.na(fp)  && fp  == 1) tags <- c(tags, "vpn_or_proxy")
  if (!is.na(fbr) && fbr == 1) tags <- c(tags, "bssid_reuse")
  if (!is.na(frl) && frl == 1) tags <- c(tags, "roundtrip_low")
  if (!is.na(frh) && frh == 1) tags <- c(tags, "roundtrip_high")
  if (!is.na(fco) && fco == 1) tags <- c(tags, "config_duration_outlier")
  if (length(tags) == 0) return(NA_character_)

  # map tag -> key trong weights để sắp theo criticality
 key_map <- c(
    "root/jailbreak_or_adb"       = "root",
    "mock_location_or_spoofing"   = "spoofing",
    "location_jump"               = "location_jump",
    "country_mismatch"            = "country_mismatch",
    "vpn_or_proxy"                = "proxy",
    "bssid_reuse"                 = "bssid_reuse",
    "roundtrip_low"               = "roundtrip_low",
    "roundtrip_high"              = "roundtrip_high",
    "config_duration_outlier"     = "config_duration_outlier"
  )
  # tên vector phải là character; dùng match an toàn
  keys <- unname(key_map[match(tags, names(key_map))])
  ord <- order(unname(weights[keys]), decreasing = TRUE)
  paste(tags[ord], collapse = ", ")
}

reason_pretty_builder <- function(fr, fs, flj, fcm, fp, fbr, frl, frh, fco,
                                  dist_km, cell_ctry, wifi_ctry,
                                  bssid_count, eround, qlow, qhigh,
                                  cfgdur, q95c) {
  items <- list()
  if (!is.na(fr) && fr == 1)   items <- append(items, list(list(w=weights["root"],               txt="Device is rooted/jailbroken or ADB enabled")))
  if (!is.na(fs) && fs == 1)   items <- append(items, list(list(w=weights["spoofing"],           txt="Mock location / spoofing detected")))
  if (!is.na(flj) && flj == 1) items <- append(items, list(list(w=weights["location_jump"],      txt=sprintf("Location jump ≈ %.0f km (> %d km)", dist_km, jump_km_threshold))))
  if (!is.na(fcm) && fcm == 1) items <- append(items, list(list(w=weights["country_mismatch"],   txt=sprintf("Country mismatch: cell=%s vs wifi=%s", dplyr::coalesce(cell_ctry,"NA"), dplyr::coalesce(wifi_ctry,"NA")))))
  if (!is.na(fp) && fp == 1)   items <- append(items, list(list(w=weights["proxy"],              txt="Using VPN/Proxy")))
  if (!is.na(fbr) && fbr == 1) items <- append(items, list(list(w=weights["bssid_reuse"],        txt=sprintf("Wi-Fi BSSID reused %s times", ifelse(is.na(bssid_count),"NA", bssid_count)))))
  if (!is.na(frl) && frl == 1) items <- append(items, list(list(w=weights["roundtrip_low"],      txt=sprintf("engine_roundtrip = %.0f < p2.5 ≈ %.0f", eround, qlow))))
  if (!is.na(frh) && frh == 1) items <- append(items, list(list(w=weights["roundtrip_high"],     txt=sprintf("engine_roundtrip = %.0f > p97.5 ≈ %.0f", eround, qhigh))))
  if (!is.na(fco) && fco == 1) items <- append(items, list(list(w=weights["config_duration_outlier"], txt=sprintf("config_duration = %.0f > p95 ≈ %.0f", cfgdur, q95c))))
  if (length(items) == 0) return(NA_character_)
  ord <- order(sapply(items, `[[`, "w"), decreasing = TRUE)
  paste(sapply(items[ord], `[[`, "txt"), collapse = " | ")
}

# 3) Danh sách cột flag để tính tổng nhanh
flag_cols <- c(
  "flag_root","flag_spoofing","flag_location_jump","flag_country_mismatch",
  "flag_proxy","flag_bssid_reuse","flag_roundtrip_low","flag_roundtrip_high",
  "flag_config_duration_outlier"
)

# 4) Tính & gắn trực tiếp vào merged_df (risk_flag_count, risk_score, risk_level, reason_tags, reason_pretty)
merged_df <- merged_df %>%
  mutate(
    # unweighted count (nếu sau này bạn muốn “mọi trọng số bằng nhau” thì dùng cột này)
    risk_flag_count = rowSums(across(all_of(flag_cols), ~ dplyr::coalesce(as.integer(.), 0))),
    # weighted score (đang dùng weights ở trên)
    risk_score =
      dplyr::coalesce(flag_root,0L)               * weights["root"] +
      dplyr::coalesce(flag_spoofing,0L)           * weights["spoofing"] +
      dplyr::coalesce(flag_location_jump,0L)      * weights["location_jump"] +
      dplyr::coalesce(flag_country_mismatch,0L)   * weights["country_mismatch"] +
      dplyr::coalesce(flag_proxy,0L)              * weights["proxy"] +
      dplyr::coalesce(flag_bssid_reuse,0L)        * weights["bssid_reuse"] +
      dplyr::coalesce(flag_roundtrip_low,0L)      * weights["roundtrip_low"] +
      dplyr::coalesce(flag_roundtrip_high,0L)     * weights["roundtrip_high"] +
      dplyr::coalesce(flag_config_duration_outlier,0L) * weights["config_duration_outlier"],
    risk_level = cut(
      risk_score,
      breaks = c(-1, 2, 5, 8, Inf),
      labels = c("Low", "Medium", "High", "Very High"),
      right = TRUE
    ),
    reason_tags = pmap_chr(
      list(flag_root, flag_spoofing, flag_location_jump, flag_country_mismatch,
           flag_proxy, flag_bssid_reuse, flag_roundtrip_low, flag_roundtrip_high, flag_config_duration_outlier),
      ~ reason_tags_builder(..1, ..2, ..3, ..4, ..5, ..6, ..7, ..8, ..9)
    ),
    reason_pretty = pmap_chr(
      list(flag_root, flag_spoofing, flag_location_jump, flag_country_mismatch,
           flag_proxy, flag_bssid_reuse, flag_roundtrip_low, flag_roundtrip_high, flag_config_duration_outlier,
           distance_jump_km, cell_ctry, wifi_ctry, bssid_count,
           engine_roundtrip, rt_q_low, rt_q_high,
           config_duration, q95_config),
      ~ reason_pretty_builder(..1, ..2, ..3, ..4, ..5, ..6, ..7, ..8, ..9,
                              ..10, ..11, ..12, ..13,
                              ..14, ..15, ..16,
                              ..17, ..18)
    )
  )

# ==== EXPORT ====
top_risk <- merged_df %>%
  filter(risk_score > 0) %>%   # chỉ lấy giao dịch có risk > 0
  arrange(desc(risk_score)) %>%
  mutate(distance_jump_km = as.integer(round(distance_jump_km, 0))) %>%  
  select(any_of(c(
    "gc_transaction","solution","source",
    "risk_score","risk_level","reason_tags","reason_pretty",
    "distance_jump_km","cell_ctry","wifi_ctry","bssid_count",
    "flag_root","flag_spoofing","flag_location_jump","flag_country_mismatch",
    "flag_proxy","flag_bssid_reuse","flag_roundtrip_outlier","flag_config_duration_outlier"
  )))

write_xlsx(top_risk, "top_risk_transactions.xlsx")
write_xlsx(merged_df, "full_transactions_with_risk.xlsx")

cat("DONE\n- top_risk_transactions.xlsx\n- full_transactions_with_risk.xlsx\n")

## DONE
## - top_risk_transactions.xlsx
## - full_transactions_with_risk.xlsx

library(dplyr)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

theme_clean <- theme_minimal(base_size = 20) +
  theme(
    panel.grid = element_blank(),
    axis.text.y = element_text(size = 16),
    axis.text.x = element_text(size = 16),
    plot.margin = margin(5, 40, 5, 5) # khoảng trống bên phải
  )

## 1. Number of Transactions by Risk Score
df_risk <- merged_df %>%
  group_by(risk_score) %>%
  summarise(count = n(), .groups = "drop") %>%
  arrange(risk_score)

max_ct1 <- max(df_risk$count, na.rm = TRUE)

ggplot(df_risk, aes(x = risk_score, y = count)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = comma(count)), vjust = -0.3, size = 8) +
  scale_y_continuous(limits = c(0, max_ct1 * 1.15), labels = comma) +
  labs(
    title = "Number of Transactions by Risk Score",
    x = "Risk Score",
    y = "Number of Transactions"
  ) +
  theme_clean

## 2. Total Risk Score by Reason
df_sum_risk <- merged_df %>%
  filter(!is.na(reason_tags)) %>%
  group_by(reason_tags) %>%
  summarise(total_risk = sum(risk_score, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_risk))

max_ct2 <- max(df_sum_risk$total_risk, na.rm = TRUE)

ggplot(df_sum_risk, aes(x = reorder(reason_tags, total_risk), y = total_risk)) +
  geom_col(fill = "darkorange") +
  geom_text(aes(label = comma(total_risk)), hjust = -0.1, size = 8) +
  coord_flip(clip = "off") +
  scale_y_continuous(limits = c(0, max_ct2 * 1.15), labels = comma) +
  labs(
    title = "Total Risk Score by Reason",
    x = "Reason",
    y = "Total Risk Score"
  ) +
  theme_clean

## 3. Number of Transactions by Reason
df_reason <- merged_df %>%
  filter(!is.na(reason_tags)) %>%
  group_by(reason_tags) %>%
  summarise(count = n(), .groups = "drop") %>%
  arrange(desc(count))

max_ct3 <- max(df_reason$count, na.rm = TRUE)

ggplot(df_reason, aes(x = reorder(reason_tags, count), y = count)) +
  geom_col(fill = "forestgreen") +
  geom_text(aes(label = comma(count)), hjust = -0.1, size = 8) +
  coord_flip(clip = "off") +
  scale_y_continuous(limits = c(0, max_ct3 * 1.15), labels = comma) +
  labs(
    title = "Number of Transactions by Reason",
    x = "Reason",
    y = "Number of Transactions"
  ) +
  theme_clean

PaulCase

Paulus

2025-08-15