# ==== LIBRARIES ====
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(writexl)
library(geosphere) # distHaversine
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# ==== CONFIG ====
setwd("D:/PaulCase")
input_path <- "D:/PaulCase/Senior Data Anlyst_Case Study Dataset.xlsx"
top_n <- 20
bssid_reuse_threshold <- 5 # BSSID xuất hiện > 5 lần
jump_km_threshold <- 100 # Nhảy vị trí > 100 km
#roundtrip_q <- 0.95 # Outlier ngưỡng 95%
config_q <- 0.95
options(scipen = 999) # tắt scientific notation
# ==== HELPERS ====
parse_list_num <- function(x) {
if (length(x) == 0 || is.na(x)) return(numeric(0))
if (is.list(x)) return(suppressWarnings(as.numeric(unlist(x))))
if (is.character(x)) {
y <- tryCatch(fromJSON(x), error = function(e) NULL)
if (is.null(y)) return(numeric(0))
return(suppressWarnings(as.numeric(y)))
}
out <- suppressWarnings(as.numeric(x))
out[is.na(out)] <- numeric(0)
out
}
extract_last_numeric <- function(x){ v <- parse_list_num(x); if (length(v)==0) NA_real_ else tail(v,1) }
extract_first_numeric <- function(x){ v <- parse_list_num(x); if (length(v)==0) NA_real_ else v[1] }
as_bool <- function(x) {
if (is.logical(x)) return(x)
if (is.numeric(x)) return(x != 0)
if (is.character(x)) {
x2 <- tolower(trimws(x))
return(x2 %in% c("1","true","t","yes","y"))
}
rep(FALSE, length(x))
}
normalize_ctry <- function(x) {
if (is.na(x)) return(NA_character_)
if (is.character(x) && grepl("^\\s*\\[", x)) {
v <- tryCatch(fromJSON(x), error = function(e) NULL)
if (is.null(v) || length(v) == 0) return(NA_character_)
return(as.character(v[[1]]))
}
as.character(x)
}
# ==== LOAD ====
ios_df <- read_xlsx(input_path, sheet = "ios") %>% clean_names()
android_df <- read_xlsx(input_path, sheet = "android") %>% clean_names()
# ==== MAP ANDROID -> iOS naming ====
android_df <- android_df %>%
rename(
rooted = is_rooted,
proxy = is_proxy,
location_history_is_simulated_by_software = mocked_location
)
# ==== HARMONIZE TYPES (trước khi merge) ====
bool_like_cols <- c(
"rooted",
"proxy",
"location_history_is_simulated_by_software",
"is_spoofing",
"adb_is_active",
"gc_authorized"
)
json_like_cols <- c(
"last_known_location_lat", "last_known_location_lon",
"location_history_lat", "location_history_lon",
"cell_tower_mobile_country_code", "wifiaps_mac_address"
)
# Ép các cột “cờ” & “list/JSON” về character để 2 bảng cùng kiểu
for (c in bool_like_cols) {
if (c %in% names(ios_df)) ios_df[[c]] <- as.character(ios_df[[c]])
if (c %in% names(android_df)) android_df[[c]] <- as.character(android_df[[c]])
}
for (c in json_like_cols) {
if (c %in% names(ios_df)) ios_df[[c]] <- as.character(ios_df[[c]])
if (c %in% names(android_df)) android_df[[c]] <- as.character(android_df[[c]])
}
# ==== PREP iOS: chuẩn hoá list & gắn source ====
ios_df <- ios_df %>%
mutate(
last_known_location_lat_val = map_dbl(last_known_location_lat, extract_last_numeric),
last_known_location_lon_val = map_dbl(last_known_location_lon, extract_last_numeric),
location_history_lat_first = map_dbl(location_history_lat, extract_first_numeric),
location_history_lon_first = map_dbl(location_history_lon, extract_first_numeric),
source = "iOS"
)
# ==== PREP ANDROID: dùng lat/lon hiện tại thay cho last_known ====
android_df <- android_df %>%
mutate(
last_known_location_lat_val = suppressWarnings(as.numeric(lat)),
last_known_location_lon_val = suppressWarnings(as.numeric(lon)),
location_history_lat_first = suppressWarnings(as.numeric(lat)),
location_history_lon_first = suppressWarnings(as.numeric(lon)),
source = "Android"
)
# ==== MERGE ====
all_cols <- union(names(ios_df), names(android_df))
ios_df <- ios_df[, intersect(all_cols, names(ios_df))]
android_df <- android_df[, intersect(all_cols, names(android_df))]
# Bổ sung cột còn thiếu bằng NA để bind_rows không mất cột
missing_ios <- setdiff(all_cols, names(ios_df))
missing_android <- setdiff(all_cols, names(android_df))
for (c in missing_ios) ios_df[[c]] <- NA
for (c in missing_android) android_df[[c]] <- NA
merged_df <- bind_rows(ios_df[, all_cols], android_df[, all_cols])
# ==== NORMALIZE TYPES ====
merged_df <- merged_df %>%
mutate(
lat = suppressWarnings(as.numeric(lat)),
lon = suppressWarnings(as.numeric(lon)),
engine_roundtrip = suppressWarnings(as.numeric(engine_roundtrip)),
config_duration = suppressWarnings(as.numeric(ifelse("config_duration" %in% names(.), config_duration, NA))),
rooted = as_bool(ifelse("rooted" %in% names(.), rooted, FALSE)),
adb_is_active = as_bool(ifelse("adb_is_active" %in% names(.), adb_is_active, FALSE)),
is_spoofing = as_bool(ifelse("is_spoofing" %in% names(.), is_spoofing, FALSE)),
proxy = as_bool(ifelse("proxy" %in% names(.), proxy, FALSE))
)
# ==== DISTANCE JUMP (km) ====
merged_df$distance_jump_km <- pmap_dbl(
list(merged_df$last_known_location_lon_val, merged_df$last_known_location_lat_val,
merged_df$lon, merged_df$lat),
function(lon1, lat1, lon2, lat2){
if (any(is.na(c(lon1, lat1, lon2, lat2)))) return(NA_real_)
geosphere::distHaversine(c(lon1, lat1), c(lon2, lat2)) / 1000
}
)
# ==== COUNTRY MISMATCH ====
merged_df <- merged_df %>%
mutate(
cell_ctry = if ("cell_tower_mobile_country_code" %in% names(.))
vapply(cell_tower_mobile_country_code, normalize_ctry, FUN.VALUE = character(1))
else NA_character_,
wifi_ctry = if ("connected_wifi_country" %in% names(.))
as.character(connected_wifi_country) else NA_character_
)
# ==== BSSID REUSE ====
merged_df <- merged_df %>%
mutate(bssid = if ("connected_wifi_bssid" %in% names(.)) as.character(connected_wifi_bssid) else NA_character_) %>%
group_by(bssid) %>%
mutate(bssid_count = if (all(is.na(bssid))) NA_integer_ else n()) %>%
ungroup()
# ==== FLAGS ====
# (two-sided 95% for engine_roundtrip)
# Ngưỡng phân vị hai phía cho roundtrip
rt_q_low <- suppressWarnings(stats::quantile(merged_df$engine_roundtrip, 0.025, na.rm = TRUE))
rt_q_high <- suppressWarnings(stats::quantile(merged_df$engine_roundtrip, 0.975, na.rm = TRUE))
# Cờ roundtrip thấp/cao theo p2.5/p97.5
merged_df <- merged_df %>%
mutate(
flag_roundtrip_low = as.integer(!is.na(engine_roundtrip) & engine_roundtrip < rt_q_low),
flag_roundtrip_high = as.integer(!is.na(engine_roundtrip) & engine_roundtrip > rt_q_high)
)
# Ngưỡng p95 cho config_duration
q95_config <- suppressWarnings(stats::quantile(merged_df$config_duration, config_q, na.rm = TRUE))
# Các cờ còn lại
merged_df <- merged_df %>%
mutate(
flag_root = as.integer(rooted | adb_is_active),
flag_spoofing = as.integer(is_spoofing | as_bool(ifelse("location_history_is_simulated_by_software" %in% names(.),
location_history_is_simulated_by_software, FALSE))),
flag_location_jump = as.integer(!is.na(distance_jump_km) & distance_jump_km > jump_km_threshold),
flag_country_mismatch = as.integer(!is.na(cell_ctry) & !is.na(wifi_ctry) & cell_ctry != wifi_ctry),
flag_proxy = as.integer(proxy),
flag_bssid_reuse = as.integer(!is.na(bssid) & bssid != "" & !is.na(bssid_count) & bssid_count > bssid_reuse_threshold),
flag_config_duration_outlier = as.integer(!is.na(config_duration) & !is.na(q95_config) & config_duration > q95_config)
)
# ==== RISK: compute & attach all risk columns to merged_df ====
# 1) Weights (đang dùng theo mức độ nghiêm trọng)
weights <- c(
root = 3,
spoofing = 3,
location_jump = 3,
country_mismatch = 3,
proxy = 2,
bssid_reuse = 1,
roundtrip_low = 1,
roundtrip_high = 1,
config_duration_outlier = 1
)
# 2) Reason builders (sửa nhẹ: mapping rõ ràng để sắp xếp theo weights)
reason_tags_builder <- function(fr, fs, flj, fcm, fp, fbr, frl, frh, fco) {
tags <- c()
if (!is.na(fr) && fr == 1) tags <- c(tags, "root/jailbreak_or_adb")
if (!is.na(fs) && fs == 1) tags <- c(tags, "mock_location_or_spoofing")
if (!is.na(flj) && flj == 1) tags <- c(tags, "location_jump")
if (!is.na(fcm) && fcm == 1) tags <- c(tags, "country_mismatch")
if (!is.na(fp) && fp == 1) tags <- c(tags, "vpn_or_proxy")
if (!is.na(fbr) && fbr == 1) tags <- c(tags, "bssid_reuse")
if (!is.na(frl) && frl == 1) tags <- c(tags, "roundtrip_low")
if (!is.na(frh) && frh == 1) tags <- c(tags, "roundtrip_high")
if (!is.na(fco) && fco == 1) tags <- c(tags, "config_duration_outlier")
if (length(tags) == 0) return(NA_character_)
# map tag -> key trong weights để sắp theo criticality
key_map <- c(
"root/jailbreak_or_adb" = "root",
"mock_location_or_spoofing" = "spoofing",
"location_jump" = "location_jump",
"country_mismatch" = "country_mismatch",
"vpn_or_proxy" = "proxy",
"bssid_reuse" = "bssid_reuse",
"roundtrip_low" = "roundtrip_low",
"roundtrip_high" = "roundtrip_high",
"config_duration_outlier" = "config_duration_outlier"
)
# tên vector phải là character; dùng match an toàn
keys <- unname(key_map[match(tags, names(key_map))])
ord <- order(unname(weights[keys]), decreasing = TRUE)
paste(tags[ord], collapse = ", ")
}
reason_pretty_builder <- function(fr, fs, flj, fcm, fp, fbr, frl, frh, fco,
dist_km, cell_ctry, wifi_ctry,
bssid_count, eround, qlow, qhigh,
cfgdur, q95c) {
items <- list()
if (!is.na(fr) && fr == 1) items <- append(items, list(list(w=weights["root"], txt="Device is rooted/jailbroken or ADB enabled")))
if (!is.na(fs) && fs == 1) items <- append(items, list(list(w=weights["spoofing"], txt="Mock location / spoofing detected")))
if (!is.na(flj) && flj == 1) items <- append(items, list(list(w=weights["location_jump"], txt=sprintf("Location jump ≈ %.0f km (> %d km)", dist_km, jump_km_threshold))))
if (!is.na(fcm) && fcm == 1) items <- append(items, list(list(w=weights["country_mismatch"], txt=sprintf("Country mismatch: cell=%s vs wifi=%s", dplyr::coalesce(cell_ctry,"NA"), dplyr::coalesce(wifi_ctry,"NA")))))
if (!is.na(fp) && fp == 1) items <- append(items, list(list(w=weights["proxy"], txt="Using VPN/Proxy")))
if (!is.na(fbr) && fbr == 1) items <- append(items, list(list(w=weights["bssid_reuse"], txt=sprintf("Wi-Fi BSSID reused %s times", ifelse(is.na(bssid_count),"NA", bssid_count)))))
if (!is.na(frl) && frl == 1) items <- append(items, list(list(w=weights["roundtrip_low"], txt=sprintf("engine_roundtrip = %.0f < p2.5 ≈ %.0f", eround, qlow))))
if (!is.na(frh) && frh == 1) items <- append(items, list(list(w=weights["roundtrip_high"], txt=sprintf("engine_roundtrip = %.0f > p97.5 ≈ %.0f", eround, qhigh))))
if (!is.na(fco) && fco == 1) items <- append(items, list(list(w=weights["config_duration_outlier"], txt=sprintf("config_duration = %.0f > p95 ≈ %.0f", cfgdur, q95c))))
if (length(items) == 0) return(NA_character_)
ord <- order(sapply(items, `[[`, "w"), decreasing = TRUE)
paste(sapply(items[ord], `[[`, "txt"), collapse = " | ")
}
# 3) Danh sách cột flag để tính tổng nhanh
flag_cols <- c(
"flag_root","flag_spoofing","flag_location_jump","flag_country_mismatch",
"flag_proxy","flag_bssid_reuse","flag_roundtrip_low","flag_roundtrip_high",
"flag_config_duration_outlier"
)
# 4) Tính & gắn trực tiếp vào merged_df (risk_flag_count, risk_score, risk_level, reason_tags, reason_pretty)
merged_df <- merged_df %>%
mutate(
# unweighted count (nếu sau này bạn muốn “mọi trọng số bằng nhau” thì dùng cột này)
risk_flag_count = rowSums(across(all_of(flag_cols), ~ dplyr::coalesce(as.integer(.), 0))),
# weighted score (đang dùng weights ở trên)
risk_score =
dplyr::coalesce(flag_root,0L) * weights["root"] +
dplyr::coalesce(flag_spoofing,0L) * weights["spoofing"] +
dplyr::coalesce(flag_location_jump,0L) * weights["location_jump"] +
dplyr::coalesce(flag_country_mismatch,0L) * weights["country_mismatch"] +
dplyr::coalesce(flag_proxy,0L) * weights["proxy"] +
dplyr::coalesce(flag_bssid_reuse,0L) * weights["bssid_reuse"] +
dplyr::coalesce(flag_roundtrip_low,0L) * weights["roundtrip_low"] +
dplyr::coalesce(flag_roundtrip_high,0L) * weights["roundtrip_high"] +
dplyr::coalesce(flag_config_duration_outlier,0L) * weights["config_duration_outlier"],
risk_level = cut(
risk_score,
breaks = c(-1, 2, 5, 8, Inf),
labels = c("Low", "Medium", "High", "Very High"),
right = TRUE
),
reason_tags = pmap_chr(
list(flag_root, flag_spoofing, flag_location_jump, flag_country_mismatch,
flag_proxy, flag_bssid_reuse, flag_roundtrip_low, flag_roundtrip_high, flag_config_duration_outlier),
~ reason_tags_builder(..1, ..2, ..3, ..4, ..5, ..6, ..7, ..8, ..9)
),
reason_pretty = pmap_chr(
list(flag_root, flag_spoofing, flag_location_jump, flag_country_mismatch,
flag_proxy, flag_bssid_reuse, flag_roundtrip_low, flag_roundtrip_high, flag_config_duration_outlier,
distance_jump_km, cell_ctry, wifi_ctry, bssid_count,
engine_roundtrip, rt_q_low, rt_q_high,
config_duration, q95_config),
~ reason_pretty_builder(..1, ..2, ..3, ..4, ..5, ..6, ..7, ..8, ..9,
..10, ..11, ..12, ..13,
..14, ..15, ..16,
..17, ..18)
)
)
# ==== EXPORT ====
top_risk <- merged_df %>%
filter(risk_score > 0) %>% # chỉ lấy giao dịch có risk > 0
arrange(desc(risk_score)) %>%
mutate(distance_jump_km = as.integer(round(distance_jump_km, 0))) %>%
select(any_of(c(
"gc_transaction","solution","source",
"risk_score","risk_level","reason_tags","reason_pretty",
"distance_jump_km","cell_ctry","wifi_ctry","bssid_count",
"flag_root","flag_spoofing","flag_location_jump","flag_country_mismatch",
"flag_proxy","flag_bssid_reuse","flag_roundtrip_outlier","flag_config_duration_outlier"
)))
write_xlsx(top_risk, "top_risk_transactions.xlsx")
write_xlsx(merged_df, "full_transactions_with_risk.xlsx")
cat("DONE\n- top_risk_transactions.xlsx\n- full_transactions_with_risk.xlsx\n")
## DONE
## - top_risk_transactions.xlsx
## - full_transactions_with_risk.xlsx
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
theme_clean <- theme_minimal(base_size = 20) +
theme(
panel.grid = element_blank(),
axis.text.y = element_text(size = 16),
axis.text.x = element_text(size = 16),
plot.margin = margin(5, 40, 5, 5) # khoảng trống bên phải
)
## 1. Number of Transactions by Risk Score
df_risk <- merged_df %>%
group_by(risk_score) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(risk_score)
max_ct1 <- max(df_risk$count, na.rm = TRUE)
ggplot(df_risk, aes(x = risk_score, y = count)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = comma(count)), vjust = -0.3, size = 8) +
scale_y_continuous(limits = c(0, max_ct1 * 1.15), labels = comma) +
labs(
title = "Number of Transactions by Risk Score",
x = "Risk Score",
y = "Number of Transactions"
) +
theme_clean

## 2. Total Risk Score by Reason
df_sum_risk <- merged_df %>%
filter(!is.na(reason_tags)) %>%
group_by(reason_tags) %>%
summarise(total_risk = sum(risk_score, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_risk))
max_ct2 <- max(df_sum_risk$total_risk, na.rm = TRUE)
ggplot(df_sum_risk, aes(x = reorder(reason_tags, total_risk), y = total_risk)) +
geom_col(fill = "darkorange") +
geom_text(aes(label = comma(total_risk)), hjust = -0.1, size = 8) +
coord_flip(clip = "off") +
scale_y_continuous(limits = c(0, max_ct2 * 1.15), labels = comma) +
labs(
title = "Total Risk Score by Reason",
x = "Reason",
y = "Total Risk Score"
) +
theme_clean

## 3. Number of Transactions by Reason
df_reason <- merged_df %>%
filter(!is.na(reason_tags)) %>%
group_by(reason_tags) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count))
max_ct3 <- max(df_reason$count, na.rm = TRUE)
ggplot(df_reason, aes(x = reorder(reason_tags, count), y = count)) +
geom_col(fill = "forestgreen") +
geom_text(aes(label = comma(count)), hjust = -0.1, size = 8) +
coord_flip(clip = "off") +
scale_y_continuous(limits = c(0, max_ct3 * 1.15), labels = comma) +
labs(
title = "Number of Transactions by Reason",
x = "Reason",
y = "Number of Transactions"
) +
theme_clean
