Background for this project

This project aims to review each steps of data analysis with R from data cleaning to data analysis and visualization.

Data Exploration & Cleaning

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

meta_trans.df <- read_csv("/Users/hosungkim/Desktop/Data Analysis/Portfolio/Case Study/Metaverse_transactions/metaverse_transactions_dataset.csv")
## Rows: 78600 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): sending_address, receiving_address, transaction_type, location_reg...
## dbl  (6): hour_of_day, amount, ip_prefix, login_frequency, session_duration,...
## dttm (1): timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(meta_trans.df)
## # A tibble: 6 × 14
##   timestamp           hour_of_day sending_address       receiving_address amount
##   <dttm>                    <dbl> <chr>                 <chr>              <dbl>
## 1 2022-04-11 12:47:27          12 0x9d32d0bf2c00f41ce7… 0x39f82e1c09bc6d… 797.  
## 2 2022-06-14 19:12:46          19 0xd6e251c23cbf52dbd4… 0x51e8fbe24f124e…   0.01
## 3 2022-01-18 16:26:59          16 0x2e0925b922fed01f6a… 0x52c7911879f783… 778.  
## 4 2022-06-15 09:20:04           9 0x93efefc25fcaf31d76… 0x8ac3b7bd531b3a… 301.  
## 5 2022-02-18 14:35:30          14 0xad3b8de45d63f5cce2… 0x6fdc047c239161… 776.  
## 6 2022-04-05 19:05:44          19 0xa99b9a7f5c5dd37429… 0x5a78c88c5fc1e9… 590.  
## # ℹ 9 more variables: transaction_type <chr>, location_region <chr>,
## #   ip_prefix <dbl>, login_frequency <dbl>, session_duration <dbl>,
## #   purchase_pattern <chr>, age_group <chr>, risk_score <dbl>, anomaly <chr>

Checking if Timestamp and Hour of Day matches the time value

Extracting hour from Timestamp
hour_in_Timestamp <- as.POSIXct(meta_trans.df$timestamp, format="%d/%m/%Y %H:%M:%S")
hour_in_Timestamp <- format(hour_in_Timestamp, format="%k")

hour_data = data.frame(column1=trimws(hour_in_Timestamp), 
                  column2 = trimws(meta_trans.df$hour_of_day))
head(hour_data)
##   column1 column2
## 1      12      12
## 2      19      19
## 3      16      16
## 4       9       9
## 5      14      14
## 6      19      19
Verifying if two values are identical
identical(hour_data$column1, hour_data$column2)
## [1] TRUE

Difference between Phishing and Scamming

Phishing is a type of online scam that targets consumers by sending them an e-mail that appears to be from a well-known source - an internet service provider, a bank, or a mortgage company, for example (Federal Trade Commsion). As metaverse provides various unprecedented ways to make connections with users, cybercrimes are rising and being more sophisticated before. One of the most common ways of metaverse phishing is impersonating brands. For example, users might receive fake messages or emails directing them to fraudulent websites that mimic popular virtual marketplaces or platforms. In this data, “scam” includes various deceptive practices aimed at exploiting users for financial gain or other malicious purposes except for phishing.

unique(meta_trans.df$transaction_type)
## [1] "transfer" "purchase" "sale"     "phishing" "scam"

Anomaly & Risk_score

My first impression about anomaly and risk_score is that they look like a same indicator for identifying possibilities of scamming or phishing.

# Converting the character variable to a numeric representation
encoded_anomaly <- as.numeric(factor(meta_trans.df$anomaly, levels = c("low_risk", "moderate_risk", "high_risk")))

# Calculate correlation
correlation <- cor(meta_trans.df$risk_score, encoded_anomaly)
print(correlation)
## [1] 0.8648303
ggplot(meta_trans.df, aes(x = meta_trans.df$risk_score, y = encoded_anomaly)) +
  geom_point() +  
  geom_smooth(method = "lm", se = FALSE) +  
  labs(x = "risk score", y = "anomaly", title = "risk score vs anomaly")  
## `geom_smooth()` using formula = 'y ~ x'

Duplicates

unique(duplicated(meta_trans.df))
## [1] FALSE

Data Analysis

Assumption 1

New users would be more prone to scamming and phishing than the established and veteran.
meta_trans.df$anomaly <- factor(meta_trans.df$anomaly, levels = c("low_risk", "moderate_risk", "high_risk"))

# Calculate the counts
counts <- meta_trans.df %>%
  count(anomaly, age_group, name = "count") %>%
  complete(anomaly, age_group, fill = list(count = 0))

counts
## # A tibble: 9 × 3
##   anomaly       age_group   count
##   <fct>         <chr>       <int>
## 1 low_risk      established 26033
## 2 low_risk      new         19256
## 3 low_risk      veteran     18205
## 4 moderate_risk established     0
## 5 moderate_risk new           394
## 6 moderate_risk veteran      8217
## 7 high_risk     established     0
## 8 high_risk     new          6495
## 9 high_risk     veteran         0
# Plotting
ggplot(counts, aes(x = age_group, y = count, fill = anomaly)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") + 
  labs(title = "Risk Type Distribution by Age Group",
       x = "Age Group",
       y = "Count of Risk Types",
       fill = "Risk Type") +
  scale_fill_manual(values = c("low_risk" = "#1f77b4", "moderate_risk" = "#ff7f0e", "high_risk" = "#d62728"),
                     name = "Risk Type",
                     labels = c("Low Risk", "Moderate Risk", "High Risk")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 12), 
        axis.text = element_text(size = 10),  
        legend.title = element_text(size = 10), 
        legend.text = element_text(size = 10))  

Result: high_risk transactions only occurred in “new” age group.

Assumption 2

Transaction amount wouldn’t be a clue to prevent scamming or phishing.
# Calculate the mean amount for each anomaly level
mean_amount <- meta_trans.df %>%
  group_by(anomaly) %>%
  summarize(mean_amount = mean(amount))

# Plotting bar chart with mean amount for each anomaly level
ggplot(mean_amount, aes(x = anomaly, y = mean_amount, fill = anomaly)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Amount by Anomalies",
       x = "Anomaly",
       y = "Average Transaction Amount",
       fill = "Anomaly Level") +
  scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +  # Specify fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plotting density plot
ggplot(meta_trans.df, aes(x = amount, fill = anomaly)) +
  geom_density(alpha = 0.5) +  
  geom_vline(data = mean_amount, aes(xintercept = mean_amount, color = anomaly), linetype = "dashed", size = 0.5) +  
  labs(title = "Distribution of Amount by Anomaly Level",
       x = "Amount",
       y = "Density",
       fill = "Anomaly Level") +
  scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
  scale_color_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +  
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Result: Transaction amounts depending on anomaly level have no patterns.

Assumption 3

Session duration of scamming or phishing might be relatively shorter than other transactions. As official websites use Two-factor authentication, well known as 2FA, to counter phishing attacks, the length of a visitor’s stay on a credible website might be longer than malicious websites. Moreover, long session duration reflects high content quality of blog posts and product pages, which all increases user’s engagement.
# Calculate the mean session duration for each anomaly level
mean_session_duration <- meta_trans.df %>%
  group_by(anomaly) %>%
  summarize(mean_duration = mean(session_duration))

# Define colors for each anomaly level
anomaly_colors <- c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")

# Plotting bar chart with mean session duration for each anomaly level
ggplot(mean_session_duration, aes(x = anomaly, y = mean_duration, fill = anomaly)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Session Duration by Anomaly Level",
       x = "Anomaly Level",
       y = "Average Session Duration (seconds)",
       fill = "Anomaly Level") +
  scale_fill_manual(values = anomaly_colors) +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plotting histogram of session durations for each anomaly level
ggplot(meta_trans.df, aes(x = session_duration, fill = anomaly)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5) +
  labs(title = "Distribution of Session Durations by Anomaly Level",
       x = "Session Duration (seconds)",
       y = "Density",
       fill = "Anomaly Level") +
  scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +  # Specify fill colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Plotting density plot of session durations for each anomaly level
ggplot(meta_trans.df, aes(x = session_duration, fill = anomaly)) +
  geom_density(alpha = 0.5) +  
  geom_vline(data = mean_session_duration, aes(xintercept = mean_duration, color = anomaly), linetype = "dashed", size = 0.5) +  
  labs(title = "Distribution of Session Duration by Anomaly Level",
       x = "Session Duration (seconds)",
       y = "Density",
       fill = "Anomaly Level") +
  scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
  scale_color_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +  
  theme_minimal()

Result: Session Duration is significantly short in high_risk transactions.