This project aims to review each steps of data analysis with R from data cleaning to data analysis and visualization.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
meta_trans.df <- read_csv("/Users/hosungkim/Desktop/Data Analysis/Portfolio/Case Study/Metaverse_transactions/metaverse_transactions_dataset.csv")
## Rows: 78600 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): sending_address, receiving_address, transaction_type, location_reg...
## dbl (6): hour_of_day, amount, ip_prefix, login_frequency, session_duration,...
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(meta_trans.df)
## # A tibble: 6 × 14
## timestamp hour_of_day sending_address receiving_address amount
## <dttm> <dbl> <chr> <chr> <dbl>
## 1 2022-04-11 12:47:27 12 0x9d32d0bf2c00f41ce7… 0x39f82e1c09bc6d… 797.
## 2 2022-06-14 19:12:46 19 0xd6e251c23cbf52dbd4… 0x51e8fbe24f124e… 0.01
## 3 2022-01-18 16:26:59 16 0x2e0925b922fed01f6a… 0x52c7911879f783… 778.
## 4 2022-06-15 09:20:04 9 0x93efefc25fcaf31d76… 0x8ac3b7bd531b3a… 301.
## 5 2022-02-18 14:35:30 14 0xad3b8de45d63f5cce2… 0x6fdc047c239161… 776.
## 6 2022-04-05 19:05:44 19 0xa99b9a7f5c5dd37429… 0x5a78c88c5fc1e9… 590.
## # ℹ 9 more variables: transaction_type <chr>, location_region <chr>,
## # ip_prefix <dbl>, login_frequency <dbl>, session_duration <dbl>,
## # purchase_pattern <chr>, age_group <chr>, risk_score <dbl>, anomaly <chr>
hour_in_Timestamp <- as.POSIXct(meta_trans.df$timestamp, format="%d/%m/%Y %H:%M:%S")
hour_in_Timestamp <- format(hour_in_Timestamp, format="%k")
hour_data = data.frame(column1=trimws(hour_in_Timestamp),
column2 = trimws(meta_trans.df$hour_of_day))
head(hour_data)
## column1 column2
## 1 12 12
## 2 19 19
## 3 16 16
## 4 9 9
## 5 14 14
## 6 19 19
identical(hour_data$column1, hour_data$column2)
## [1] TRUE
Phishing is a type of online scam that targets consumers by sending them an e-mail that appears to be from a well-known source - an internet service provider, a bank, or a mortgage company, for example (Federal Trade Commsion). As metaverse provides various unprecedented ways to make connections with users, cybercrimes are rising and being more sophisticated before. One of the most common ways of metaverse phishing is impersonating brands. For example, users might receive fake messages or emails directing them to fraudulent websites that mimic popular virtual marketplaces or platforms. In this data, “scam” includes various deceptive practices aimed at exploiting users for financial gain or other malicious purposes except for phishing.
unique(meta_trans.df$transaction_type)
## [1] "transfer" "purchase" "sale" "phishing" "scam"
My first impression about anomaly and risk_score is that they look like a same indicator for identifying possibilities of scamming or phishing.
# Converting the character variable to a numeric representation
encoded_anomaly <- as.numeric(factor(meta_trans.df$anomaly, levels = c("low_risk", "moderate_risk", "high_risk")))
# Calculate correlation
correlation <- cor(meta_trans.df$risk_score, encoded_anomaly)
print(correlation)
## [1] 0.8648303
ggplot(meta_trans.df, aes(x = meta_trans.df$risk_score, y = encoded_anomaly)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "risk score", y = "anomaly", title = "risk score vs anomaly")
## `geom_smooth()` using formula = 'y ~ x'
unique(duplicated(meta_trans.df))
## [1] FALSE
meta_trans.df$anomaly <- factor(meta_trans.df$anomaly, levels = c("low_risk", "moderate_risk", "high_risk"))
# Calculate the counts
counts <- meta_trans.df %>%
count(anomaly, age_group, name = "count") %>%
complete(anomaly, age_group, fill = list(count = 0))
counts
## # A tibble: 9 × 3
## anomaly age_group count
## <fct> <chr> <int>
## 1 low_risk established 26033
## 2 low_risk new 19256
## 3 low_risk veteran 18205
## 4 moderate_risk established 0
## 5 moderate_risk new 394
## 6 moderate_risk veteran 8217
## 7 high_risk established 0
## 8 high_risk new 6495
## 9 high_risk veteran 0
# Plotting
ggplot(counts, aes(x = age_group, y = count, fill = anomaly)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Risk Type Distribution by Age Group",
x = "Age Group",
y = "Count of Risk Types",
fill = "Risk Type") +
scale_fill_manual(values = c("low_risk" = "#1f77b4", "moderate_risk" = "#ff7f0e", "high_risk" = "#d62728"),
name = "Risk Type",
labels = c("Low Risk", "Moderate Risk", "High Risk")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10))
# Calculate the mean amount for each anomaly level
mean_amount <- meta_trans.df %>%
group_by(anomaly) %>%
summarize(mean_amount = mean(amount))
# Plotting bar chart with mean amount for each anomaly level
ggplot(mean_amount, aes(x = anomaly, y = mean_amount, fill = anomaly)) +
geom_bar(stat = "identity") +
labs(title = "Average Amount by Anomalies",
x = "Anomaly",
y = "Average Transaction Amount",
fill = "Anomaly Level") +
scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) + # Specify fill colors
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plotting density plot
ggplot(meta_trans.df, aes(x = amount, fill = anomaly)) +
geom_density(alpha = 0.5) +
geom_vline(data = mean_amount, aes(xintercept = mean_amount, color = anomaly), linetype = "dashed", size = 0.5) +
labs(title = "Distribution of Amount by Anomaly Level",
x = "Amount",
y = "Density",
fill = "Anomaly Level") +
scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
scale_color_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Calculate the mean session duration for each anomaly level
mean_session_duration <- meta_trans.df %>%
group_by(anomaly) %>%
summarize(mean_duration = mean(session_duration))
# Define colors for each anomaly level
anomaly_colors <- c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")
# Plotting bar chart with mean session duration for each anomaly level
ggplot(mean_session_duration, aes(x = anomaly, y = mean_duration, fill = anomaly)) +
geom_bar(stat = "identity") +
labs(title = "Average Session Duration by Anomaly Level",
x = "Anomaly Level",
y = "Average Session Duration (seconds)",
fill = "Anomaly Level") +
scale_fill_manual(values = anomaly_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plotting histogram of session durations for each anomaly level
ggplot(meta_trans.df, aes(x = session_duration, fill = anomaly)) +
geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5) +
labs(title = "Distribution of Session Durations by Anomaly Level",
x = "Session Duration (seconds)",
y = "Density",
fill = "Anomaly Level") +
scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) + # Specify fill colors
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plotting density plot of session durations for each anomaly level
ggplot(meta_trans.df, aes(x = session_duration, fill = anomaly)) +
geom_density(alpha = 0.5) +
geom_vline(data = mean_session_duration, aes(xintercept = mean_duration, color = anomaly), linetype = "dashed", size = 0.5) +
labs(title = "Distribution of Session Duration by Anomaly Level",
x = "Session Duration (seconds)",
y = "Density",
fill = "Anomaly Level") +
scale_fill_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
scale_color_manual(values = c("low_risk" = "blue", "moderate_risk" = "orange", "high_risk" = "red")) +
theme_minimal()