This R-Codline has been published to RPubs link
This document covers the R-Codeline as referenced in the Power Point presentation for this CAS study. Please read/consult the presention regards the intense discussion and furhter input to this topic.
Review and Analyze Banking Operational Risk Data supplied by ORX (ORX News)
ORX News Homepage
The ORX News team searches financial media sources worldwide to identify operational risk loss events as they appear. They’re a dedicated team of multilingual researchers working to provide global coverage of operational risk loss events. The service provides detailed and timely reporting of operational risk loss events in the financial services industry, including the banking, insurance and asset management sectors.
ORX News also covers significant cyber events from all industry sectors. Our subscribers use the service to raise awareness of losses, inform their scenario programmes and challenge assumptions.
The service is available to ORX members and non-members.
library(readxl) # XLS File manipulation
library(magrittr) # Contains pipe %>%
library(readxl) # XLS read-write
library(ggplot2) # Graphics and Visualization
library(hexbin) # Needed by ggplot2
library(gridExtra) # Display data in grid
library(grid) # Display data in grid
Libraries used for text mining (tm)
library(tm)
library(stringi)
library(proxy)
library(wordcloud)
library(SnowballC)
library(RColorBrewer)
Libraries used for Decision Trees (CART)
library(rpart)
library(rpart.plot)
library(rattle)
library(magrittr) # pipe
Remove all object currently in the R-Studio session.
rm(list=ls())
ORX_path <-"/Users/tstump/Dropbox/GitHub/CAS_BigData_Analysis/ORX_News/"
ORX_filename <- "HWZ_CAS_BigData_ORX_News_TStump.xlsx"
ORX_file=paste(ORX_path,ORX_filename,sep="")
ORX <- read_excel(ORX_file)
Subset ORX data.frame to active loss numbers, i.e extract valid loss data
ORX_loss_w_0 <- ORX[!is.na(as.numeric(as.character(ORX$`Loss Amount (USD)`))),] # only numericals
## Warning in `[.tbl_df`(ORX, !is.na(as.numeric(as.character(ORX$`Loss Amount
## (USD)`))), : NAs introduced by coercion
ORX_loss <- ORX_loss_w_0[which(ORX_loss_w_0$`Loss Amount (USD)`!=0),] # only non-0 values
Extract only meaningfull columns from ORX.News that make sense for furhter analytics.
ORX_clean <- data.frame(
ORX_loss$`Story Reference Number`,
ORX_loss$Headline,
round(as.numeric(ORX_loss$`Loss Amount (USD)`),0),
ORX_loss$`ORX Standard`,
ORX_loss$`Business Line Level 1 Name`,
ORX_loss$`Product Level 1 Name`,
ORX_loss$`Process Level 1 Name`,
ORX_loss$`Event Type Level 1 Name`,
ORX_loss$`Scenario Category Name`,
ORX_loss$`Basel Event Type Level 1 Name`,
ORX_loss$`Cause 1 Level 1 Name`,
ORX_loss$`Region Name`)
ORX_clean.col_names = c("Ref_ID",
"Headline",
"Loss_USD",
"ORX_Std",
"Business",
"Product",
"Process",
"Event",
"Scenario",
"Basel_II",
"Cause",
"Region")
colnames(ORX_clean) <- ORX_clean.col_names
ORX_NA_removed <- ORX_clean[complete.cases(ORX_clean),]
Define a value 0.0 to 1.0 in the quantile (e.g. 0.9 = 90%). For this exercise, we apply the full data-set, i.e. no outliners will be removed.
loss_quantile <- quantile(ORX_NA_removed$Loss_USD,1,names=FALSE) # 100%, no data reduction
ORX_select <- subset(ORX_NA_removed, Loss_USD < loss_quantile)
str(ORX_select)
## 'data.frame': 4884 obs. of 12 variables:
## $ Ref_ID : num 1 6 7 9 11 12 13 15 16 17 ...
## $ Headline: Factor w/ 5490 levels "‘Hacktivists’ cost PayPal an estimated GBP 3.5 million",..: 1571 2512 4667 3190 3047 5026 947 3182 2850 983 ...
## $ Loss_USD: num 1200000 10539300 1235000 18122644 5172000 ...
## $ ORX_Std : Factor w/ 2 levels "Banking","Insurance": 1 1 1 1 1 1 1 1 1 1 ...
## $ Business: Factor w/ 12 levels "Agency Services",..: 6 4 10 1 10 12 5 2 12 10 ...
## $ Product : Factor w/ 12 levels "Brokerage","Capital Raising",..: 10 6 4 12 10 7 2 8 7 11 ...
## $ Process : Factor w/ 17 levels "Capture and Document Transactions",..: 7 1 17 2 1 2 5 2 16 5 ...
## $ Event : Factor w/ 7 levels "Clients, Products & Business Practices",..: 2 5 4 1 5 5 1 1 1 5 ...
## $ Scenario: Factor w/ 31 levels "1st Party Fraud",..: 30 15 1 24 15 25 17 17 8 17 ...
## $ Basel_II: Factor w/ 7 levels "Business disruption and system failures",..: 4 7 6 2 7 7 2 2 2 7 ...
## $ Cause : Factor w/ 5 levels "External","Governance & Structure",..: 5 4 1 5 4 4 5 5 4 4 ...
## $ Region : Factor w/ 6 levels "Africa","Asia / Pacific",..: 5 2 2 5 6 6 6 5 5 5 ...
summary(ORX_select$Loss_USD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.434e+03 2.248e+06 7.100e+06 1.501e+08 3.596e+07 3.146e+10
sd(ORX_select$Loss_USD) # Standard Deviation
## [1] 955265423
boxplot(ORX_select$Loss_USD)
By applying the grid.table command, data can be displayed in a table (grid) format.
ORX_head_10 <- head(ORX_clean[,c(1,3,5)],10)
ORX_head_10
grid.table(ORX_head_10)
tt1 <- ttheme_default(core=list(fg_params=list(hjust=0, x=0.1)),
rowhead=list(fg_params=list(hjust=0, x=0)))
grid.table(ORX_head_10, theme=tt1)
tt2 <- ttheme_minimal(
core=list(bg_params = list(fill = blues9[1:4], col=NA),
fg_params=list(fontface=3)),
colhead=list(fg_params=list(col="navyblue", fontface=4L)),
rowhead=list(fg_params=list(col="orange", fontface=3L)))
grid.table(ORX_head_10, theme=tt2)
Tried various options:
ggplot(ORX_select, aes(x=Loss_USD,y=Business)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Business",
subtitle="Operational Risk Losses by Business Line",
x="Loss [USD]", y="Business",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Product)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Product",
subtitle="Operational Risk Losses by Product",
x="Loss [USD]", y="Product",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Process)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Process",
subtitle="Operational Risk Losses by Process",
x="Loss [USD]", y="Process",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Event)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Event",
subtitle="Operational Risk Losses by Event",
x="Loss [USD]", y="Event",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Scenario)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Scenario",
subtitle="Operational Risk Losses by Scenario",
x="Loss [USD]", y="Scenario",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Basel_II)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Basel_II",
subtitle="Operational Risk Losses by Basel_II",
x="Loss [USD]", y="Basel_II",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
ggplot(ORX_select, aes(x=Loss_USD,y=Cause)) +
geom_jitter(colour="darkblue") +
labs(title="Loss (USD) vs Cause",
subtitle="Operational Risk Losses by Cause",
x="Loss [USD]", y="Cause",
caption="ORX News (https://managingrisktogether.orx.org/orx-news)")
** Anylysis of unstructured data ** It has been estimated that over 70% of potentially useable business information is unstructured, often in the form of text data.
** Text Mining**
Text mining provides a collection of techniques that allow us to derive actionable insights from these data.
** Exercise**
In this presentation we will process the Operational Risk Incidents “Headlines”, as provided by ORX News (See Appendix for the full Codeline)
Extract the ORX News ”Headlines” for further processing. The dataset contains of 7144 text lines. By flattening the text, the various text lines will be transferred into a continuous character string.
summary(ORX$Headline)
## Length Class Mode
## 7144 character character
t_headline <- ORX$`Headline`
t_headline[1:5]
## [1] "Countrywide pays USD 1.2 million in severance pay settlement"
## [2] "Nationwide Mutual Insurance pays USD 7.2 million in death benefits settlement"
## [3] "Regions Financial hit by cyberattack"
## [4] "AFLAC loses USD 4 million in disability insurance fraud scheme"
## [5] "Forethought to pay USD 25 million for failing to pay death benefits to life insurance beneficiaries"
t_headline_flat <- stri_flatten(t_headline, col=" ")
substr(t_headline_flat,1,500)
## [1] "Countrywide pays USD 1.2 million in severance pay settlement Nationwide Mutual Insurance pays USD 7.2 million in death benefits settlement Regions Financial hit by cyberattack AFLAC loses USD 4 million in disability insurance fraud scheme Forethought to pay USD 25 million for failing to pay death benefits to life insurance beneficiaries Four arrested for fraudulently obtaining INR 554.7 million from Vijaya Bank State Bank of India loses INR 65 million in loan fraud scheme Nationwide Mutual pays "
t_headline_stem <- stemDocument(t_headline_flat)
substr(t_headline_stem,1,500)
## [1] "Countrywid pay USD 1.2 million in sever pay settlement Nationwid Mutual Insuranc pay USD 7.2 million in death benefit settlement Region Financi hit by cyberattack AFLAC lose USD 4 million in disabl insur fraud scheme Forethought to pay USD 25 million for fail to pay death benefit to life insur beneficiari Four arrest for fraudul obtain INR 554.7 million from Vijaya Bank State Bank of India lose INR 65 million in loan fraud scheme Nationwid Mutual pay USD 26 million to settl Harleysvill merger cl"
The text gets massively distracted:
Do a stem-completion > does not work at all > out-of-memory and invalid expression error. Spent approx one hour in the internet for fixing this problem, no soultion found. So I stopped further investiation and do not StemCompetion
Command would be: t_headline_complete <- stemCompletion(t_headline_stem,Corpus(VectorSource(t_headline_flat)))
Text will be brought into a “Corpus” data structure and prepared for graphical representation.
docs <- Corpus(VectorSource(t_headline_flat))
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removePunctuation)
docs7 <- tm_map(docs6, tolower)
docs8 <- tm_map(docs7, removeWords, c(stopwords("english"),
"banking","risk","operational", "finance",
"event","million","usd","bank","eur",
"billion", "banks","financial","banco","pays",
"pay","data","inr","former"))
docs9 <- tm_map(docs8, stemDocument)
## Warning in tm_map.SimpleCorpus(docs8, stemDocument): transformation drops
## documents
Error in stemComplete > out of memory > disabled this command: docs10 <- tm_map(docs9, stemCompletion, t_headline_flat)
Text will be broken-up in words and word-counts (data.frame) and displayed in word-cloud and barplot (next slide)
dtm1 <- TermDocumentMatrix(docs8)
m1 <- as.matrix(dtm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
set.seed(1234)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 5,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale = c(3, 1))
barplot(d1[1:10,]$freq, las = 2, names.arg = d1[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
Word Stemming Still another useful preprocessing step involves word stemming. Word stemming reduces words to unify across documents. For example, the stem of “computational”, “computers” and “computation” is “comput”.
Example
By applying Word stemming towards the ORX Headlines, the word count get’s an additional shift. E.g. some words are getting a better (“fined” [3] to “fine” [1]) or a worse ranking (“fraud” [1] to “fraud” [3]).A word may even kicked-out of the top-10 list (“Insurance”).
dtm <- TermDocumentMatrix(docs9)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 5,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale = c(3, 1))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
Running a CART on a sub-set of ORX loss data
CART Decision Trees are an important type of algorithm for predictive modeling machine learning. Classification and Regression Trees (CART) is a term to refer to Decision Tree algorithms that can be used for classification or regression predictive modeling problems.
Binary Tree The representation for the CART model is a binary tree. Each root node represents a single input variable (x) and a split point on that variable (assuming the variable is numeric). The leaf nodes of the tree contain an output variable (y) which is used to make a prediction.
Subsetting is needed to get the regression tree in a right shape (too much data)
Remove outliners of Loss_USD
Define a value 0.0 to 1.0 in the quantile (e.g. 0.75 = 75%)
par(mfrow=c(1,2)) # Display two box-blots side-by-side
boxplot(ORX_NA_removed$Loss_USD)
str(ORX_NA_removed)
## 'data.frame': 4885 obs. of 12 variables:
## $ Ref_ID : num 1 6 7 9 11 12 13 15 16 17 ...
## $ Headline: Factor w/ 5490 levels "‘Hacktivists’ cost PayPal an estimated GBP 3.5 million",..: 1571 2512 4667 3190 3047 5026 947 3182 2850 983 ...
## $ Loss_USD: num 1200000 10539300 1235000 18122644 5172000 ...
## $ ORX_Std : Factor w/ 2 levels "Banking","Insurance": 1 1 1 1 1 1 1 1 1 1 ...
## $ Business: Factor w/ 12 levels "Agency Services",..: 6 4 10 1 10 12 5 2 12 10 ...
## $ Product : Factor w/ 12 levels "Brokerage","Capital Raising",..: 10 6 4 12 10 7 2 8 7 11 ...
## $ Process : Factor w/ 17 levels "Capture and Document Transactions",..: 7 1 17 2 1 2 5 2 16 5 ...
## $ Event : Factor w/ 7 levels "Clients, Products & Business Practices",..: 2 5 4 1 5 5 1 1 1 5 ...
## $ Scenario: Factor w/ 31 levels "1st Party Fraud",..: 30 15 1 24 15 25 17 17 8 17 ...
## $ Basel_II: Factor w/ 7 levels "Business disruption and system failures",..: 4 7 6 2 7 7 2 2 2 7 ...
## $ Cause : Factor w/ 5 levels "External","Governance & Structure",..: 5 4 1 5 4 4 5 5 4 4 ...
## $ Region : Factor w/ 6 levels "Africa","Asia / Pacific",..: 5 2 2 5 6 6 6 5 5 5 ...
loss_quantile <- quantile(ORX_NA_removed$Loss_USD,0.75,names=FALSE)
ORX_RM_Outliners <- subset(ORX_NA_removed, Loss_USD < loss_quantile)
boxplot(ORX_RM_Outliners$Loss_USD)
str(ORX_RM_Outliners)
## 'data.frame': 3663 obs. of 12 variables:
## $ Ref_ID : num 1 6 7 9 11 13 17 18 20 21 ...
## $ Headline: Factor w/ 5490 levels "‘Hacktivists’ cost PayPal an estimated GBP 3.5 million",..: 1571 2512 4667 3190 3047 947 983 3567 4699 567 ...
## $ Loss_USD: num 1200000 10539300 1235000 18122644 5172000 ...
## $ ORX_Std : Factor w/ 2 levels "Banking","Insurance": 1 1 1 1 1 1 1 1 1 1 ...
## $ Business: Factor w/ 12 levels "Agency Services",..: 6 4 10 1 10 5 10 2 4 10 ...
## $ Product : Factor w/ 12 levels "Brokerage","Capital Raising",..: 10 6 4 12 10 2 11 6 4 6 ...
## $ Process : Factor w/ 17 levels "Capture and Document Transactions",..: 7 1 17 2 1 5 5 2 2 16 ...
## $ Event : Factor w/ 7 levels "Clients, Products & Business Practices",..: 2 5 4 1 5 1 5 1 1 7 ...
## $ Scenario: Factor w/ 31 levels "1st Party Fraud",..: 30 15 1 24 15 17 17 17 17 23 ...
## $ Basel_II: Factor w/ 7 levels "Business disruption and system failures",..: 4 7 6 2 7 2 7 2 2 1 ...
## $ Cause : Factor w/ 5 levels "External","Governance & Structure",..: 5 4 1 5 4 5 4 4 4 3 ...
## $ Region : Factor w/ 6 levels "Africa","Asia / Pacific",..: 5 2 2 5 6 6 5 5 5 5 ...
Sub-set of columns from ORX_NA_removed, covering:
ORX_CART_all <- ORX_RM_Outliners[,c(3,5,6,8,11)]
Currently we don’t sample, ie. we take full data-set. But the codeline below may be applied (1 to be replace with 0.x value) to sample data.
n_Daten <- nrow(ORX_CART_all)
sample_selection <- sample(nrow(ORX_CART_all), 1 * n_Daten)
ORX_CART_sample <- ORX_CART_all[sample_selection,]
Furthermore, we reduce the data set to certain Business, Products, etc., that may be of interest for our business.
ORX_CART_Business <- ORX_CART_sample [ ORX_CART_sample$Business == "Retail Banking"
| ORX_CART_sample$Business == "Commercial Banking"
| ORX_CART_sample$Business == "Private Banking",]
ORX_CART_Product <- ORX_CART_Business [ ORX_CART_Business$Product == "Commercial Credit"
| ORX_CART_Business$Product == "Deposits"
| ORX_CART_Business$Product == "Retail Credit",]
ORX_CART_Event <- ORX_CART_Product [ ORX_CART_Product$Event == "External Fraud"
| ORX_CART_Product$Event == "Internal Fraud"
| ORX_CART_Product$Event == "Clients, Products & Business Practices",]
ORX_CART_Cause <- ORX_CART_Event [ ORX_CART_Event$Cause == "External"
| ORX_CART_Event$Cause == "People / Staff"
| ORX_CART_Event$Cause == "Processes",]
ORX_CART <- ORX_CART_Cause
summary(ORX_CART)
## Loss_USD Business
## Min. : 30000 Retail Banking :901
## 1st Qu.: 1783816 Commercial Banking:611
## Median : 4067369 Private Banking : 91
## Mean : 7539607 Agency Services : 0
## 3rd Qu.:10187250 Asset Management : 0
## Max. :35794000 Clearing : 0
## (Other) : 0
## Product
## Retail Credit :658
## Commercial Credit :520
## Deposits :425
## Brokerage : 0
## Capital Raising : 0
## Cash Management, Payments & Settlements: 0
## (Other) : 0
## Event
## Clients, Products & Business Practices :508
## Employee Practices & Workplace Safety : 0
## Execution, Delivery & Process Management: 0
## External Fraud :564
## Internal Fraud :531
## Natural Disasters & Public Safety : 0
## Technology & Infrastructure Failure : 0
## Cause
## External :680
## Governance & Structure : 0
## Internal Systems Failures: 0
## People / Staff :467
## Processes :456
##
##
Recursive Partitioning And Regression Trees. Fit a rpart model.
Establish the recursive partitioning of the regression tree.
fit <- rpart(Loss_USD ~ Business + Product + Event + Cause, method="anova", data = ORX_CART)
fit
## n= 1603
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1603 1.065068e+17 7539607
## 2) Product=Deposits 425 1.599335e+16 4883696 *
## 3) Product=Commercial Credit,Retail Credit 1178 8.643398e+16 8497809
## 6) Product=Retail Credit 658 4.473826e+16 7748525
## 12) Event=External Fraud,Internal Fraud 351 1.619624e+16 6028030 *
## 13) Event=Clients, Products & Business Practices 307 2.631511e+16 9715606 *
## 7) Product=Commercial Credit 520 4.085885e+16 9445942 *
fancyRpartPlot(fit,main="Regression Tree for ORX Loss-USD",sub="Sub-Set of ORX-News data - External OpRisk Incidents")