Load Data

#remove(list=ls())

require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
require(car)
## Loading required package: car
## Loading required package: carData
require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
require(DMwR2)
## Loading required package: DMwR2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
require(ggplot2)
require(ggcorrplot)
## Loading required package: ggcorrplot
require(psych)
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
require(magrittr)
## Loading required package: magrittr
require(MASS)
## Loading required package: MASS
require(randomForest)
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
require(stargazer)
## Loading required package: stargazer
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
require(tidyverse)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()           masks ggplot2::%+%()
## ✖ psych::alpha()         masks ggplot2::alpha()
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ tidyr::extract()       masks magrittr::extract()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::recode()        masks car::recode()
## ✖ dplyr::select()        masks MASS::select()
## ✖ purrr::set_names()     masks magrittr::set_names()
## ✖ purrr::some()          masks car::some()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
require(UBL)
## Loading required package: UBL
## Loading required package: MBA
## Loading required package: gstat
## Loading required package: automap
## Loading required package: sp
## 
## Attaching package: 'UBL'
## 
## The following object is masked from 'package:psych':
## 
##     phi
require(visdat)
## Loading required package: visdat
breach <- read.csv("C:/Users/lfult/Desktop/breach anthony/breach.csv", stringsAsFactors = T)
stock <- read.csv("C:/Users/lfult/Desktop/breach anthony/stock.csv", stringsAsFactors = T)

Correlation Function

corfunction=function(d){
  mycorr=cor(d[, 1:ncol(d)]); p.mat=ggcorrplot::cor_pmat(d[,1:ncol(d)])
  myplot=ggcorrplot(mycorr, hc.order=TRUE,type="lower",
                    colors=c("red", "white","green"),tl.cex = 8, 
                    tl.col = "black", lab=TRUE, lab_size=2, p.mat=p.mat,
                    insig="pch", pch=4)
  print(myplot)}

EDA

Assign Breach = 1

breach$breach=1

Make Year

breach$year=as.numeric(sapply(breach$BREACH_DISCLOSURE_DATE, function(x) substr(x,7,11)))
stock$year=stock$fyear
stock$fyear=NULL

NYSE Only

breach=breach[breach$MARKET=="NYSE",]

Key

breach$key=paste(breach$BEST_EDGAR_TICKER, breach$year)

Remove Duplicates

breach=breach[!duplicated(breach[c('key')]), ]

Filter Stock Data / Assign Key

stock=stock[stock$exchg==11,]
stock$key=paste(stock$tic,stock$year)

Merge

df=merge(breach, stock, by=c('key'), all=T)
df$breach[is.na(df$breach)]=0
df=df[,-c(2:300)]
df$year=as.factor(ifelse(is.na(df$year.x),df$year.y, df$year.x))

Recode & Delete Some Extra Stuff

df$year.x=df$year.y=NULL
df$breach=as.factor(df$breach)


df$gvkey=df$datadate=df$consol=df$popsrc=df$datafmt=df$tic=df$cusip=
df$conm=df$curcd=df$county=df$ein=df$spcindcd=df$spcseccd=df$act=df$ch=df$ao=df$at=df$naics=NULL
df=df[!is.na(df$prcc_f),]
df$sic=as.factor(df$sic)

Look at Missing

missmap(df)

Describe

describe(df)
##          vars     n     mean       sd  median  trimmed      mad min    max
## key*        1 57091 24386.77 14309.93 24293.0 24346.44 18532.50   1  49247
## breach*     2 57091     1.01     0.09     1.0     1.00     0.00   1      2
## indfmt*     3 57091     1.86     0.34     2.0     1.95     0.00   1      2
## fyr         4 57091    10.62     2.85    12.0    11.38     0.00   1     12
## exchg       5 57091    11.00     0.00    11.0    11.00     0.00  11     11
## costat*     6 57091     1.29     0.45     1.0     1.23     0.00   1      2
## prcc_f      7 57091    67.74  1726.43    22.3    27.15    18.44   0 141600
## idbflag*    8 57091     1.79     0.41     2.0     1.86     0.00   1      2
## sic*        9 57091   239.73   104.50   292.0   254.21    43.00   1    374
## state*     10 57091    28.58    19.54    26.0    28.39    25.20   1     63
## year*      11 57091    12.36     6.26    12.0    12.37     7.41   1     23
##           range  skew kurtosis    se
## key*      49246  0.02    -1.21 59.89
## breach*       1 10.67   111.76  0.00
## indfmt*       1 -2.11     2.43  0.00
## fyr          11 -2.11     3.31  0.01
## exchg         0   NaN      NaN  0.00
## costat*       1  0.94    -1.11  0.00
## prcc_f   141600 60.05  3791.89  7.23
## idbflag*      1 -1.40    -0.05  0.00
## sic*        373 -0.99    -0.38  0.44
## state*       62  0.06    -1.42  0.08
## year*        22 -0.01    -1.19  0.03

Some Plots

ggplot(df, aes(x=breach, y=prcc_f)) + geom_boxplot()

temp=as.data.frame(table(df$breach,df$year))
colnames(temp)=c('Breach', 'Year', 'Frequency')
temp2=temp[temp$Breach==1,]
ggplot(temp, aes(x=Year, y=Frequency, fill=Breach))+geom_bar(stat='identity')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(temp2, aes(x=Year, y=Frequency, group=1))+
  geom_line(color='red')+geom_point()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  ggtitle("Breaches by Year")+
  theme(plot.title = element_text(hjust = 0.5))

Remove Key

newdf=df[, -1]
newdf$sic=NULL
newdf$fyr=NULL #same as year
newdf$exchg=NULL
newdf$year=as.numeric(newdf$year)

Showing that Modeling LM==BAD

mylm=lm(prcc_f~breach, data=newdf)
#mystep=stepAIC(mylm)  #I ran this to procure variables that should be in the dataset.

finallm=lm(prcc_f ~ indfmt + idbflag + state + year, data=newdf)
summary(mylm)
## 
## Call:
## lm(formula = prcc_f ~ breach, data = newdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##    -91    -55    -45    -25 141532 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   67.528      7.257   9.306   <2e-16 ***
## breach1       25.337     78.409   0.323    0.747    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1726 on 57089 degrees of freedom
## Multiple R-squared:  1.829e-06,  Adjusted R-squared:  -1.569e-05 
## F-statistic: 0.1044 on 1 and 57089 DF,  p-value: 0.7466
summary(finallm)
## 
## Call:
## lm(formula = prcc_f ~ indfmt + idbflag + state + year, data = newdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7864    -39      8     35 133726 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  184.231     30.154   6.110    1e-09 ***
## indfmtINDL   -76.421     20.406  -3.745 0.000180 ***
## idbflagD    -191.560     19.929  -9.612  < 2e-16 ***
## stateAB      122.459    211.791   0.578 0.563125    
## stateAL      122.314    104.264   1.173 0.240751    
## stateAR      126.535    135.463   0.934 0.350262    
## stateAZ      125.436     73.114   1.716 0.086236 .  
## stateBC       59.912    129.868   0.461 0.644566    
## stateCA      120.636     33.704   3.579 0.000345 ***
## stateCO       97.465     55.109   1.769 0.076967 .  
## stateCT      121.785     50.888   2.393 0.016706 *  
## stateDC      117.322    115.422   1.016 0.309411    
## stateDE       97.182    119.889   0.811 0.417599    
## stateFL      115.739     43.416   2.666 0.007682 ** 
## stateGA      109.600     46.450   2.360 0.018302 *  
## stateHI      130.151    129.478   1.005 0.314807    
## stateIA      118.936    118.722   1.002 0.316440    
## stateID      150.626    184.639   0.816 0.414626    
## stateIL      120.632     34.373   3.510 0.000449 ***
## stateIN      100.640     75.414   1.335 0.182041    
## stateKS      131.847    123.101   1.071 0.284152    
## stateKY      147.942    113.142   1.308 0.191023    
## stateLA      101.865    126.372   0.806 0.420204    
## stateMA      123.232     41.258   2.987 0.002820 ** 
## stateMD      104.538     62.701   1.667 0.095472 .  
## stateME      141.281    191.817   0.737 0.461405    
## stateMI      117.289     64.004   1.833 0.066880 .  
## stateMN      110.404     63.243   1.746 0.080869 .  
## stateMO      112.345     65.839   1.706 0.087948 .  
## stateMS      125.711    171.928   0.731 0.464670    
## stateMT       92.821    200.640   0.463 0.643634    
## stateNC      126.017     55.237   2.281 0.022529 *  
## stateND      114.098    202.503   0.563 0.573140    
## stateNE     7787.813    113.017  68.909  < 2e-16 ***
## stateNH      131.465    142.991   0.919 0.357895    
## stateNJ      122.701     52.954   2.317 0.020500 *  
## stateNM      143.248    354.185   0.404 0.685888    
## stateNV       93.654     87.920   1.065 0.286783    
## stateNY      107.734     29.052   3.708 0.000209 ***
## stateOH      130.529     48.158   2.710 0.006721 ** 
## stateOK      129.395     86.918   1.489 0.136571    
## stateON      100.077     82.418   1.214 0.224653    
## stateOR      137.515    120.315   1.143 0.253060    
## statePA      119.988     42.544   2.820 0.004799 ** 
## statePR       74.971    140.182   0.535 0.592782    
## stateQC      110.455    176.113   0.627 0.530542    
## stateRI       66.331    172.102   0.385 0.699930    
## stateSC      135.579    137.492   0.986 0.324095    
## stateSD      149.452    277.305   0.539 0.589928    
## stateSK       25.082    313.807   0.080 0.936296    
## stateTN      132.966     56.369   2.359 0.018334 *  
## stateTX      121.590     32.162   3.781 0.000157 ***
## stateUT      149.797    108.427   1.382 0.167116    
## stateVA      173.476     50.750   3.418 0.000631 ***
## stateVI      143.262    586.472   0.244 0.807017    
## stateVT      108.716    308.830   0.352 0.724821    
## stateWA      124.874     91.647   1.363 0.173028    
## stateWI      141.197     65.021   2.172 0.029894 *  
## stateWV      111.685    626.905   0.178 0.858603    
## year          -2.685      1.112  -2.413 0.015810 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1657 on 57031 degrees of freedom
## Multiple R-squared:  0.07941,    Adjusted R-squared:  0.07846 
## F-statistic: 83.38 on 59 and 57031 DF,  p-value: < 2.2e-16
c(AIC(mylm), AIC(finallm))
## [1] 1013113 1008505

Classification (with Training, Test, and SMOTE)

Overclassification as 1’s

set.seed(1234)

mys=sample(0:nrow(df), .2*nrow(df))

newdf$breach=as.factor(newdf$breach)
newdf$costat=as.numeric(newdf$costat)
newdf$idbflag=as.numeric(newdf$idbflag)
newdf$indfmt=as.numeric(newdf$indfmt)
newdf$state=NULL


train=newdf[-mys,]
test=newdf[mys,]
train=SmoteClassif(breach~.,train,  C.perc = list('0'=1,'1'=225), k=2, repl=T)
#train$state=as.factor(train$state)
#test$state=as.factor(test$state)

myglm=glm(breach~., train, family='binomial')
summary(myglm)
## 
## Call:
## glm(formula = breach ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6780  -0.3457   0.4340   0.6458   2.1930  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.044e+00  7.083e-02  14.737   <2e-16 ***
## indfmt      -6.089e-02  2.489e-02  -2.446   0.0144 *  
## costat      -2.400e+00  3.506e-02 -68.468   <2e-16 ***
## prcc_f       6.933e-06  5.959e-06   1.163   0.2447    
## idbflag     -1.224e+00  1.861e-02 -65.779   <2e-16 ***
## year         2.694e-01  1.785e-03 150.933   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 171364  on 133928  degrees of freedom
## Residual deviance: 110830  on 133923  degrees of freedom
## AIC: 110842
## 
## Number of Fisher Scoring iterations: 5
mypred=round(predict(myglm, test, type='response'),0)
table(mypred, test$breach)
##       
## mypred    0    1
##      0 7030    3
##      1 4293   92
confusionMatrix(as.factor(mypred), test$breach, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7030    3
##          1 4293   92
##                                           
##                Accuracy : 0.6238          
##                  95% CI : (0.6148, 0.6326)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0252          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.968421        
##             Specificity : 0.620860        
##          Pos Pred Value : 0.020981        
##          Neg Pred Value : 0.999573        
##              Prevalence : 0.008320        
##          Detection Rate : 0.008057        
##    Detection Prevalence : 0.384043        
##       Balanced Accuracy : 0.794641        
##                                           
##        'Positive' Class : 1               
## 

Quick Forest

Needs hyperparameter tuning and adjustment Also, need to investigate gradient boosting, SVM, etc.

myrf= randomForest(breach~.,data=train,ntree=100,mtry=3)

mypred=predict(myrf, test, type='response')
table(mypred, test$breach)
##       
## mypred     0     1
##      0 10475    61
##      1   848    34
confusionMatrix(as.factor(mypred), test$breach, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10475    61
##          1   848    34
##                                           
##                Accuracy : 0.9204          
##                  95% CI : (0.9153, 0.9253)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0554          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.357895        
##             Specificity : 0.925108        
##          Pos Pred Value : 0.038549        
##          Neg Pred Value : 0.994210        
##              Prevalence : 0.008320        
##          Detection Rate : 0.002978        
##    Detection Prevalence : 0.077246        
##       Balanced Accuracy : 0.641501        
##                                           
##        'Positive' Class : 1               
##