#remove(list=ls())
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
require(car)
## Loading required package: car
## Loading required package: carData
require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
require(DMwR2)
## Loading required package: DMwR2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
require(ggplot2)
require(ggcorrplot)
## Loading required package: ggcorrplot
require(psych)
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
require(magrittr)
## Loading required package: magrittr
require(MASS)
## Loading required package: MASS
require(randomForest)
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
require(stargazer)
## Loading required package: stargazer
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
require(tidyverse)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::some() masks car::some()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
require(UBL)
## Loading required package: UBL
## Loading required package: MBA
## Loading required package: gstat
## Loading required package: automap
## Loading required package: sp
##
## Attaching package: 'UBL'
##
## The following object is masked from 'package:psych':
##
## phi
require(visdat)
## Loading required package: visdat
breach <- read.csv("C:/Users/lfult/Desktop/breach anthony/breach.csv", stringsAsFactors = T)
stock <- read.csv("C:/Users/lfult/Desktop/breach anthony/stock.csv", stringsAsFactors = T)
corfunction=function(d){
mycorr=cor(d[, 1:ncol(d)]); p.mat=ggcorrplot::cor_pmat(d[,1:ncol(d)])
myplot=ggcorrplot(mycorr, hc.order=TRUE,type="lower",
colors=c("red", "white","green"),tl.cex = 8,
tl.col = "black", lab=TRUE, lab_size=2, p.mat=p.mat,
insig="pch", pch=4)
print(myplot)}
breach$breach=1
breach$year=as.numeric(sapply(breach$BREACH_DISCLOSURE_DATE, function(x) substr(x,7,11)))
stock$year=stock$fyear
stock$fyear=NULL
breach=breach[breach$MARKET=="NYSE",]
breach$key=paste(breach$BEST_EDGAR_TICKER, breach$year)
breach=breach[!duplicated(breach[c('key')]), ]
stock=stock[stock$exchg==11,]
stock$key=paste(stock$tic,stock$year)
df=merge(breach, stock, by=c('key'), all=T)
df$breach[is.na(df$breach)]=0
df=df[,-c(2:300)]
df$year=as.factor(ifelse(is.na(df$year.x),df$year.y, df$year.x))
df$year.x=df$year.y=NULL
df$breach=as.factor(df$breach)
df$gvkey=df$datadate=df$consol=df$popsrc=df$datafmt=df$tic=df$cusip=
df$conm=df$curcd=df$county=df$ein=df$spcindcd=df$spcseccd=df$act=df$ch=df$ao=df$at=df$naics=NULL
df=df[!is.na(df$prcc_f),]
df$sic=as.factor(df$sic)
missmap(df)
describe(df)
## vars n mean sd median trimmed mad min max
## key* 1 57091 24386.77 14309.93 24293.0 24346.44 18532.50 1 49247
## breach* 2 57091 1.01 0.09 1.0 1.00 0.00 1 2
## indfmt* 3 57091 1.86 0.34 2.0 1.95 0.00 1 2
## fyr 4 57091 10.62 2.85 12.0 11.38 0.00 1 12
## exchg 5 57091 11.00 0.00 11.0 11.00 0.00 11 11
## costat* 6 57091 1.29 0.45 1.0 1.23 0.00 1 2
## prcc_f 7 57091 67.74 1726.43 22.3 27.15 18.44 0 141600
## idbflag* 8 57091 1.79 0.41 2.0 1.86 0.00 1 2
## sic* 9 57091 239.73 104.50 292.0 254.21 43.00 1 374
## state* 10 57091 28.58 19.54 26.0 28.39 25.20 1 63
## year* 11 57091 12.36 6.26 12.0 12.37 7.41 1 23
## range skew kurtosis se
## key* 49246 0.02 -1.21 59.89
## breach* 1 10.67 111.76 0.00
## indfmt* 1 -2.11 2.43 0.00
## fyr 11 -2.11 3.31 0.01
## exchg 0 NaN NaN 0.00
## costat* 1 0.94 -1.11 0.00
## prcc_f 141600 60.05 3791.89 7.23
## idbflag* 1 -1.40 -0.05 0.00
## sic* 373 -0.99 -0.38 0.44
## state* 62 0.06 -1.42 0.08
## year* 22 -0.01 -1.19 0.03
ggplot(df, aes(x=breach, y=prcc_f)) + geom_boxplot()
temp=as.data.frame(table(df$breach,df$year))
colnames(temp)=c('Breach', 'Year', 'Frequency')
temp2=temp[temp$Breach==1,]
ggplot(temp, aes(x=Year, y=Frequency, fill=Breach))+geom_bar(stat='identity')+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(temp2, aes(x=Year, y=Frequency, group=1))+
geom_line(color='red')+geom_point()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Breaches by Year")+
theme(plot.title = element_text(hjust = 0.5))
newdf=df[, -1]
newdf$sic=NULL
newdf$fyr=NULL #same as year
newdf$exchg=NULL
newdf$year=as.numeric(newdf$year)
mylm=lm(prcc_f~breach, data=newdf)
#mystep=stepAIC(mylm) #I ran this to procure variables that should be in the dataset.
finallm=lm(prcc_f ~ indfmt + idbflag + state + year, data=newdf)
summary(mylm)
##
## Call:
## lm(formula = prcc_f ~ breach, data = newdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91 -55 -45 -25 141532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.528 7.257 9.306 <2e-16 ***
## breach1 25.337 78.409 0.323 0.747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1726 on 57089 degrees of freedom
## Multiple R-squared: 1.829e-06, Adjusted R-squared: -1.569e-05
## F-statistic: 0.1044 on 1 and 57089 DF, p-value: 0.7466
summary(finallm)
##
## Call:
## lm(formula = prcc_f ~ indfmt + idbflag + state + year, data = newdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7864 -39 8 35 133726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 184.231 30.154 6.110 1e-09 ***
## indfmtINDL -76.421 20.406 -3.745 0.000180 ***
## idbflagD -191.560 19.929 -9.612 < 2e-16 ***
## stateAB 122.459 211.791 0.578 0.563125
## stateAL 122.314 104.264 1.173 0.240751
## stateAR 126.535 135.463 0.934 0.350262
## stateAZ 125.436 73.114 1.716 0.086236 .
## stateBC 59.912 129.868 0.461 0.644566
## stateCA 120.636 33.704 3.579 0.000345 ***
## stateCO 97.465 55.109 1.769 0.076967 .
## stateCT 121.785 50.888 2.393 0.016706 *
## stateDC 117.322 115.422 1.016 0.309411
## stateDE 97.182 119.889 0.811 0.417599
## stateFL 115.739 43.416 2.666 0.007682 **
## stateGA 109.600 46.450 2.360 0.018302 *
## stateHI 130.151 129.478 1.005 0.314807
## stateIA 118.936 118.722 1.002 0.316440
## stateID 150.626 184.639 0.816 0.414626
## stateIL 120.632 34.373 3.510 0.000449 ***
## stateIN 100.640 75.414 1.335 0.182041
## stateKS 131.847 123.101 1.071 0.284152
## stateKY 147.942 113.142 1.308 0.191023
## stateLA 101.865 126.372 0.806 0.420204
## stateMA 123.232 41.258 2.987 0.002820 **
## stateMD 104.538 62.701 1.667 0.095472 .
## stateME 141.281 191.817 0.737 0.461405
## stateMI 117.289 64.004 1.833 0.066880 .
## stateMN 110.404 63.243 1.746 0.080869 .
## stateMO 112.345 65.839 1.706 0.087948 .
## stateMS 125.711 171.928 0.731 0.464670
## stateMT 92.821 200.640 0.463 0.643634
## stateNC 126.017 55.237 2.281 0.022529 *
## stateND 114.098 202.503 0.563 0.573140
## stateNE 7787.813 113.017 68.909 < 2e-16 ***
## stateNH 131.465 142.991 0.919 0.357895
## stateNJ 122.701 52.954 2.317 0.020500 *
## stateNM 143.248 354.185 0.404 0.685888
## stateNV 93.654 87.920 1.065 0.286783
## stateNY 107.734 29.052 3.708 0.000209 ***
## stateOH 130.529 48.158 2.710 0.006721 **
## stateOK 129.395 86.918 1.489 0.136571
## stateON 100.077 82.418 1.214 0.224653
## stateOR 137.515 120.315 1.143 0.253060
## statePA 119.988 42.544 2.820 0.004799 **
## statePR 74.971 140.182 0.535 0.592782
## stateQC 110.455 176.113 0.627 0.530542
## stateRI 66.331 172.102 0.385 0.699930
## stateSC 135.579 137.492 0.986 0.324095
## stateSD 149.452 277.305 0.539 0.589928
## stateSK 25.082 313.807 0.080 0.936296
## stateTN 132.966 56.369 2.359 0.018334 *
## stateTX 121.590 32.162 3.781 0.000157 ***
## stateUT 149.797 108.427 1.382 0.167116
## stateVA 173.476 50.750 3.418 0.000631 ***
## stateVI 143.262 586.472 0.244 0.807017
## stateVT 108.716 308.830 0.352 0.724821
## stateWA 124.874 91.647 1.363 0.173028
## stateWI 141.197 65.021 2.172 0.029894 *
## stateWV 111.685 626.905 0.178 0.858603
## year -2.685 1.112 -2.413 0.015810 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1657 on 57031 degrees of freedom
## Multiple R-squared: 0.07941, Adjusted R-squared: 0.07846
## F-statistic: 83.38 on 59 and 57031 DF, p-value: < 2.2e-16
c(AIC(mylm), AIC(finallm))
## [1] 1013113 1008505
Overclassification as 1’s
set.seed(1234)
mys=sample(0:nrow(df), .2*nrow(df))
newdf$breach=as.factor(newdf$breach)
newdf$costat=as.numeric(newdf$costat)
newdf$idbflag=as.numeric(newdf$idbflag)
newdf$indfmt=as.numeric(newdf$indfmt)
newdf$state=NULL
train=newdf[-mys,]
test=newdf[mys,]
train=SmoteClassif(breach~.,train, C.perc = list('0'=1,'1'=225), k=2, repl=T)
#train$state=as.factor(train$state)
#test$state=as.factor(test$state)
myglm=glm(breach~., train, family='binomial')
summary(myglm)
##
## Call:
## glm(formula = breach ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6780 -0.3457 0.4340 0.6458 2.1930
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044e+00 7.083e-02 14.737 <2e-16 ***
## indfmt -6.089e-02 2.489e-02 -2.446 0.0144 *
## costat -2.400e+00 3.506e-02 -68.468 <2e-16 ***
## prcc_f 6.933e-06 5.959e-06 1.163 0.2447
## idbflag -1.224e+00 1.861e-02 -65.779 <2e-16 ***
## year 2.694e-01 1.785e-03 150.933 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 171364 on 133928 degrees of freedom
## Residual deviance: 110830 on 133923 degrees of freedom
## AIC: 110842
##
## Number of Fisher Scoring iterations: 5
mypred=round(predict(myglm, test, type='response'),0)
table(mypred, test$breach)
##
## mypred 0 1
## 0 7030 3
## 1 4293 92
confusionMatrix(as.factor(mypred), test$breach, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7030 3
## 1 4293 92
##
## Accuracy : 0.6238
## 95% CI : (0.6148, 0.6326)
## No Information Rate : 0.9917
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0252
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.968421
## Specificity : 0.620860
## Pos Pred Value : 0.020981
## Neg Pred Value : 0.999573
## Prevalence : 0.008320
## Detection Rate : 0.008057
## Detection Prevalence : 0.384043
## Balanced Accuracy : 0.794641
##
## 'Positive' Class : 1
##
Needs hyperparameter tuning and adjustment Also, need to investigate gradient boosting, SVM, etc.
myrf= randomForest(breach~.,data=train,ntree=100,mtry=3)
mypred=predict(myrf, test, type='response')
table(mypred, test$breach)
##
## mypred 0 1
## 0 10475 61
## 1 848 34
confusionMatrix(as.factor(mypred), test$breach, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10475 61
## 1 848 34
##
## Accuracy : 0.9204
## 95% CI : (0.9153, 0.9253)
## No Information Rate : 0.9917
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0554
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.357895
## Specificity : 0.925108
## Pos Pred Value : 0.038549
## Neg Pred Value : 0.994210
## Prevalence : 0.008320
## Detection Rate : 0.002978
## Detection Prevalence : 0.077246
## Balanced Accuracy : 0.641501
##
## 'Positive' Class : 1
##