Cyber Security Threat analysis in businesses (2009 - 2015).

Mean of Individual affected in each attack

print(paste("Mean number of Individuals being affected in a business cybersecurity attack is ",format(mean(breach_data$Individuals.Affected), big.mark = ",")))
## [1] "Mean number of Individuals being affected in a business cybersecurity attack is  35,778.58"

Rename the columns

colnames(breach_data) <- c("Id","Name", "State", "Entity_Type", "Num_Affected", "Breach_Date", "Breach_Type", "Breach_Location","Associate_Present", "Breach_Desc")
#summary(breach_data)

Subset the data frame with State and Num_Affected columns

Display the States, order by number of associates affected

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Create Sub-set data
breach_byState_subdata <- subset(breach_data, select = c("State","Num_Affected"))

breach_byState_sum_subdata <-  aggregate(breach_byState_subdata$Num_Affected, by=list(breach_byState_subdata$State), FUN=sum)
View(breach_byState_sum_subdata)

breach_byState_sum_subdata1 <- breach_byState_sum_subdata[order(breach_byState_sum_subdata$x),] 
colnames(breach_byState_sum_subdata1) <- c("State", "Total_Associates_Affected")
breach_byState_sum_subdata1
##    State Total_Associates_Affected
## 48    VT                       550
## 12    HI                       674
## 9     DE                      1883
## 22    ME                      1920
## 1     AK                      8500
## 43    SD                      9120
## 30    NE                     11943
## 29    ND                     12650
## 8     DC                     13905
## 14    ID                     14962
## 3     AR                     19383
## 51    WV                     21543
## 26    MS                     27640
## 17    KS                     30656
## 41    RI                     31613
## 33    NM                     34804
## 13    IA                     35584
## 52    WY                     37565
## 19    LA                     63521
## 34    NV                     67077
## 38    OR                     69856
## 25    MO                     92330
## 18    KY                    102340
## 50    WI                    115831
## 24    MN                    126519
## 49    WA                    165956
## 23    MI                    172541
## 6     CO                    173881
## 20    MA                    184939
## 7     CT                    210293
## 36    OH                    220591
## 4     AZ                    234183
## 31    NH                    239339
## 37    OK                    249348
## 28    NC                    282591
## 21    MD                    325570
## 16    IN                    523629
## 11    GA                    562231
## 42    SC                    700385
## 46    UT                    835276
## 2     AL                   1072221
## 27    MT                   1105360
## 39    PA                   1219266
## 40    PR                   1234508
## 5     CA                   2422097
## 35    NY                   2758702
## 10    FL                   2931504
## 32    NJ                   3035497
## 45    TX                   3492300
## 15    IL                   4602939
## 47    VA                   5148257
## 44    TN                   6125371

Based on the above analysis, Tennessee state had the highest number of people affected

Let’s drill down into the details for Tennessee breaches

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   2.0.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Subset data for TN 
breach_TN_subdata <- subset (breach_data, State == "TN", select = c("Entity_Type","Num_Affected", "Breach_Date", "Breach_Type", "Breach_Location","Associate_Present"))

as.Date(breach_TN_subdata$Breach_Date, format = "%m/%d/%Y")
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA
Breach_Year  <- year(breach_TN_subdata$Breach_Date) # format(breach_TN_subdata$Breach_Date, format="%Y")

#Replicate the dataframe
breach_TN_newdata <- breach_TN_subdata

# Adding a new column (Breach_Year) to the dataset 
breach_TN_newdata$Breach_Year <- Breach_Year

# Subset data related with Graphics -- TN 
breach_TN_newdata1 <- subset (breach_TN_newdata, select = c("Entity_Type","Num_Affected", "Breach_Year", "Breach_Type", "Breach_Location"))

View(breach_TN_newdata1)
breach_TN_newdata1
##              Entity_Type Num_Affected Breach_Year
## 12    Business Associate         6400        2009
## 39           Health Plan         3900        2010
## 95   Healthcare Provider         1745        2010
## 106  Healthcare Provider         1537        2010
## 141  Healthcare Provider         1711        2010
## 180          Health Plan      1023209        2010
## 198  Healthcare Provider         8200        2010
## 230  Healthcare Provider          500        2011
## 304  Healthcare Provider          810        2011
## 305  Healthcare Provider          705        2011
## 357  Healthcare Provider          731        2011
## 394          Health Plan         1770        2011
## 404  Healthcare Provider         2185        2011
## 501          Health Plan         1102        2012
## 553  Healthcare Provider        27799        2012
## 684  Healthcare Provider         1180        2013
## 686   Business Associate          539        2013
## 741  Healthcare Provider         5690        2013
## 749   Business Associate        32000        2013
## 758  Healthcare Provider         4268        2013
## 771  Healthcare Provider         6932        2013
## 780   Business Associate         4330        2013
## 841   Business Associate         2777        2013
## 861  Healthcare Provider         9602        2014
## 939  Healthcare Provider         1144        2014
## 1033 Healthcare Provider        28300        2014
## 1041  Business Associate      4500000        2014
## 1050         Health Plan         1717        2014
## 1055  Business Associate          566        2014
## 1068  Business Associate          800        2014
## 1077 Healthcare Provider       307528        2014
## 1132         Health Plan        79000        2015
## 1143 Healthcare Provider        56694        2015
##                         Breach_Type                          Breach_Location
## 12                            Theft                                   Laptop
## 39                            Theft                              Paper/Films
## 95                             Loss                                   Laptop
## 106                           Theft            Desktop Computer, Paper/Films
## 141                            Loss  Other, Other Portable Electronic Device
## 180                           Theft                                    Other
## 198               Improper Disposal                              Paper/Films
## 230                           Theft                         Desktop Computer
## 304                           Theft                                   Laptop
## 305                           Theft                                   Laptop
## 357                           Theft                              Paper/Films
## 394  Unauthorized Access/Disclosure                              Paper/Films
## 404                           Theft Laptop, Other Portable Electronic Device
## 501                            Loss                              Paper/Films
## 553                           Theft                                   Laptop
## 684  Unauthorized Access/Disclosure                                    Email
## 686                           Other                         Desktop Computer
## 741                           Theft                                   Laptop
## 749                           Theft                           Network Server
## 758                            Loss                                    Other
## 771                           Theft                                   Laptop
## 780  Unauthorized Access/Disclosure                              Paper/Films
## 841                           Theft                                   Laptop
## 861                           Theft                                   Laptop
## 939  Unauthorized Access/Disclosure                              Paper/Films
## 1033              Improper Disposal                                    Other
## 1041                          Theft                           Network Server
## 1050                          Other                                    Other
## 1055                          Theft                              Paper/Films
## 1068                          Theft                              Paper/Films
## 1077 Unauthorized Access/Disclosure                           Network Server
## 1132 Unauthorized Access/Disclosure                                    Other
## 1143                          Theft         Other Portable Electronic Device

Plot the chart by Year/Num_Affected

# ScatterPlot  Year vs Num_Affected

png(file = "scatter_Year_plot.png")

plot(x = breach_TN_newdata$Breach_Year,y = breach_TN_newdata$Num_Affected,
   xlab = "Year",
   ylab = "Num Affected",
   xlim = c(2009,2015),
   ylim = c(300, 100000),        
   main = "Year vs Num"
)
x <- breach_TN_newdata1$Breach_Year
y <- breach_TN_newdata1$Num_Affected
z <- breach_TN_newdata1$Entity_Type
t <- breach_TN_newdata1$Breach_Type
loc <- breach_TN_newdata1$Breach_Location

lines(lowess(x, y), col = "blue")
     
# Save the file.
dev.off()
## png 
##   2
library(ggforce)
## Warning: package 'ggforce' was built under R version 4.0.5
# install.packages("ggplot2")
library(ggplot2)
# install.packages("ggbeeswarm")
library(ggbeeswarm)
## Warning: package 'ggbeeswarm' was built under R version 4.0.5
ggplot(data = breach_TN_newdata1) +
  aes(y = y, x = x) +  geom_beeswarm()

ggplot(data = breach_TN_newdata1) +
  aes(y = y, x = z) +  geom_beeswarm()

ggplot(data = breach_TN_newdata1) +
  aes(y = y, x = t) +  geom_beeswarm()

ggplot(data = breach_TN_newdata1) +
  aes(y = loc, x = y) +  geom_beeswarm()
## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.

coord_flip()
## <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
library(ggplot2)
# Basic box plot
p <- ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) + geom_boxplot()
p

# Rotate the box plot
p + coord_flip()

# Notched box plot
ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) + 
  geom_boxplot(notch=FALSE)

# Change outlier, color, shape and size
ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=8,
                outlier.size=4)

##Based on the data and graphical analysis, Tennessee State had the highest number of people affected during 2009 - 2015. We can also conclude that most of the cyberattacks were categorized as Data Thefts, which occured mostly with Healthcare Provider businesses. Special attention should be given with respect to the Network Server security which if comprised affects a huge population, highly skewed compared to other device threats.