print(paste("Mean number of Individuals being affected in a business cybersecurity attack is ",format(mean(breach_data$Individuals.Affected), big.mark = ",")))
## [1] "Mean number of Individuals being affected in a business cybersecurity attack is 35,778.58"
colnames(breach_data) <- c("Id","Name", "State", "Entity_Type", "Num_Affected", "Breach_Date", "Breach_Type", "Breach_Location","Associate_Present", "Breach_Desc")
#summary(breach_data)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create Sub-set data
breach_byState_subdata <- subset(breach_data, select = c("State","Num_Affected"))
breach_byState_sum_subdata <- aggregate(breach_byState_subdata$Num_Affected, by=list(breach_byState_subdata$State), FUN=sum)
View(breach_byState_sum_subdata)
breach_byState_sum_subdata1 <- breach_byState_sum_subdata[order(breach_byState_sum_subdata$x),]
colnames(breach_byState_sum_subdata1) <- c("State", "Total_Associates_Affected")
breach_byState_sum_subdata1
## State Total_Associates_Affected
## 48 VT 550
## 12 HI 674
## 9 DE 1883
## 22 ME 1920
## 1 AK 8500
## 43 SD 9120
## 30 NE 11943
## 29 ND 12650
## 8 DC 13905
## 14 ID 14962
## 3 AR 19383
## 51 WV 21543
## 26 MS 27640
## 17 KS 30656
## 41 RI 31613
## 33 NM 34804
## 13 IA 35584
## 52 WY 37565
## 19 LA 63521
## 34 NV 67077
## 38 OR 69856
## 25 MO 92330
## 18 KY 102340
## 50 WI 115831
## 24 MN 126519
## 49 WA 165956
## 23 MI 172541
## 6 CO 173881
## 20 MA 184939
## 7 CT 210293
## 36 OH 220591
## 4 AZ 234183
## 31 NH 239339
## 37 OK 249348
## 28 NC 282591
## 21 MD 325570
## 16 IN 523629
## 11 GA 562231
## 42 SC 700385
## 46 UT 835276
## 2 AL 1072221
## 27 MT 1105360
## 39 PA 1219266
## 40 PR 1234508
## 5 CA 2422097
## 35 NY 2758702
## 10 FL 2931504
## 32 NJ 3035497
## 45 TX 3492300
## 15 IL 4602939
## 47 VA 5148257
## 44 TN 6125371
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 2.0.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Subset data for TN
breach_TN_subdata <- subset (breach_data, State == "TN", select = c("Entity_Type","Num_Affected", "Breach_Date", "Breach_Type", "Breach_Location","Associate_Present"))
as.Date(breach_TN_subdata$Breach_Date, format = "%m/%d/%Y")
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA
Breach_Year <- year(breach_TN_subdata$Breach_Date) # format(breach_TN_subdata$Breach_Date, format="%Y")
#Replicate the dataframe
breach_TN_newdata <- breach_TN_subdata
# Adding a new column (Breach_Year) to the dataset
breach_TN_newdata$Breach_Year <- Breach_Year
# Subset data related with Graphics -- TN
breach_TN_newdata1 <- subset (breach_TN_newdata, select = c("Entity_Type","Num_Affected", "Breach_Year", "Breach_Type", "Breach_Location"))
View(breach_TN_newdata1)
breach_TN_newdata1
## Entity_Type Num_Affected Breach_Year
## 12 Business Associate 6400 2009
## 39 Health Plan 3900 2010
## 95 Healthcare Provider 1745 2010
## 106 Healthcare Provider 1537 2010
## 141 Healthcare Provider 1711 2010
## 180 Health Plan 1023209 2010
## 198 Healthcare Provider 8200 2010
## 230 Healthcare Provider 500 2011
## 304 Healthcare Provider 810 2011
## 305 Healthcare Provider 705 2011
## 357 Healthcare Provider 731 2011
## 394 Health Plan 1770 2011
## 404 Healthcare Provider 2185 2011
## 501 Health Plan 1102 2012
## 553 Healthcare Provider 27799 2012
## 684 Healthcare Provider 1180 2013
## 686 Business Associate 539 2013
## 741 Healthcare Provider 5690 2013
## 749 Business Associate 32000 2013
## 758 Healthcare Provider 4268 2013
## 771 Healthcare Provider 6932 2013
## 780 Business Associate 4330 2013
## 841 Business Associate 2777 2013
## 861 Healthcare Provider 9602 2014
## 939 Healthcare Provider 1144 2014
## 1033 Healthcare Provider 28300 2014
## 1041 Business Associate 4500000 2014
## 1050 Health Plan 1717 2014
## 1055 Business Associate 566 2014
## 1068 Business Associate 800 2014
## 1077 Healthcare Provider 307528 2014
## 1132 Health Plan 79000 2015
## 1143 Healthcare Provider 56694 2015
## Breach_Type Breach_Location
## 12 Theft Laptop
## 39 Theft Paper/Films
## 95 Loss Laptop
## 106 Theft Desktop Computer, Paper/Films
## 141 Loss Other, Other Portable Electronic Device
## 180 Theft Other
## 198 Improper Disposal Paper/Films
## 230 Theft Desktop Computer
## 304 Theft Laptop
## 305 Theft Laptop
## 357 Theft Paper/Films
## 394 Unauthorized Access/Disclosure Paper/Films
## 404 Theft Laptop, Other Portable Electronic Device
## 501 Loss Paper/Films
## 553 Theft Laptop
## 684 Unauthorized Access/Disclosure Email
## 686 Other Desktop Computer
## 741 Theft Laptop
## 749 Theft Network Server
## 758 Loss Other
## 771 Theft Laptop
## 780 Unauthorized Access/Disclosure Paper/Films
## 841 Theft Laptop
## 861 Theft Laptop
## 939 Unauthorized Access/Disclosure Paper/Films
## 1033 Improper Disposal Other
## 1041 Theft Network Server
## 1050 Other Other
## 1055 Theft Paper/Films
## 1068 Theft Paper/Films
## 1077 Unauthorized Access/Disclosure Network Server
## 1132 Unauthorized Access/Disclosure Other
## 1143 Theft Other Portable Electronic Device
# ScatterPlot Year vs Num_Affected
png(file = "scatter_Year_plot.png")
plot(x = breach_TN_newdata$Breach_Year,y = breach_TN_newdata$Num_Affected,
xlab = "Year",
ylab = "Num Affected",
xlim = c(2009,2015),
ylim = c(300, 100000),
main = "Year vs Num"
)
x <- breach_TN_newdata1$Breach_Year
y <- breach_TN_newdata1$Num_Affected
z <- breach_TN_newdata1$Entity_Type
t <- breach_TN_newdata1$Breach_Type
loc <- breach_TN_newdata1$Breach_Location
lines(lowess(x, y), col = "blue")
# Save the file.
dev.off()
## png
## 2
library(ggforce)
## Warning: package 'ggforce' was built under R version 4.0.5
# install.packages("ggplot2")
library(ggplot2)
# install.packages("ggbeeswarm")
library(ggbeeswarm)
## Warning: package 'ggbeeswarm' was built under R version 4.0.5
ggplot(data = breach_TN_newdata1) +
aes(y = y, x = x) + geom_beeswarm()
ggplot(data = breach_TN_newdata1) +
aes(y = y, x = z) + geom_beeswarm()
ggplot(data = breach_TN_newdata1) +
aes(y = y, x = t) + geom_beeswarm()
ggplot(data = breach_TN_newdata1) +
aes(y = loc, x = y) + geom_beeswarm()
## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.
coord_flip()
## <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
## aspect: function
## backtransform_range: function
## clip: on
## default: FALSE
## distance: function
## expand: TRUE
## is_free: function
## is_linear: function
## labels: function
## limits: list
## modify_scales: function
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## setup_data: function
## setup_layout: function
## setup_panel_guides: function
## setup_panel_params: function
## setup_params: function
## train_panel_guides: function
## transform: function
## super: <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
library(ggplot2)
# Basic box plot
p <- ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) + geom_boxplot()
p
# Rotate the box plot
p + coord_flip()
# Notched box plot
ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) +
geom_boxplot(notch=FALSE)
# Change outlier, color, shape and size
ggplot(breach_TN_newdata1, aes(x = Breach_Year,y = Num_Affected, group = 1)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4)
##Based on the data and graphical analysis, Tennessee State had the highest number of people affected during 2009 - 2015. We can also conclude that most of the cyberattacks were categorized as Data Thefts, which occured mostly with Healthcare Provider businesses. Special attention should be given with respect to the Network Server security which if comprised affects a huge population, highly skewed compared to other device threats.