cyberBreachesReport <- read.csv('https://raw.githubusercontent.com/AlainKuiete/SummerBridge/master/HHSCyberSecurityBreaches.csv')
summary(cyberBreachesReport)
## X
## Min. : 1.0
## 1st Qu.: 288.5
## Median : 576.0
## Mean : 576.0
## 3rd Qu.: 863.5
## Max. :1151.0
##
## Name.of.Covered.Entity
## StayWell Health Management, LLC : 6
## University of California, San Francisco : 5
## Clearpoint Design, Inc. : 4
## UnitedHealth Group health plan single affiliated covered entity: 4
## Walgreen Co. : 4
## Cook County Health & Hospitals System : 3
## (Other) :1125
## State Covered.Entity.Type Individuals.Affected
## CA :128 Business Associate :272 Min. : 500
## TX :100 Health Plan :108 1st Qu.: 1000
## NY : 72 Healthcare Clearing House: 4 Median : 2365
## FL : 69 Healthcare Provider :767 Mean : 35779
## IL : 57 3rd Qu.: 7350
## PA : 45 Max. :4900000
## (Other):680
## Breach.Submission.Date Type.of.Breach
## 2014-04-25: 7 Theft :577
## 2010-07-30: 6 Unauthorized Access/Disclosure:183
## 2012-11-29: 6 Other : 89
## 2013-12-06: 6 Loss : 79
## 2009-11-20: 5 Hacking/IT Incident : 77
## 2010-07-23: 5 Improper Disposal : 42
## (Other) :1116 (Other) :104
## Location.of.Breached.Information
## Paper/Films :254
## Laptop :222
## Other :132
## Network Server :127
## Desktop Computer :108
## Other Portable Electronic Device: 68
## (Other) :240
## Business.Associate.Present
## Mode :logical
## FALSE:879
## TRUE :272
##
##
##
##
## Web.Description
## \\N :892
## : 50
## A bag containing a compact disk - read only memory (CD-ROM) was stolen from the vehicle of a physician associated with the covered entity (CE). The CD-ROM involved in the breach contained names, dates of birth, social security numbers, medical histories, and the treatment information of approximately 2,046 individuals. Following the breach, the CE filed a police report and provided breach notification to affected individuals, HHS, and the media. The CE sanctioned and retrained the physician whose bag was stolen and implemented organization wide improvements to its compliance with the Privacy and Security Rules. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation the covered entity posted substitute notification of the breach in the local paper and confirmed that corrective actions steps were taken. \n\\\n\\\n\\: 2
## The covered entity (CE), Long Island Consultation Center, misplaced an unencrypted portable device that contained the electronic protected health information (ePHI) of 800 individuals. The ePHI included names, dates of birth, diagnoses, and other treatment information. Upon discovery of the breach, the CE conducted a search for the portable device. The CE provided breach notification to HHS, the media, and affected individuals. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE improved physical security. The CE also developed and implemented a policy and procedure prohibiting use of portable media for storing ePHI and trained staff on its new policy. : 2
## The covered entity (CE), Samaritan Regional Health System, mismatched names and addresses in a mailing to former patients of a recently deceased physician. The protected health information (PHI) included the names and addresses of approximately 2,203 individuals. The CE provided breach notification to affected individuals, the media, and HHS, and posted substitute notice on its website. Following the breach, the CE re-trained staff on proper address validation techniques and implemented new audit procedures for mailings. OCR obtained assurances that the CE implemented the corrective action listed above. : 2
## Two laptop computers containing the electronic protected health information (ePHI) of approximately 5,450 individuals were stolen from the CE. The ePHI included patient names, dates of birth, and social security numbers. The CE provided breach notification to all affected individuals, HHS, and the media. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE installed encryption software and increased physical security. : 2
## (Other) :201
str(cyberBreachesReport)
## 'data.frame': 1151 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name.of.Covered.Entity : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
## $ Covered.Entity.Type : Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ Individuals.Affected : int 1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
## $ Breach.Submission.Date : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
## $ Type.of.Breach : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
## $ Location.of.Breached.Information: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
## $ Business.Associate.Present : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Web.Description : Factor w/ 207 levels ""," \n\\The covered entity (CE), Medco Health Solutions, mailed letters with incorrect addresses after a programmi"| __truncated__,..: 9 112 5 40 56 53 55 57 54 35 ...
BreachesReport<- data.frame(Name = cyberBreachesReport$Name.of.Covered.Entity,
State = cyberBreachesReport$State,
Business.Type = cyberBreachesReport$Covered.Entity.Type,
Num.people = cyberBreachesReport$Individuals.Affected ,
Date.Submit = cyberBreachesReport$Breach.Submission.Date,
Breach.Type = cyberBreachesReport$Type.of.Breach ,
Info.Location = cyberBreachesReport$Location.of.Breached.Information,
Is.Present = cyberBreachesReport$Business.Associate.Present)
summary(BreachesReport)
## Name
## StayWell Health Management, LLC : 6
## University of California, San Francisco : 5
## Clearpoint Design, Inc. : 4
## UnitedHealth Group health plan single affiliated covered entity: 4
## Walgreen Co. : 4
## Cook County Health & Hospitals System : 3
## (Other) :1125
## State Business.Type Num.people
## CA :128 Business Associate :272 Min. : 500
## TX :100 Health Plan :108 1st Qu.: 1000
## NY : 72 Healthcare Clearing House: 4 Median : 2365
## FL : 69 Healthcare Provider :767 Mean : 35779
## IL : 57 3rd Qu.: 7350
## PA : 45 Max. :4900000
## (Other):680
## Date.Submit Breach.Type
## 2014-04-25: 7 Theft :577
## 2010-07-30: 6 Unauthorized Access/Disclosure:183
## 2012-11-29: 6 Other : 89
## 2013-12-06: 6 Loss : 79
## 2009-11-20: 5 Hacking/IT Incident : 77
## 2010-07-23: 5 Improper Disposal : 42
## (Other) :1116 (Other) :104
## Info.Location Is.Present
## Paper/Films :254 Mode :logical
## Laptop :222 FALSE:879
## Other :132 TRUE :272
## Network Server :127
## Desktop Computer :108
## Other Portable Electronic Device: 68
## (Other) :240
str(BreachesReport)
## 'data.frame': 1151 obs. of 8 variables:
## $ Name : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
## $ Business.Type: Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ Num.people : int 1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
## $ Date.Submit : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
## $ Breach.Type : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
## $ Info.Location: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
## $ Is.Present : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
head(BreachesReport)
## Name State
## 1 Brooke Army Medical Center TX
## 2 Mid America Kidney Stone Association, LLC MO
## 3 Alaska Department of Health and Social Services AK
## 4 Health Services for Children with Special Needs, Inc. DC
## 5 L. Douglas Carlson, M.D. CA
## 6 David I. Cohen, MD CA
## Business.Type Num.people Date.Submit Breach.Type
## 1 Healthcare Provider 1000 2009-10-21 Theft
## 2 Healthcare Provider 1000 2009-10-28 Theft
## 3 Healthcare Provider 501 2009-10-30 Theft
## 4 Health Plan 3800 2009-11-17 Loss
## 5 Healthcare Provider 5257 2009-11-20 Theft
## 6 Healthcare Provider 857 2009-11-20 Theft
## Info.Location Is.Present
## 1 Paper/Films FALSE
## 2 Network Server FALSE
## 3 Other, Other Portable Electronic Device FALSE
## 4 Laptop FALSE
## 5 Desktop Computer FALSE
## 6 Desktop Computer FALSE
tail(BreachesReport)
## Name State
## 1146 Senior Health Partners, a Healthfirst company NY
## 1147 Tomas, Arturo IL
## 1148 Pathway to Hope FL
## 1149 Hunt Regional Medical Partners TX
## 1150 Marketing Clique TX
## 1151 Raymond Mark Turner, M.D. NV
## Business.Type Num.people Date.Submit
## 1146 Health Plan 2772 2015-02-06
## 1147 Business Associate 680 2015-02-09
## 1148 Healthcare Provider 600 2015-02-12
## 1149 Healthcare Provider 3000 2015-02-18
## 1150 Health Plan 8700 2015-02-20
## 1151 Healthcare Provider 2153 2015-02-26
## Breach.Type
## 1146 Theft
## 1147 Loss
## 1148 Unauthorized Access/Disclosure
## 1149 Unauthorized Access/Disclosure
## 1150 Unauthorized Access/Disclosure
## 1151 Theft
## Info.Location Is.Present
## 1146 Laptop, Other Portable Electronic Device FALSE
## 1147 Paper/Films TRUE
## 1148 Email FALSE
## 1149 Other FALSE
## 1150 Other FALSE
## 1151 Laptop FALSE
mean(BreachesReport$Num.people)
## [1] 35778.58
median(BreachesReport$Num.people)
## [1] 2365
max(BreachesReport$Num.people)
## [1] 4900000
min(BreachesReport$Num.people)
## [1] 500
#BreachesReport$num.people
#BreachesReport$Breach.Type
levels(BreachesReport$Breach.Type) <-c("Hacking","Hacking",
"Hacking", "Hacking","Hacking", "Hacking", "Neglect",
"Neglect","Neglect","Neglect","Neglect", "Loss", "Loss",
"Loss", "Loss", "Loss", "Loss", "Loss", "Other", "Theft",
"Theft", "UAA/D", "Unknown", "Theft","Theft","Theft",
"UAA/D", "UAA/D", "Unknown")
levels(BreachesReport$Breach.Type)
## [1] "Hacking" "Neglect" "Loss" "Other" "Theft" "UAA/D" "Unknown"
#BreachesReport$Date.Submit
Years <- format(as.Date(BreachesReport$Date.Submit), "%Y")
head(Years)
## [1] "2009" "2009" "2009" "2009" "2009" "2009"
BreachesReport$Years <- Years
BreachesReport$Years <- factor(BreachesReport$Years)
head(BreachesReport$Num.people)
## [1] 1000 1000 501 3800 5257 857
#NYBReport <- data.frame()
#BreachesReport$Region <- Region
library(ggplot2)
bridgePlot1 <- ggplot(data = BreachesReport, aes(x = Name,
y =Num.people, color = Breach.Type)) +
geom_point()
bridgePlot1
bridgePlot2 <- ggplot(data = BreachesReport, aes(x = Date.Submit,
y = Num.people, color = Business.Type,
size = Num.people)) + geom_point()
bridgePlot2
bridgePlot24 <- ggplot(data = BreachesReport, aes(x = Date.Submit,
y = Num.people, color = Business.Type,
size = Num.people)) + geom_point()+ ylim(0, 5000)
bridgePlot24
## Warning: Removed 369 rows containing missing values (geom_point).
bridgeplot3 <- ggplot(data = BreachesReport, aes(x = Num.people))
bridgeplot3 + geom_histogram(color = "Black",
aes(fill = Breach.Type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##### Comment 4
bridgeplot33 <- ggplot(data = BreachesReport, aes(x = Num.people))
bridgeplot37 <- bridgeplot33 + geom_histogram(color = "Black",
aes(fill = Breach.Type))
bridgeplot39 <- bridgeplot37 + coord_cartesian(xlim = 0:1000000)
bridgeplot39
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#bridgeplot35 <- bridgeplot3 + geom_histogram(color = “Black”, # aes(fill = Region))
#Which years, the breaches most occur?
bridgePlot4 <- ggplot(data = BreachesReport,
aes(x = Years, y = Num.people,
color = Business.Type)) + geom_jitter()
bridgePlot41 <- bridgePlot4 + geom_boxplot(size = 1, alpha = 0.1)
bridgePlot42 <- bridgePlot41 + ylim(0,5000)
bridgePlot42
## Warning: Removed 369 rows containing non-finite values (stat_boxplot).
## Warning: Removed 372 rows containing missing values (geom_point).
bridgePlot44 <- ggplot(data = BreachesReport,
aes(x = Years, y = Num.people,
color = Years)) + geom_jitter()
bridgePlot46 <- bridgePlot44 + geom_boxplot(size = 1, alpha = 0.1)
bridgePlot48 <- bridgePlot46 + ylim(0,5000)
bridgePlot48
## Warning: Removed 369 rows containing non-finite values (stat_boxplot).
## Warning: Removed 374 rows containing missing values (geom_point).
#Breaches report by year (Scatter points)
brs <- ggplot(data = BreachesReport, aes(x = c(1:1151),
y = Num.people))
brs + geom_point(aes(color = Years)) + ylim(0,5000)
## Warning: Removed 369 rows containing missing values (geom_point).
#Location breches by location of breach (Using factors)
reportPlot2 <- ggplot(data = BreachesReport,
aes(x = Info.Location,
y = Breach.Type, color = Business.Type,
size = Num.people)) + geom_point()
reportPlot2
#Facets
sBF1 <- ggplot(data = BreachesReport, aes(x = Num.people))+
geom_histogram(aes(fill = Years),
color = "Black")
sBF2 <- sBF1 + facet_grid(Business.Type~Breach.Type,
scales = "free")
sBF2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sBF3 <- sBF1 + facet_grid(Years~Business.Type,
scales = "free")
sBF3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##The cybersecurity breaches affected deeply small businesses in ## Health care
Comment 1
Most of the data are between 0 5000 for the number of people
affected
#Spreading of around the companies (Scatter points) ## Deep views
Comment 3
Unnifom repatition of people affected
Most of cybersecurity breaches were from theft