cyberBreachesReport <- read.csv('https://raw.githubusercontent.com/AlainKuiete/SummerBridge/master/HHSCyberSecurityBreaches.csv')
summary(cyberBreachesReport)
## X
## Min. : 1.0
## 1st Qu.: 288.5
## Median : 576.0
## Mean : 576.0
## 3rd Qu.: 863.5
## Max. :1151.0
##
## Name.of.Covered.Entity
## StayWell Health Management, LLC : 6
## University of California, San Francisco : 5
## Clearpoint Design, Inc. : 4
## UnitedHealth Group health plan single affiliated covered entity: 4
## Walgreen Co. : 4
## Cook County Health & Hospitals System : 3
## (Other) :1125
## State Covered.Entity.Type Individuals.Affected
## CA :128 Business Associate :272 Min. : 500
## TX :100 Health Plan :108 1st Qu.: 1000
## NY : 72 Healthcare Clearing House: 4 Median : 2365
## FL : 69 Healthcare Provider :767 Mean : 35779
## IL : 57 3rd Qu.: 7350
## PA : 45 Max. :4900000
## (Other):680
## Breach.Submission.Date Type.of.Breach
## 2014-04-25: 7 Theft :577
## 2010-07-30: 6 Unauthorized Access/Disclosure:183
## 2012-11-29: 6 Other : 89
## 2013-12-06: 6 Loss : 79
## 2009-11-20: 5 Hacking/IT Incident : 77
## 2010-07-23: 5 Improper Disposal : 42
## (Other) :1116 (Other) :104
## Location.of.Breached.Information
## Paper/Films :254
## Laptop :222
## Other :132
## Network Server :127
## Desktop Computer :108
## Other Portable Electronic Device: 68
## (Other) :240
## Business.Associate.Present
## Mode :logical
## FALSE:879
## TRUE :272
##
##
##
##
## Web.Description
## \\N :892
## : 50
## A bag containing a compact disk - read only memory (CD-ROM) was stolen from the vehicle of a physician associated with the covered entity (CE). The CD-ROM involved in the breach contained names, dates of birth, social security numbers, medical histories, and the treatment information of approximately 2,046 individuals. Following the breach, the CE filed a police report and provided breach notification to affected individuals, HHS, and the media. The CE sanctioned and retrained the physician whose bag was stolen and implemented organization wide improvements to its compliance with the Privacy and Security Rules. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation the covered entity posted substitute notification of the breach in the local paper and confirmed that corrective actions steps were taken. \n\\\n\\\n\\: 2
## The covered entity (CE), Long Island Consultation Center, misplaced an unencrypted portable device that contained the electronic protected health information (ePHI) of 800 individuals. The ePHI included names, dates of birth, diagnoses, and other treatment information. Upon discovery of the breach, the CE conducted a search for the portable device. The CE provided breach notification to HHS, the media, and affected individuals. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE improved physical security. The CE also developed and implemented a policy and procedure prohibiting use of portable media for storing ePHI and trained staff on its new policy. : 2
## The covered entity (CE), Samaritan Regional Health System, mismatched names and addresses in a mailing to former patients of a recently deceased physician. The protected health information (PHI) included the names and addresses of approximately 2,203 individuals. The CE provided breach notification to affected individuals, the media, and HHS, and posted substitute notice on its website. Following the breach, the CE re-trained staff on proper address validation techniques and implemented new audit procedures for mailings. OCR obtained assurances that the CE implemented the corrective action listed above. : 2
## Two laptop computers containing the electronic protected health information (ePHI) of approximately 5,450 individuals were stolen from the CE. The ePHI included patient names, dates of birth, and social security numbers. The CE provided breach notification to all affected individuals, HHS, and the media. As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE installed encryption software and increased physical security. : 2
## (Other) :201
str(cyberBreachesReport)
## 'data.frame': 1151 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name.of.Covered.Entity : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
## $ Covered.Entity.Type : Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ Individuals.Affected : int 1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
## $ Breach.Submission.Date : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
## $ Type.of.Breach : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
## $ Location.of.Breached.Information: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
## $ Business.Associate.Present : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Web.Description : Factor w/ 207 levels ""," \n\\The covered entity (CE), Medco Health Solutions, mailed letters with incorrect addresses after a programmi"| __truncated__,..: 9 112 5 40 56 53 55 57 54 35 ...
BreachesReport<- data.frame(Name = cyberBreachesReport$Name.of.Covered.Entity,
State = cyberBreachesReport$State,
Business.Type = cyberBreachesReport$Covered.Entity.Type,
Num.people = cyberBreachesReport$Individuals.Affected ,
Date.Submit = cyberBreachesReport$Breach.Submission.Date,
Breach.Type = cyberBreachesReport$Type.of.Breach ,
Info.Location = cyberBreachesReport$Location.of.Breached.Information,
Is.Present = cyberBreachesReport$Business.Associate.Present)
summary(BreachesReport)
## Name
## StayWell Health Management, LLC : 6
## University of California, San Francisco : 5
## Clearpoint Design, Inc. : 4
## UnitedHealth Group health plan single affiliated covered entity: 4
## Walgreen Co. : 4
## Cook County Health & Hospitals System : 3
## (Other) :1125
## State Business.Type Num.people
## CA :128 Business Associate :272 Min. : 500
## TX :100 Health Plan :108 1st Qu.: 1000
## NY : 72 Healthcare Clearing House: 4 Median : 2365
## FL : 69 Healthcare Provider :767 Mean : 35779
## IL : 57 3rd Qu.: 7350
## PA : 45 Max. :4900000
## (Other):680
## Date.Submit Breach.Type
## 2014-04-25: 7 Theft :577
## 2010-07-30: 6 Unauthorized Access/Disclosure:183
## 2012-11-29: 6 Other : 89
## 2013-12-06: 6 Loss : 79
## 2009-11-20: 5 Hacking/IT Incident : 77
## 2010-07-23: 5 Improper Disposal : 42
## (Other) :1116 (Other) :104
## Info.Location Is.Present
## Paper/Films :254 Mode :logical
## Laptop :222 FALSE:879
## Other :132 TRUE :272
## Network Server :127
## Desktop Computer :108
## Other Portable Electronic Device: 68
## (Other) :240
str(BreachesReport)
## 'data.frame': 1151 obs. of 8 variables:
## $ Name : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
## $ Business.Type: Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ Num.people : int 1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
## $ Date.Submit : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
## $ Breach.Type : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
## $ Info.Location: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
## $ Is.Present : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
head(BreachesReport)
## Name State
## 1 Brooke Army Medical Center TX
## 2 Mid America Kidney Stone Association, LLC MO
## 3 Alaska Department of Health and Social Services AK
## 4 Health Services for Children with Special Needs, Inc. DC
## 5 L. Douglas Carlson, M.D. CA
## 6 David I. Cohen, MD CA
## Business.Type Num.people Date.Submit Breach.Type
## 1 Healthcare Provider 1000 2009-10-21 Theft
## 2 Healthcare Provider 1000 2009-10-28 Theft
## 3 Healthcare Provider 501 2009-10-30 Theft
## 4 Health Plan 3800 2009-11-17 Loss
## 5 Healthcare Provider 5257 2009-11-20 Theft
## 6 Healthcare Provider 857 2009-11-20 Theft
## Info.Location Is.Present
## 1 Paper/Films FALSE
## 2 Network Server FALSE
## 3 Other, Other Portable Electronic Device FALSE
## 4 Laptop FALSE
## 5 Desktop Computer FALSE
## 6 Desktop Computer FALSE
tail(BreachesReport)
## Name State
## 1146 Senior Health Partners, a Healthfirst company NY
## 1147 Tomas, Arturo IL
## 1148 Pathway to Hope FL
## 1149 Hunt Regional Medical Partners TX
## 1150 Marketing Clique TX
## 1151 Raymond Mark Turner, M.D. NV
## Business.Type Num.people Date.Submit
## 1146 Health Plan 2772 2015-02-06
## 1147 Business Associate 680 2015-02-09
## 1148 Healthcare Provider 600 2015-02-12
## 1149 Healthcare Provider 3000 2015-02-18
## 1150 Health Plan 8700 2015-02-20
## 1151 Healthcare Provider 2153 2015-02-26
## Breach.Type
## 1146 Theft
## 1147 Loss
## 1148 Unauthorized Access/Disclosure
## 1149 Unauthorized Access/Disclosure
## 1150 Unauthorized Access/Disclosure
## 1151 Theft
## Info.Location Is.Present
## 1146 Laptop, Other Portable Electronic Device FALSE
## 1147 Paper/Films TRUE
## 1148 Email FALSE
## 1149 Other FALSE
## 1150 Other FALSE
## 1151 Laptop FALSE
mean(BreachesReport$Num.people)
## [1] 35778.58
median(BreachesReport$Num.people)
## [1] 2365
max(BreachesReport$Num.people)
## [1] 4900000
min(BreachesReport$Num.people)
## [1] 500
levels(BreachesReport$Breach.Type) <-c("Hacking","Hacking",
"Hacking", "Hacking","Hacking", "Hacking", "Neglect",
"Neglect","Neglect","Neglect","Neglect", "Loss", "Loss",
"Loss", "Loss", "Loss", "Loss", "Loss", "Other", "Theft",
"Theft", "UAA/D", "Unknown", "Theft","Theft","Theft",
"UAA/D", "UAA/D", "Unknown")
levels(BreachesReport$Breach.Type)
## [1] "Hacking" "Neglect" "Loss" "Other" "Theft" "UAA/D" "Unknown"
Years <- format(as.Date(BreachesReport$Date.Submit), "%Y")
head(Years)
## [1] "2009" "2009" "2009" "2009" "2009" "2009"
Month <- format(as.Date(BreachesReport$Date.Submit), "%m")
head(Month)
## [1] "10" "10" "10" "11" "11" "11"
BreachesReport$Years <- Years
BreachesReport$Months <- Month
BreachesReport$Years <- factor(BreachesReport$Years)
BreachesReport$Months <- factor(BreachesReport$Months)
head(BreachesReport$Num.people)
## [1] 1000 1000 501 3800 5257 857
Northeast <- c()
South <- c()
North_Central <- c()
West <- c()
for (i in 1:length(state.region)) {
if (state.region[i] == "Northeast"){
Northeast <- c(Northeast, state.abb[i])
}else if (state.region[i] == "South"){
South <- c(South, state.abb[i])
}else if (state.region[i] == "North Central"){
North_Central <- c(North_Central, state.abb[i])
}else if (state.region[i] == "West"){
West <- c(West, state.abb[i])
}
}
Region <- c()
for (i in 1:length(BreachesReport$State)) {
if (is.element(BreachesReport$State[i],Northeast)){
Region <- c(Region,"Northeast")
}else if(is.element(BreachesReport$State[i],South)){
Region <- c(Region,"South")
}else if(is.element(BreachesReport$State[i],North_Central)){
Region <- c(Region,"North Central")
}else if(is.element(BreachesReport$State[i],West)){
Region <- c(Region,"West")
}else {
Region <- c(Region,"South")
}
}
str(Region)
## chr [1:1151] "South" "North Central" "West" "South" "West" "West" ...
BreachesReport$Region <- Region
BreachesReport$Region <- factor(BreachesReport$Region)
levels(BreachesReport$Region)
## [1] "North Central" "Northeast" "South" "West"
sBR <- data.frame(Years = BreachesReport$Years,
Months = BreachesReport$Months,
State = BreachesReport$State,
Regions = BreachesReport$Region,
Breach.Type = BreachesReport$Breach.Type,
Business.Type = BreachesReport$Business.Type,
Num.People = BreachesReport$Num.people)
summary(sBR)
## Years Months State Regions Breach.Type
## 2009: 18 04 :112 CA :128 North Central:240 Hacking: 94
## 2010:197 01 :106 TX :100 Northeast :200 Neglect: 51
## 2011:192 10 :104 NY : 72 South :442 Loss :105
## 2012:192 11 :104 FL : 69 West :269 Other : 89
## 2013:249 07 : 98 IL : 57 Theft :609
## 2014:278 03 : 95 PA : 45 UAA/D :191
## 2015: 25 (Other):532 (Other):680 Unknown: 12
## Business.Type Num.People
## Business Associate :272 Min. : 500
## Health Plan :108 1st Qu.: 1000
## Healthcare Clearing House: 4 Median : 2365
## Healthcare Provider :767 Mean : 35779
## 3rd Qu.: 7350
## Max. :4900000
##
str(sBR)
## 'data.frame': 1151 obs. of 7 variables:
## $ Years : Factor w/ 7 levels "2009","2010",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Months : Factor w/ 12 levels "01","02","03",..: 10 10 10 11 11 11 11 11 11 11 ...
## $ State : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
## $ Regions : Factor w/ 4 levels "North Central",..: 3 1 4 3 4 4 4 4 4 4 ...
## $ Breach.Type : Factor w/ 7 levels "Hacking","Neglect",..: 5 5 5 3 5 5 5 5 5 5 ...
## $ Business.Type: Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ Num.People : int 1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
aggregate(Num.People~Regions,sBR, sum, na.rm = TRUE)
## Regions Num.People
## 1 North Central 5954333
## 2 Northeast 7682119
## 3 South 22374501
## 4 West 5170191
agg <- data.frame(aggregate(Num.People~Regions + Years,sBR,
sum, na.rm = TRUE))
agg
## Regions Years Num.People
## 1 North Central 2009 11646
## 2 Northeast 2009 943
## 3 South 2009 96200
## 4 West 2009 25984
## 5 North Central 2010 499141
## 6 Northeast 2010 1554352
## 7 South 2010 3125946
## 8 West 2010 344737
## 9 North Central 2011 208174
## 10 Northeast 2011 4082787
## 11 South 2011 8103854
## 12 West 2011 730857
## 13 North Central 2012 223012
## 14 Northeast 2012 232673
## 15 South 2012 1061385
## 16 West 2012 1233290
## 17 North Central 2013 4493638
## 18 Northeast 2013 215732
## 19 South 2013 925386
## 20 West 2013 1233840
## 21 North Central 2014 465152
## 22 Northeast 2014 1573643
## 23 South 2014 8878150
## 24 West 2014 1586245
## 25 North Central 2015 53570
## 26 Northeast 2015 21989
## 27 South 2015 183580
## 28 West 2015 15238
agg2 <- data.frame(aggregate(Num.People~Years,sBR,
sum, na.rm = TRUE))
agg2
## Years Num.People
## 1 2009 134773
## 2 2010 5524176
## 3 2011 13125672
## 4 2012 2750360
## 5 2013 6868596
## 6 2014 12503190
## 7 2015 274377
agg3 <- data.frame(aggregate(Num.People~State + Region,sBR,
sum, na.rm = TRUE))
agg3
## State Region Num.People
## 1 IA North Central 35584
## 2 IL North Central 4602939
## 3 IN North Central 523629
## 4 KS North Central 30656
## 5 MI North Central 172541
## 6 MN North Central 126519
## 7 MO North Central 92330
## 8 ND North Central 12650
## 9 NE North Central 11943
## 10 OH North Central 220591
## 11 SD North Central 9120
## 12 WI North Central 115831
## 13 CT Northeast 210293
## 14 MA Northeast 184939
## 15 ME Northeast 1920
## 16 NH Northeast 239339
## 17 NJ Northeast 3035497
## 18 NY Northeast 2758702
## 19 PA Northeast 1219266
## 20 RI Northeast 31613
## 21 VT Northeast 550
## 22 AL South 1072221
## 23 AR South 19383
## 24 DC South 13905
## 25 DE South 1883
## 26 FL South 2931504
## 27 GA South 562231
## 28 KY South 102340
## 29 LA South 63521
## 30 MD South 325570
## 31 MS South 27640
## 32 NC South 282591
## 33 OK South 249348
## 34 PR South 1234508
## 35 SC South 700385
## 36 TN South 6125371
## 37 TX South 3492300
## 38 VA South 5148257
## 39 WV South 21543
## 40 AK West 8500
## 41 AZ West 234183
## 42 CA West 2422097
## 43 CO West 173881
## 44 HI West 674
## 45 ID West 14962
## 46 MT West 1105360
## 47 NM West 34804
## 48 NV West 67077
## 49 OR West 69856
## 50 UT West 835276
## 51 WA West 165956
## 52 WY West 37565
agg4 <- data.frame(aggregate(Num.People~Months + Years,sBR,
sum, na.rm = TRUE))
agg4
## Months Years Num.People
## 1 10 2009 2501
## 2 11 2009 35420
## 3 12 2009 96852
## 4 01 2010 73508
## 5 02 2010 28172
## 6 03 2010 68952
## 7 04 2010 738181
## 8 05 2010 62873
## 9 06 2010 1478161
## 10 07 2010 1068905
## 11 08 2010 133983
## 12 09 2010 129580
## 13 10 2010 34046
## 14 11 2010 1652724
## 15 12 2010 55091
## 16 01 2011 457669
## 17 02 2011 1786533
## 18 03 2011 563888
## 19 04 2011 2260574
## 20 05 2011 530673
## 21 06 2011 135744
## 22 07 2011 71321
## 23 08 2011 151907
## 24 09 2011 34942
## 25 10 2011 1127477
## 26 11 2011 5934332
## 27 12 2011 70612
## 28 01 2012 89323
## 29 02 2012 61575
## 30 03 2012 68340
## 31 04 2012 1379883
## 32 05 2012 150759
## 33 06 2012 33913
## 34 07 2012 55022
## 35 08 2012 243726
## 36 09 2012 103897
## 37 10 2012 247868
## 38 11 2012 97952
## 39 12 2012 218102
## 40 01 2013 105568
## 41 02 2013 207002
## 42 03 2013 103214
## 43 04 2013 88285
## 44 05 2013 300651
## 45 06 2013 46713
## 46 07 2013 537510
## 47 08 2013 4134829
## 48 09 2013 90631
## 49 10 2013 840429
## 50 11 2013 212471
## 51 12 2013 201293
## 52 01 2014 1440600
## 53 02 2014 497492
## 54 03 2014 160855
## 55 04 2014 440315
## 56 05 2014 475704
## 57 06 2014 252873
## 58 07 2014 1276229
## 59 08 2014 4815065
## 60 09 2014 2153087
## 61 10 2014 451324
## 62 11 2014 325714
## 63 12 2014 213932
## 64 01 2015 175778
## 65 02 2015 98599
library(ggplot2)
library(scales)
#### People affected by State
gp <- ggplot(data = BreachesReport,
aes(x = State, y = Num.people))
gp
gp1 <- gp + geom_line(aes(color = Years, group = Region))
gp1
gp2 <- gp1 + scale_color_discrete(name = "Region")
gp2
gp3 <- gp2 + scale_y_continuous(labels = comma)
gp3
gp01 <- ggplot(data = agg3,
aes(x = State, y = Num.People))
gp01
gp10 <- gp01 + geom_line(aes(color = Region, group = Region))
gp10
gp20 <- gp10 + scale_color_discrete(name = "Region")
gp20
gp30 <- gp20 + scale_y_continuous(labels = comma)
gp30
gp40 <- gp30 + labs(title = "Cybersecurity Breaches",
x = "State", y = "Number of People affected ")
gp40
gp02 <- ggplot(data = agg4,
aes(x = Months, y = Num.People))
gp02
gp11 <- gp02 + geom_line(aes(color = Years, group = Years))
gp11
gp21 <- gp11 + scale_color_discrete(name = "Years")
gp21
gp31 <- gp21 + scale_y_continuous(labels = comma_format())
gp31
gp41 <- gp31 + labs(title = "Cybersecurity Breaches in Health Care",
x = "Months", y = "Number of People affected ")
gp41