Studuy of Cybersecurity Breaches in Heath Care

State and Region Where People Were Affected

People affected by Months and Years

Importing data

cyberBreachesReport <- read.csv('https://raw.githubusercontent.com/AlainKuiete/SummerBridge/master/HHSCyberSecurityBreaches.csv')

exploring data

summary(cyberBreachesReport)
##        X         
##  Min.   :   1.0  
##  1st Qu.: 288.5  
##  Median : 576.0  
##  Mean   : 576.0  
##  3rd Qu.: 863.5  
##  Max.   :1151.0  
##                  
##                                                      Name.of.Covered.Entity
##  StayWell Health Management, LLC                                :   6      
##  University of California, San Francisco                        :   5      
##  Clearpoint Design, Inc.                                        :   4      
##  UnitedHealth Group health plan single affiliated covered entity:   4      
##  Walgreen Co.                                                   :   4      
##  Cook County Health & Hospitals System                          :   3      
##  (Other)                                                        :1125      
##      State                    Covered.Entity.Type Individuals.Affected
##  CA     :128   Business Associate       :272      Min.   :    500     
##  TX     :100   Health Plan              :108      1st Qu.:   1000     
##  NY     : 72   Healthcare Clearing House:  4      Median :   2365     
##  FL     : 69   Healthcare Provider      :767      Mean   :  35779     
##  IL     : 57                                      3rd Qu.:   7350     
##  PA     : 45                                      Max.   :4900000     
##  (Other):680                                                          
##  Breach.Submission.Date                        Type.of.Breach
##  2014-04-25:   7        Theft                         :577   
##  2010-07-30:   6        Unauthorized Access/Disclosure:183   
##  2012-11-29:   6        Other                         : 89   
##  2013-12-06:   6        Loss                          : 79   
##  2009-11-20:   5        Hacking/IT Incident           : 77   
##  2010-07-23:   5        Improper Disposal             : 42   
##  (Other)   :1116        (Other)                       :104   
##                  Location.of.Breached.Information
##  Paper/Films                     :254            
##  Laptop                          :222            
##  Other                           :132            
##  Network Server                  :127            
##  Desktop Computer                :108            
##  Other Portable Electronic Device: 68            
##  (Other)                         :240            
##  Business.Associate.Present
##  Mode :logical             
##  FALSE:879                 
##  TRUE :272                 
##                            
##                            
##                            
##                            
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Web.Description
##  \\N                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         :892  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              : 50  
##  A bag containing a compact disk - read only memory (CD-ROM) was stolen from the vehicle of a physician associated with the covered entity (CE).  The CD-ROM involved in the breach contained names, dates of birth, social security numbers, medical histories, and the treatment information of approximately 2,046 individuals.  Following the breach, the CE filed a police report and provided breach notification to affected individuals, HHS, and the media.  The CE sanctioned and retrained the physician whose bag was stolen and implemented organization wide improvements to its compliance with the Privacy and Security Rules.  As a result of OCR\032\032\032\032\032\032\032\032\032s investigation the covered entity posted substitute notification of the breach in the local paper and confirmed that corrective actions steps were taken. \n\\\n\\\n\\:  2  
##  The covered entity (CE), Long Island Consultation Center, misplaced an unencrypted portable device that contained the electronic protected health information (ePHI) of 800 individuals.  The ePHI included names, dates of birth, diagnoses, and other treatment information.  Upon discovery of the breach, the CE conducted a search for the portable device.  The CE provided breach notification to HHS, the media, and affected individuals.  As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE improved physical security.   The CE also developed and implemented a policy and procedure prohibiting use of portable media for storing ePHI and trained staff on its new policy.                                                                                                                                                        :  2  
##  The covered entity (CE), Samaritan Regional Health System, mismatched names and addresses in a mailing to former patients of a recently deceased physician.  The protected health information (PHI) included the names and addresses of approximately 2,203 individuals.  The CE provided breach notification to affected individuals, the media, and HHS, and posted substitute notice on its website.  Following the breach, the CE re-trained staff on proper address validation techniques and implemented new audit procedures for mailings.  OCR obtained assurances that the CE implemented the corrective action listed above.                                                                                                                                                                                                                                      :  2  
##  Two laptop computers containing the electronic protected health information (ePHI) of approximately 5,450 individuals were stolen from the CE.  The ePHI included patient names, dates of birth, and social security numbers.  The CE provided breach notification to all affected individuals, HHS, and the media.  As a result of OCR\032\032\032\032\032\032\032\032\032s investigation, the CE installed encryption software and increased physical security.                                                                                                                                                                                                                                                                                                                                                                                                           :  2  
##  (Other)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     :201
str(cyberBreachesReport)
## 'data.frame':    1151 obs. of  10 variables:
##  $ X                               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name.of.Covered.Entity          : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
##  $ State                           : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
##  $ Covered.Entity.Type             : Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ Individuals.Affected            : int  1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
##  $ Breach.Submission.Date          : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
##  $ Type.of.Breach                  : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
##  $ Location.of.Breached.Information: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
##  $ Business.Associate.Present      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Web.Description                 : Factor w/ 207 levels ""," \n\\The covered entity (CE), Medco Health Solutions, mailed letters with incorrect addresses after a programmi"| __truncated__,..: 9 112 5 40 56 53 55 57 54 35 ...
BreachesReport<- data.frame(Name = cyberBreachesReport$Name.of.Covered.Entity,
                            State = cyberBreachesReport$State,
                            Business.Type = cyberBreachesReport$Covered.Entity.Type,
                            Num.people = cyberBreachesReport$Individuals.Affected ,
                            Date.Submit  = cyberBreachesReport$Breach.Submission.Date,
                            Breach.Type =  cyberBreachesReport$Type.of.Breach ,
                            Info.Location = cyberBreachesReport$Location.of.Breached.Information,
                            Is.Present  = cyberBreachesReport$Business.Associate.Present)


summary(BreachesReport)
##                                                               Name     
##  StayWell Health Management, LLC                                :   6  
##  University of California, San Francisco                        :   5  
##  Clearpoint Design, Inc.                                        :   4  
##  UnitedHealth Group health plan single affiliated covered entity:   4  
##  Walgreen Co.                                                   :   4  
##  Cook County Health & Hospitals System                          :   3  
##  (Other)                                                        :1125  
##      State                       Business.Type   Num.people     
##  CA     :128   Business Associate       :272   Min.   :    500  
##  TX     :100   Health Plan              :108   1st Qu.:   1000  
##  NY     : 72   Healthcare Clearing House:  4   Median :   2365  
##  FL     : 69   Healthcare Provider      :767   Mean   :  35779  
##  IL     : 57                                   3rd Qu.:   7350  
##  PA     : 45                                   Max.   :4900000  
##  (Other):680                                                    
##      Date.Submit                           Breach.Type 
##  2014-04-25:   7   Theft                         :577  
##  2010-07-30:   6   Unauthorized Access/Disclosure:183  
##  2012-11-29:   6   Other                         : 89  
##  2013-12-06:   6   Loss                          : 79  
##  2009-11-20:   5   Hacking/IT Incident           : 77  
##  2010-07-23:   5   Improper Disposal             : 42  
##  (Other)   :1116   (Other)                       :104  
##                           Info.Location Is.Present     
##  Paper/Films                     :254   Mode :logical  
##  Laptop                          :222   FALSE:879      
##  Other                           :132   TRUE :272      
##  Network Server                  :127                  
##  Desktop Computer                :108                  
##  Other Portable Electronic Device: 68                  
##  (Other)                         :240
str(BreachesReport)
## 'data.frame':    1151 obs. of  8 variables:
##  $ Name         : Factor w/ 1055 levels "24 ON Physicians, PC/In Compass Health,Inc.",..: 114 566 29 355 473 234 565 447 524 176 ...
##  $ State        : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
##  $ Business.Type: Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ Num.people   : int  1000 1000 501 3800 5257 857 6145 952 5166 5900 ...
##  $ Date.Submit  : Factor w/ 753 levels "2009-10-21","2009-10-28",..: 1 2 3 4 5 5 5 5 5 6 ...
##  $ Breach.Type  : Factor w/ 29 levels "Hacking/IT Incident",..: 24 24 24 12 24 24 24 24 24 24 ...
##  $ Info.Location: Factor w/ 47 levels "Desktop Computer",..: 47 40 45 34 1 1 1 1 1 34 ...
##  $ Is.Present   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
head(BreachesReport)
##                                                    Name State
## 1                            Brooke Army Medical Center    TX
## 2             Mid America Kidney Stone Association, LLC    MO
## 3       Alaska Department of Health and Social Services    AK
## 4 Health Services for Children with Special Needs, Inc.    DC
## 5                              L. Douglas Carlson, M.D.    CA
## 6                                    David I. Cohen, MD    CA
##         Business.Type Num.people Date.Submit Breach.Type
## 1 Healthcare Provider       1000  2009-10-21       Theft
## 2 Healthcare Provider       1000  2009-10-28       Theft
## 3 Healthcare Provider        501  2009-10-30       Theft
## 4         Health Plan       3800  2009-11-17        Loss
## 5 Healthcare Provider       5257  2009-11-20       Theft
## 6 Healthcare Provider        857  2009-11-20       Theft
##                             Info.Location Is.Present
## 1                             Paper/Films      FALSE
## 2                          Network Server      FALSE
## 3 Other, Other Portable Electronic Device      FALSE
## 4                                  Laptop      FALSE
## 5                        Desktop Computer      FALSE
## 6                        Desktop Computer      FALSE
tail(BreachesReport)
##                                               Name State
## 1146 Senior Health Partners, a Healthfirst company    NY
## 1147                                 Tomas, Arturo    IL
## 1148                               Pathway to Hope    FL
## 1149                Hunt Regional Medical Partners    TX
## 1150                              Marketing Clique    TX
## 1151                     Raymond Mark Turner, M.D.    NV
##            Business.Type Num.people Date.Submit
## 1146         Health Plan       2772  2015-02-06
## 1147  Business Associate        680  2015-02-09
## 1148 Healthcare Provider        600  2015-02-12
## 1149 Healthcare Provider       3000  2015-02-18
## 1150         Health Plan       8700  2015-02-20
## 1151 Healthcare Provider       2153  2015-02-26
##                         Breach.Type
## 1146                          Theft
## 1147                           Loss
## 1148 Unauthorized Access/Disclosure
## 1149 Unauthorized Access/Disclosure
## 1150 Unauthorized Access/Disclosure
## 1151                          Theft
##                                 Info.Location Is.Present
## 1146 Laptop, Other Portable Electronic Device      FALSE
## 1147                              Paper/Films       TRUE
## 1148                                    Email      FALSE
## 1149                                    Other      FALSE
## 1150                                    Other      FALSE
## 1151                                   Laptop      FALSE
mean(BreachesReport$Num.people)
## [1] 35778.58
median(BreachesReport$Num.people)
## [1] 2365
max(BreachesReport$Num.people)
## [1] 4900000
min(BreachesReport$Num.people)
## [1] 500

Transforming my data

levels(BreachesReport$Breach.Type) <-c("Hacking","Hacking",
"Hacking", "Hacking","Hacking", "Hacking", "Neglect", 
"Neglect","Neglect","Neglect","Neglect", "Loss", "Loss",
"Loss", "Loss", "Loss", "Loss", "Loss", "Other", "Theft",
"Theft", "UAA/D", "Unknown", "Theft","Theft","Theft",
"UAA/D", "UAA/D", "Unknown")


levels(BreachesReport$Breach.Type)
## [1] "Hacking" "Neglect" "Loss"    "Other"   "Theft"   "UAA/D"   "Unknown"
Inserting and factoring columns Years and Motnhs
Years <- format(as.Date(BreachesReport$Date.Submit), "%Y")
head(Years)
## [1] "2009" "2009" "2009" "2009" "2009" "2009"
Month <- format(as.Date(BreachesReport$Date.Submit), "%m")
head(Month)
## [1] "10" "10" "10" "11" "11" "11"
BreachesReport$Years <- Years
BreachesReport$Months <- Month
BreachesReport$Years <- factor(BreachesReport$Years)
BreachesReport$Months <- factor(BreachesReport$Months)
head(BreachesReport$Num.people)
## [1] 1000 1000  501 3800 5257  857

Creating The Region Column

Northeast <- c()
South <- c()
North_Central <- c()
West <- c()

for (i in 1:length(state.region)) {
    if (state.region[i] == "Northeast"){
      Northeast <- c(Northeast, state.abb[i])
    }else if (state.region[i] == "South"){
      South <- c(South, state.abb[i])
    }else if (state.region[i] == "North Central"){
      North_Central <- c(North_Central, state.abb[i])
    }else if (state.region[i] == "West"){
      West <- c(West, state.abb[i])
    }
}

Region <- c()
for (i in 1:length(BreachesReport$State)) {
  if (is.element(BreachesReport$State[i],Northeast)){
    Region <- c(Region,"Northeast")
  }else if(is.element(BreachesReport$State[i],South)){
    Region <- c(Region,"South")
  }else if(is.element(BreachesReport$State[i],North_Central)){
    Region <- c(Region,"North Central")
  }else if(is.element(BreachesReport$State[i],West)){
    Region <- c(Region,"West")
  }else {
    Region <- c(Region,"South")
  }
}

str(Region)
##  chr [1:1151] "South" "North Central" "West" "South" "West" "West" ...
BreachesReport$Region <- Region
BreachesReport$Region <- factor(BreachesReport$Region)
levels(BreachesReport$Region)
## [1] "North Central" "Northeast"     "South"         "West"

Creating a New Dataframe

sBR <- data.frame(Years = BreachesReport$Years,
                  Months = BreachesReport$Months,
                  State = BreachesReport$State,
                  Regions = BreachesReport$Region,
                  Breach.Type = BreachesReport$Breach.Type,
                  Business.Type = BreachesReport$Business.Type,
                  Num.People = BreachesReport$Num.people)

summary(sBR)
##   Years         Months        State              Regions     Breach.Type 
##  2009: 18   04     :112   CA     :128   North Central:240   Hacking: 94  
##  2010:197   01     :106   TX     :100   Northeast    :200   Neglect: 51  
##  2011:192   10     :104   NY     : 72   South        :442   Loss   :105  
##  2012:192   11     :104   FL     : 69   West         :269   Other  : 89  
##  2013:249   07     : 98   IL     : 57                       Theft  :609  
##  2014:278   03     : 95   PA     : 45                       UAA/D  :191  
##  2015: 25   (Other):532   (Other):680                       Unknown: 12  
##                    Business.Type   Num.People     
##  Business Associate       :272   Min.   :    500  
##  Health Plan              :108   1st Qu.:   1000  
##  Healthcare Clearing House:  4   Median :   2365  
##  Healthcare Provider      :767   Mean   :  35779  
##                                  3rd Qu.:   7350  
##                                  Max.   :4900000  
## 
str(sBR)
## 'data.frame':    1151 obs. of  7 variables:
##  $ Years        : Factor w/ 7 levels "2009","2010",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Months       : Factor w/ 12 levels "01","02","03",..: 10 10 10 11 11 11 11 11 11 11 ...
##  $ State        : Factor w/ 52 levels "AK","AL","AR",..: 45 25 1 8 5 5 5 5 5 5 ...
##  $ Regions      : Factor w/ 4 levels "North Central",..: 3 1 4 3 4 4 4 4 4 4 ...
##  $ Breach.Type  : Factor w/ 7 levels "Hacking","Neglect",..: 5 5 5 3 5 5 5 5 5 5 ...
##  $ Business.Type: Factor w/ 4 levels "Business Associate",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ Num.People   : int  1000 1000 501 3800 5257 857 6145 952 5166 5900 ...

Using Aggregattion

Aggregation by Month and Region

aggregate(Num.People~Regions,sBR, sum, na.rm = TRUE)
##         Regions Num.People
## 1 North Central    5954333
## 2     Northeast    7682119
## 3         South   22374501
## 4          West    5170191

Aggregation by Region and Year

agg <- data.frame(aggregate(Num.People~Regions + Years,sBR, 
                            sum, na.rm = TRUE))
agg
##          Regions Years Num.People
## 1  North Central  2009      11646
## 2      Northeast  2009        943
## 3          South  2009      96200
## 4           West  2009      25984
## 5  North Central  2010     499141
## 6      Northeast  2010    1554352
## 7          South  2010    3125946
## 8           West  2010     344737
## 9  North Central  2011     208174
## 10     Northeast  2011    4082787
## 11         South  2011    8103854
## 12          West  2011     730857
## 13 North Central  2012     223012
## 14     Northeast  2012     232673
## 15         South  2012    1061385
## 16          West  2012    1233290
## 17 North Central  2013    4493638
## 18     Northeast  2013     215732
## 19         South  2013     925386
## 20          West  2013    1233840
## 21 North Central  2014     465152
## 22     Northeast  2014    1573643
## 23         South  2014    8878150
## 24          West  2014    1586245
## 25 North Central  2015      53570
## 26     Northeast  2015      21989
## 27         South  2015     183580
## 28          West  2015      15238
agg2 <- data.frame(aggregate(Num.People~Years,sBR, 
                            sum, na.rm = TRUE))
agg2
##   Years Num.People
## 1  2009     134773
## 2  2010    5524176
## 3  2011   13125672
## 4  2012    2750360
## 5  2013    6868596
## 6  2014   12503190
## 7  2015     274377

Aggregation by State and Region

agg3 <- data.frame(aggregate(Num.People~State + Region,sBR, 
                             sum, na.rm = TRUE))
agg3
##    State        Region Num.People
## 1     IA North Central      35584
## 2     IL North Central    4602939
## 3     IN North Central     523629
## 4     KS North Central      30656
## 5     MI North Central     172541
## 6     MN North Central     126519
## 7     MO North Central      92330
## 8     ND North Central      12650
## 9     NE North Central      11943
## 10    OH North Central     220591
## 11    SD North Central       9120
## 12    WI North Central     115831
## 13    CT     Northeast     210293
## 14    MA     Northeast     184939
## 15    ME     Northeast       1920
## 16    NH     Northeast     239339
## 17    NJ     Northeast    3035497
## 18    NY     Northeast    2758702
## 19    PA     Northeast    1219266
## 20    RI     Northeast      31613
## 21    VT     Northeast        550
## 22    AL         South    1072221
## 23    AR         South      19383
## 24    DC         South      13905
## 25    DE         South       1883
## 26    FL         South    2931504
## 27    GA         South     562231
## 28    KY         South     102340
## 29    LA         South      63521
## 30    MD         South     325570
## 31    MS         South      27640
## 32    NC         South     282591
## 33    OK         South     249348
## 34    PR         South    1234508
## 35    SC         South     700385
## 36    TN         South    6125371
## 37    TX         South    3492300
## 38    VA         South    5148257
## 39    WV         South      21543
## 40    AK          West       8500
## 41    AZ          West     234183
## 42    CA          West    2422097
## 43    CO          West     173881
## 44    HI          West        674
## 45    ID          West      14962
## 46    MT          West    1105360
## 47    NM          West      34804
## 48    NV          West      67077
## 49    OR          West      69856
## 50    UT          West     835276
## 51    WA          West     165956
## 52    WY          West      37565

Aggregation by Month and Year

agg4 <- data.frame(aggregate(Num.People~Months + Years,sBR, 
                             sum, na.rm = TRUE))
agg4
##    Months Years Num.People
## 1      10  2009       2501
## 2      11  2009      35420
## 3      12  2009      96852
## 4      01  2010      73508
## 5      02  2010      28172
## 6      03  2010      68952
## 7      04  2010     738181
## 8      05  2010      62873
## 9      06  2010    1478161
## 10     07  2010    1068905
## 11     08  2010     133983
## 12     09  2010     129580
## 13     10  2010      34046
## 14     11  2010    1652724
## 15     12  2010      55091
## 16     01  2011     457669
## 17     02  2011    1786533
## 18     03  2011     563888
## 19     04  2011    2260574
## 20     05  2011     530673
## 21     06  2011     135744
## 22     07  2011      71321
## 23     08  2011     151907
## 24     09  2011      34942
## 25     10  2011    1127477
## 26     11  2011    5934332
## 27     12  2011      70612
## 28     01  2012      89323
## 29     02  2012      61575
## 30     03  2012      68340
## 31     04  2012    1379883
## 32     05  2012     150759
## 33     06  2012      33913
## 34     07  2012      55022
## 35     08  2012     243726
## 36     09  2012     103897
## 37     10  2012     247868
## 38     11  2012      97952
## 39     12  2012     218102
## 40     01  2013     105568
## 41     02  2013     207002
## 42     03  2013     103214
## 43     04  2013      88285
## 44     05  2013     300651
## 45     06  2013      46713
## 46     07  2013     537510
## 47     08  2013    4134829
## 48     09  2013      90631
## 49     10  2013     840429
## 50     11  2013     212471
## 51     12  2013     201293
## 52     01  2014    1440600
## 53     02  2014     497492
## 54     03  2014     160855
## 55     04  2014     440315
## 56     05  2014     475704
## 57     06  2014     252873
## 58     07  2014    1276229
## 59     08  2014    4815065
## 60     09  2014    2153087
## 61     10  2014     451324
## 62     11  2014     325714
## 63     12  2014     213932
## 64     01  2015     175778
## 65     02  2015      98599

Plotting

library(ggplot2)
library(scales)

Plotting with lines

#### People affected by State
gp <- ggplot(data = BreachesReport, 
            aes(x = State, y = Num.people))
gp

gp1 <- gp + geom_line(aes(color = Years, group = Region))
gp1

gp2 <- gp1 + scale_color_discrete(name = "Region")
gp2

gp3 <- gp2 + scale_y_continuous(labels = comma)
gp3

Peoeple affected By State and region

gp01 <- ggplot(data = agg3, 
             aes(x = State, y = Num.People))
gp01

gp10 <- gp01 + geom_line(aes(color = Region, group = Region))
gp10

gp20 <- gp10 + scale_color_discrete(name = "Region")
gp20

gp30 <- gp20 + scale_y_continuous(labels = comma)
gp30

gp40 <- gp30 + labs(title = "Cybersecurity Breaches", 
                  x = "State", y = "Number of People affected ")
gp40

Peoeple affected By Month and Year

gp02 <- ggplot(data = agg4, 
               aes(x = Months, y = Num.People))
gp02

gp11 <- gp02 + geom_line(aes(color = Years, group = Years))
gp11

gp21 <- gp11 + scale_color_discrete(name = "Years")
gp21

gp31 <- gp21 + scale_y_continuous(labels = comma_format())
gp31

gp41 <- gp31 + labs(title = "Cybersecurity Breaches in Health Care", 
            x = "Months", y = "Number of People affected ")
gp41