Breach Type
DISC) - Sensitive information posted publicly on a website, mishandled or sent to the wrong party via email, fax or mail.HACK) - Electronic entry by an outside party, malware and spyware.CARD) - Fraud involving debit and credit cards that is not accomplished via hacking. For example, skimming devices at point-of-service terminals.INSD) - Someone with legitimate access intentionally breaches information - such as an employee or contractor.PHYS) - Lost, discarded or stolen non-electronic records, such as paper documentsPORT) - Lost, discarded or stolen laptop, PDA, smartphone, portable memory device, CD, hard drive, data tape, etcSTAT) - Lost, discarded or stolen stationary electronic device such as a computer or server not designed for mobility.UNKN)Org Class
BSO : Businesses => OtherBSF : Businesses => Financial and Insurance ServicesBSR : Businesses => Retail/MerchantEDU : Educational InstitutionsGOV : Government and MilitaryMED : Healthcare => Medical ProvidersNGO : Nonprofit Organizations(I did some substitution in the source data for Org Class tho to make graphs easier to read)
breaches <- read.csv("~/data/prc-2013.csv", stringsAsFactors = FALSE)
# make 'real' objects and convert certain cols to factors
breaches$Date <- as.Date(breaches$Date, format = "%d-%b-%y")
breaches$Org.Class <- factor(breaches$Org.Class)
breaches$Loss.Type <- factor(breaches$Loss.Type)
breaches$State <- factor(breaches$State)
breaches$Year <- factor(breaches$Year)
levels(breaches$State)
## [1] "" "Alabama" "Alaska"
## [4] "Arizona" "Arkansas" "California"
## [7] "Colorado" "Connecticut" "Delaware"
## [10] "District Of Columbia" "Florida" "Georgia"
## [13] "Hawaii" "Idaho" "Illinois"
## [16] "Indiana" "Iowa" "Kansas"
## [19] "Kentucky" "London City of" "Louisiana"
## [22] "Maine" "Maryland" "Massachusetts"
## [25] "Michigan" "Minnesota" "Mississippi"
## [28] "Missouri" "Montana" "Nebraska"
## [31] "Nevada" "New Hampshire" "New Jersey"
## [34] "New Mexico" "New York" "Noord Holland"
## [37] "North Carolina" "North Dakota" "Ohio"
## [40] "Oklahoma" "Oregon" "Pennsylvania"
## [43] "Puerto Rico" "Rhode Island" "South Carolina"
## [46] "South Dakota" "Tennessee" "Texas"
## [49] "Utah" "Vermont" "Virginia"
## [52] "Washington" "West Virginia" "Wisconsin"
## [55] "Wyoming"
That's a bit of a mess. Some missing data, some non-US data. Let's just look at the US.
# stinking Americans
breaches.us <- breaches[!breaches$State %in% c("", "London City of", "Noord Holland"),
]
by.year.org <- count(breaches.us, c("Year", "Org.Class"))
by.year.type <- count(breaches.us, c("Year", "Loss.Type"))
by.both <- count(breaches.us, c("Year", "Org.Class", "Loss.Type"))
# since neither NGO nor GOV has CARD by default, add it so grid is nice
yrs <- range(as.numeric(levels(by.both$Year)))
yrs <- as.character(seq(yrs[1], yrs[2]))
by.both <- rbind(by.both, data.frame(Year = yrs, Org.Class = "GOV", Loss.Type = "CARD",
freq = 0))
by.both <- rbind(by.both, data.frame(Year = yrs, Org.Class = "NGO", Loss.Type = "CARD",
freq = 0))
gg <- ggplot(data = by.both, aes(x = Year, y = freq))
gg <- gg + geom_bar(aes(fill = Org.Class), stat = "identity")
gg <- gg + facet_wrap(~Org.Class + Loss.Type)
gg <- gg + labs(x = "", y = "# Breaches")
gg <- gg + theme_bw()
gg <- gg + theme(legend.position = "none")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg
gg <- ggplot(data = by.year.type, aes(x = Year, y = freq))
gg <- gg + geom_bar(aes(fill = Loss.Type), stat = "identity")
gg <- gg + facet_wrap(~Loss.Type)
gg <- gg + labs(x = "", y = "# Breaches")
gg <- gg + theme_bw()
gg <- gg + theme(legend.position = "none")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg
gg <- ggplot(data = by.year.org, aes(x = Year, y = freq))
gg <- gg + geom_bar(aes(fill = Org.Class), stat = "identity")
gg <- gg + facet_wrap(~Org.Class)
gg <- gg + labs(x = "", y = "# Breaches")
gg <- gg + theme_bw()
gg <- gg + theme(legend.position = "none")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg