# It seems that not many scripts have looked an suicide and attempted suicide in this data, so I did an exploratory
# analysis and some visualization for it. There are some pretty interesting pattern in this data
library(lubridate)
library(ggplot2)
library(ggmap)
train <- read.csv("train.csv", header = TRUE, stringsAsFactors = FALSE)
# make some transformation to the data
# pull out all tragic suicidal cases
suicide <- train[which(train$Category == 'SUICIDE'), c(1, 3, 4, 8, 9)]
# sort the suicide types to succeed and attempted
suicide[, 'type'] <- NA
suicide[, "type"][grep("ATTEMPTED", suicide[, 2])] <- "Attempted"
suicide[, "type"][-grep("ATTEMPTED", suicide[, 2])] <- "Succeeded"
# create some time variables
suicide[, 1] <- parse_date_time(suicide[, 1], "%Y-%m-%d %H:%M:%S", tz = "UTC")
suicide[, "year"] <- as.numeric(format(suicide[, 1], "%Y"))
suicide[, "month"] <- as.numeric(format(suicide[, 1], "%m"))
suicide[, "minute"] <- as.numeric(format(suicide[, 1], "%M"))
suicide[, "hour"] <- as.numeric(format(suicide[, 1], "%H"))
# Q0: Do most people kill themselves by jumping off the bridge?
# A: Nope, or it might be the sampling error, the bridge area data might not be fully collected.
map<-get_map(location="sanfrancisco",zoom=12,source="osm", color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=sanfrancisco&zoom=12&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=sanfrancisco&sensor=false
ggmap(map)+
geom_point(data = suicide, aes(x = X, y = Y, color = as.factor(type)))+
ggtitle("Suicide Location")+
scale_color_manual(name = "Type", values = c("#11c2d7", "#9f0303"))

# Q1: Is there a pattern of suicide cases during a day cycle?
# A: well, it seems that most deadly suicidal attempts occurred/were reported at
# noon (12:00), but unsuccessful attempts peaked during night time.
ggplot(suicide)+
geom_density(aes(x = hour, color = as.factor(type)))+
scale_x_continuous(breaks = c(1:24))+
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(color = "black"))+
ggtitle("Suicide by Hours in a Day")+
scale_colour_manual(name = "Type", values = c("#11c2d7", "#9f0303"))

# Q2: Is there a patter of suicide cases during an hour cycle?
# A: I think there is a data collection bias here. Most cases were marked at 0 or 30 minutes time stamp.
ggplot(suicide)+
geom_bar(position = "dodge", aes(x = minute, fill = as.factor(type)))+
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(color = "black"))+
ggtitle("suicide by minutes in an hour")+
scale_colour_manual(name = "Type", values = c("#11c2d7", "#9f0303"))

# Q3: What about patterns for months in a year, aka, a seasonal effect?
# A: attempted sucide occured mostly in May and November, this is kind of interesting.
# Maybe it is the seaonal transition that made people blue?
ggplot(suicide)+
geom_bar(aes(x = as.factor(month)))+
facet_grid(.~type)
