這個分析檢查了2012-2014年美國的槍殺率,數據來源是CDC(美國疾病管制與預防中心)。
這裡可以看到數據。
guns <- data.table::fread("/Users/yuzhe/Downloads/guns.csv", header = T, drop = 1)
head(guns)
## year month intent police sex age race
## 1: 2012 01 Suicide 0 M 34 Asian/Pacific Islander
## 2: 2012 01 Suicide 0 F 21 White
## 3: 2012 01 Suicide 0 M 60 White
## 4: 2012 02 Suicide 0 M 64 White
## 5: 2012 02 Suicide 0 M 31 White
## 6: 2012 02 Suicide 0 M 17 Native American/Native Alaskan
## hispanic place education
## 1: 100 Home 4
## 2: 100 Street 3
## 3: 100 Other specified 4
## 4: 100 Home 4
## 5: 100 Other specified 2
## 6: 100 Home 1
guns <- na.omit(guns)
# The purpose of *10000 and the *100 are to convert 2012, 01, 01 into 20120101 for readability
guns$year <- as.numeric(guns$year)
guns$month <- as.numeric(guns$month)
guns$date <- (guns$year*10000 + guns$month*100 + 1) %>% anydate()
monthly_rates <- aggregate(x = guns$date, by = list(month = guns$date), length)
ggplot(monthly_rates[1:12, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2012 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2014") +
ylab("Gun Deaths Count")
ggplot(monthly_rates[13:24, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2013 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2013") +
ylab("Gun Deaths Count")
ggplot(monthly_rates[25:36, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2014 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2014") +
ylab("Gun Deaths Count")
intent_sex <- aggregate(guns$sex, by = list(intent = guns$intent, sex = guns$sex), length)
ggplot(intent_sex, aes(x = intent, y = x, fill = sex)) +
geom_bar(stat = "identity", position = "stack") +
ylab("Count") +
ggtitle("Gender distribution\nGun Deaths US: 2012-2014")
guns$education = as.factor(guns$education)
intent_edu <- aggregate(guns$education, by = list(intent = guns$intent, education = guns$education), length)
ggplot(intent_edu, aes(x = intent, y = x, fill = education)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Education distribution\nin Gun Deaths US: 2012-2014")
intent_place <- aggregate(guns$intent, by = list(place = guns$place, intent = guns$intent), length)
ggplot(intent_place, aes(x = intent, y = x, fill = place)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Location distribution\nin Gun Deaths US: 2012-2014")
將Trade/service area和Industrial/construction視為Street,並把剩下除了Home和Street設為Other。
#These are too many categories and it's hard to arrive to conclusions
# let's merge 'street' with 'trade/service area' and the rest to 'Other'
guns[which(guns$place == "Trade/service area" | guns$place == "Industrial/construction"), "place"] = "Street"
guns[which(guns$place != "Street" & guns$place != "Home"), "place"] = "Other"
intent_place <- aggregate(guns$intent, by = list(place = guns$place, intent = guns$intent), length)
ggplot(intent_place, aes(x = intent, y = x, fill = place)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Location distribution\nin Gun Deaths US: 2012-2014")
place_died <- guns %>%
group_by(place, intent) %>%
tally() %>%
group_by(place) %>%
mutate(pct = n / sum(n))
ggplot(place_died, aes(factor(place), pct, fill = intent)) +
geom_bar(stat = "identity", color = "grey40") +
xlab("Place") +
ylab("Percentage")+
labs(fill = "Intent")
ggplot(intent_sex, aes(x = sex, y = x, fill = intent)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Gender") +
ylab("Freqency") +
ggtitle("Gender Distribution by Intent")
ggplot(intent_edu, aes(x = education, y = x, fill = intent)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Education Distribution by Intent")
ggplot(guns, aes(x = age)) +
geom_histogram(breaks = seq(0, 100, 10), fill="lightblue", color="lightyellow")
suicide = guns[guns$intent == 'Suicide', ]
homicide = guns[guns$intent == 'Homicide', ]
p1 <- ggplot(suicide, aes(x = age))+
geom_histogram(breaks = seq(0, 100, 10), color="lightyellow", fill="lightskyblue2") +
ggtitle("Suicide gun deaths\nAge Distribution")
p2 <- ggplot(homicide, aes(x = age))+
geom_histogram(breaks = seq(0, 100, 10), color="lightyellow", fill="lightskyblue2") +
ggtitle("Homicide gun deaths\nAge Distribution")
multiplot(p1, p2, cols = 2)
guns_box <- guns[which(guns$intent != "Undetermined"),]
qplot(factor(intent), age, data = guns_box, geom = "boxplot", xlab = "Intent")
guns_box <- guns[which(guns$intent != "Undetermined"), ]
qplot(intent, age, fill = sex, data = guns_box, geom = "boxplot", xlab = "Intent")