require(dplyr)
require(Hmisc)
require(ggplot2)
require(reshape2)
guns <- data.table::fread("/Users/yuzhe/Downloads/guns.csv", header = T, drop = 1)
head(guns)
## year month intent police sex age race
## 1: 2012 01 Suicide 0 M 34 Asian/Pacific Islander
## 2: 2012 01 Suicide 0 F 21 White
## 3: 2012 01 Suicide 0 M 60 White
## 4: 2012 02 Suicide 0 M 64 White
## 5: 2012 02 Suicide 0 M 31 White
## 6: 2012 02 Suicide 0 M 17 Native American/Native Alaskan
## hispanic place education
## 1: 100 Home 4
## 2: 100 Street 3
## 3: 100 Other specified 4
## 4: 100 Home 4
## 5: 100 Other specified 2
## 6: 100 Home 1
guns <- na.omit(guns)
我們有將近101,000筆數據(槍殺死亡事件)和10個欄位(屬性)
以下將介紹各個欄位代表的意義:
str(guns)
## Classes 'data.table' and 'data.frame': 99343 obs. of 10 variables:
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ month : chr "01" "01" "01" "02" ...
## $ intent : chr "Suicide" "Suicide" "Suicide" "Suicide" ...
## $ police : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sex : chr "M" "F" "M" "M" ...
## $ age : int 34 21 60 64 31 17 48 41 50 30 ...
## $ race : chr "Asian/Pacific Islander" "White" "White" "White" ...
## $ hispanic : int 100 100 100 100 100 100 100 100 100 100 ...
## $ place : chr "Home" "Street" "Other specified" "Home" ...
## $ education: int 4 3 4 4 2 1 2 2 3 3 ...
## - attr(*, ".internal.selfref")=<externalptr>
Incompeteness/ Completeness(不完整性/完整性):為了瞭解哪些欄位(columns)是有用或無用的,去檢查有多少數據為NaN是重要的。
# In order to count NA values in guns
sapply(guns, function(x) sum(is.na(x)))
## year month intent police sex age race
## 0 0 0 0 0 0 0
## hispanic place education
## 0 0 0
# In order to see the percentage of valid data:
sapply(guns, function(x) (sum(!is.na(x))/length(x))*100)
## year month intent police sex age race
## 100 100 100 100 100 100 100
## hispanic place education
## 100 100 100
看來大多數列至少有98.6%的值,這意味著這數據接近完成(不像現實世界的數據)。
這意味著我們可能可以刪除所有具有NaN值的行,但仍然不能忽略我們的潛在見解。
讓我們嘗試用真實價值替換盡可能多的NaN,然後刪除我們無法填充的列。
# Organizing the data by a column value: first by the year, then by month:
head(guns[order(guns$year, guns$month), ], 10)
## year month intent police sex age race
## 1: 2012 01 Suicide 0 M 34 Asian/Pacific Islander
## 2: 2012 01 Suicide 0 F 21 White
## 3: 2012 01 Suicide 0 M 60 White
## 4: 2012 01 Suicide 0 M 21 Native American/Native Alaskan
## 5: 2012 01 Suicide 0 F 59 White
## 6: 2012 01 Suicide 0 F 30 White
## 7: 2012 01 Homicide 0 M 58 Black
## 8: 2012 01 Suicide 0 M 78 White
## 9: 2012 01 Suicide 0 M 60 White
## 10: 2012 01 Accidental 0 M 61 White
## hispanic place education
## 1: 100 Home 4
## 2: 100 Street 3
## 3: 100 Other specified 4
## 4: 100 Home 2
## 5: 100 Home 2
## 6: 100 Other unspecified 4
## 7: 100 Home 1
## 8: 100 Home 4
## 9: 100 Other unspecified 1
## 10: 100 Home 2
對我來說,最有趣的是intent這個欄位,也是我想預測的。
intent_table <- table(guns$intent);intent_table
##
## Accidental Homicide Suicide Undetermined
## 1625 33750 63162 806
有趣的是,在美國自殺並不是最常見的槍殺方式,但在2012-2014年卻佔了槍殺方式的2/3的數量。
Hmisc::describe(guns[, c("age", "education")])
## guns[, c("age", "education")]
##
## 2 Variables 99343 Observations
## ---------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 99343 0 104 1 43.98 22.24 18 21
## .25 .50 .75 .90 .95
## 27 42 58 72 80
##
## lowest : 0 1 2 3 4, highest: 99 100 101 102 107
## ---------------------------------------------------------------------------
## education
## n missing distinct Info Mean Gmd
## 99343 0 5 0.901 2.299 1.07
##
## Value 1 2 3 4 5
## Frequency 21448 42258 21430 12879 1328
## Proportion 0.216 0.425 0.216 0.130 0.013
## ---------------------------------------------------------------------------
清洗 education欄位:注意到最低年齡是0歲,讓我們來看看有多少槍事件導致16歲以下的兒童死亡:
nrow(guns[which(guns$age < 16), ])
## [1] 1791
在education欄位裡轉換NaN值:因為只有11個NaN的列,大可把它刪除,但為了練習,我們試著來填充真實的數據。 如果這個比例較大(比如說大於5%),並且你正在使用機器學習,則你可能需要找出一種以某種方式填寫訊息的方法。 請注意,在education很多這些孩子有NaN或值等於5.0(=不可用)。 假設所有16歲以下的孩子都受過教育1.0(=不到高中),並在對應欄位填寫這些數據:
將資料視覺化之後更容易看出,但首先我們先了解數據中性別的分佈狀況。
table(guns$sex)
##
## F M
## 14364 84979
# We can use this column to split the percentage of Male/Female death cases
Question: 比較每年的死亡人數,是上升還是下降?
table(guns$year)
##
## 2012 2013 2014
## 33072 33146 33125
Answer:似乎它保持不變。通過查看這些值,這一變化看起來並不顯著(<0.3%)。
Question: 哪幾個月死亡人數比其他的多得多?
guns[order(guns$year) & order(guns$month), ] %>%
head()
## year month intent police sex age race
## 1: 2012 01 Suicide 0 M 34 Asian/Pacific Islander
## 2: 2012 01 Suicide 0 F 21 White
## 3: 2012 01 Suicide 0 M 60 White
## 4: 2012 02 Suicide 0 M 64 White
## 5: 2012 02 Suicide 0 M 31 White
## 6: 2012 02 Suicide 0 M 17 Native American/Native Alaskan
## hispanic place education
## 1: 100 Home 4
## 2: 100 Street 3
## 3: 100 Other specified 4
## 4: 100 Home 4
## 5: 100 Other specified 2
## 6: 100 Home 1
data object指得是年、月、日。我們不能在沒有日期的情況下創建datetime object,但為了實際用途,我們把兩個日期的值(年、月)組合再一起並把日期時間設置為1。
require(anytime)
## Loading required package: anytime
# The purpose of *10000 and the *100 are to convert 2012, 01, 01 into 20120101 for readability
guns$year <- as.numeric(guns$year)
guns$month <- as.numeric(guns$month)
guns$date <- (guns$year*10000 + guns$month*100 + 1) %>%
anydate()
monthly_rates <- aggregate(x = guns$date, by = list(month = guns$date), length);monthly_rates
## month x
## 1 2012-01-01 2720
## 2 2012-02-01 2321
## 3 2012-03-01 2703
## 4 2012-04-01 2759
## 5 2012-05-01 2967
## 6 2012-06-01 2775
## 7 2012-07-01 2965
## 8 2012-08-01 2909
## 9 2012-09-01 2812
## 10 2012-10-01 2701
## 11 2012-11-01 2689
## 12 2012-12-01 2751
## 13 2013-01-01 2816
## 14 2013-02-01 2347
## 15 2013-03-01 2820
## 16 2013-04-01 2754
## 17 2013-05-01 2755
## 18 2013-06-01 2876
## 19 2013-07-01 3041
## 20 2013-08-01 2816
## 21 2013-09-01 2705
## 22 2013-10-01 2762
## 23 2013-11-01 2722
## 24 2013-12-01 2732
## 25 2014-01-01 2617
## 26 2014-02-01 2338
## 27 2014-03-01 2644
## 28 2014-04-01 2815
## 29 2014-05-01 2816
## 30 2014-06-01 2881
## 31 2014-07-01 2841
## 32 2014-08-01 2918
## 33 2014-09-01 2881
## 34 2014-10-01 2833
## 35 2014-11-01 2719
## 36 2014-12-01 2822
Question: 有多少比例的警察涉及槍殺案件?
prop.table(table(guns$police))
##
## 0 1
## 0.9998087434 0.0001912566
有98%的比例警察沒有涉及槍殺案,所以基本上這欄位變數可以不考慮了。 因為這種分布不額外給我們資訊。
guns <- guns[, -("police")]
head(guns)
## year month intent sex age race hispanic
## 1: 2012 1 Suicide M 34 Asian/Pacific Islander 100
## 2: 2012 1 Suicide F 21 White 100
## 3: 2012 1 Suicide M 60 White 100
## 4: 2012 2 Suicide M 64 White 100
## 5: 2012 2 Suicide M 31 White 100
## 6: 2012 2 Suicide M 17 Native American/Native Alaskan 100
## place education date
## 1: Home 4 2012-01-01
## 2: Street 3 2012-01-01
## 3: Other specified 4 2012-01-01
## 4: Home 4 2012-02-01
## 5: Other specified 2 2012-02-01
## 6: Home 1 2012-02-01
我們可以快速在我們資料視覺化後找到一種模式。
但是,我們快速處理符號值(如數字和文字)的能力非常差。
資料視覺化著重於從表格視覺數據轉換數據。
透過視覺畫的圖形,我們可能會找到有一個良好的相關性,再來轉到分析的預測部分。
當相鄰點之間存在邏輯關聯時,折線圖的效果最佳。
在處理時間時,折線圖視覺化最適合的選擇。
因為rows具有自然排序,每個rows都反映了有關上一行之後發生的事件之訊息。
為了強調折線圖的直觀表示如何幫助我們輕鬆觀察趨勢,讓我們以2012年到2014年底的36個數據點作為折線圖。
為了創建2012年的失業(unemployment)數據折線圖,我們需要:
X軸: 2012-01-01~2014-12-01
Y軸: y軸的範圍從2357到3079(對應於最小和最大死亡事件值)我們不必指定值。我們傳入x值列表作為第一個參數,y值列表作為plot()的第二個參數。
讓我們開始畫起年度的圖表:
ggplot(monthly_rates[1:12, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
labs(title = "2012 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2014") +
ylab("Gun Deaths Count")
這折線圖的原點並非為0,所以要畫一個原點為0的圖。
#2012
ggplot(monthly_rates[1:12, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2012 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2014") +
ylab("Gun Deaths Count")
#2013
ggplot(monthly_rates[13:24, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2013 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2013") +
ylab("Gun Deaths Count")
#2014
ggplot(monthly_rates[25:36, ], aes(x = month, y = x, color = "steelblue")) +
geom_line() +
ylim(0, 3500) +
labs(title = "2014 Gun Deaths Line Chart") +
xlab("Monthly Gun Death Count in the US, 2014") +
ylab("Gun Deaths Count")
When we need visualization that scales graphical objects to the quantitative values we’re interested in comparing - we can use a bar plot.
Let’s look at the ‘intent’ division with inner gender [‘sex’] division.
intent_sex <- aggregate(guns$sex, by = list(intent = guns$intent, sex = guns$sex), length)
ggplot(intent_sex, aes(x = intent, y = x, fill = sex)) +
geom_bar(stat = "identity", position = "stack")
男性發生意外的數量是遠大於女性的,並從這張圖去我們可推斷,從Accidental中去學習是很困難的,因為發生Accidental得數量很少。
We can look at a similar split to get a sence of the education of the victims:
guns$education = as.factor(guns$education)
intent_edu <- aggregate(guns$education, by = list(intent = guns$intent, education = guns$education), length)
ggplot(intent_edu, aes(x = intent, y = x, fill = education)) +
geom_bar(stat = "identity", position = "stack")
但這看起來太擁擠了,換成水平的來看!
ggplot(intent_edu, aes(x = intent, y = x, fill = education)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Education distribution\nin Gun Deaths US: 2012-2014")
####重畫
education <- guns %>%
na.omit() %>%
group_by(education, intent) %>%
tally() %>%
group_by(education) %>%
mutate(pct = n / sum(n))
ggplot(education, aes(factor(education), pct, fill = intent))+
geom_bar(stat = "identity", color = "grey40") +
labs(fill = "Intent")
將資料視覺化之後,我們很容易看到受過大學教育的人,Suicide(自殺)的比例是遠高於Homicide(他殺)的。 The dark blues are more than a third of the Suicide cases, but only about a quarter of the Homicide cases. This is indication that ‘education’ could be a helpful variable for our ‘intent’ prediction.
Again, the Accidental bar does not seem to give us any additional info. It may be useless for our prediction, and we’re better off going with binary: ‘Suicide’ vs. ‘Homicide’.
接下來畫畫看發生事件地區的分佈:
intent_place <- aggregate(guns$intent, by = list(place = guns$place, intent = guns$intent), length)
ggplot(intent_place, aes(x = intent, y = x, fill = place)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Location distribution\nin Gun Deaths US: 2012-2014")
We can see that most cases of Suicide happen at home. This means that location could be an important variable when we want to predict intent. The Accidental column, again, seems too dense to give us any valuble info.
It’s almost impossible to conclude things about this visualization since there are a lot of values (i.e colors), and this distribution isn’t that useful for us. Let’s make it better by merging some of the values.
temp_place <- table(guns$place) %>% data.frame()
temp_place[order(temp_place$Freq, decreasing = T), ]
## Var1 Freq
## 2 Home 60443
## 4 Other specified 13739
## 9 Street 11148
## 5 Other unspecified 8857
## 10 Trade/service area 3439
## 7 School/instiution 670
## 1 Farm 468
## 3 Industrial/construction 248
## 6 Residential institution 203
## 8 Sports 128
我們不確定Other specified和Other unspecified能給我們任何資訊。
我們可以把這整個欄位刪去或是選擇Home、Street和other做為我們的3個值。
#These are too many categories and it's hard to arrive to conclusions
# let's merge 'street' with 'trade/service area' and the rest to 'Other'
guns[which(guns$place == "Trade/service area" | guns$place == "Industrial/construction"), "place"] = "Street"
guns[which(guns$place != "Street" & guns$place != "Home"), "place"] = "Other"
intent_place <- aggregate(guns$intent, by = list(place = guns$place, intent = guns$intent), length)
ggplot(intent_place, aes(x = intent, y = x, fill = place)) +
geom_bar(stat = "identity", position = "stack") +
coord_flip() +
xlab("Count") +
ylab("Intent") +
ggtitle("Location distribution\nin Gun Deaths US: 2012-2014")
place_died <- guns %>%
group_by(place, intent) %>%
tally() %>%
group_by(place) %>%
mutate(pct = n / sum(n))
ggplot(place_died, aes(factor(place), pct, fill = intent)) +
geom_bar(stat = "identity", color = "grey40") +
xlab("Place") +
ylab("Percentage")+
labs(fill = "Intent")
這看起來不會很密集,且更容易了解我們的data。不是很意外Suicide(自殺)在街上的人比較少,大多是人在家裡。
而Homicide(他殺)則在三者分布相當平均。
ggplot(intent_sex, aes(x = sex, y = x, fill = intent)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Gender") +
ylab("Freqency") +
ggtitle("Gender Distribution by Intent")
ggplot(intent_edu, aes(x = education, y = x, fill = intent)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Education Distribution by Intent")
Let’s use a stacked barplot to look at the percentage intent by place of death.
ggplot(guns, aes(x = age)) +
geom_histogram(breaks = seq(0, 100, 10))
ggplot objects can be passed in …, or to plotlist (as a list of ggplot objects)
If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then plot 1 will go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom.
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
suicide = guns[guns$intent == 'Suicide', ]
homicide = guns[guns$intent == 'Homicide', ]
p1 <- ggplot(suicide, aes(x = age))+
geom_histogram(breaks = seq(0, 100, 10), color="lightyellow", fill="lightskyblue2") +
ggtitle("Suicide gun deaths\nAge Distribution")
p2 <- ggplot(homicide, aes(x = age))+
geom_histogram(breaks = seq(0, 100, 10), color="lightyellow", fill="lightskyblue2") +
ggtitle("Homicide gun deaths\nAge Distribution")
multiplot(p1, p2, cols = 2)
通過觀察,我們可以看到,大多數Homicide案件死亡發生在20-21歲左右,而大多數自殺案件是在55-58歲之間(20歲左右還有一個明顯的高峰)。
如果我們想要更準確的數字 - 我們可以看看這些變量的平均值和中位數。
再者,由於我們的Accident槍死事件非常少,所以很難從這部分數據中推斷出任何事情。
guns_box <- guns[which(guns$intent != "Undetermined"),]
qplot(factor(intent), age, data = guns_box, geom = "boxplot", xlab = "Intent")
guns_box <- guns[which(guns$intent != "Undetermined"), ]
qplot(intent, age, fill = sex, data = guns_box, geom = "boxplot", xlab = "Intent")
再次看到 Accidental 仍是給不出額外的訊息。
我們可以看出女性Suicide的年齡變異比男性還大,然而在Homicide中女性的年齡變異較小。