1. 請使用R 實作, 並以“.R”檔繳交可運行之程式碼,以做為評分參考
  2. 本期末考為Open Book, Open Computer 測驗,除抄襲其他人外, 所有公開資源都可以參考
  3. 考試時間為一星期 (2018/09/03 12:00:00 ~ 2018/09/11 23:59:59)
  4. 實作完期末考者,可以將.R 檔寄至 david@largitdata.com , 並在信件主旨上標注[繳交CDC R語言期末考]以做後續評分
  5. 考試以60分為標準, 超過60分方算及格

基礎 R 語言 與資料視覺化 (12題共60分)

從疫情中心的開放資料網站蒐集到「登革熱近12個月每日確定病例統計」,資料如下:

library(readr)
dataset <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/Dengue_Daily_last12m.csv')
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   發病日 = col_date(format = ""),
##   個案研判日 = col_date(format = ""),
##   通報日 = col_date(format = ""),
##   確定病例數 = col_integer()
## )
## See spec(...) for full column specifications.
str(dataset)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  26 variables:
##  $ 發病日            : Date, format: "2017-09-01" "2017-09-03" ...
##  $ 個案研判日        : Date, format: "2017-09-05" "2017-09-09" ...
##  $ 通報日            : Date, format: "2017-09-05" "2017-09-08" ...
##  $ 性別              : chr  "男" "男" "女" "男" ...
##  $ 年齡層            : chr  "40-44" "30-34" "15-19" "35-39" ...
##  $ 居住縣市          : chr  "新北市" "新北市" "桃園市" "台北市" ...
##  $ 居住鄉鎮          : chr  "新店區" "永和區" "大園區" "大安區" ...
##  $ 居住村里          : chr  "大同里" "新廍里" "菓林里" "民炤里" ...
##  $ 最小統計區        : chr  "A6506-0261-00" "A6504-0058-00" "None" "A6303-0351-00" ...
##  $ 最小統計區中心點X : chr  "121.541220302" "121.512814046" "None" "121.536889132" ...
##  $ 最小統計區中心點Y : chr  "24.980690477" "25.016252648" "None" "25.035476567" ...
##  $ 一級統計區        : chr  "A6506-13-011" "A6504-05-002" "None" "A6303-35-002" ...
##  $ 二級統計區        : chr  "A6506-13" "A6504-05" "None" "A6303-35" ...
##  $ 感染縣市          : chr  "None" "None" "None" "None" ...
##  $ 感染鄉鎮          : chr  "None" "None" "None" "None" ...
##  $ 感染村里          : chr  "None" "None" "None" "None" ...
##  $ 是否境外移入      : chr  "是" "是" "是" "是" ...
##  $ 感染國家          : chr  "越南" "印度" "越南" "越南" ...
##  $ 確定病例數        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 居住村里代碼      : chr  "6500600-004" "6500400-045" "6800600-011" "6300300-004" ...
##  $ 感染村里代碼      : chr  "None" "None" "None" "None" ...
##  $ 血清型            : chr  "None" "None" "None" "第四型" ...
##  $ 內政部居住縣市代碼: chr  "65" "65" "68" "63" ...
##  $ 內政部居住鄉鎮代碼: chr  "6500600" "6500400" "6800600" "6300300" ...
##  $ 內政部感染縣市代碼: chr  "None" "None" "None" "None" ...
##  $ 內政部感染鄉鎮代碼: chr  "None" "None" "None" "None" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 26
##   .. ..$ 發病日            :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ 個案研判日        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ 通報日            :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ 性別              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 年齡層            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 居住縣市          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 居住鄉鎮          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 居住村里          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 最小統計區        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 最小統計區中心點X : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 最小統計區中心點Y : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 一級統計區        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 二級統計區        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 感染縣市          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 感染鄉鎮          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 感染村里          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 是否境外移入      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 感染國家          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 確定病例數        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ 居住村里代碼      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 感染村里代碼      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 血清型            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 內政部居住縣市代碼: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 內政部居住鄉鎮代碼: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 內政部感染縣市代碼: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 內政部感染鄉鎮代碼: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

請試用R 語言回答以下問題:

  1. 請將血清型欄位資料中的None 變成遺失值NA ? (5分)
dataset[dataset$血清型 == 'None', '血清型'] <- NA
  1. 請統計血清型欄位遺失值的數量? (5分)
sum(is.na(dataset$血清型))
## [1] 180
  1. 請將血清型欄位從字串資料型態變更為階層資料型態? (5分)
dataset$血清型 <- factor(dataset$血清型)
  1. 請統計資料中男性與女性的數量? (5分)
table(dataset$性別)
## 
##  女  男 
## 171 194
  1. 繼第4題,請計算男性與女性的比例? (5分)
table(dataset$性別) / nrow(dataset)
## 
##        女        男 
## 0.4684932 0.5315068
  1. 繼第5題,請使用圓餅圖(Pir Chart) 呈現出男性與女性病例數比例 ? (5分)
par(family='STKaiti')
stat    <- table(dataset$性別) / nrow(dataset)
labels  <- paste(names(stat), round(stat * 100,2) )
pie(stat, labels = labels,init.angle = 90, clockwise = FALSE, main = '男性與女性病例數比例')

  1. 請統計在病例資料中,居住於哪個縣市的病患最多? (5分)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataset %>% 
  group_by(居住縣市) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(1) %>%
  select(居住縣市)
## # A tibble: 1 x 1
## # Groups:   居住縣市 [1]
##   居住縣市
##   <chr>   
## 1 新北市
  1. 請使用長條圖(Bar Chart) 呈現出病例數最多的前十居住縣市? (5分)
par(family='STKaiti')
stat <- dataset %>% 
  group_by(居住縣市) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(10)

names(stat) <- c("居住縣市","病例數" )
barplot(stat$病例數, names = stat$居住縣市, col = 'blue', main='病例數最多的前十居住縣市',ylab = '病例數', xlab = '居住縣市')

  1. 請統計各居住縣市各性別病例數? (5分)
df <- dataset %>% 
  group_by(居住縣市, 性別) %>%
  count(sort=TRUE) 
names(df) <- c("居住縣市","性別","病例數" )
head(df)
## # A tibble: 6 x 3
## # Groups:   居住縣市, 性別 [6]
##   居住縣市 性別  病例數
##   <chr>    <chr>  <int>
## 1 新北市   男        42
## 2 新北市   女        41
## 3 台中市   女        39
## 4 台北市   男        37
## 5 台北市   女        36
## 6 台中市   男        26
  1. 繼第9題, 請使用長條圖呈現各居住縣市各性別病例數?(5 分)
par(family='STKaiti')
#install.packages("reshape")
library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
pivot_tb <- cast(df, 性別~居住縣市, value = '病例數' )
rownames(pivot_tb) <- pivot_tb[,1]
county_name <- colnames(pivot_tb[,-1])
stat_tb <- as.matrix(pivot_tb[,-1])
barplot(stat_tb, beside = TRUE, names.arg = county_name, legend.text = c('女','男'), col=c('red','blue' ),ylab = '病例數', xlab = '居住縣市')

  1. 請根據發病日,統計每日的病例數量? (5分)
cnt_by_date <- dataset %>% 
  group_by(發病日) %>%
  count() 

names(cnt_by_date) <- c("發病日","病例數" )
head(cnt_by_date)
## # A tibble: 6 x 2
## # Groups:   發病日 [6]
##   發病日     病例數
##   <date>      <int>
## 1 2017-09-01      1
## 2 2017-09-03      2
## 3 2017-09-04      1
## 4 2017-09-05      2
## 5 2017-09-06      2
## 6 2017-09-07      2
  1. 繼第11題,請使用折線圖(Line Chart) 依發病日與病例數繪製病例趨勢圖 ? (5分)
par(family='STKaiti')
plot(cnt_by_date, type= 'l', main = '病例趨勢圖')

統計與機器學習 (8題共40分)

library(readr)
diabetes <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/diabetes.csv')
## Parsed with column specification:
## cols(
##   Pregnancies = col_integer(),
##   Glucose = col_integer(),
##   BloodPressure = col_integer(),
##   SkinThickness = col_integer(),
##   Insulin = col_integer(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_integer(),
##   Outcome = col_integer()
## )
str(diabetes)
## Classes 'tbl_df', 'tbl' and 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 9
##   .. ..$ Pregnancies             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Glucose                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ BloodPressure           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ SkinThickness           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Insulin                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ BMI                     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DiabetesPedigreeFunction: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Age                     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Outcome                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

有一糖尿病數據集,資料欄位敘述如下:

  1. Number of times pregnant
  2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
  3. Diastolic blood pressure (mm Hg)
  4. Triceps skin fold thickness (mm)
  5. 2-Hour serum insulin (mu U/ml)
  6. Body mass index (weight in kg/(height in m)^2)
  7. Diabetes pedigree function
  8. Age (years)
  9. Class variable (0 or 1)

請試用R 語言回答以下問題:

  1. 請計算除了 Class Variable 外所有欄位的相關性? (5分)
cor(diabetes[,-9])
##                          Pregnancies    Glucose BloodPressure
## Pregnancies               1.00000000 0.12945867    0.14128198
## Glucose                   0.12945867 1.00000000    0.15258959
## BloodPressure             0.14128198 0.15258959    1.00000000
## SkinThickness            -0.08167177 0.05732789    0.20737054
## Insulin                  -0.07353461 0.33135711    0.08893338
## BMI                       0.01768309 0.22107107    0.28180529
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495
## Age                       0.54434123 0.26351432    0.23952795
##                          SkinThickness     Insulin        BMI
## Pregnancies                -0.08167177 -0.07353461 0.01768309
## Glucose                     0.05732789  0.33135711 0.22107107
## BloodPressure               0.20737054  0.08893338 0.28180529
## SkinThickness               1.00000000  0.43678257 0.39257320
## Insulin                     0.43678257  1.00000000 0.19785906
## BMI                         0.39257320  0.19785906 1.00000000
## DiabetesPedigreeFunction    0.18392757  0.18507093 0.14064695
## Age                        -0.11397026 -0.04216295 0.03624187
##                          DiabetesPedigreeFunction         Age
## Pregnancies                           -0.03352267  0.54434123
## Glucose                                0.13733730  0.26351432
## BloodPressure                          0.04126495  0.23952795
## SkinThickness                          0.18392757 -0.11397026
## Insulin                                0.18507093 -0.04216295
## BMI                                    0.14064695  0.03624187
## DiabetesPedigreeFunction               1.00000000  0.03356131
## Age                                    0.03356131  1.00000000
  1. 繼第13題, 請使用corrplot 繪製相關性圖? (5分)
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(diabetes[,-9]))

  1. 請將資料分為訓練與測試資料集,其中訓練資料集占80%, 測試資料集占20%? (5分)
diabetes$Outcome <- as.factor(diabetes$Outcome)
set.seed(42)
idx <- sample.int(2, nrow(diabetes), prob = c(0.8,0.2), replace = TRUE)
trainset <- diabetes[idx == 1,]
testset  <- diabetes[idx == 2, ]
  1. 請使用決策樹建立分類模型 (目標 y 為 Class Variable)? (5分)
library(rpart)
fit <- rpart(Outcome ~., data = trainset)
  1. 繼第16題,請計算出模型準確度(Accuracy)? (5分)
predicted <- predict(fit, testset, type = 'class')
sum(predicted == testset$Outcome) / length(testset$Outcome)
## [1] 0.7581699
  1. 繼第17題,請求出混淆矩陣(Confusion Matrix)? (5分)
table(testset$Outcome, predicted)
##    predicted
##      0  1
##   0 82 16
##   1 21 34
  1. 請使用隨機森林建立分類模型 (目標 y 為 Class Variable, 樹的數量為100)? (5分)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(42)
fit2 <- randomForest(Outcome ~., data = trainset,ntree=100,importance=T,proximity=T)
predicted2 <- predict(fit2, testset, type = 'class')
sum(predicted2 == testset$Outcome) / length(testset$Outcome)
## [1] 0.8039216
  1. 請繪製隨機森林模型與決策樹的 ROC Curve, 並比較兩者的 AUC? (5 分)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
prediction_tree <- predict(fit,testset,type="prob")
pred_to_roc_tree <- prediction_tree[,2]
pred_roc_tree <- prediction(pred_to_roc_tree, testset$Outcome)
perf_roc_tree <- performance(pred_roc_tree,measure="auc",x.measure="cutoff")
perf_tpr_roc_tree <- performance(pred_roc_tree,"tpr","fpr")

# random forest
prediction_forest <- predict(fit2,testset,type="prob")
pred_to_roc_forest <- prediction_forest[,2]
pred_roc_forest <- prediction(pred_to_roc_forest,as.factor(testset$Outcome))
perf_roc_forest <- performance(pred_roc_forest,measure="auc",x.measure="cutoff")
perf_tpr_roc_forest <- performance(pred_roc_forest,"tpr","fpr")

plot(perf_tpr_roc_tree,main="ROC curve",col=1)
legend(.6,.2, c( 
  paste('rpart AUC:', round(perf_roc_tree@y.values[[1]],2) ),
  paste('random forest AUC:',round(perf_roc_forest@y.values[[1]],2) ) ),1:2)
plot(perf_tpr_roc_forest,col=2,add=T)