從疫情中心的開放資料網站蒐集到「登革熱近12個月每日確定病例統計」,資料如下:
library(readr)
dataset <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/Dengue_Daily_last12m.csv')
## Parsed with column specification:
## cols(
## .default = col_character(),
## 發病日 = col_date(format = ""),
## 個案研判日 = col_date(format = ""),
## 通報日 = col_date(format = ""),
## 確定病例數 = col_integer()
## )
## See spec(...) for full column specifications.
str(dataset)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 26 variables:
## $ 發病日 : Date, format: "2017-09-01" "2017-09-03" ...
## $ 個案研判日 : Date, format: "2017-09-05" "2017-09-09" ...
## $ 通報日 : Date, format: "2017-09-05" "2017-09-08" ...
## $ 性別 : chr "男" "男" "女" "男" ...
## $ 年齡層 : chr "40-44" "30-34" "15-19" "35-39" ...
## $ 居住縣市 : chr "新北市" "新北市" "桃園市" "台北市" ...
## $ 居住鄉鎮 : chr "新店區" "永和區" "大園區" "大安區" ...
## $ 居住村里 : chr "大同里" "新廍里" "菓林里" "民炤里" ...
## $ 最小統計區 : chr "A6506-0261-00" "A6504-0058-00" "None" "A6303-0351-00" ...
## $ 最小統計區中心點X : chr "121.541220302" "121.512814046" "None" "121.536889132" ...
## $ 最小統計區中心點Y : chr "24.980690477" "25.016252648" "None" "25.035476567" ...
## $ 一級統計區 : chr "A6506-13-011" "A6504-05-002" "None" "A6303-35-002" ...
## $ 二級統計區 : chr "A6506-13" "A6504-05" "None" "A6303-35" ...
## $ 感染縣市 : chr "None" "None" "None" "None" ...
## $ 感染鄉鎮 : chr "None" "None" "None" "None" ...
## $ 感染村里 : chr "None" "None" "None" "None" ...
## $ 是否境外移入 : chr "是" "是" "是" "是" ...
## $ 感染國家 : chr "越南" "印度" "越南" "越南" ...
## $ 確定病例數 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ 居住村里代碼 : chr "6500600-004" "6500400-045" "6800600-011" "6300300-004" ...
## $ 感染村里代碼 : chr "None" "None" "None" "None" ...
## $ 血清型 : chr "None" "None" "None" "第四型" ...
## $ 內政部居住縣市代碼: chr "65" "65" "68" "63" ...
## $ 內政部居住鄉鎮代碼: chr "6500600" "6500400" "6800600" "6300300" ...
## $ 內政部感染縣市代碼: chr "None" "None" "None" "None" ...
## $ 內政部感染鄉鎮代碼: chr "None" "None" "None" "None" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 26
## .. ..$ 發病日 :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ 個案研判日 :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ 通報日 :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ 性別 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 年齡層 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 居住縣市 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 居住鄉鎮 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 居住村里 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 最小統計區 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 最小統計區中心點X : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 最小統計區中心點Y : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 一級統計區 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 二級統計區 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 感染縣市 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 感染鄉鎮 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 感染村里 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 是否境外移入 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 感染國家 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 確定病例數 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ 居住村里代碼 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 感染村里代碼 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 血清型 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 內政部居住縣市代碼: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 內政部居住鄉鎮代碼: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 內政部感染縣市代碼: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 內政部感染鄉鎮代碼: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
請試用R 語言回答以下問題:
dataset[dataset$血清型 == 'None', '血清型'] <- NA
sum(is.na(dataset$血清型))
## [1] 180
dataset$血清型 <- factor(dataset$血清型)
table(dataset$性別)
##
## 女 男
## 171 194
table(dataset$性別) / nrow(dataset)
##
## 女 男
## 0.4684932 0.5315068
par(family='STKaiti')
stat <- table(dataset$性別) / nrow(dataset)
labels <- paste(names(stat), round(stat * 100,2) )
pie(stat, labels = labels,init.angle = 90, clockwise = FALSE, main = '男性與女性病例數比例')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataset %>%
group_by(居住縣市) %>%
count() %>%
arrange(desc(n)) %>%
head(1) %>%
select(居住縣市)
## # A tibble: 1 x 1
## # Groups: 居住縣市 [1]
## 居住縣市
## <chr>
## 1 新北市
par(family='STKaiti')
stat <- dataset %>%
group_by(居住縣市) %>%
count() %>%
arrange(desc(n)) %>%
head(10)
names(stat) <- c("居住縣市","病例數" )
barplot(stat$病例數, names = stat$居住縣市, col = 'blue', main='病例數最多的前十居住縣市',ylab = '病例數', xlab = '居住縣市')
df <- dataset %>%
group_by(居住縣市, 性別) %>%
count(sort=TRUE)
names(df) <- c("居住縣市","性別","病例數" )
head(df)
## # A tibble: 6 x 3
## # Groups: 居住縣市, 性別 [6]
## 居住縣市 性別 病例數
## <chr> <chr> <int>
## 1 新北市 男 42
## 2 新北市 女 41
## 3 台中市 女 39
## 4 台北市 男 37
## 5 台北市 女 36
## 6 台中市 男 26
par(family='STKaiti')
#install.packages("reshape")
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
pivot_tb <- cast(df, 性別~居住縣市, value = '病例數' )
rownames(pivot_tb) <- pivot_tb[,1]
county_name <- colnames(pivot_tb[,-1])
stat_tb <- as.matrix(pivot_tb[,-1])
barplot(stat_tb, beside = TRUE, names.arg = county_name, legend.text = c('女','男'), col=c('red','blue' ),ylab = '病例數', xlab = '居住縣市')
cnt_by_date <- dataset %>%
group_by(發病日) %>%
count()
names(cnt_by_date) <- c("發病日","病例數" )
head(cnt_by_date)
## # A tibble: 6 x 2
## # Groups: 發病日 [6]
## 發病日 病例數
## <date> <int>
## 1 2017-09-01 1
## 2 2017-09-03 2
## 3 2017-09-04 1
## 4 2017-09-05 2
## 5 2017-09-06 2
## 6 2017-09-07 2
par(family='STKaiti')
plot(cnt_by_date, type= 'l', main = '病例趨勢圖')
library(readr)
diabetes <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/diabetes.csv')
## Parsed with column specification:
## cols(
## Pregnancies = col_integer(),
## Glucose = col_integer(),
## BloodPressure = col_integer(),
## SkinThickness = col_integer(),
## Insulin = col_integer(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_integer(),
## Outcome = col_integer()
## )
str(diabetes)
## Classes 'tbl_df', 'tbl' and 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 9
## .. ..$ Pregnancies : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Glucose : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ BloodPressure : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ SkinThickness : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Insulin : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ BMI : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ DiabetesPedigreeFunction: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Outcome : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
有一糖尿病數據集,資料欄位敘述如下:
請試用R 語言回答以下問題:
cor(diabetes[,-9])
## Pregnancies Glucose BloodPressure
## Pregnancies 1.00000000 0.12945867 0.14128198
## Glucose 0.12945867 1.00000000 0.15258959
## BloodPressure 0.14128198 0.15258959 1.00000000
## SkinThickness -0.08167177 0.05732789 0.20737054
## Insulin -0.07353461 0.33135711 0.08893338
## BMI 0.01768309 0.22107107 0.28180529
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495
## Age 0.54434123 0.26351432 0.23952795
## SkinThickness Insulin BMI
## Pregnancies -0.08167177 -0.07353461 0.01768309
## Glucose 0.05732789 0.33135711 0.22107107
## BloodPressure 0.20737054 0.08893338 0.28180529
## SkinThickness 1.00000000 0.43678257 0.39257320
## Insulin 0.43678257 1.00000000 0.19785906
## BMI 0.39257320 0.19785906 1.00000000
## DiabetesPedigreeFunction 0.18392757 0.18507093 0.14064695
## Age -0.11397026 -0.04216295 0.03624187
## DiabetesPedigreeFunction Age
## Pregnancies -0.03352267 0.54434123
## Glucose 0.13733730 0.26351432
## BloodPressure 0.04126495 0.23952795
## SkinThickness 0.18392757 -0.11397026
## Insulin 0.18507093 -0.04216295
## BMI 0.14064695 0.03624187
## DiabetesPedigreeFunction 1.00000000 0.03356131
## Age 0.03356131 1.00000000
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(diabetes[,-9]))
diabetes$Outcome <- as.factor(diabetes$Outcome)
set.seed(42)
idx <- sample.int(2, nrow(diabetes), prob = c(0.8,0.2), replace = TRUE)
trainset <- diabetes[idx == 1,]
testset <- diabetes[idx == 2, ]
library(rpart)
fit <- rpart(Outcome ~., data = trainset)
predicted <- predict(fit, testset, type = 'class')
sum(predicted == testset$Outcome) / length(testset$Outcome)
## [1] 0.7581699
table(testset$Outcome, predicted)
## predicted
## 0 1
## 0 82 16
## 1 21 34
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(42)
fit2 <- randomForest(Outcome ~., data = trainset,ntree=100,importance=T,proximity=T)
predicted2 <- predict(fit2, testset, type = 'class')
sum(predicted2 == testset$Outcome) / length(testset$Outcome)
## [1] 0.8039216
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
prediction_tree <- predict(fit,testset,type="prob")
pred_to_roc_tree <- prediction_tree[,2]
pred_roc_tree <- prediction(pred_to_roc_tree, testset$Outcome)
perf_roc_tree <- performance(pred_roc_tree,measure="auc",x.measure="cutoff")
perf_tpr_roc_tree <- performance(pred_roc_tree,"tpr","fpr")
# random forest
prediction_forest <- predict(fit2,testset,type="prob")
pred_to_roc_forest <- prediction_forest[,2]
pred_roc_forest <- prediction(pred_to_roc_forest,as.factor(testset$Outcome))
perf_roc_forest <- performance(pred_roc_forest,measure="auc",x.measure="cutoff")
perf_tpr_roc_forest <- performance(pred_roc_forest,"tpr","fpr")
plot(perf_tpr_roc_tree,main="ROC curve",col=1)
legend(.6,.2, c(
paste('rpart AUC:', round(perf_roc_tree@y.values[[1]],2) ),
paste('random forest AUC:',round(perf_roc_forest@y.values[[1]],2) ) ),1:2)
plot(perf_tpr_roc_forest,col=2,add=T)