Homework 7
heart.data <- read.csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/processed.cleveland.data.csv",header=FALSE,sep=",",na.strings = '?')
names(heart.data) <- c( "age", "sex", "cp","trestbps", "chol","fbs", "restecg","thalach","exang","oldpeak","slope", "ca", "thal", "num")
head(heart.data)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3
## 6 56 1 2 120 236 0 0 178 0 0.8 1 0 3
## num
## 1 0
## 2 2
## 3 1
## 4 0
## 5 0
## 6 0
str(heart.data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ num : int 0 2 1 0 0 0 3 0 2 1 ...
heart.data$heart_disease <- ifelse(heart.data$num ==0, 'normal','problem')
heart.data$num <- NULL
str(heart.data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps : num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ heart_disease: chr "normal" "problem" "problem" "normal" ...
heart.data$heart_disease <- as.factor(heart.data$heart_disease)
# answer1
library(rpart)
fit <- rpart(heart_disease ~ ., data = heart.data)
plot(fit, margin = 0.1)
text(fit)

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(fit)

# answer 2
predicted <- predict(fit, heart.data, type = 'class')
## accuracy
sum(heart.data$heart_disease == predicted) / length(heart.data$heart_disease)
## [1] 0.8481848
## confusion matrix
table(heart.data$heart_disease, predicted)
## predicted
## normal problem
## normal 147 17
## problem 29 110
# answer 3
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions1 <- predict(fit, heart.data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(heart.data$heart_disease))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart'), 1)

auc <- attributes(perf.rocr1)$y.values[[1]]
# answer 4
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
heart.data2 <- na.omit(heart.data)
fit2 <- randomForest(heart_disease ~ ., data = heart.data2)
predicted2 <- predict(fit2, heart.data2, type = 'class')
sum(heart.data2$heart_disease == predicted2) / length(heart.data2$heart_disease)
## [1] 1
table(heart.data2$heart_disease, predicted2)
## predicted2
## normal problem
## normal 160 0
## problem 0 137
# answer 5
library(ROCR)
predictions2 <- predict(fit2, heart.data2, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(heart.data2$heart_disease))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Refactor our homework answer
heart.data <- read.csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/processed.cleveland.data.csv",header=FALSE,sep=",",na.strings = '?')
names(heart.data) <- c( "age", "sex", "cp","trestbps", "chol","fbs", "restecg","thalach","exang","oldpeak","slope", "ca", "thal", "num")
sum(is.na(heart.data))
## [1] 6
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
heart.data <- mice(heart.data, print = FALSE)
heart.data <- complete(heart.data, 'long')
heart.data$.id <- NULL
heart.data$.imp <- NULL
heart.data$heart_disease <- ifelse(heart.data$num ==0, 'normal','problem')
heart.data$heart_disease <- as.factor(heart.data$heart_disease)
heart.data$num <- NULL
str(heart.data)
## 'data.frame': 1515 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps : num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ heart_disease: Factor w/ 2 levels "normal","problem": 1 2 2 1 1 1 2 1 2 2 ...
set.seed(42)
idx <- sample.int(2, nrow(heart.data), replace=TRUE, prob = c(0.7, 0.3))
trainset <- heart.data[idx==1, ]
testset <- heart.data[idx==2, ]
fit1 <- rpart(heart_disease ~., data = trainset)
fit2 <- randomForest(heart_disease ~., data = trainset)
predicted1 <- predict(fit1, testset, type= 'class')
sum(testset$heart_disease == predicted1) / length(testset$heart_disease)
## [1] 0.8886364
table(testset$heart_disease, predicted1)
## predicted1
## normal problem
## normal 210 13
## problem 36 181
predicted2 <- predict(fit2, testset, type= 'class')
sum(testset$heart_disease == predicted2) / length(testset$heart_disease)
## [1] 0.9886364
table(testset$heart_disease, predicted2)
## predicted2
## normal problem
## normal 223 0
## problem 5 212
library(ROCR)
predictions1 <- predict(fit1, testset, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$heart_disease))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
predictions2 <- predict(fit2, testset, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$heart_disease))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Time Series Analysis
cold <- read.csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cold.csv')
head(cold)
## month cold
## 1 2004-01 0
## 2 2004-02 13
## 3 2004-03 8
## 4 2004-04 6
## 5 2004-05 6
## 6 2004-06 0
M <- ts(cold$cold, frequency = 12, start = c(2004,1))
plot.ts(M)

library(TTR)
M.SMA12 <- SMA(M, n = 12)
plot(M.SMA12)

plot(M)
lines(M.SMA12, col='red')

library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
cold.pred <- HoltWinters(M,beta = FALSE, gamma = FALSE)
plot(cold.pred)

cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

library(forecast)
cold.pred<- HoltWinters(M,gamma = FALSE)
plot(cold.pred)

cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

components <- decompose(M)
plot(components)

library(forecast)
cold.pred<- HoltWinters(M)
cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

ARIMA
head(cbind(M, diff(M, lag=1)))
## M diff(M, lag = 1)
## Jan 2004 0 NA
## Feb 2004 13 13
## Mar 2004 8 -5
## Apr 2004 6 -2
## May 2004 6 0
## Jun 2004 0 -6
M.diff <- diff(M, lag=1, differences = 1)
plot(M.diff)

acf(M.diff[-1], margin = 0.1)
## Warning in plot.window(...): "margin" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "margin" 不是一個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "margin" 不是
## 一個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "margin" 不是
## 一個繪圖參數
## Warning in box(...): "margin" 不是一個繪圖參數
## Warning in title(...): "margin" 不是一個繪圖參數

pacf(M.diff[-1])

Box.test(M, type= 'Ljung', lag = 1)
##
## Box-Ljung test
##
## data: M
## X-squared = 118.78, df = 1, p-value < 2.2e-16
M.arima <- auto.arima(M, ic='aic')
M.forecast <- forecast(M.arima, h = 12)
plot(M.forecast)

y <- 1:10
x <- 1:10
plot(x,y, xaxt = 'n')
z <- 1:10
plot(x,y, xaxt = 'n')
ticks <- z
axis(1, at = ticks, labels = TRUE, tcl = -0.2)
