Homework 7

heart.data <- read.csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/processed.cleveland.data.csv",header=FALSE,sep=",",na.strings = '?')

names(heart.data) <- c( "age", "sex", "cp","trestbps", "chol","fbs", "restecg","thalach","exang","oldpeak","slope", "ca", "thal", "num")

head(heart.data)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   1  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   1  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   1  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   1  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41   0  2      130  204   0       2     172     0     1.4     1  0    3
## 6  56   1  2      120  236   0       0     178     0     0.8     1  0    3
##   num
## 1   0
## 2   2
## 3   1
## 4   0
## 5   0
## 6   0
str(heart.data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num     : int  0 2 1 0 0 0 3 0 2 1 ...
heart.data$heart_disease <- ifelse(heart.data$num ==0, 'normal','problem')
heart.data$num <- NULL

str(heart.data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age          : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex          : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp           : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps     : num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol         : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs          : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg      : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach      : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang        : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak      : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope        : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca           : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal         : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ heart_disease: chr  "normal" "problem" "problem" "normal" ...
heart.data$heart_disease <- as.factor(heart.data$heart_disease)

# answer1
library(rpart)
fit <- rpart(heart_disease ~ ., data = heart.data)


plot(fit, margin = 0.1)
text(fit)

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(fit)

# answer 2
predicted <- predict(fit, heart.data, type = 'class')

## accuracy
sum(heart.data$heart_disease == predicted) / length(heart.data$heart_disease)
## [1] 0.8481848
## confusion matrix
table(heart.data$heart_disease, predicted)
##          predicted
##           normal problem
##   normal     147      17
##   problem     29     110
# answer 3
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions1 <- predict(fit, heart.data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(heart.data$heart_disease))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart'), 1)

auc <- attributes(perf.rocr1)$y.values[[1]]

# answer 4
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
heart.data2 <- na.omit(heart.data)
fit2 <- randomForest(heart_disease ~ ., data = heart.data2)

predicted2 <- predict(fit2, heart.data2, type = 'class')

sum(heart.data2$heart_disease == predicted2) / length(heart.data2$heart_disease)
## [1] 1
table(heart.data2$heart_disease, predicted2)
##          predicted2
##           normal problem
##   normal     160       0
##   problem      0     137
# answer 5
library(ROCR)
predictions2 <- predict(fit2, heart.data2, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(heart.data2$heart_disease))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Refactor our homework answer

heart.data <- read.csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/processed.cleveland.data.csv",header=FALSE,sep=",",na.strings = '?')

names(heart.data) <- c( "age", "sex", "cp","trestbps", "chol","fbs", "restecg","thalach","exang","oldpeak","slope", "ca", "thal", "num")

sum(is.na(heart.data))
## [1] 6
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
heart.data <- mice(heart.data, print = FALSE)

heart.data <- complete(heart.data, 'long')

heart.data$.id <- NULL
heart.data$.imp <- NULL

heart.data$heart_disease <- ifelse(heart.data$num ==0, 'normal','problem')
heart.data$heart_disease <- as.factor(heart.data$heart_disease)

heart.data$num <- NULL

str(heart.data)
## 'data.frame':    1515 obs. of  14 variables:
##  $ age          : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex          : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp           : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps     : num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol         : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs          : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg      : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach      : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang        : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak      : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope        : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca           : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal         : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ heart_disease: Factor w/ 2 levels "normal","problem": 1 2 2 1 1 1 2 1 2 2 ...
set.seed(42)
idx <- sample.int(2, nrow(heart.data), replace=TRUE, prob = c(0.7, 0.3))
trainset <- heart.data[idx==1, ]
testset  <- heart.data[idx==2, ]

fit1 <- rpart(heart_disease ~., data = trainset)
fit2 <- randomForest(heart_disease ~., data = trainset)



predicted1 <- predict(fit1, testset, type= 'class')

sum(testset$heart_disease == predicted1) / length(testset$heart_disease)
## [1] 0.8886364
table(testset$heart_disease, predicted1)
##          predicted1
##           normal problem
##   normal     210      13
##   problem     36     181
predicted2 <- predict(fit2, testset, type= 'class')
sum(testset$heart_disease == predicted2) / length(testset$heart_disease)
## [1] 0.9886364
table(testset$heart_disease, predicted2)
##          predicted2
##           normal problem
##   normal     223       0
##   problem      5     212
library(ROCR)
predictions1 <- predict(fit1, testset, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$heart_disease))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(fit2, testset, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$heart_disease))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")



plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Time Series Analysis

cold <- read.csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cold.csv')
head(cold)
##     month cold
## 1 2004-01    0
## 2 2004-02   13
## 3 2004-03    8
## 4 2004-04    6
## 5 2004-05    6
## 6 2004-06    0
M <- ts(cold$cold, frequency = 12, start = c(2004,1))
plot.ts(M)

library(TTR)
M.SMA12 <- SMA(M, n = 12)
plot(M.SMA12)

plot(M)
lines(M.SMA12, col='red')

library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
cold.pred <- HoltWinters(M,beta = FALSE, gamma = FALSE)
plot(cold.pred)

cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

library(forecast)
cold.pred<- HoltWinters(M,gamma = FALSE)
plot(cold.pred)

cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

components <- decompose(M)
plot(components)

library(forecast)
cold.pred<- HoltWinters(M)
cold.pred2 <- predict(cold.pred, n.ahead = 10)
plot(cold.pred2)

ARIMA

head(cbind(M, diff(M, lag=1)))
##           M diff(M, lag = 1)
## Jan 2004  0               NA
## Feb 2004 13               13
## Mar 2004  8               -5
## Apr 2004  6               -2
## May 2004  6                0
## Jun 2004  0               -6
M.diff <- diff(M, lag=1, differences = 1)
plot(M.diff)

acf(M.diff[-1], margin = 0.1)
## Warning in plot.window(...): "margin" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "margin" 不是一個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "margin" 不是
## 一個繪圖參數

## Warning in axis(side = side, at = at, labels = labels, ...): "margin" 不是
## 一個繪圖參數
## Warning in box(...): "margin" 不是一個繪圖參數
## Warning in title(...): "margin" 不是一個繪圖參數

pacf(M.diff[-1])

Box.test(M, type= 'Ljung', lag = 1)
## 
##  Box-Ljung test
## 
## data:  M
## X-squared = 118.78, df = 1, p-value < 2.2e-16
M.arima <- auto.arima(M, ic='aic')
M.forecast <- forecast(M.arima, h = 12)
plot(M.forecast)

y <- 1:10
x <- 1:10
plot(x,y, xaxt = 'n')
z <- 1:10

plot(x,y, xaxt = 'n')
ticks <- z
axis(1, at = ticks, labels = TRUE, tcl = -0.2)