Date Conversion
library(ggplot2)
library(readr)
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 發病年份 = col_integer(),
## 發病月份 = col_integer(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 是否為境外移入 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市~ 小港區~ M 否 20-24
## 2 麻疹 2009 5 基隆市~ 中山區~ M 否 30-34
## 3 麻疹 2011 6 新北市~ 蘆洲區~ F 是 25-29
## 4 麻疹 2014 1 高雄市~ 三民區~ F 是 0
## 5 麻疹 2017 3 台北市~ 松山區~ F 是 0
## 6 麻疹 2018 4 桃園市~ 蘆竹區~ F 否 30-34
## # ... with 1 more variable: 確定病例數 <int>
x <- as.Date("2018-05-08")
class(x)
## [1] "Date"
unclass(x)
## [1] 17659
y <- as.Date("1970-01-01")
unclass(y)
## [1] 0
x <- Sys.time()
x
## [1] "2018-08-26 17:25:05 CST"
p <- as.POSIXlt(x)
p
## [1] "2018-08-26 17:25:05 CST"
class(p)
## [1] "POSIXlt" "POSIXt"
unclass(p)
## $sec
## [1] 5.169132
##
## $min
## [1] 25
##
## $hour
## [1] 17
##
## $mday
## [1] 26
##
## $mon
## [1] 7
##
## $year
## [1] 118
##
## $wday
## [1] 0
##
## $yday
## [1] 237
##
## $isdst
## [1] 0
##
## $zone
## [1] "CST"
##
## $gmtoff
## [1] 28800
##
## attr(,"tzone")
## [1] "" "CST" "CDT"
p1 <- as.POSIXct(x)
p1
## [1] "2018-08-26 17:25:05 CST"
class(p1)
## [1] "POSIXct" "POSIXt"
unclass(p1)
## [1] 1535275505
ds <- c("5 8, 2018 12:00")
?strptime
## starting httpd help server ...
## done
strptime(ds, "%m %d, %Y %H:%M")
## [1] "2018-05-08 12:00:00 CST"
x <- strptime(ds, "%m %d, %Y %H:%M")
x1 <- as.POSIXlt(as.Date('2018-05-08'))
x1
## [1] "2018-05-08 UTC"
x
## [1] "2018-05-08 12:00:00 CST"
x - x1
## Time difference of 4 hours
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
ymd("20180605")
## [1] "2018-06-05"
ymd("2018/06/05")
## [1] "2018-06-05"
ymd("2018-06-05")
## [1] "2018-06-05"
dmy("05/06/2011")
## [1] "2011-06-05"
mdy("05/06/2011")
## [1] "2011-05-06"
ymd_hms('2018-05-06 08:00:00')
## [1] "2018-05-06 08:00:00 UTC"
bday <- dmy("15/10/1988")
bday
## [1] "1988-10-15"
month(bday)
## [1] 10
year(bday)
## [1] 1988
wday(bday)
## [1] 7
measles
## # A tibble: 235 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市~ 小港區~ M 否 20-24
## 2 麻疹 2009 5 基隆市~ 中山區~ M 否 30-34
## 3 麻疹 2011 6 新北市~ 蘆洲區~ F 是 25-29
## 4 麻疹 2014 1 高雄市~ 三民區~ F 是 0
## 5 麻疹 2017 3 台北市~ 松山區~ F 是 0
## 6 麻疹 2018 4 桃園市~ 蘆竹區~ F 否 30-34
## 7 麻疹 2018 4 新北市~ 新店區~ F 否 25-29
## 8 麻疹 2009 4 新北市~ 新莊區~ M 否 20-24
## 9 麻疹 2011 2 台北市~ 文山區~ F 否 20-24
## 10 麻疹 2014 10 台北市~ 南港區~ M 是 0
## # ... with 225 more rows, and 1 more variable: 確定病例數 <int>
measles$發病時間 <- ymd(paste(measles$發病年份, measles$發病月份, '01', sep = '-'))
measles
## # A tibble: 235 x 10
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市~ 小港區~ M 否 20-24
## 2 麻疹 2009 5 基隆市~ 中山區~ M 否 30-34
## 3 麻疹 2011 6 新北市~ 蘆洲區~ F 是 25-29
## 4 麻疹 2014 1 高雄市~ 三民區~ F 是 0
## 5 麻疹 2017 3 台北市~ 松山區~ F 是 0
## 6 麻疹 2018 4 桃園市~ 蘆竹區~ F 否 30-34
## 7 麻疹 2018 4 新北市~ 新店區~ F 否 25-29
## 8 麻疹 2009 4 新北市~ 新莊區~ M 否 20-24
## 9 麻疹 2011 2 台北市~ 文山區~ F 否 20-24
## 10 麻疹 2014 10 台北市~ 南港區~ M 是 0
## # ... with 225 more rows, and 2 more variables: 確定病例數 <int>,
## # 發病時間 <date>
measles$發病時間 <- ymd(with(measles, paste(發病年份, 發病月份, '01', sep = '-')))
Aesthetics
library(ggplot2)
head(measles)
## # A tibble: 6 x 10
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市~ 小港區~ M 否 20-24
## 2 麻疹 2009 5 基隆市~ 中山區~ M 否 30-34
## 3 麻疹 2011 6 新北市~ 蘆洲區~ F 是 25-29
## 4 麻疹 2014 1 高雄市~ 三民區~ F 是 0
## 5 麻疹 2017 3 台北市~ 松山區~ F 是 0
## 6 麻疹 2018 4 桃園市~ 蘆竹區~ F 否 30-34
## # ... with 2 more variables: 確定病例數 <int>, 發病時間 <date>
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數) )
p1 + geom_point()

p1 + geom_line()

p1 + geom_point(color= 'red')

p1 + geom_point(color= 'blue', size = 5, shape = 17)

p1 + geom_point(aes(color = factor(性別)))

#factor(measles$性別)
p1 + geom_point(aes(color = factor(性別))) + scale_color_manual(values = c('orange', 'purple'))

p1 + geom_point(aes(color = factor(性別))) + scale_color_brewer()

p1 + geom_point(color = 'blue') + geom_point(color = 'red')

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')
p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )
multiplot(p2, p3, rows=2)
## Loading required package: grid

## [1] 2
p1 + geom_point() + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢')

p1 + geom_point(aes(color=factor(性別))) + geom_line(aes(color=factor(性別)))

p1 + geom_line(aes(color=factor(性別))) + geom_point(aes(color=factor(性別)))

plot(measles$發病時間, measles$確定病例數, type= 'p', pch= 19, xlab='時間', ylab = '病例數', col = factor(measles$性別))

Geometry
load("C:/Users/USER/Downloads/cdc.Rdata")
plot(wtdesire ~ weight, data = cdc)

ggplot(data = cdc, aes(x = weight, y = wtdesire)) + geom_point()

hist(cdc$weight)

histogram <- ggplot(data=cdc, aes(x=weight))
histogram + geom_histogram(binwidth=10,
color="black")

density <- ggplot(data=cdc, aes(x=weight))
density + geom_density(stat="density", alpha=I(0.1), fill='blue')

boxplot(cdc$weight~ cdc$gender)

box <- ggplot(data=cdc,
aes(x=gender, y=weight))
box + geom_boxplot(aes(fill=gender ))

barplot(table(cdc$genhlth))

bar <- ggplot(data=cdc, aes(x=genhlth))
bar + geom_bar()

pie(table(cdc$gender))

table(cdc$gender)
##
## m f
## 9569 10431
cdc_sex <- as.data.frame(table(cdc$gender))
pie <- ggplot(cdc_sex, aes(x="", y=Freq
,fill=Var1 )) + geom_bar(width=1, stat =
"identity") + geom_text(aes(label=Freq, y
=c(15000, 5000) ) , size=5)
pie

pie + coord_polar(theta="y", start = 0)

Statstics
smooth <- ggplot(data=cdc, aes(x=weight, y=wtdesire, color=gender)) +
geom_point(aes(shape=gender), size=1.5)
smooth

fit1 <- lm(wtdesire ~ weight, data = cdc[cdc$gender == 'm',])
fit2 <- lm(wtdesire ~ weight, data = cdc[cdc$gender == 'f',])
plot(wtdesire ~ weight, data = cdc[cdc$gender == 'm',], col= 'blue')
points(wtdesire ~ weight, data = cdc[cdc$gender == 'f',], col= 'red')
abline(fit1, col ='blue')
abline(fit2, col ='red')

smooth <- ggplot(data=cdc, aes(x=weight, y=wtdesire, color=gender)) +
geom_point(aes(shape=gender), size=1.5)
smooth

smooth + geom_smooth(method="lm")

box <- ggplot(data=cdc, aes(x=gender, y=weight))
box+ geom_boxplot(aes(fill=gender )) + geom_point()

box+ geom_boxplot(aes(fill=gender )) + geom_jitter()

box+ geom_jitter()+geom_boxplot(aes(fill=gender ))

Facet
w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w + facet_grid(gender ~ . )

w + facet_grid(. ~ gender )

head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
w + facet_grid(gender ~ genhlth )

w + facet_grid(genhlth ~ gender )

Coordinates
w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) +
geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w

w + xlim(100,200) + ylim(100,200)
## Warning: Removed 3720 rows containing non-finite values (stat_smooth).
## Warning: Removed 3720 rows containing missing values (geom_point).

histogram <- ggplot(data=cdc, aes(x=weight))
histogram + geom_histogram(fill='orange') + ylim(0,1000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing missing values (geom_bar).

histogram + geom_histogram(binwidth=10, color="black") +
coord_cartesian( ylim = c(0,1000) )

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w + facet_grid(gender~genhlth) + coord_cartesian( ylim = c(100,200) )

theme
w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) +
geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重') + theme(axis.title.x = element_text(color = 'DarkGreen', size = 10),
axis.title.y = element_text(color = 'Red', size = 10),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15)) + theme(plot.title = element_text(size = 20, hjust=0.5))

w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重') +
scale_color_manual(name ='性別'
,labels = c("MALE", "FEMALE"), values = c("blue", "red")) + theme(legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.position = c(1,1),
legend.justification = c(1,1))

?theme
w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) +
geom_smooth(method = 'lm')
w1 <- w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重')
w1

w1 + theme_dark()

w1 + theme_light() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(size = 20, hjust=0.5))

ggsave(w1, file='plot1.png')
## Saving 7 x 5 in image
w2 <- w1 + theme_light() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(size = 20, hjust=0.5))
ggsave(w2, file='plot2.png')
## Saving 7 x 5 in image
plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ds <- data.frame(
labels = c("A", "B", "C"),
values = c(10, 20, 30))
plot_ly(ds, labels = ds$labels, values = ds$values, type = "pie", hole=0.6) %>%
layout(title = "Donut Chart Example")
library(plotly)
month<- c(1,2,3,4,5)
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)
plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>%
add_trace(x = month, y = tainan, fill = "tozeroy",name="tainan") %>%
layout(yaxis = list(title = 'rainfall') )
plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>%
add_trace(x = month, y = tainan, fill = "tonexty",name="tainan") %>%
layout(yaxis = list(title = 'rainfall') )
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ... with 53,930 more rows
d <- diamonds[sample(nrow(diamonds), 1000), ]
plot_ly(d, x = d$carat, y= d$price, color = d$clarity, type='scatter', mode= 'markers', size = d$carat, text=
paste("Clarity", d$clarity))
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
data("economics")
p <- subplot(
plot_ly(economics, x = economics$date, y = economics$unemploy, type='scatter', mode = 'lines'),
plot_ly(economics, x = economics$date, y = economics$uempmed, type='scatter', mode = 'lines'),
margin=0.05
)
p %>% layout(showlegend=FALSE)
Decision Tree
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col = 'blue')
abline(h = 1.75, col = 'orange')

predict(fit, data.frame(Sepal.Length = 3, Sepal.Width = 5, Petal.Length = 1, Petal.Width = 3), type = 'class')
## 1
## setosa
## Levels: setosa versicolor virginica
predicted <- predict(fit, iris, type = 'class')
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.96
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
Logistic Regression
dataset <- iris[iris$Species!= 'setosa',]
dataset$Species <- factor(dataset$Species)
fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = dataset, family = 'binomial')
predicted <- predict(fit, dataset)
predicted <- factor(ifelse(predicted >0, 'virginica', 'versicolor'))
sum(predicted == dataset$Species) / length(dataset$Species)
## [1] 0.98
table(dataset$Species,predicted )
## predicted
## versicolor virginica
## versicolor 49 1
## virginica 1 49
SVM
library(e1071)
fit <- svm(Species ~ ., data = iris)
predicted <- predict(fit, data = iris, type = 'class')
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.9733333
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
預測客戶是否流失
library(C50)
data(churn)
names(churnTrain)
## [1] "state" "account_length"
## [3] "area_code" "international_plan"
## [5] "voice_mail_plan" "number_vmail_messages"
## [7] "total_day_minutes" "total_day_calls"
## [9] "total_day_charge" "total_eve_minutes"
## [11] "total_eve_calls" "total_eve_charge"
## [13] "total_night_minutes" "total_night_calls"
## [15] "total_night_charge" "total_intl_minutes"
## [17] "total_intl_calls" "total_intl_charge"
## [19] "number_customer_service_calls" "churn"
churnTrain <- churnTrain[,4:20]
# Decision Tree
library(rpart)
fit <- rpart(churn ~ ., data = churnTrain)
plot(fit, margin = 0.1)
text(fit)

#install.packages('rpart.plot')
library(rpart.plot)
rpart.plot(fit)

predicted <- predict(fit, churnTrain, type= 'class')
sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.9528953
table(churnTrain$churn, predicted)
## predicted
## yes no
## yes 361 122
## no 35 2815
# 0.9528
# Logistic Regression
fit <-glm(churn ~ ., data = churnTrain, family = 'binomial')
predicted <- predict(fit, churnTrain)
predicted <- factor(ifelse(predicted >0, 'no', 'yes'))
sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.8622862
# 0.86
# SVM
library(e1071)
fit <-svm(churn ~ ., data = churnTrain)
predicted <- predict(fit, churnTrain)
sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.940294
# 0.94