Date Conversion

library(ggplot2)

library(readr)

measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   發病年份 = col_integer(),
##   發病月份 = col_integer(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   是否為境外移入 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市  鄉鎮  性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr> <chr> <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市~ 小港區~ M     否             20-24 
## 2 麻疹         2009        5 基隆市~ 中山區~ M     否             30-34 
## 3 麻疹         2011        6 新北市~ 蘆洲區~ F     是             25-29 
## 4 麻疹         2014        1 高雄市~ 三民區~ F     是             0     
## 5 麻疹         2017        3 台北市~ 松山區~ F     是             0     
## 6 麻疹         2018        4 桃園市~ 蘆竹區~ F     否             30-34 
## # ... with 1 more variable: 確定病例數 <int>
x <- as.Date("2018-05-08")
class(x)
## [1] "Date"
unclass(x)
## [1] 17659
y <- as.Date("1970-01-01")

unclass(y)
## [1] 0
x <- Sys.time()
x
## [1] "2018-08-26 17:25:05 CST"
p <- as.POSIXlt(x)
p
## [1] "2018-08-26 17:25:05 CST"
class(p)
## [1] "POSIXlt" "POSIXt"
unclass(p)
## $sec
## [1] 5.169132
## 
## $min
## [1] 25
## 
## $hour
## [1] 17
## 
## $mday
## [1] 26
## 
## $mon
## [1] 7
## 
## $year
## [1] 118
## 
## $wday
## [1] 0
## 
## $yday
## [1] 237
## 
## $isdst
## [1] 0
## 
## $zone
## [1] "CST"
## 
## $gmtoff
## [1] 28800
## 
## attr(,"tzone")
## [1] ""    "CST" "CDT"
p1 <- as.POSIXct(x)
p1
## [1] "2018-08-26 17:25:05 CST"
class(p1)
## [1] "POSIXct" "POSIXt"
unclass(p1)
## [1] 1535275505
ds <- c("5 8, 2018 12:00")
?strptime
## starting httpd help server ...
##  done
strptime(ds, "%m %d, %Y %H:%M")
## [1] "2018-05-08 12:00:00 CST"
x <- strptime(ds, "%m %d, %Y %H:%M")
x1 <- as.POSIXlt(as.Date('2018-05-08'))

x1
## [1] "2018-05-08 UTC"
x
## [1] "2018-05-08 12:00:00 CST"
x - x1
## Time difference of 4 hours
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
ymd("20180605")
## [1] "2018-06-05"
ymd("2018/06/05")
## [1] "2018-06-05"
ymd("2018-06-05")
## [1] "2018-06-05"
dmy("05/06/2011")
## [1] "2011-06-05"
mdy("05/06/2011")
## [1] "2011-05-06"
ymd_hms('2018-05-06 08:00:00')
## [1] "2018-05-06 08:00:00 UTC"
bday <- dmy("15/10/1988")
bday
## [1] "1988-10-15"
month(bday)
## [1] 10
year(bday)
## [1] 1988
wday(bday)
## [1] 7
measles
## # A tibble: 235 x 9
##    確定病名 發病年份 發病月份 縣市  鄉鎮  性別  是否為境外移入 年齡層
##    <chr>       <int>    <int> <chr> <chr> <chr> <chr>          <chr> 
##  1 麻疹         2009        4 高雄市~ 小港區~ M     否             20-24 
##  2 麻疹         2009        5 基隆市~ 中山區~ M     否             30-34 
##  3 麻疹         2011        6 新北市~ 蘆洲區~ F     是             25-29 
##  4 麻疹         2014        1 高雄市~ 三民區~ F     是             0     
##  5 麻疹         2017        3 台北市~ 松山區~ F     是             0     
##  6 麻疹         2018        4 桃園市~ 蘆竹區~ F     否             30-34 
##  7 麻疹         2018        4 新北市~ 新店區~ F     否             25-29 
##  8 麻疹         2009        4 新北市~ 新莊區~ M     否             20-24 
##  9 麻疹         2011        2 台北市~ 文山區~ F     否             20-24 
## 10 麻疹         2014       10 台北市~ 南港區~ M     是             0     
## # ... with 225 more rows, and 1 more variable: 確定病例數 <int>
measles$發病時間 <- ymd(paste(measles$發病年份, measles$發病月份, '01', sep = '-'))

measles
## # A tibble: 235 x 10
##    確定病名 發病年份 發病月份 縣市  鄉鎮  性別  是否為境外移入 年齡層
##    <chr>       <int>    <int> <chr> <chr> <chr> <chr>          <chr> 
##  1 麻疹         2009        4 高雄市~ 小港區~ M     否             20-24 
##  2 麻疹         2009        5 基隆市~ 中山區~ M     否             30-34 
##  3 麻疹         2011        6 新北市~ 蘆洲區~ F     是             25-29 
##  4 麻疹         2014        1 高雄市~ 三民區~ F     是             0     
##  5 麻疹         2017        3 台北市~ 松山區~ F     是             0     
##  6 麻疹         2018        4 桃園市~ 蘆竹區~ F     否             30-34 
##  7 麻疹         2018        4 新北市~ 新店區~ F     否             25-29 
##  8 麻疹         2009        4 新北市~ 新莊區~ M     否             20-24 
##  9 麻疹         2011        2 台北市~ 文山區~ F     否             20-24 
## 10 麻疹         2014       10 台北市~ 南港區~ M     是             0     
## # ... with 225 more rows, and 2 more variables: 確定病例數 <int>,
## #   發病時間 <date>
measles$發病時間 <- ymd(with(measles, paste(發病年份, 發病月份, '01', sep = '-')))

Aesthetics

library(ggplot2)
head(measles)
## # A tibble: 6 x 10
##   確定病名 發病年份 發病月份 縣市  鄉鎮  性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr> <chr> <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市~ 小港區~ M     否             20-24 
## 2 麻疹         2009        5 基隆市~ 中山區~ M     否             30-34 
## 3 麻疹         2011        6 新北市~ 蘆洲區~ F     是             25-29 
## 4 麻疹         2014        1 高雄市~ 三民區~ F     是             0     
## 5 麻疹         2017        3 台北市~ 松山區~ F     是             0     
## 6 麻疹         2018        4 桃園市~ 蘆竹區~ F     否             30-34 
## # ... with 2 more variables: 確定病例數 <int>, 發病時間 <date>
p1 <- ggplot(measles, aes(x = 發病時間, y = 確定病例數) )

p1 + geom_point()

p1 + geom_line()

p1 + geom_point(color= 'red')

p1 + geom_point(color= 'blue', size = 5, shape = 17)

p1 + geom_point(aes(color = factor(性別)))

#factor(measles$性別)

p1 + geom_point(aes(color = factor(性別))) + scale_color_manual(values = c('orange', 'purple'))

p1 + geom_point(aes(color = factor(性別))) + scale_color_brewer()

p1 + geom_point(color = 'blue') + geom_point(color = 'red')

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')

p2 <- p1 + geom_point(aes(color=factor(性別)) )
p3 <- p1 + geom_point(aes(shape=factor(性別)) )

multiplot(p2, p3, rows=2)
## Loading required package: grid

## [1] 2
p1 + geom_point() + xlab('時間') + ylab('病例數') + ggtitle('麻疹發病趨勢')

p1 + geom_point(aes(color=factor(性別))) + geom_line(aes(color=factor(性別)))

p1 + geom_line(aes(color=factor(性別))) + geom_point(aes(color=factor(性別)))

plot(measles$發病時間, measles$確定病例數, type= 'p', pch= 19, xlab='時間', ylab = '病例數', col = factor(measles$性別))

Geometry

load("C:/Users/USER/Downloads/cdc.Rdata")
plot(wtdesire ~ weight, data = cdc)

ggplot(data = cdc, aes(x = weight, y = wtdesire)) + geom_point()

hist(cdc$weight)

histogram <- ggplot(data=cdc, aes(x=weight))
histogram + geom_histogram(binwidth=10,
color="black")

density <- ggplot(data=cdc, aes(x=weight))
density + geom_density(stat="density", alpha=I(0.1), fill='blue')

boxplot(cdc$weight~ cdc$gender)

box <- ggplot(data=cdc,
aes(x=gender, y=weight))

box + geom_boxplot(aes(fill=gender ))

barplot(table(cdc$genhlth))

bar <- ggplot(data=cdc, aes(x=genhlth))
bar + geom_bar()

pie(table(cdc$gender))

table(cdc$gender)
## 
##     m     f 
##  9569 10431
cdc_sex <- as.data.frame(table(cdc$gender))

pie <- ggplot(cdc_sex, aes(x="", y=Freq
,fill=Var1 )) + geom_bar(width=1, stat =
"identity") + geom_text(aes(label=Freq, y
=c(15000, 5000) ) , size=5)
pie

pie + coord_polar(theta="y", start = 0)

Statstics

smooth <- ggplot(data=cdc, aes(x=weight, y=wtdesire, color=gender)) +
geom_point(aes(shape=gender), size=1.5)
smooth

fit1 <- lm(wtdesire ~ weight, data = cdc[cdc$gender == 'm',])

fit2 <- lm(wtdesire ~ weight, data = cdc[cdc$gender == 'f',])

plot(wtdesire ~ weight, data = cdc[cdc$gender == 'm',], col= 'blue')
points(wtdesire ~ weight, data = cdc[cdc$gender == 'f',], col= 'red')
abline(fit1, col ='blue')
abline(fit2, col ='red')

smooth <- ggplot(data=cdc, aes(x=weight, y=wtdesire, color=gender)) +
geom_point(aes(shape=gender), size=1.5)

smooth

smooth + geom_smooth(method="lm")

box <- ggplot(data=cdc, aes(x=gender, y=weight))

box+ geom_boxplot(aes(fill=gender )) + geom_point()

box+ geom_boxplot(aes(fill=gender )) + geom_jitter()

box+ geom_jitter()+geom_boxplot(aes(fill=gender )) 

Facet

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w + facet_grid(gender ~ . )

w + facet_grid(. ~ gender )

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
w + facet_grid(gender ~ genhlth )

w + facet_grid(genhlth ~ gender )

Coordinates

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) +
geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')
w

w + xlim(100,200) + ylim(100,200)
## Warning: Removed 3720 rows containing non-finite values (stat_smooth).
## Warning: Removed 3720 rows containing missing values (geom_point).

histogram <- ggplot(data=cdc, aes(x=weight))
histogram + geom_histogram(fill='orange') + ylim(0,1000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing missing values (geom_bar).

histogram + geom_histogram(binwidth=10, color="black") +
coord_cartesian( ylim = c(0,1000) )

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')

w + facet_grid(gender~genhlth) + coord_cartesian( ylim = c(100,200) )

theme

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) +
geom_point(aes(color=factor(gender))) + geom_smooth(method = 'lm')

w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重') + theme(axis.title.x = element_text(color = 'DarkGreen', size = 10),
axis.title.y = element_text(color = 'Red', size = 10),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15)) + theme(plot.title = element_text(size = 20, hjust=0.5))

w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重') +
scale_color_manual(name ='性別'
,labels = c("MALE", "FEMALE"), values = c("blue", "red")) + theme(legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.position = c(1,1),
legend.justification = c(1,1))

?theme

w <- ggplot(data=cdc, aes(x=weight, y = wtdesire)) + geom_point(aes(color=factor(gender))) +
geom_smooth(method = 'lm')

w1 <- w + xlab('體重') + ylab('理想體重') + ggtitle('體重 v.s. 理想體重')

w1

w1 + theme_dark()

w1 + theme_light() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(size = 20, hjust=0.5))

ggsave(w1, file='plot1.png')
## Saving 7 x 5 in image
w2 <- w1 + theme_light() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(size = 20, hjust=0.5))


ggsave(w2, file='plot2.png')
## Saving 7 x 5 in image

plotly

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ds <- data.frame(
  labels = c("A", "B", "C"),
  values = c(10, 20, 30))

plot_ly(ds, labels = ds$labels, values = ds$values, type = "pie", hole=0.6) %>%
layout(title = "Donut Chart Example")
library(plotly)
month<- c(1,2,3,4,5)
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>%
add_trace(x = month, y = tainan, fill = "tozeroy",name="tainan") %>%
layout(yaxis = list(title = 'rainfall') )
plot_ly(x = month, y = taipei, fill = "tozeroy", name="taipei",type='scatter', mode= 'markers') %>%
add_trace(x = month, y = tainan, fill = "tonexty",name="tainan") %>%
layout(yaxis = list(title = 'rainfall') )
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ... with 53,930 more rows
d <- diamonds[sample(nrow(diamonds), 1000), ]

plot_ly(d, x = d$carat, y= d$price, color = d$clarity, type='scatter', mode= 'markers', size = d$carat, text=
paste("Clarity", d$clarity))
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
data("economics")
p <- subplot(
plot_ly(economics, x = economics$date, y = economics$unemploy, type='scatter', mode = 'lines'),
plot_ly(economics, x = economics$date, y = economics$uempmed, type='scatter', mode = 'lines'),
margin=0.05
)
p %>% layout(showlegend=FALSE)

使用Tableau 做財經資訊視覺化

Decision Tree

data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)

fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col = 'blue')
abline(h = 1.75, col = 'orange')

predict(fit, data.frame(Sepal.Length = 3, Sepal.Width = 5, Petal.Length = 1, Petal.Width = 3), type = 'class')
##      1 
## setosa 
## Levels: setosa versicolor virginica
predicted <- predict(fit, iris, type = 'class')

sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.96
table(iris$Species, predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45

Logistic Regression

dataset <- iris[iris$Species!= 'setosa',]
dataset$Species <- factor(dataset$Species)

fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = dataset, family = 'binomial')


predicted <- predict(fit, dataset)

predicted <- factor(ifelse(predicted >0, 'virginica', 'versicolor'))

sum(predicted == dataset$Species) / length(dataset$Species)
## [1] 0.98
table(dataset$Species,predicted )
##             predicted
##              versicolor virginica
##   versicolor         49         1
##   virginica           1        49

SVM

library(e1071)
fit <- svm(Species ~ ., data = iris)
predicted <- predict(fit, data = iris, type = 'class')

sum(iris$Species ==  predicted) / length(iris$Species)
## [1] 0.9733333
table(iris$Species, predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

預測客戶是否流失

library(C50)
data(churn)


names(churnTrain)
##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_day_charge"              "total_eve_minutes"            
## [11] "total_eve_calls"               "total_eve_charge"             
## [13] "total_night_minutes"           "total_night_calls"            
## [15] "total_night_charge"            "total_intl_minutes"           
## [17] "total_intl_calls"              "total_intl_charge"            
## [19] "number_customer_service_calls" "churn"
churnTrain <- churnTrain[,4:20]


# Decision Tree
library(rpart)
fit <- rpart(churn ~ ., data = churnTrain)
plot(fit, margin = 0.1)
text(fit)

#install.packages('rpart.plot')
library(rpart.plot)
rpart.plot(fit)

predicted <- predict(fit, churnTrain, type= 'class')

sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.9528953
table(churnTrain$churn, predicted)
##      predicted
##        yes   no
##   yes  361  122
##   no    35 2815
# 0.9528

# Logistic Regression
fit <-glm(churn ~ ., data = churnTrain, family = 'binomial')
predicted <- predict(fit, churnTrain)

predicted <- factor(ifelse(predicted >0, 'no', 'yes'))
sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.8622862
# 0.86

# SVM
library(e1071)
fit <-svm(churn ~ ., data = churnTrain)
predicted <- predict(fit, churnTrain)

sum(predicted == churnTrain$churn) / length(churnTrain$churn)
## [1] 0.940294
# 0.94