作業五

library(readr)
## Warning: package 'readr' was built under R version 3.4.4
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   發病年份 = col_integer(),
##   發病月份 = col_integer(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   是否為境外移入 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2009        4 高雄市 小港區 M     否             20-24 
## 2 麻疹         2009        5 基隆市 中山區 M     否             30-34 
## 3 麻疹         2011        6 新北市 蘆洲區 F     是             25-29 
## 4 麻疹         2014        1 高雄市 三民區 F     是             0     
## 5 麻疹         2017        3 台北市 松山區 F     是             0     
## 6 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## # ... with 1 more variable: 確定病例數 <int>
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# method 1
dataset <- measles %>%
  mutate(發病時間 = as.Date(paste0(發病年份, '-', 發病月份, '-', '01'), format='%Y-%m-%d') ) %>%
  filter(發病時間>= '2010-01-01', 是否為境外移入 == '否') 
## Warning: package 'bindrcpp' was built under R version 3.4.4
# method 2
dataset <- measles %>%
  filter(發病年份 >= 2010 , 是否為境外移入 == '否')


## Answer 1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
head(dataset)
## # A tibble: 6 x 9
##   確定病名 發病年份 發病月份 縣市   鄉鎮   性別  是否為境外移入 年齡層
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>          <chr> 
## 1 麻疹         2018        4 桃園市 蘆竹區 F     否             30-34 
## 2 麻疹         2018        4 新北市 新店區 F     否             25-29 
## 3 麻疹         2011        2 台北市 文山區 F     否             20-24 
## 4 麻疹         2018        3 台北市 信義區 F     否             25-29 
## 5 麻疹         2011        3 連江縣 東引鄉 M     否             25-29 
## 6 麻疹         2014       10 南投縣 仁愛鄉 F     否             25-29 
## # ... with 1 more variable: 確定病例數 <int>
p <- ggplot(dataset, aes(x= factor(性別), weight = 確定病例數)) 
p + geom_bar(aes(fill= factor(性別)) )

## Answer 2
stat <- dataset %>%
  group_by(性別) %>%
  summarise(disease_cnt = sum(確定病例數) )

pie <- ggplot(stat, aes(x = '', y= disease_cnt, fill = 性別))

p1 <- pie + geom_bar(width = 1, stat= 'identity')
p1

p1 + coord_polar(theta = 'y', start = 0) + geom_text(aes(label = disease_cnt, y = c(70, 30)) )

## Answer 3
dataset <- measles %>%
  mutate(發病時間 = as.Date(paste0(發病年份, '-', 發病月份, '-', '01'), format='%Y-%m-%d') )

p<- ggplot(dataset, aes(x = 發病時間, y= 確定病例數 )) + geom_point() + geom_line()

p + facet_grid(.~年齡層 )
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

點估計與信賴區間

load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
hist(cdc$height)

sample_means10 <- rep(NA , 5000 )
sample_means50 <- rep(NA , 5000 )
sample_means100 <- rep(NA , 5000 )

for (i in 1:5000 ){
samp <-sample(cdc$height, 10 )
sample_means10[i]=mean (samp)

samp <-sample(cdc$height, 50 )
sample_means50[i]=mean(samp)

samp <-sample (cdc$height , 100 )
sample_means100[i]=mean (samp )
}


hist(sample_means10)

hist(sample_means50)

hist(sample_means100)

par (mfrow =c(3, 1))
xlimits <-range (sample_means10 )
hist (sample_means10, breaks =20 , xlim=xlimits)

hist(sample_means50, breaks =20 , xlim=xlimits )
hist(sample_means100, breaks =20 , xlim=xlimits )

population <-cdc$height 
set.seed (123 )
samp <-sample(population,  50 )
sample_mean <-mean (samp)
hist(samp)

par(mfrow=c(1,2))

curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-3, seq (-3, 1.5 , 0.01 ), 1.5 )
cord.y <-c(0, dnorm(seq (-3, 1.5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

curve (pnorm (x, 0, 1), xlim =c(-3, 3),  main ="F(z)=P(Z<=z)" )

pnorm(1.5)
## [1] 0.9331928
pnorm(-1.5)
## [1] 0.0668072
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-1.5, seq (-1.5, 1.5 , 0.01 ), 1.5 )
cord.y <-c(0, dnorm(seq (-1.5, 1.5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

pnorm(1.5) - pnorm(-1.5)
## [1] 0.8663856
pnorm(1.96) - pnorm(-1.96)
## [1] 0.9500042
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-1.96, seq (-1.96, 1.96 , 0.01 ), 1.96 )
cord.y <-c(0, dnorm(seq (-1.96, 1.96 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

samp <-sample(cdc$height, 50 )
sample_mean <- mean(samp)
sde  <- sd(samp) / sqrt(50)
lower <- sample_mean - 1.96 * sde
upper <- sample_mean + 1.96 * sde
lower
## [1] 65.28906
upper
## [1] 67.63094
mean(cdc$height)
## [1] 67.1829
hist(samp)
abline(v = upper , col ='red')
abline(v = lower , col ='red')
abline(v = mean(cdc$height), col= 'blue')

set.seed(123)
samp <-sample(cdc$height, 10 )
sample_mean <- mean(samp)
sde  <- sd(samp) / sqrt(10)
lower10 <- sample_mean - 1.96 * sde
upper10 <- sample_mean + 1.96 * sde

set.seed(123)
samp <-sample(cdc$height, 50 )
sample_mean <- mean(samp)
sde  <- sd(samp) / sqrt(50)
lower50 <- sample_mean - 1.96 * sde
upper50 <- sample_mean + 1.96 * sde

set.seed(123)
samp <-sample(cdc$height, 100 )
sample_mean <- mean(samp)
sde  <- sd(samp) / sqrt(100)
lower100 <- sample_mean - 1.96 * sde
upper100 <- sample_mean + 1.96 * sde

list(lower10, upper10, lower50, upper50, lower100, upper100)
## [[1]]
## [1] 63.02933
## 
## [[2]]
## [1] 69.17067
## 
## [[3]]
## [1] 65.54618
## 
## [[4]]
## [1] 68.29382
## 
## [[5]]
## [1] 66.66416
## 
## [[6]]
## [1] 68.41584
x <- c(0,3,6)
y <- c(0,5,0)
plot(x,y, type = 'n')
polygon(x,y, col='red')

curve(dnorm (x, 0, 1), xlim =c(-3, 3))
#seq (-1.96, 1.96 , 0.01 )

cord.x <-c(-1.96, seq (-1.96, 1.96 , 0.01 ), 1.96 )
#dnorm(seq(-1.06,1.96, 0.01))
cord.y <-c(0, dnorm(seq (-1.96, 1.96 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

## 多變量分析

str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
numeric_dataset  <- cdc[,c('height', 'weight', 'wtdesire', 'age')]

#numeric_dataset

cov(numeric_dataset)
##              height      weight   wtdesire        age
## height    17.023499   91.834880  100.13654  -8.879927
## weight    91.834880 1606.484154 1026.56638   1.108694
## wtdesire 100.136542 1026.566383 1024.85178 -13.769994
## age       -8.879927    1.108694  -13.76999 295.588571
sum((numeric_dataset$weight - mean(numeric_dataset$weight)) * (numeric_dataset$height - mean(numeric_dataset$height))) / nrow(numeric_dataset)
## [1] 91.83029
cor(numeric_dataset)
##              height      weight    wtdesire          age
## height    1.0000000 0.555322192  0.75811946 -0.125181791
## weight    0.5553222 1.000000000  0.80005213  0.001608902
## wtdesire  0.7581195 0.800052128  1.00000000 -0.025018392
## age      -0.1251818 0.001608902 -0.02501839  1.000000000
cov(numeric_dataset[, c('weight', 'wtdesire')]) / (sd(numeric_dataset$weight) * sd(numeric_dataset$wtdesire)  )
##             weight  wtdesire
## weight   1.2520097 0.8000521
## wtdesire 0.8000521 0.7987159
heatmap(cor(numeric_dataset))

library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
co <- cor(numeric_dataset)
co
##              height      weight    wtdesire          age
## height    1.0000000 0.555322192  0.75811946 -0.125181791
## weight    0.5553222 1.000000000  0.80005213  0.001608902
## wtdesire  0.7581195 0.800052128  1.00000000 -0.025018392
## age      -0.1251818 0.001608902 -0.02501839  1.000000000
melt(co)
##        Var1     Var2        value
## 1    height   height  1.000000000
## 2    weight   height  0.555322192
## 3  wtdesire   height  0.758119462
## 4       age   height -0.125181791
## 5    height   weight  0.555322192
## 6    weight   weight  1.000000000
## 7  wtdesire   weight  0.800052128
## 8       age   weight  0.001608902
## 9    height wtdesire  0.758119462
## 10   weight wtdesire  0.800052128
## 11 wtdesire wtdesire  1.000000000
## 12      age wtdesire -0.025018392
## 13   height      age -0.125181791
## 14   weight      age  0.001608902
## 15 wtdesire      age -0.025018392
## 16      age      age  1.000000000
qplot(x=Var1, y=Var2, data=melt(co), fill=value, geom="tile")

Regression Analysis

co <- cor(numeric_dataset)
heatmap(co)

plot(wtdesire ~ weight, data = cdc)
fit <- lm(wtdesire ~ weight, data = cdc)
fit
## 
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
## 
## Coefficients:
## (Intercept)       weight  
##      46.664        0.639
summary(fit)
## 
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -167.98   -9.32    0.08   11.51  518.31 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.664015   0.590782   78.99   <2e-16 ***
## weight       0.639014   0.003388  188.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.21 on 19998 degrees of freedom
## Multiple R-squared:  0.6401, Adjusted R-squared:  0.6401 
## F-statistic: 3.556e+04 on 1 and 19998 DF,  p-value: < 2.2e-16
plot(wtdesire ~ weight, data = numeric_dataset)
abline(fit, col='red')