library(readr)
## Warning: package 'readr' was built under R version 3.4.4
measles <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/measles.csv")
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 發病年份 = col_integer(),
## 發病月份 = col_integer(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 是否為境外移入 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_integer()
## )
head(measles)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2009 4 高雄市 小港區 M 否 20-24
## 2 麻疹 2009 5 基隆市 中山區 M 否 30-34
## 3 麻疹 2011 6 新北市 蘆洲區 F 是 25-29
## 4 麻疹 2014 1 高雄市 三民區 F 是 0
## 5 麻疹 2017 3 台北市 松山區 F 是 0
## 6 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## # ... with 1 more variable: 確定病例數 <int>
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# method 1
dataset <- measles %>%
mutate(發病時間 = as.Date(paste0(發病年份, '-', 發病月份, '-', '01'), format='%Y-%m-%d') ) %>%
filter(發病時間>= '2010-01-01', 是否為境外移入 == '否')
## Warning: package 'bindrcpp' was built under R version 3.4.4
# method 2
dataset <- measles %>%
filter(發病年份 >= 2010 , 是否為境外移入 == '否')
## Answer 1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
head(dataset)
## # A tibble: 6 x 9
## 確定病名 發病年份 發病月份 縣市 鄉鎮 性別 是否為境外移入 年齡層
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 麻疹 2018 4 桃園市 蘆竹區 F 否 30-34
## 2 麻疹 2018 4 新北市 新店區 F 否 25-29
## 3 麻疹 2011 2 台北市 文山區 F 否 20-24
## 4 麻疹 2018 3 台北市 信義區 F 否 25-29
## 5 麻疹 2011 3 連江縣 東引鄉 M 否 25-29
## 6 麻疹 2014 10 南投縣 仁愛鄉 F 否 25-29
## # ... with 1 more variable: 確定病例數 <int>
p <- ggplot(dataset, aes(x= factor(性別), weight = 確定病例數))
p + geom_bar(aes(fill= factor(性別)) )
## Answer 2
stat <- dataset %>%
group_by(性別) %>%
summarise(disease_cnt = sum(確定病例數) )
pie <- ggplot(stat, aes(x = '', y= disease_cnt, fill = 性別))
p1 <- pie + geom_bar(width = 1, stat= 'identity')
p1
p1 + coord_polar(theta = 'y', start = 0) + geom_text(aes(label = disease_cnt, y = c(70, 30)) )
## Answer 3
dataset <- measles %>%
mutate(發病時間 = as.Date(paste0(發病年份, '-', 發病月份, '-', '01'), format='%Y-%m-%d') )
p<- ggplot(dataset, aes(x = 發病時間, y= 確定病例數 )) + geom_point() + geom_line()
p + facet_grid(.~年齡層 )
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
hist(cdc$height)
sample_means10 <- rep(NA , 5000 )
sample_means50 <- rep(NA , 5000 )
sample_means100 <- rep(NA , 5000 )
for (i in 1:5000 ){
samp <-sample(cdc$height, 10 )
sample_means10[i]=mean (samp)
samp <-sample(cdc$height, 50 )
sample_means50[i]=mean(samp)
samp <-sample (cdc$height , 100 )
sample_means100[i]=mean (samp )
}
hist(sample_means10)
hist(sample_means50)
hist(sample_means100)
par (mfrow =c(3, 1))
xlimits <-range (sample_means10 )
hist (sample_means10, breaks =20 , xlim=xlimits)
hist(sample_means50, breaks =20 , xlim=xlimits )
hist(sample_means100, breaks =20 , xlim=xlimits )
population <-cdc$height
set.seed (123 )
samp <-sample(population, 50 )
sample_mean <-mean (samp)
hist(samp)
par(mfrow=c(1,2))
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-3, seq (-3, 1.5 , 0.01 ), 1.5 )
cord.y <-c(0, dnorm(seq (-3, 1.5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
curve (pnorm (x, 0, 1), xlim =c(-3, 3), main ="F(z)=P(Z<=z)" )
pnorm(1.5)
## [1] 0.9331928
pnorm(-1.5)
## [1] 0.0668072
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-1.5, seq (-1.5, 1.5 , 0.01 ), 1.5 )
cord.y <-c(0, dnorm(seq (-1.5, 1.5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
pnorm(1.5) - pnorm(-1.5)
## [1] 0.8663856
pnorm(1.96) - pnorm(-1.96)
## [1] 0.9500042
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-1.96, seq (-1.96, 1.96 , 0.01 ), 1.96 )
cord.y <-c(0, dnorm(seq (-1.96, 1.96 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
samp <-sample(cdc$height, 50 )
sample_mean <- mean(samp)
sde <- sd(samp) / sqrt(50)
lower <- sample_mean - 1.96 * sde
upper <- sample_mean + 1.96 * sde
lower
## [1] 65.28906
upper
## [1] 67.63094
mean(cdc$height)
## [1] 67.1829
hist(samp)
abline(v = upper , col ='red')
abline(v = lower , col ='red')
abline(v = mean(cdc$height), col= 'blue')
set.seed(123)
samp <-sample(cdc$height, 10 )
sample_mean <- mean(samp)
sde <- sd(samp) / sqrt(10)
lower10 <- sample_mean - 1.96 * sde
upper10 <- sample_mean + 1.96 * sde
set.seed(123)
samp <-sample(cdc$height, 50 )
sample_mean <- mean(samp)
sde <- sd(samp) / sqrt(50)
lower50 <- sample_mean - 1.96 * sde
upper50 <- sample_mean + 1.96 * sde
set.seed(123)
samp <-sample(cdc$height, 100 )
sample_mean <- mean(samp)
sde <- sd(samp) / sqrt(100)
lower100 <- sample_mean - 1.96 * sde
upper100 <- sample_mean + 1.96 * sde
list(lower10, upper10, lower50, upper50, lower100, upper100)
## [[1]]
## [1] 63.02933
##
## [[2]]
## [1] 69.17067
##
## [[3]]
## [1] 65.54618
##
## [[4]]
## [1] 68.29382
##
## [[5]]
## [1] 66.66416
##
## [[6]]
## [1] 68.41584
x <- c(0,3,6)
y <- c(0,5,0)
plot(x,y, type = 'n')
polygon(x,y, col='red')
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
#seq (-1.96, 1.96 , 0.01 )
cord.x <-c(-1.96, seq (-1.96, 1.96 , 0.01 ), 1.96 )
#dnorm(seq(-1.06,1.96, 0.01))
cord.y <-c(0, dnorm(seq (-1.96, 1.96 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
## 多變量分析
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
numeric_dataset <- cdc[,c('height', 'weight', 'wtdesire', 'age')]
#numeric_dataset
cov(numeric_dataset)
## height weight wtdesire age
## height 17.023499 91.834880 100.13654 -8.879927
## weight 91.834880 1606.484154 1026.56638 1.108694
## wtdesire 100.136542 1026.566383 1024.85178 -13.769994
## age -8.879927 1.108694 -13.76999 295.588571
sum((numeric_dataset$weight - mean(numeric_dataset$weight)) * (numeric_dataset$height - mean(numeric_dataset$height))) / nrow(numeric_dataset)
## [1] 91.83029
cor(numeric_dataset)
## height weight wtdesire age
## height 1.0000000 0.555322192 0.75811946 -0.125181791
## weight 0.5553222 1.000000000 0.80005213 0.001608902
## wtdesire 0.7581195 0.800052128 1.00000000 -0.025018392
## age -0.1251818 0.001608902 -0.02501839 1.000000000
cov(numeric_dataset[, c('weight', 'wtdesire')]) / (sd(numeric_dataset$weight) * sd(numeric_dataset$wtdesire) )
## weight wtdesire
## weight 1.2520097 0.8000521
## wtdesire 0.8000521 0.7987159
heatmap(cor(numeric_dataset))
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
co <- cor(numeric_dataset)
co
## height weight wtdesire age
## height 1.0000000 0.555322192 0.75811946 -0.125181791
## weight 0.5553222 1.000000000 0.80005213 0.001608902
## wtdesire 0.7581195 0.800052128 1.00000000 -0.025018392
## age -0.1251818 0.001608902 -0.02501839 1.000000000
melt(co)
## Var1 Var2 value
## 1 height height 1.000000000
## 2 weight height 0.555322192
## 3 wtdesire height 0.758119462
## 4 age height -0.125181791
## 5 height weight 0.555322192
## 6 weight weight 1.000000000
## 7 wtdesire weight 0.800052128
## 8 age weight 0.001608902
## 9 height wtdesire 0.758119462
## 10 weight wtdesire 0.800052128
## 11 wtdesire wtdesire 1.000000000
## 12 age wtdesire -0.025018392
## 13 height age -0.125181791
## 14 weight age 0.001608902
## 15 wtdesire age -0.025018392
## 16 age age 1.000000000
qplot(x=Var1, y=Var2, data=melt(co), fill=value, geom="tile")
co <- cor(numeric_dataset)
heatmap(co)
plot(wtdesire ~ weight, data = cdc)
fit <- lm(wtdesire ~ weight, data = cdc)
fit
##
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
##
## Coefficients:
## (Intercept) weight
## 46.664 0.639
summary(fit)
##
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.98 -9.32 0.08 11.51 518.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.664015 0.590782 78.99 <2e-16 ***
## weight 0.639014 0.003388 188.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.21 on 19998 degrees of freedom
## Multiple R-squared: 0.6401, Adjusted R-squared: 0.6401
## F-statistic: 3.556e+04 on 1 and 19998 DF, p-value: < 2.2e-16
plot(wtdesire ~ weight, data = numeric_dataset)
abline(fit, col='red')