url <-'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cdc.Rdata'
curl::curl_download(url, destfile = 'cdc.Rdata')
load('cdc.Rdata')
class(cdc)
## [1] "data.frame"
names(cdc)
## [1] "genhlth" "exerany" "hlthplan" "smoke100" "height" "weight" "wtdesire"
## [8] "age" "gender"
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
summary(cdc)
## genhlth exerany hlthplan smoke100
## excellent:4657 Min. :0.0000 Min. :0.0000 Min. :0.0000
## very good:6972 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## good :5675 Median :1.0000 Median :1.0000 Median :0.0000
## fair :2019 Mean :0.7457 Mean :0.8738 Mean :0.4721
## poor : 677 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## height weight wtdesire age gender
## Min. :48.00 Min. : 68.0 Min. : 68.0 Min. :18.00 m: 9569
## 1st Qu.:64.00 1st Qu.:140.0 1st Qu.:130.0 1st Qu.:31.00 f:10431
## Median :67.00 Median :165.0 Median :150.0 Median :43.00
## Mean :67.18 Mean :169.7 Mean :155.1 Mean :45.07
## 3rd Qu.:70.00 3rd Qu.:190.0 3rd Qu.:175.0 3rd Qu.:57.00
## Max. :93.00 Max. :500.0 Max. :680.0 Max. :99.00
數據的中心
sum(cdc$height) / length(cdc$height)
## [1] 67.1829
mean(cdc$height)
## [1] 67.1829
salary <- c(100,120,130,110,90,95)
mean(salary)
## [1] 107.5
sort(salary)
## [1] 90 95 100 110 120 130
(100 + 110) / 2
## [1] 105
median(salary)
## [1] 105
salary2 <- c(salary, 10000)
mean(salary2)
## [1] 1520.714
sort(salary2)
## [1] 90 95 100 110 120 130 10000
median(salary2)
## [1] 110
patients <- c('台北區', '中區', '中區', '中區', '南區', '台北區')
table(patients)
## patients
## 中區 南區 台北區
## 3 1 2
mean(cdc$weight)
## [1] 169.683
median(cdc$weight)
## [1] 165
#sort(table(cdc$weight), decreasing = TRUE)
table(cdc$smoke100)
##
## 0 1
## 10559 9441
length(cdc$smoke100)
## [1] 20000
table(cdc$smoke100) / length(cdc$smoke100)
##
## 0 1
## 0.52795 0.47205
barplot(table(cdc$smoke100))

pie(table(cdc$smoke100))

gender_smokers <- table(cdc$gender, cdc$smoke100)
gender_smokers
##
## 0 1
## m 4547 5022
## f 6012 4419
mosaicplot(gender_smokers)

數據離度
a <- c(150, 155, 160, 162, 168, 171, 173, 175, 178, 182, 185)
sort(a)
## [1] 150 155 160 162 168 171 173 175 178 182 185
median(a)
## [1] 171
quantile(a,0.25)
## 25%
## 161
(160 + 162) / 2
## [1] 161
quantile(a,0.75)
## 75%
## 176.5
(175 + 178) / 2
## [1] 176.5
quantile(a,0.75) - quantile(a,0.25)
## 75%
## 15.5
IQR(a)
## [1] 15.5
a <- c(150, 155, 160, 162, 168, 171, 173, 175, 178, 182, 185)
boxplot(a)

boxplot(cdc$weight)

#height = gender * X + b
boxplot(cdc$height ~ cdc $gender)

temp <- sample(c(34,38), 100, replace = TRUE)
boxplot(temp)

boxplot(c(temp, 999,999))

bmi <- (cdc$weight/cdc$height^2) * 703
boxplot(bmi ~ cdc$genhlth)

hist(cdc$height, breaks = 50)

hist(cdc$height, breaks = 100)

#stem(cdc$weight)
plot(cdc$weight, cdc$wtdesire)

url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/who_sample.xlsx'
curl::curl_download(url, destfile = 'who_sample.xlsx')
library(readxl)
who_sample <- read_excel("who_sample.xlsx")
head(who_sample)
## # A tibble: 6 x 3
## date link title
## <chr> <chr> <chr>
## 1 6 Septemb… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
## 2 5 Septemb… http://www.who.int/entity/csr/do… Human infection with avian influ…
## 3 30 August… http://www.who.int/entity/csr/do… Yellow fever ??? France ??? Fren…
## 4 28 August… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
## 5 25 August… http://www.who.int/entity/csr/do… Chikungunya ??? France
## 6 17 August… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
#install.packages("googlesheets4")
library(googlesheets4)
gs4_deauth()
read_sheet("https://docs.google.com/spreadsheets/d/1U6Cf_qEOhiR9AZqTqS3mbMF3zt2db48ZP5v3rkrAEJY/edit#gid=780868077")
## Reading from "gapminder"
## Range "Africa"
## # A tibble: 624 x 6
## country continent year lifeExp pop gdpPercap
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Algeria Africa 1952 43.1 9279525 2449.
## 2 Algeria Africa 1957 45.7 10270856 3014.
## 3 Algeria Africa 1962 48.3 11000948 2551.
## 4 Algeria Africa 1967 51.4 12760499 3247.
## 5 Algeria Africa 1972 54.5 14760787 4183.
## 6 Algeria Africa 1977 58.0 17152804 4910.
## 7 Algeria Africa 1982 61.4 20033753 5745.
## 8 Algeria Africa 1987 65.8 23254956 5681.
## 9 Algeria Africa 1992 67.7 26298373 5023.
## 10 Algeria Africa 1997 69.2 29072015 4797.
## # … with 614 more rows
sd(cdc$weight)
## [1] 40.08097
sqrt(var(cdc$weight))
## [1] 40.08097
sd(cdc$weight) ^ 2
## [1] 1606.484
var(cdc$weight)
## [1] 1606.484
contender1 <- c(8.4,8.6,8.8,9,9,9.2,9.7,10.1,10.4,10.3,10.5,10.6,11.0,11.1,11.4,11.7,11.9,12.3,12.8,13,13,14.2,14.4,14.6)
contender2 <- c(9.8,9.8,9.9,10.1,10.1,10.2,10.2,10.3,10.3,10.7,10.8,10.8,11,11.1,11.2,11.2,11.3,11.6,11.7,11.7,11.8,11.8,11.9,11.9)
summary(contender1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.400 9.575 10.800 11.083 12.425 14.600
summary(contender2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.80 10.20 10.90 10.88 11.62 11.90
combined <- cbind(contender1,contender2)
boxplot(combined)

sd(contender1)
## [1] 1.880718
sd(contender2)
## [1] 0.7293038
url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/blood_lead.RData'
curl::curl_download(url, destfile= 'blood_lead.RData')
load('blood_lead.RData')
View(blood_lead)
str(blood_lead)
## tibble [41 × 3] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:41] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:41] 3 4 6 7 9 10 11 12 13 14 ...
## $ Blood lead levels: num [1:41] 69 45 49 84 48 58 17 76 61 78 ...
#blood_lead$Blood lead levels
#backticks
#blood_lead$`Blood lead levels`
#blood_lead[,3]
mean(blood_lead$`Blood lead levels`)
## [1] NA
a <- c(50,60,70)
mean(a)
## [1] 60
a2 <- c(50,60,70, NA)
mean(a2)
## [1] NA
?mean
mean(blood_lead$`Blood lead levels`, na.rm = TRUE)
## [1] 60.5641
summary(blood_lead$`Blood lead levels`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 17.00 48.00 58.00 60.56 75.00 104.00 2
sd(blood_lead$`Blood lead levels`, na.rm = TRUE)
## [1] 19.58624
boxplot(blood_lead$`Blood lead levels`)

hist(blood_lead$`Blood lead levels`)

機率
sample(1:10)
## [1] 3 8 4 2 10 6 1 5 7 9
sample(1:10, size = 5)
## [1] 2 1 9 5 10
sample(1:42, size = 6)
## [1] 13 36 39 8 38 41
sample(c(0,1), 10, replace = TRUE)
## [1] 1 1 1 1 1 1 0 1 0 1
sample.int(20, 12)
## [1] 8 15 17 10 1 6 20 3 19 12 14 7
coins <- c("heads", "tails")
fair_coin <- sample(coins, size = 100, replace = TRUE)
table(fair_coin)
## fair_coin
## heads tails
## 53 47
unfair_coin <- sample(coins, size = 100,
replace = TRUE, prob = c(0.3,0.7))
table(unfair_coin)
## unfair_coin
## heads tails
## 36 64
hist(cdc$height)

rnorm(100)
## [1] -0.10719872 1.51164816 0.79434792 -0.58791031 0.50834121 -1.07459888
## [7] -0.84651193 0.41535356 0.46391627 0.48219395 -0.50865286 1.27263219
## [13] 0.09054305 -1.00316624 -0.64567996 1.00224294 0.14307319 0.89262443
## [19] -0.51476186 -0.88866637 -0.75052786 1.70604517 -0.21618354 0.34385283
## [25] 1.44784747 -0.56050831 1.10230969 -1.69040010 0.53514112 0.46258720
## [31] -0.53129102 1.93258720 0.60521479 -0.59184381 0.30669698 -0.29198592
## [37] -0.30992558 -1.06338071 1.28355743 -0.29066316 -0.62726656 0.55473927
## [43] 0.63108132 1.62576634 1.06286579 0.71787634 1.20645647 0.92469491
## [49] -0.32822751 1.79254985 0.44608581 -0.41832405 -0.63040556 -1.25124230
## [55] -0.51056725 -0.29374759 0.78963124 0.16513179 0.08176565 -0.55607733
## [61] 0.39954209 0.03109339 0.74103296 -0.41646506 0.14225788 1.63435126
## [67] -1.39408709 -0.45337765 -0.54677123 0.13052387 -0.28739547 -1.00457872
## [73] -0.44467993 0.12522951 -0.34114693 0.52924171 0.88546444 -0.91040710
## [79] 1.96274589 -0.21681848 0.40161561 -0.52508496 -0.56031488 1.97246500
## [85] -1.59658523 -0.64885500 0.08465384 0.88313958 -0.85281007 0.14083823
## [91] -0.18303673 -1.97129543 0.53020607 2.03501273 -0.77943377 -0.31884636
## [97] 0.80931600 -0.66464365 -0.13394031 1.62798258
hist(rnorm(100))

curve(dnorm,-3,3)

dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(180, mean = 175,sd = 5)
## [1] 0.04839414
pnorm(0)
## [1] 0.5
pnorm(180, mean = 175,sd = 5)
## [1] 0.8413447
pnorm(11500, mean = 10000, sd = 1000)
## [1] 0.9331928
pnorm(30000, mean = 10000, sd = 1000)
## [1] 1
pnorm(11000, mean = 10000, sd = 1000) - pnorm(9000, mean = 10000, sd = 1000)
## [1] 0.6826895
pnorm(12000, mean = 10000, sd = 1000) - pnorm(8000, mean = 10000, sd = 1000)
## [1] 0.9544997
pnorm(13000, mean = 10000, sd = 1000) - pnorm(7000, mean = 10000, sd = 1000)
## [1] 0.9973002
set.seed(50)
y = runif(100,0,5)
hist(y)

set.seed(50)
y = runif(10000,0,5)
hist(y)

hist(rpois(5000,3))

1 - pnorm(207, mean = 165.3 , sd = 5.9)
## [1] 7.870371e-13
cov(cdc$height, cdc$weight)
## [1] 91.83488
cor(cdc$height, cdc$weight)
## [1] 0.5553222
numeric_dataset <- cdc[,c('height', 'weight', 'wtdesire', 'age')]
cor(numeric_dataset)
## height weight wtdesire age
## height 1.0000000 0.555322192 0.75811946 -0.125181791
## weight 0.5553222 1.000000000 0.80005213 0.001608902
## wtdesire 0.7581195 0.800052128 1.00000000 -0.025018392
## age -0.1251818 0.001608902 -0.02501839 1.000000000
cov(numeric_dataset)
## height weight wtdesire age
## height 17.023499 91.834880 100.13654 -8.879927
## weight 91.834880 1606.484154 1026.56638 1.108694
## wtdesire 100.136542 1026.566383 1024.85178 -13.769994
## age -8.879927 1.108694 -13.76999 295.588571
heatmap(cor(numeric_dataset))
