url <-'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/cdc.Rdata'
curl::curl_download(url, destfile = 'cdc.Rdata')
load('cdc.Rdata')
class(cdc)
## [1] "data.frame"
names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"   "wtdesire"
## [8] "age"      "gender"
str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00

數據的中心

sum(cdc$height) / length(cdc$height)
## [1] 67.1829
mean(cdc$height)
## [1] 67.1829
salary <- c(100,120,130,110,90,95)
mean(salary)
## [1] 107.5
sort(salary)
## [1]  90  95 100 110 120 130
(100 + 110) / 2 
## [1] 105
median(salary)
## [1] 105
salary2 <- c(salary, 10000)
mean(salary2)
## [1] 1520.714
sort(salary2)
## [1]    90    95   100   110   120   130 10000
median(salary2)
## [1] 110
patients <- c('台北區', '中區', '中區', '中區', '南區', '台北區')
table(patients)
## patients
##   中區   南區 台北區 
##      3      1      2
mean(cdc$weight)
## [1] 169.683
median(cdc$weight)
## [1] 165
#sort(table(cdc$weight), decreasing = TRUE)

table(cdc$smoke100)
## 
##     0     1 
## 10559  9441
length(cdc$smoke100)
## [1] 20000
table(cdc$smoke100) / length(cdc$smoke100)
## 
##       0       1 
## 0.52795 0.47205
barplot(table(cdc$smoke100))

pie(table(cdc$smoke100))

gender_smokers <- table(cdc$gender, cdc$smoke100)
gender_smokers
##    
##        0    1
##   m 4547 5022
##   f 6012 4419
mosaicplot(gender_smokers)

數據離度

a <- c(150, 155, 160, 162, 168, 171, 173, 175, 178, 182, 185)
sort(a)
##  [1] 150 155 160 162 168 171 173 175 178 182 185
median(a)
## [1] 171
quantile(a,0.25)
## 25% 
## 161
(160 + 162) / 2
## [1] 161
quantile(a,0.75)
##   75% 
## 176.5
(175 + 178) / 2  
## [1] 176.5
quantile(a,0.75) - quantile(a,0.25)
##  75% 
## 15.5
IQR(a)
## [1] 15.5
a <- c(150, 155, 160, 162, 168, 171, 173, 175, 178, 182, 185)

boxplot(a)

boxplot(cdc$weight)

#height = gender * X + b
boxplot(cdc$height ~ cdc $gender)

temp <- sample(c(34,38), 100, replace = TRUE)
boxplot(temp)

boxplot(c(temp, 999,999))

bmi <-  (cdc$weight/cdc$height^2) * 703
boxplot(bmi ~ cdc$genhlth)

hist(cdc$height, breaks = 50)

hist(cdc$height, breaks = 100)

#stem(cdc$weight)
plot(cdc$weight, cdc$wtdesire)

url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/who_sample.xlsx'

curl::curl_download(url, destfile = 'who_sample.xlsx')

library(readxl)
who_sample <- read_excel("who_sample.xlsx")

head(who_sample)
## # A tibble: 6 x 3
##   date       link                              title                            
##   <chr>      <chr>                             <chr>                            
## 1 6 Septemb… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
## 2 5 Septemb… http://www.who.int/entity/csr/do… Human infection with avian influ…
## 3 30 August… http://www.who.int/entity/csr/do… Yellow fever ??? France ??? Fren…
## 4 28 August… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
## 5 25 August… http://www.who.int/entity/csr/do… Chikungunya ??? France           
## 6 17 August… http://www.who.int/entity/csr/do… Middle East respiratory syndrome…
#install.packages("googlesheets4")
library(googlesheets4)
gs4_deauth()
read_sheet("https://docs.google.com/spreadsheets/d/1U6Cf_qEOhiR9AZqTqS3mbMF3zt2db48ZP5v3rkrAEJY/edit#gid=780868077")
## Reading from "gapminder"
## Range "Africa"
## # A tibble: 624 x 6
##    country continent  year lifeExp      pop gdpPercap
##    <chr>   <chr>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 Algeria Africa     1952    43.1  9279525     2449.
##  2 Algeria Africa     1957    45.7 10270856     3014.
##  3 Algeria Africa     1962    48.3 11000948     2551.
##  4 Algeria Africa     1967    51.4 12760499     3247.
##  5 Algeria Africa     1972    54.5 14760787     4183.
##  6 Algeria Africa     1977    58.0 17152804     4910.
##  7 Algeria Africa     1982    61.4 20033753     5745.
##  8 Algeria Africa     1987    65.8 23254956     5681.
##  9 Algeria Africa     1992    67.7 26298373     5023.
## 10 Algeria Africa     1997    69.2 29072015     4797.
## # … with 614 more rows
sd(cdc$weight)
## [1] 40.08097
sqrt(var(cdc$weight))
## [1] 40.08097
sd(cdc$weight) ^ 2
## [1] 1606.484
var(cdc$weight)
## [1] 1606.484
contender1 <- c(8.4,8.6,8.8,9,9,9.2,9.7,10.1,10.4,10.3,10.5,10.6,11.0,11.1,11.4,11.7,11.9,12.3,12.8,13,13,14.2,14.4,14.6)
contender2 <- c(9.8,9.8,9.9,10.1,10.1,10.2,10.2,10.3,10.3,10.7,10.8,10.8,11,11.1,11.2,11.2,11.3,11.6,11.7,11.7,11.8,11.8,11.9,11.9)


summary(contender1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.400   9.575  10.800  11.083  12.425  14.600
summary(contender2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.80   10.20   10.90   10.88   11.62   11.90
combined <- cbind(contender1,contender2)
boxplot(combined)

sd(contender1)
## [1] 1.880718
sd(contender2)
## [1] 0.7293038
url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/blood_lead.RData' 
curl::curl_download(url, destfile= 'blood_lead.RData')
load('blood_lead.RData')

View(blood_lead)

str(blood_lead)
## tibble [41 × 3] (S3: tbl_df/tbl/data.frame)
##  $ ID               : num [1:41] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age              : num [1:41] 3 4 6 7 9 10 11 12 13 14 ...
##  $ Blood lead levels: num [1:41] 69 45 49 84 48 58 17 76 61 78 ...
#blood_lead$Blood lead levels
#backticks
#blood_lead$`Blood lead levels`
#blood_lead[,3]

mean(blood_lead$`Blood lead levels`)
## [1] NA
a <- c(50,60,70)
mean(a)
## [1] 60
a2 <- c(50,60,70, NA)
mean(a2)
## [1] NA
?mean

mean(blood_lead$`Blood lead levels`, na.rm = TRUE)
## [1] 60.5641
summary(blood_lead$`Blood lead levels`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   17.00   48.00   58.00   60.56   75.00  104.00       2
sd(blood_lead$`Blood lead levels`, na.rm = TRUE)
## [1] 19.58624
boxplot(blood_lead$`Blood lead levels`)

hist(blood_lead$`Blood lead levels`)

機率

sample(1:10)
##  [1]  3  8  4  2 10  6  1  5  7  9
sample(1:10, size = 5)
## [1]  2  1  9  5 10
sample(1:42, size = 6)
## [1] 13 36 39  8 38 41
sample(c(0,1), 10, replace = TRUE)
##  [1] 1 1 1 1 1 1 0 1 0 1
sample.int(20, 12)
##  [1]  8 15 17 10  1  6 20  3 19 12 14  7
coins <- c("heads", "tails")

fair_coin <- sample(coins, size = 100, replace = TRUE)
table(fair_coin)
## fair_coin
## heads tails 
##    53    47
unfair_coin <- sample(coins, size = 100, 
                            replace = TRUE, prob = c(0.3,0.7))
table(unfair_coin)
## unfair_coin
## heads tails 
##    36    64
hist(cdc$height)

rnorm(100)
##   [1] -0.10719872  1.51164816  0.79434792 -0.58791031  0.50834121 -1.07459888
##   [7] -0.84651193  0.41535356  0.46391627  0.48219395 -0.50865286  1.27263219
##  [13]  0.09054305 -1.00316624 -0.64567996  1.00224294  0.14307319  0.89262443
##  [19] -0.51476186 -0.88866637 -0.75052786  1.70604517 -0.21618354  0.34385283
##  [25]  1.44784747 -0.56050831  1.10230969 -1.69040010  0.53514112  0.46258720
##  [31] -0.53129102  1.93258720  0.60521479 -0.59184381  0.30669698 -0.29198592
##  [37] -0.30992558 -1.06338071  1.28355743 -0.29066316 -0.62726656  0.55473927
##  [43]  0.63108132  1.62576634  1.06286579  0.71787634  1.20645647  0.92469491
##  [49] -0.32822751  1.79254985  0.44608581 -0.41832405 -0.63040556 -1.25124230
##  [55] -0.51056725 -0.29374759  0.78963124  0.16513179  0.08176565 -0.55607733
##  [61]  0.39954209  0.03109339  0.74103296 -0.41646506  0.14225788  1.63435126
##  [67] -1.39408709 -0.45337765 -0.54677123  0.13052387 -0.28739547 -1.00457872
##  [73] -0.44467993  0.12522951 -0.34114693  0.52924171  0.88546444 -0.91040710
##  [79]  1.96274589 -0.21681848  0.40161561 -0.52508496 -0.56031488  1.97246500
##  [85] -1.59658523 -0.64885500  0.08465384  0.88313958 -0.85281007  0.14083823
##  [91] -0.18303673 -1.97129543  0.53020607  2.03501273 -0.77943377 -0.31884636
##  [97]  0.80931600 -0.66464365 -0.13394031  1.62798258
hist(rnorm(100))

curve(dnorm,-3,3)

dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(180, mean = 175,sd = 5)
## [1] 0.04839414
pnorm(0)
## [1] 0.5
pnorm(180, mean = 175,sd = 5)
## [1] 0.8413447
pnorm(11500, mean = 10000, sd = 1000)
## [1] 0.9331928
pnorm(30000, mean = 10000, sd = 1000)
## [1] 1
pnorm(11000, mean = 10000, sd = 1000) - pnorm(9000, mean = 10000, sd = 1000)
## [1] 0.6826895
pnorm(12000, mean = 10000, sd = 1000) - pnorm(8000, mean = 10000, sd = 1000)
## [1] 0.9544997
pnorm(13000, mean = 10000, sd = 1000) - pnorm(7000, mean = 10000, sd = 1000)
## [1] 0.9973002
set.seed(50)
y = runif(100,0,5)
hist(y)

set.seed(50)
y = runif(10000,0,5)
hist(y)

hist(rpois(5000,3))

1 - pnorm(207, mean = 165.3 , sd = 5.9)
## [1] 7.870371e-13
cov(cdc$height, cdc$weight) 
## [1] 91.83488
cor(cdc$height, cdc$weight)
## [1] 0.5553222
numeric_dataset <- cdc[,c('height', 'weight', 'wtdesire', 'age')] 
cor(numeric_dataset)
##              height      weight    wtdesire          age
## height    1.0000000 0.555322192  0.75811946 -0.125181791
## weight    0.5553222 1.000000000  0.80005213  0.001608902
## wtdesire  0.7581195 0.800052128  1.00000000 -0.025018392
## age      -0.1251818 0.001608902 -0.02501839  1.000000000
cov(numeric_dataset)
##              height      weight   wtdesire        age
## height    17.023499   91.834880  100.13654  -8.879927
## weight    91.834880 1606.484154 1026.56638   1.108694
## wtdesire 100.136542 1026.566383 1024.85178 -13.769994
## age       -8.879927    1.108694  -13.76999 295.588571
heatmap(cor(numeric_dataset))