刻劃數據輪廓

load("C:/Users/nc20/Downloads/cdc.Rdata")

class(cdc)
## [1] "data.frame"
str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
range(cdc$weight)
## [1]  68 500
?cut
## starting httpd help server ... done
a <- c(20, 22,25,27,32,34,57,42)
bins <- seq(0,100, by = 10)
bins
##  [1]   0  10  20  30  40  50  60  70  80  90 100
# [20,30) right = False
# a >= 20 & a < 30

# (10,20] right = TRUE
# a > 10 & a <= 20

cut(a, bins, right=TRUE)
## [1] (10,20] (20,30] (20,30] (20,30] (30,40] (30,40] (50,60] (40,50]
## 10 Levels: (0,10] (10,20] (20,30] (30,40] (40,50] (50,60] ... (90,100]
bins      <- seq(0,500,by = 10)
intervals <- cut(cdc$weight, bins, right=FALSE)

#table(intervals)
plot(table(intervals), type= 'h', main = 'weight histogram', xlab = 'intervals', ylab = 'frequency')

hist(cdc$weight)

?hist

hist(cdc$weight, breaks = 50)

hist(cdc$weight, breaks = 500)

sort(table(cdc$weight), decreasing=TRUE)
## 
## 160 150 180 170 200 140 190 165 130 175 145 135 185 155 125 120 210 195 
## 992 970 933 922 805 794 715 692 627 626 615 589 577 527 473 440 431 393 
## 220 230 115 110 205 215 240 250 225 138 235 128 168 105 148 132 142 178 
## 376 268 244 235 230 206 204 202 196 144 137 125 122 112 111 110 110 106 
## 260 118 158 162 172 100 152 122 127 134 198 300 112 245 123 182 192 137 
## 104 102 102  96  95  94  80  74  71  71  70  70  69  69  65  64  64  62 
## 143 147 136 163 126 124 280 270 108 174 173 156 188 117 157 153 154 187 
##  62  62  60  60  59  58  57  56  55  53  50  49  49  48  48  47  47  47 
## 133 164 167 183 212 146 186 144 149 176 184 275 290 114 129 179 204 265 
##  46  46  45  45  45  43  42  41  40  40  40  40  40  39  37  37  36  36 
## 113 197 218 107 116 189 208 119 203 193 194 103 139 141 196 159 169 255 
##  34  34  34  33  33  33  33  32  32  31  31  30  30  30  29  28  28  27 
## 131 166 171  98 151 106 121 202  95 177 228 207 285 350 161 206 102 199 
##  26  26  26  25  25  24  24  24  22  22  22  21  21  21  19  19  18  18 
## 216 222 248 104 109 214  90 238 310 320 181 217 219 236 295 209 227 242 
##  18  18  18  17  14  14  12  12  12  12  11  11  11  11  11  10  10  10 
## 191 213 232 252 201 224 226 234 257  92 111 211 223 315  93  97 101 246 
##   9   9   9   9   8   8   8   8   8   7   7   7   7   7   6   6   6   6 
## 262 278 330 340  99 237 258 400  85  88 233 239 241 243 253 256 263 267 
##   6   6   6   6   5   5   5   5   4   4   4   4   4   4   4   4   4   4 
## 268 305 380  84  94  96 272 274 286 298 325  78  80  82 231 247 249 254 
##   4   4   4   3   3   3   3   3   3   3   3   2   2   2   2   2   2   2 
## 276 279 282 283 287 292 360 362 385  68  70  79  83  86 221 229 244 271 
##   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1 
## 273 294 296 297 308 309 313 318 319 324 327 328 344 348 364 370 371 390 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 405 495 500 
##   1   1   1
table(cdc$weight %% 10)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 9421  207  919  545  525 5865  481  543 1159  335
#stem(cdc$weight)

找出中心值

mean(cdc$weight)
## [1] 169.683
sum(cdc$weight) / length(cdc$weight)
## [1] 169.683
a <- c(50,45,60,70,68,88,100)
sort(a)
## [1]  45  50  60  68  70  88 100
pos <- ceiling(length(a)/ 2)
sort(a)[pos]
## [1] 68
median(a)
## [1] 68
b <- c(50,45,60,70,88,100)
sort(b)
## [1]  45  50  60  70  88 100
(60 + 70 ) / 2
## [1] 65
pos <- length(b) / 2
pos
## [1] 3
(sort(b)[pos] + sort(b)[pos + 1] ) / 2
## [1] 65
median(b)
## [1] 65
median2 <- function(data){
  if (length(data) %% 2 ==0 ){
    pos <- length(data) / 2    
    return((sort(data)[pos] + sort(data)[pos + 1] ) / 2)
  } else{
    pos <- ceiling(length(data)/ 2)
    return(sort(data)[pos])
  }
}


median2(a)
## [1] 68
median2(b)
## [1] 65
median(cdc$weight)
## [1] 165
mean(cdc$weight)
## [1] 169.683
median(cdc$weight)
## [1] 165
which.max(table(cdc$weight))
## 160 
##  81
head(sort(table(cdc$weight), decreasing = TRUE))
## 
## 160 150 180 170 200 140 
## 992 970 933 922 805 794

根據類別統計

#?table
table(cdc$smoke100)
## 
##     0     1 
## 10559  9441
table(cdc$smoke100) / length(cdc$smoke100)
## 
##       0       1 
## 0.52795 0.47205
tb <- table(cdc$smoke100)
barplot(tb, main= 'smoke statistics', xlab= 'smoking', ylab = 'frequency', names.arg = c('No', 'Yes'), col= c('red', 'blue') )

pie(tb, labels = c('No', 'Yes'), init.angle = 90, clockwise = TRUE, main ='smokers statistics')

tb <- table(cdc$smoke100, cdc$gender)
colnames(tb) <- c('male', 'female')
rownames(tb) <- c('Non-smoker', 'Smoker')
tb
##             
##              male female
##   Non-smoker 4547   6012
##   Smoker     5022   4419
mosaicplot(tb, main = 'smokers by gender', col=c('blue', 'red'))

barplot(tb, col=c('blue', 'red') )

?barplot
barplot(tb, col=c('blue', 'red'), beside=TRUE )

IQR

a =c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185 )

sort(a)
##  [1] 150 155 160 162 168 171 173 175 178 182 185
length(a)
## [1] 11
sort(a)[6]
## [1] 171
median(a)
## [1] 171
(160 + 162) / 2
## [1] 161
quantile(a, 0.25)
## 25% 
## 161
(175+178) / 2
## [1] 176.5
quantile(a, 0.75)
##   75% 
## 176.5
quantile(a, 0.75) - quantile(a, 0.25)
##  75% 
## 15.5
IQR(a)
## [1] 15.5
stem(a)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   15 | 05
##   16 | 028
##   17 | 1358
##   18 | 25

Boxplot

a  <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185 )


median(a)
## [1] 171
quantile(a, 0.5)
## 50% 
## 171
quantile(a, 0.25)
## 25% 
## 161
quantile(a, 0.75)
##   75% 
## 176.5
quantile(a, 0.75) - quantile(a, 0.25)
##  75% 
## 15.5
IQR(a)
## [1] 15.5
min(a[a > (median(a) - 1.5 * IQR(a))])
## [1] 150
max(a[a < (median(a) + 1.5 * IQR(a))])
## [1] 185
b  <- c(60 , 70 , 72 , 78 , 88 , 90 , 120 , 125 , 180 , 175 , 1000 )

mean(b)
## [1] 187.0909
median(b)
## [1] 90
quantile(b, 0.25)
## 25% 
##  75
quantile(b, 0.75)
## 75% 
## 150
IQR(b)
## [1] 75
min(b[b > (median(b) - 1.5 * IQR(b))])
## [1] 60
max(b[b < (median(b) + 1.5 * IQR(b))])
## [1] 180
boxplot(b)
abline(h = 180, col='red', lty = 2)
abline(h = 60,  col='red', lty = 2)

set.seed(123)
temp <- sample.int(30, 100, replace=TRUE)
mean(temp)
## [1] 15.43
median(temp)
## [1] 14.5
temp <- c(temp, 999,999,999)
mean(temp)
## [1] 44.07767
median(temp)
## [1] 15
boxplot(temp)

temp2 <- temp[temp < 100]
boxplot(temp2)

boxplot(cdc$height ~ cdc$gender, main = 'height by gender', xlab = 'gender', ylab = 'height',  names= c('male', 'female'))

#?boxplot

boxplot(cdc$weight ~ cdc$gender, main = 'weight by gender', xlab = 'gender', ylab = 'weight',  names= c('male', 'female'))

cdc$bmi  <- (cdc$weight /cdc$height^2)*703
boxplot(cdc$bmi ~ cdc$genhlth, main = 'bmi by genhlth', xlab = 'genhlth', ylab = 'bmi')

scatter plot

plot(cdc$weight, cdc$wtdesire)

fit <- lm(wtdesire ~ weight, data = cdc)
plot(wtdesire ~ weight, data = cdc)
abline(fit, col = 'red')

Standard Deviaton

a  <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185, NA )

# 1. drop na
mean(a, na.rm=TRUE)
## [1] 169
# 2. fill na with some statstics number
patients <- data.frame(
    age    = c(30,23,22,29,35,NA, NA),
    gender = c('M', 'F', 'F', 'M', 'M', 'M', 'F'))

patients
##   age gender
## 1  30      M
## 2  23      F
## 3  22      F
## 4  29      M
## 5  35      M
## 6  NA      M
## 7  NA      F
tapply(patients$age, patients$gender, function(e) mean(e, na.rm=TRUE))
##        F        M 
## 22.50000 31.33333
# fill na with some pattern
employees <- data.frame(tenure = c(2,3,1,5,4), 
                        salary=  c(100,NA,50, 250, 200))
employees
##   tenure salary
## 1      2    100
## 2      3     NA
## 3      1     50
## 4      5    250
## 5      4    200
a  <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185)

mean(a)
## [1] 169
sum(a - mean(a))
## [1] 0
sum(abs(a - mean(a)))
## [1] 100
# Variance: sd ^ 2
sum((a - mean(a)) ^ 2) / (length(a) - 1)
## [1] 125
var(a)
## [1] 125
sd(a) ^ 2
## [1] 125
# standard deviation: sqrt(var)
sqrt(var(a))
## [1] 11.18034
sd(a)
## [1] 11.18034
sd(cdc$weight)
## [1] 40.08097
var(cdc$weight)
## [1] 1606.484
mean(cdc$weight) 
## [1] 169.683
mean(cdc$weight)  - sd(cdc$weight)
## [1] 129.602
mean(cdc$weight)  + sd(cdc$weight)
## [1] 209.7639
nrow(cdc[(cdc$weight >= mean(cdc$weight)  - sd(cdc$weight)) & (cdc$weight <mean(cdc$weight)  + sd(cdc$weight)),  ])
## [1] 14152
contender1 <- c(8.4,8.6,8.8,9.9,9.2,9.7,10.1,10.4,10.3,10.5,10.6,11.0,11.1,11.4,11.7,11.9,12.3,12.8,13,13.13,14.2,14.4,14.6)
contender2 <- c(9.8,9.8,9.9 ,10.1, 10.1 ,10.2, 10.2,10.3,10.3,10.7,10.8,10.8,11.1,11.2,11.2,11.3,11.6,11.7,11.7,11.8,11.8,11.9 ,11.9)

summary(contender1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.40   10.00   11.00   11.22   12.55   14.60
summary(contender2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.80   10.20   10.80   10.88   11.65   11.90
IQR (contender1 )
## [1] 2.55
IQR (contender2 )
## [1] 1.45
combined <- cbind(contender1, contender2)
length(contender1)
## [1] 23
length(contender2)
## [1] 23
boxplot(combined)

sd(contender1)
## [1] 1.836088
sd(contender2)
## [1] 0.7452617