刻劃數據輪廓
load("C:/Users/nc20/Downloads/cdc.Rdata")
class(cdc)
## [1] "data.frame"
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
range(cdc$weight)
## [1] 68 500
?cut
## starting httpd help server ... done
a <- c(20, 22,25,27,32,34,57,42)
bins <- seq(0,100, by = 10)
bins
## [1] 0 10 20 30 40 50 60 70 80 90 100
# [20,30) right = False
# a >= 20 & a < 30
# (10,20] right = TRUE
# a > 10 & a <= 20
cut(a, bins, right=TRUE)
## [1] (10,20] (20,30] (20,30] (20,30] (30,40] (30,40] (50,60] (40,50]
## 10 Levels: (0,10] (10,20] (20,30] (30,40] (40,50] (50,60] ... (90,100]
bins <- seq(0,500,by = 10)
intervals <- cut(cdc$weight, bins, right=FALSE)
#table(intervals)
plot(table(intervals), type= 'h', main = 'weight histogram', xlab = 'intervals', ylab = 'frequency')

hist(cdc$weight)

?hist
hist(cdc$weight, breaks = 50)

hist(cdc$weight, breaks = 500)

sort(table(cdc$weight), decreasing=TRUE)
##
## 160 150 180 170 200 140 190 165 130 175 145 135 185 155 125 120 210 195
## 992 970 933 922 805 794 715 692 627 626 615 589 577 527 473 440 431 393
## 220 230 115 110 205 215 240 250 225 138 235 128 168 105 148 132 142 178
## 376 268 244 235 230 206 204 202 196 144 137 125 122 112 111 110 110 106
## 260 118 158 162 172 100 152 122 127 134 198 300 112 245 123 182 192 137
## 104 102 102 96 95 94 80 74 71 71 70 70 69 69 65 64 64 62
## 143 147 136 163 126 124 280 270 108 174 173 156 188 117 157 153 154 187
## 62 62 60 60 59 58 57 56 55 53 50 49 49 48 48 47 47 47
## 133 164 167 183 212 146 186 144 149 176 184 275 290 114 129 179 204 265
## 46 46 45 45 45 43 42 41 40 40 40 40 40 39 37 37 36 36
## 113 197 218 107 116 189 208 119 203 193 194 103 139 141 196 159 169 255
## 34 34 34 33 33 33 33 32 32 31 31 30 30 30 29 28 28 27
## 131 166 171 98 151 106 121 202 95 177 228 207 285 350 161 206 102 199
## 26 26 26 25 25 24 24 24 22 22 22 21 21 21 19 19 18 18
## 216 222 248 104 109 214 90 238 310 320 181 217 219 236 295 209 227 242
## 18 18 18 17 14 14 12 12 12 12 11 11 11 11 11 10 10 10
## 191 213 232 252 201 224 226 234 257 92 111 211 223 315 93 97 101 246
## 9 9 9 9 8 8 8 8 8 7 7 7 7 7 6 6 6 6
## 262 278 330 340 99 237 258 400 85 88 233 239 241 243 253 256 263 267
## 6 6 6 6 5 5 5 5 4 4 4 4 4 4 4 4 4 4
## 268 305 380 84 94 96 272 274 286 298 325 78 80 82 231 247 249 254
## 4 4 4 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2
## 276 279 282 283 287 292 360 362 385 68 70 79 83 86 221 229 244 271
## 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
## 273 294 296 297 308 309 313 318 319 324 327 328 344 348 364 370 371 390
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 405 495 500
## 1 1 1
table(cdc$weight %% 10)
##
## 0 1 2 3 4 5 6 7 8 9
## 9421 207 919 545 525 5865 481 543 1159 335
#stem(cdc$weight)
找出ä¸å¿ƒå€¼
mean(cdc$weight)
## [1] 169.683
sum(cdc$weight) / length(cdc$weight)
## [1] 169.683
a <- c(50,45,60,70,68,88,100)
sort(a)
## [1] 45 50 60 68 70 88 100
pos <- ceiling(length(a)/ 2)
sort(a)[pos]
## [1] 68
median(a)
## [1] 68
b <- c(50,45,60,70,88,100)
sort(b)
## [1] 45 50 60 70 88 100
(60 + 70 ) / 2
## [1] 65
pos <- length(b) / 2
pos
## [1] 3
(sort(b)[pos] + sort(b)[pos + 1] ) / 2
## [1] 65
median(b)
## [1] 65
median2 <- function(data){
if (length(data) %% 2 ==0 ){
pos <- length(data) / 2
return((sort(data)[pos] + sort(data)[pos + 1] ) / 2)
} else{
pos <- ceiling(length(data)/ 2)
return(sort(data)[pos])
}
}
median2(a)
## [1] 68
median2(b)
## [1] 65
median(cdc$weight)
## [1] 165
mean(cdc$weight)
## [1] 169.683
median(cdc$weight)
## [1] 165
which.max(table(cdc$weight))
## 160
## 81
head(sort(table(cdc$weight), decreasing = TRUE))
##
## 160 150 180 170 200 140
## 992 970 933 922 805 794
æ ¹æ“šé¡žåˆ¥çµ±è¨ˆ
#?table
table(cdc$smoke100)
##
## 0 1
## 10559 9441
table(cdc$smoke100) / length(cdc$smoke100)
##
## 0 1
## 0.52795 0.47205
tb <- table(cdc$smoke100)
barplot(tb, main= 'smoke statistics', xlab= 'smoking', ylab = 'frequency', names.arg = c('No', 'Yes'), col= c('red', 'blue') )

pie(tb, labels = c('No', 'Yes'), init.angle = 90, clockwise = TRUE, main ='smokers statistics')

tb <- table(cdc$smoke100, cdc$gender)
colnames(tb) <- c('male', 'female')
rownames(tb) <- c('Non-smoker', 'Smoker')
tb
##
## male female
## Non-smoker 4547 6012
## Smoker 5022 4419
mosaicplot(tb, main = 'smokers by gender', col=c('blue', 'red'))

barplot(tb, col=c('blue', 'red') )

?barplot
barplot(tb, col=c('blue', 'red'), beside=TRUE )

IQR
a =c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185 )
sort(a)
## [1] 150 155 160 162 168 171 173 175 178 182 185
length(a)
## [1] 11
sort(a)[6]
## [1] 171
median(a)
## [1] 171
(160 + 162) / 2
## [1] 161
quantile(a, 0.25)
## 25%
## 161
(175+178) / 2
## [1] 176.5
quantile(a, 0.75)
## 75%
## 176.5
quantile(a, 0.75) - quantile(a, 0.25)
## 75%
## 15.5
IQR(a)
## [1] 15.5
stem(a)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 15 | 05
## 16 | 028
## 17 | 1358
## 18 | 25
Boxplot
a <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185 )
median(a)
## [1] 171
quantile(a, 0.5)
## 50%
## 171
quantile(a, 0.25)
## 25%
## 161
quantile(a, 0.75)
## 75%
## 176.5
quantile(a, 0.75) - quantile(a, 0.25)
## 75%
## 15.5
IQR(a)
## [1] 15.5
min(a[a > (median(a) - 1.5 * IQR(a))])
## [1] 150
max(a[a < (median(a) + 1.5 * IQR(a))])
## [1] 185
b <- c(60 , 70 , 72 , 78 , 88 , 90 , 120 , 125 , 180 , 175 , 1000 )
mean(b)
## [1] 187.0909
median(b)
## [1] 90
quantile(b, 0.25)
## 25%
## 75
quantile(b, 0.75)
## 75%
## 150
IQR(b)
## [1] 75
min(b[b > (median(b) - 1.5 * IQR(b))])
## [1] 60
max(b[b < (median(b) + 1.5 * IQR(b))])
## [1] 180
boxplot(b)
abline(h = 180, col='red', lty = 2)
abline(h = 60, col='red', lty = 2)

set.seed(123)
temp <- sample.int(30, 100, replace=TRUE)
mean(temp)
## [1] 15.43
median(temp)
## [1] 14.5
temp <- c(temp, 999,999,999)
mean(temp)
## [1] 44.07767
median(temp)
## [1] 15
boxplot(temp)

temp2 <- temp[temp < 100]
boxplot(temp2)

boxplot(cdc$height ~ cdc$gender, main = 'height by gender', xlab = 'gender', ylab = 'height', names= c('male', 'female'))

#?boxplot
boxplot(cdc$weight ~ cdc$gender, main = 'weight by gender', xlab = 'gender', ylab = 'weight', names= c('male', 'female'))

cdc$bmi <- (cdc$weight /cdc$height^2)*703
boxplot(cdc$bmi ~ cdc$genhlth, main = 'bmi by genhlth', xlab = 'genhlth', ylab = 'bmi')

scatter plot
plot(cdc$weight, cdc$wtdesire)

fit <- lm(wtdesire ~ weight, data = cdc)
plot(wtdesire ~ weight, data = cdc)
abline(fit, col = 'red')

Standard Deviaton
a <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185, NA )
# 1. drop na
mean(a, na.rm=TRUE)
## [1] 169
# 2. fill na with some statstics number
patients <- data.frame(
age = c(30,23,22,29,35,NA, NA),
gender = c('M', 'F', 'F', 'M', 'M', 'M', 'F'))
patients
## age gender
## 1 30 M
## 2 23 F
## 3 22 F
## 4 29 M
## 5 35 M
## 6 NA M
## 7 NA F
tapply(patients$age, patients$gender, function(e) mean(e, na.rm=TRUE))
## F M
## 22.50000 31.33333
# fill na with some pattern
employees <- data.frame(tenure = c(2,3,1,5,4),
salary= c(100,NA,50, 250, 200))
employees
## tenure salary
## 1 2 100
## 2 3 NA
## 3 1 50
## 4 5 250
## 5 4 200
a <- c(150 , 155 , 160 , 162 , 168 , 171 , 173 , 175 , 178 , 182 , 185)
mean(a)
## [1] 169
sum(a - mean(a))
## [1] 0
sum(abs(a - mean(a)))
## [1] 100
# Variance: sd ^ 2
sum((a - mean(a)) ^ 2) / (length(a) - 1)
## [1] 125
var(a)
## [1] 125
sd(a) ^ 2
## [1] 125
# standard deviation: sqrt(var)
sqrt(var(a))
## [1] 11.18034
sd(a)
## [1] 11.18034
sd(cdc$weight)
## [1] 40.08097
var(cdc$weight)
## [1] 1606.484
mean(cdc$weight)
## [1] 169.683
mean(cdc$weight) - sd(cdc$weight)
## [1] 129.602
mean(cdc$weight) + sd(cdc$weight)
## [1] 209.7639
nrow(cdc[(cdc$weight >= mean(cdc$weight) - sd(cdc$weight)) & (cdc$weight <mean(cdc$weight) + sd(cdc$weight)), ])
## [1] 14152
contender1 <- c(8.4,8.6,8.8,9.9,9.2,9.7,10.1,10.4,10.3,10.5,10.6,11.0,11.1,11.4,11.7,11.9,12.3,12.8,13,13.13,14.2,14.4,14.6)
contender2 <- c(9.8,9.8,9.9 ,10.1, 10.1 ,10.2, 10.2,10.3,10.3,10.7,10.8,10.8,11.1,11.2,11.2,11.3,11.6,11.7,11.7,11.8,11.8,11.9 ,11.9)
summary(contender1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.40 10.00 11.00 11.22 12.55 14.60
summary(contender2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.80 10.20 10.80 10.88 11.65 11.90
IQR (contender1 )
## [1] 2.55
IQR (contender2 )
## [1] 1.45
combined <- cbind(contender1, contender2)
length(contender1)
## [1] 23
length(contender2)
## [1] 23
boxplot(combined)

sd(contender1)
## [1] 1.836088
sd(contender2)
## [1] 0.7452617