str(rivers)
## num [1:141] 735 320 325 392 524 ...
sort.rivers<-sort(rivers)
log.rivers<-log(sort.rivers)
library(lattice)
library(MASS)
hist(log.rivers)
n <- length(log.rivers)
i <- 1:n
A. Uniform distribution
max(log.rivers)-min(log.rivers)
## [1] 3.313512
adjust.scale.log.rivers<-(log.rivers - min(log.rivers))/(max(log.rivers) - min(log.rivers)) # follows unif[0,1]/ min-max scaling
qunif.log.rivers<-(((i-0.5)/n)-0)/(1-0)
plot(qunif.log.rivers,adjust.scale.log.rivers, main="Uniform prob Plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, distriution=function(p)qunif(p,min(log.rivers),max(log.rivers)))
B. Normal and Standard Normal distribution
fitdistr(log.rivers, densfu="normal")
## mean sd
## 6.17587888 0.58938291
## (0.04963500) (0.03509724)
qnorm.log.rivers<-qnorm((i-0.5)/n,6.17587888,0.58938291)
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, disribution=function(p)qnorm(p,6.17587888,0.58938291))
qstandard.n.log.rivers<-qnorm((i-0.5)/n)
plot(qstandard.n.log.rivers, log.rivers, main="Standard Normal prob plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, distribution=qnorm)
C. Student-t distribution
fitdistr(log.rivers, densfu="t")
## Warning in log(s): NaNì´ ìì±ëììµëë¤
## m s df
## 6.13240057 0.52222478 9.11266483
## (0.05423608) (0.05259516) (6.68045101)
qt1.log.rivers<-qt((i-0.5)/n, df=9.11266483)
plot(qt1.log.rivers, log.rivers, main="Student-t prob plot", col="blue")
qt2.log.rivers<-qt((i-0.5)/n,df=n-2)
plot(qt2.log.rivers, log.rivers, main="Student-t, df=n-2")
abline(0,1,col="green")
qqmath(log.rivers, distribution=function(p)qt(p,df=9.11266483))
D. Exponential Distribution
fitdistr(log.rivers, densfu="exponential")
## rate
## 0.16192027
## (0.01363615)
qexp.log.rivers<- -log(1-(i-0.5)/n)*(1/0.16192027) # exponential quantile
plot(qexp.log.rivers, log.rivers, main="Exponential prob plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, distribution=function(p)qexp(p, rate=0.16912027))
E. Weibull Distribution
fitdistr(log.rivers, densfu="weibull")
## Warning in densfun(x, parm[1], parm[2], ...): NaNì´ ìì±ëììµëë¤
## shape scale
## 9.68871321 6.45644999
## (0.56204457) (0.05970672)
qweibull.log.rivers<-log(-log(1-(i-0.5)/n))
plot(qweibull.log.rivers, log(log.rivers), main="Weibull prob plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, distribution=function(p)qweibull(p, shape=9.68871321, scale=6.45644999))
F. Gamma Distribution
fitdistr(log.rivers, densfu="gamma")
## shape rate
## 109.021358 17.652904
## ( 12.963503) ( 2.103892)
(mean.rivers <- mean(log.rivers))
## [1] 6.175879
(var.rivers <- var(log.rivers))
## [1] 0.3498534
(shape.rivers <- mean.rivers^2/var.rivers)
## [1] 109.0213
(scale.rivers <- mean.rivers/var.rivers)
## [1] 17.65276
qgamma.log.rivers<-qgamma((i-0.5)/n,shape=shape.rivers, rate=scale.rivers)
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")
qqmath(log.rivers, distribution=function(p)qgamma(p, shape=109.021358, rate=17.652904))
par(mfrow=c(1,2))
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")
qq.norm<-qqplot(qnorm.log.rivers, log.rivers)
qq.x1<-qq.norm$x
qq.y1<-qq.norm$y
qq.gamma<-qqplot(qgamma.log.rivers, log.rivers)
qq.x2<-qq.gamma$x
qq.y2<-qq.gamma$y
par(mfrow=c(1,2))
plot((qq.x1+qq.y1)/2, qq.y1-qq.x1, main="tmd_Normal", ylab="log.rivers - qnorm.log.rivers", xlab="mean", ylim=c(min(qq.y1-qq.x1-3),max(qq.y1-qq.x1+3)))
abline(0,0, col="green")
plot((qq.x2+qq.y2)/2, qq.y2-qq.x2, main="tmd_Gamma",
ylab="log.rivers - qgamma.log.rivers", xlab="mean", ylim=c(min(qq.y2-qq.x2-3),max(qq.y2-qq.x2+3)))
abline(0,0, col="green")
x<-seq(4,8.7,0.01)
y1<-dnorm(x,6.17587888, 0.58938291)
y2<-dgamma(x,109.0213, 17.65276)
par(mfrow=c(1,2))
plot(x,y1,type="l", main="normal")
plot(x,y2,type="l", main="gamma")
work.d<-"C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 4"
setwd(work.d)
dataQ2<-read.delim("QUAKES.DAT", header=FALSE)
str(dataQ2)
## 'data.frame': 5 obs. of 13 variables:
## $ V1 : int 840 1901 40 139 246
## $ V2 : int 157 695 1336 780 1617
## $ V3 : int 145 294 335 203 638
## $ V4 : int 44 562 1354 436 937
## $ V5 : int 33 721 454 30 735
## $ V6 : int 121 76 36 384 38
## $ V7 : int 150 710 667 129 365
## $ V8 : int 280 46 40 9 92
## $ V9 : int 434 402 556 209 82
## $ V10: int 736 194 99 599 220
## $ V11: int 584 759 304 83 NA
## $ V12: int 887 319 375 832 NA
## $ V13: int 263 460 567 328 NA
apply(dataQ2, 2, function(x)sum(is.na(x))) # we have 3 NA values in col 11, 12, 13
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 0 0 0 0 0 0 0 0 0 0 1 1 1
attach(dataQ2)
quakes<-c(V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13)
quakes<-na.omit(quakes)
str(quakes)
## atomic [1:62] 840 1901 40 139 246 157 695 1336 780 1617 ...
## - attr(*, "na.action")=Class 'omit' int [1:3] 55 60 65
n<-length(quakes)
quakes<-sort(quakes)
i<-1:n
fitdistr(quakes, densfu="exponential")
## rate
## 0.0022872321
## (0.0002904788)
qexp.quakes<- -log(1-(i-0.5)/n)/(0.0022872321) # exponential quantile
plot(qexp.quakes, quakes, main="Exponential prob plot")
abline(0,1, col="red")
qweibull.quakes<-log(-log(1-(i-0.5)/n))
plot(qweibull.quakes, log(quakes), main="Weibull prob plot")
abline(0,1, col="green")
count<-c(0:14)
frequency<-c(57,203,383,525,532,408,272,139,46,27,10,4,0,1,1)
dataQ3<-rbind(count, frequency)
dataQ3
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## count 0 1 2 3 4 5 6 7 8 9 10 11
## frequency 57 203 383 525 532 408 272 139 46 27 10 4
## [,13] [,14] [,15]
## count 12 13 14
## frequency 0 1 1
sum.freq<-sum(frequency)
prob.event<-frequency/sum.freq
plot(prob.event~count, data=dataQ3)
fitdistr(count, "poisson")
## lambda
## 7.0000000
## (0.6831301)
x<-seq(0, 14, 1)
y<-dpois(x, 7)
plot(x,y,main="Poisson density")
n<-length(frequency)
i<-1:n
qpoisson<-qpois((i-0.5)/n, lambda=7)
plot(qpoisson, sort(frequency), main="poisson plotting")
abline(0,1,col="green")
dataQ4<-read.delim("BLOODFAT.DAT", header=FALSE)
str(dataQ4)
## 'data.frame': 54 obs. of 14 variables:
## $ V1 : int 195 170 234 158 219 155 243 168 184 215 ...
## $ V2 : int 348 90 143 87 98 48 101 227 145 168 ...
## $ V3 : int 237 150 222 167 266 207 209 207 263 233 ...
## $ V4 : int 174 167 284 177 486 195 97 160 142 340 ...
## $ V5 : int 205 200 116 217 190 238 221 NA 185 212 ...
## $ V6 : int 158 154 87 114 108 172 156 NA 115 171 ...
## $ V7 : int 201 228 157 234 156 168 178 NA 271 221 ...
## $ V8 : int 171 119 134 116 126 71 116 NA 128 140 ...
## $ V9 : int 190 169 194 190 187 210 289 NA 173 239 ...
## $ V10: int 85 86 121 132 109 91 120 NA 56 97 ...
## $ V11: int 180 178 130 178 149 208 201 NA 230 168 ...
## $ V12: int 82 166 64 157 146 139 72 NA 304 131 ...
## $ V13: int 193 251 206 265 147 160 168 NA 222 231 ...
## $ V14: int 210 211 99 73 95 116 100 NA 151 145 ...
dataQ4
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 1 195 348 237 174 205 158 201 171 190 85 180 82 193 210
## 2 170 90 150 167 200 154 228 119 169 86 178 166 251 211
## 3 234 143 222 284 116 87 157 134 194 121 130 64 206 99
## 4 158 87 167 177 217 114 234 116 190 132 178 157 265 73
## 5 219 98 266 486 190 108 156 126 187 109 149 146 147 95
## 6 155 48 207 195 238 172 168 71 210 91 208 139 160 116
## 7 243 101 209 97 221 156 178 116 289 120 201 72 168 100
## 8 168 227 207 160 NA NA NA NA NA NA NA NA NA NA
## 9 184 145 263 142 185 115 271 128 173 56 230 304 222 151
## 10 215 168 233 340 212 171 221 140 239 97 168 131 231 145
## 11 221 432 131 137 211 124 232 258 313 256 240 221 176 166
## 12 210 92 251 189 175 148 185 256 184 222 198 149 198 333
## 13 208 112 284 245 231 181 171 165 258 210 164 76 230 492
## 14 197 87 216 112 230 90 265 156 197 158 230 146 233 142
## 15 250 118 243 50 175 489 200 68 240 196 185 116 213 130
## 16 180 80 208 220 386 162 236 152 230 162 188 220 200 101
## 17 212 130 193 188 230 158 169 112 181 104 189 84 180 202
## 18 297 232 232 328 150 426 239 154 178 100 242 144 323 196
## 19 168 208 197 291 417 198 172 140 240 441 191 115 217 327
## 20 208 262 220 75 191 115 119 84 171 170 179 126 208 149
## 21 180 102 254 153 191 136 176 217 283 424 253 222 220 172
## 22 268 154 248 312 245 120 171 108 239 92 196 141 247 137
## 23 219 454 159 125 200 152 233 127 232 131 189 135 237 400
## 24 319 418 171 78 194 183 244 108 236 148 260 144 254 170
## 25 250 161 196 130 298 143 306 408 175 153 251 117 256 271
## 26 285 930 184 255 228 142 171 120 229 242 195 137 214 223
## 27 221 268 204 150 276 199 165 121 211 91 264 259 245 446
## 28 227 146 197 265 196 103 193 170 211 122 185 120 157 59
## 29 224 124 209 82 223 80 278 152 251 152 140 164 197 101
## 30 172 106 174 117 192 101 221 179 283 199 178 109 185 168
## 31 181 119 191 233 185 130 206 133 210 217 226 72 219 267
## 32 215 325 228 130 245 257 186 273 242 85 201 297 239 137
## 33 179 126 218 123 279 317 234 135 264 269 237 88 162 91
## 34 245 166 191 90 207 316 248 142 139 173 246 87 247 91
## 35 193 290 332 250 194 116 195 363 243 112 271 89 197 347
## 36 242 179 175 246 138 91 244 177 206 201 191 149 223 154
## 37 172 207 190 120 144 125 194 125 105 36 201 92 193 259
## 38 262 88 211 304 178 84 331 134 235 144 267 199 227 202
## 39 243 126 261 174 185 100 171 90 222 229 231 161 258 328
## 40 211 306 249 256 209 89 177 133 165 151 299 93 274 323
## 41 219 163 233 101 220 153 348 154 194 400 230 137 250 160
## 42 173 300 260 127 258 151 131 61 168 91 208 77 287 209
## 43 308 260 227 172 168 126 178 101 164 80 151 73 165 155
## 44 249 146 258 145 194 196 140 99 187 390 171 135 221 156
## 45 294 135 167 80 208 201 208 148 185 231 159 82 222 108
## 46 266 164 217 227 249 200 218 207 245 322 242 180 262 169
## 47 169 158 204 84 184 182 206 148 198 124 242 248 189 176
## 48 260 98 199 153 207 150 206 107 210 95 229 296 232 583
## 49 267 192 228 149 187 115 304 149 140 102 209 376 198 105
## 50 270 110 188 148 160 125 218 96 257 402 259 240 139 54
## 51 213 261 178 125 172 146 198 103 222 348 238 156 273 146
## 52 131 96 233 141 269 84 170 284 149 237 194 272 142 111
## 53 218 567 194 278 252 233 184 184 203 170 239 38 232 161
## 54 225 240 280 218 185 110 163 156 216 101 NA NA NA NA
apply(dataQ4, 2, function(x)sum(is.na(x)))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 0 0 0 0 1 1 1 1 1 1 2 2 2 2
sum(apply(dataQ4, 2, function(x)sum(is.na(x)))) # total number of 14 observations are marked NA
## [1] 14
group1<-1:8
chol.group1<-c(dataQ4[group1,1],
dataQ4[group1,3],
dataQ4[group1,5],
dataQ4[group1,7],
dataQ4[group1,9],
dataQ4[group1,11],
dataQ4[group1,13]
)
chol.group2<-c(dataQ4[-group1,1],
dataQ4[-group1,3],
dataQ4[-group1,5],
dataQ4[-group1,7],
dataQ4[-group1,9],
dataQ4[-group1,11],
dataQ4[-group1,13]
)
trigly.group1<-c(dataQ4[group1,2],
dataQ4[group1,4],
dataQ4[group1,6],
dataQ4[group1,8],
dataQ4[group1,10],
dataQ4[group1,12],
dataQ4[group1,14]
)
trigly.group2<-c(dataQ4[-group1,2],
dataQ4[-group1,4],
dataQ4[-group1,6],
dataQ4[-group1,8],
dataQ4[-group1,10],
dataQ4[-group1,12],
dataQ4[-group1,14]
)
chol.group1<-na.omit(chol.group1)
trigly.group1<-na.omit(trigly.group1)
chol.group2<-na.omit(chol.group2)
trigly.group2<-na.omit(trigly.group2)
str(chol.group1)
## atomic [1:51] 195 170 234 158 219 155 243 168 237 150 ...
## - attr(*, "na.action")=Class 'omit' int [1:5] 24 32 40 48 56
str(chol.group2)
## atomic [1:320] 184 215 221 210 208 197 250 180 212 297 ...
## - attr(*, "na.action")=Class 'omit' int [1:2] 276 322
str(trigly.group1)
## atomic [1:51] 348 90 143 87 98 48 101 227 174 167 ...
## - attr(*, "na.action")=Class 'omit' int [1:5] 24 32 40 48 56
str(trigly.group2)
## atomic [1:320] 145 168 432 92 112 87 118 80 130 232 ...
## - attr(*, "na.action")=Class 'omit' int [1:2] 276 322
qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))
abline(0,1,col="green")
qqplot(sort(chol.group2), sort(quantile(chol.group1, probs=ppoints(chol.group2))))
abline(0,1,col="green")
qq.chol<-qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))
qq.group1 <- qq.chol$y
qq.group2 <- qq.chol$x
plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Cholesterol",
ylab="Group2 - Group1", xlab="mean", ylim=c(5,-40))
abline(0,0, col="green")
qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))
abline(0,1,col="green")
qq.trigly<-qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))
qq.group1 <- qq.trigly$y
qq.group2 <- qq.trigly$x
plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Triglyceride",
ylab="Group2 - Group1", xlab="mean", ylim=c(5, -120))
abline(0,0, col="green")
chol<-c(chol.group1, chol.group2)
chol<-sort(chol)
hist(chol)
fitdistr(chol, densfu="gamma")
## shape rate
## 25.665328442 0.120316388
## ( 1.870081336) ( 0.008852518)
(mean.chol<-mean(chol))
## [1] 213.3154
(var.chol<-var(chol))
## [1] 1823.433
(shape.chol<-mean.chol^2/var.chol)
## [1] 24.95483
(scale.chol<-mean.chol/var.chol)
## [1] 0.1169856
i<-1:length(chol)
qgamma.chol<-qgamma((i-0.5)/length(chol),shape=shape.chol, scale=scale.chol)
plot(qgamma.chol, chol, main="Gamma prob plot")
abline(0,1, col="green")
fitdistr(chol, densfu="exponential")
## rate
## 0.0046878949
## (0.0002433834)
qexp.chol<- -log(1-(i-0.5)/length(chol)*(0.0046878949))
plot(qexp.chol, chol, main="Exponential prob plot")
abline(0,1, col="green")
fitdistr(chol, densfu="normal")
## mean sd
## 213.315364 42.644083
## ( 2.213970) ( 1.565514)
qnorm.chol<-qnorm((i-0.5)/length(chol), mean=213.315364, sd=42.644083)
plot(qnorm.chol, chol, main="Normal prob plot")
abline(0,1, col="green")
trigly<-sort(trigly.group2)
hist(trigly)
fitdistr(trigly, densfu="gamma")
## shape rate
## 4.164402883 0.023219079
## (0.315418255) (0.001867378)
(mean.trigly<-mean(trigly))
## [1] 179.3531
(var.trigly<-var(trigly))
## [1] 10372.69
(shape.trigly<-mean.trigly^2/var.trigly)
## [1] 3.101176
(scale.trigly<-mean.trigly/var.trigly)
## [1] 0.01729089
i<-1:length(trigly)
qgamma.trigly<-qgamma((i-0.5)/length(trigly),shape=shape.trigly, scale=scale.trigly)
plot(qgamma.trigly, trigly, main="Gamma prob plot")
abline(0,1, col="green")
fitdistr(trigly, densfu="normal")
## mean sd
## 179.353125 101.687160
## ( 5.684485) ( 4.019538)
qnorm.trigly<-qnorm((i-0.5)/length(trigly), mean=179.353125, sd=101.687160)
plot(qnorm.trigly, trigly, main="Normal prob plot")
abline(0,1, col="green")