Question 1. Conduct log transformation with ‘rivers’ data and search to which theoretical distribution it belong

str(rivers)
##  num [1:141] 735 320 325 392 524 ...
sort.rivers<-sort(rivers)
log.rivers<-log(sort.rivers)
library(lattice)
library(MASS)

hist(log.rivers)

n <- length(log.rivers)
i <- 1:n

A. Uniform distribution

max(log.rivers)-min(log.rivers)
## [1] 3.313512
adjust.scale.log.rivers<-(log.rivers - min(log.rivers))/(max(log.rivers) - min(log.rivers))    # follows unif[0,1]/ min-max scaling
qunif.log.rivers<-(((i-0.5)/n)-0)/(1-0)
plot(qunif.log.rivers,adjust.scale.log.rivers, main="Uniform prob Plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distriution=function(p)qunif(p,min(log.rivers),max(log.rivers)))

B. Normal and Standard Normal distribution

fitdistr(log.rivers, densfu="normal")
##       mean          sd    
##   6.17587888   0.58938291 
##  (0.04963500) (0.03509724)
qnorm.log.rivers<-qnorm((i-0.5)/n,6.17587888,0.58938291)
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, disribution=function(p)qnorm(p,6.17587888,0.58938291))

qstandard.n.log.rivers<-qnorm((i-0.5)/n)
plot(qstandard.n.log.rivers, log.rivers, main="Standard Normal prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=qnorm)

C. Student-t distribution

fitdistr(log.rivers, densfu="t")
## Warning in log(s): NaN이 생성되었습니다
##        m            s            df    
##   6.13240057   0.52222478   9.11266483 
##  (0.05423608) (0.05259516) (6.68045101)
qt1.log.rivers<-qt((i-0.5)/n, df=9.11266483)
plot(qt1.log.rivers, log.rivers, main="Student-t prob plot", col="blue")

qt2.log.rivers<-qt((i-0.5)/n,df=n-2)
plot(qt2.log.rivers, log.rivers, main="Student-t, df=n-2")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qt(p,df=9.11266483))

D. Exponential Distribution

fitdistr(log.rivers, densfu="exponential")
##       rate   
##   0.16192027 
##  (0.01363615)
qexp.log.rivers<- -log(1-(i-0.5)/n)*(1/0.16192027)             # exponential quantile
plot(qexp.log.rivers, log.rivers, main="Exponential prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qexp(p, rate=0.16912027))

E. Weibull Distribution

fitdistr(log.rivers, densfu="weibull")
## Warning in densfun(x, parm[1], parm[2], ...): NaN이 생성되었습니다
##      shape        scale   
##   9.68871321   6.45644999 
##  (0.56204457) (0.05970672)
qweibull.log.rivers<-log(-log(1-(i-0.5)/n))
plot(qweibull.log.rivers, log(log.rivers), main="Weibull prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qweibull(p, shape=9.68871321, scale=6.45644999))

F. Gamma Distribution

fitdistr(log.rivers, densfu="gamma")
##      shape         rate   
##   109.021358    17.652904 
##  ( 12.963503) (  2.103892)
(mean.rivers <- mean(log.rivers))
## [1] 6.175879
(var.rivers <- var(log.rivers))
## [1] 0.3498534
(shape.rivers <- mean.rivers^2/var.rivers)
## [1] 109.0213
(scale.rivers <- mean.rivers/var.rivers)
## [1] 17.65276
qgamma.log.rivers<-qgamma((i-0.5)/n,shape=shape.rivers, rate=scale.rivers)
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qgamma(p, shape=109.021358, rate=17.652904))

par(mfrow=c(1,2))
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")

qq.norm<-qqplot(qnorm.log.rivers, log.rivers)

qq.x1<-qq.norm$x
qq.y1<-qq.norm$y
qq.gamma<-qqplot(qgamma.log.rivers, log.rivers)

qq.x2<-qq.gamma$x
qq.y2<-qq.gamma$y

par(mfrow=c(1,2))
plot((qq.x1+qq.y1)/2, qq.y1-qq.x1, main="tmd_Normal", ylab="log.rivers - qnorm.log.rivers", xlab="mean", ylim=c(min(qq.y1-qq.x1-3),max(qq.y1-qq.x1+3)))
abline(0,0, col="green")
plot((qq.x2+qq.y2)/2, qq.y2-qq.x2, main="tmd_Gamma", 
        ylab="log.rivers - qgamma.log.rivers", xlab="mean", ylim=c(min(qq.y2-qq.x2-3),max(qq.y2-qq.x2+3)))
abline(0,0, col="green")

x<-seq(4,8.7,0.01)
y1<-dnorm(x,6.17587888, 0.58938291)
y2<-dgamma(x,109.0213, 17.65276)
par(mfrow=c(1,2))
plot(x,y1,type="l", main="normal")
plot(x,y2,type="l", main="gamma")

Question 2. Check if QuAKES.DAT belongs to exponential or weibull distribution

work.d<-"C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 4"
setwd(work.d)
dataQ2<-read.delim("QUAKES.DAT", header=FALSE)
str(dataQ2)
## 'data.frame':    5 obs. of  13 variables:
##  $ V1 : int  840 1901 40 139 246
##  $ V2 : int  157 695 1336 780 1617
##  $ V3 : int  145 294 335 203 638
##  $ V4 : int  44 562 1354 436 937
##  $ V5 : int  33 721 454 30 735
##  $ V6 : int  121 76 36 384 38
##  $ V7 : int  150 710 667 129 365
##  $ V8 : int  280 46 40 9 92
##  $ V9 : int  434 402 556 209 82
##  $ V10: int  736 194 99 599 220
##  $ V11: int  584 759 304 83 NA
##  $ V12: int  887 319 375 832 NA
##  $ V13: int  263 460 567 328 NA
apply(dataQ2, 2, function(x)sum(is.na(x)))   # we have 3 NA values in col 11, 12, 13
##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 
##   0   0   0   0   0   0   0   0   0   0   1   1   1
attach(dataQ2)
quakes<-c(V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13)
quakes<-na.omit(quakes)
str(quakes)
##  atomic [1:62] 840 1901 40 139 246 157 695 1336 780 1617 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:3] 55 60 65
n<-length(quakes)
quakes<-sort(quakes)
i<-1:n

fitdistr(quakes, densfu="exponential")
##        rate    
##   0.0022872321 
##  (0.0002904788)
qexp.quakes<- -log(1-(i-0.5)/n)/(0.0022872321)             # exponential quantile
plot(qexp.quakes, quakes, main="Exponential prob plot")
abline(0,1, col="red")

qweibull.quakes<-log(-log(1-(i-0.5)/n))
plot(qweibull.quakes, log(quakes), main="Weibull prob plot")
abline(0,1, col="green")

Qustion 3. Check if this data set belongs to Poisson distribution

count<-c(0:14)
frequency<-c(57,203,383,525,532,408,272,139,46,27,10,4,0,1,1)
dataQ3<-rbind(count, frequency)
dataQ3
##           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## count        0    1    2    3    4    5    6    7    8     9    10    11
## frequency   57  203  383  525  532  408  272  139   46    27    10     4
##           [,13] [,14] [,15]
## count        12    13    14
## frequency     0     1     1
sum.freq<-sum(frequency)
prob.event<-frequency/sum.freq
plot(prob.event~count, data=dataQ3)

fitdistr(count, "poisson")
##     lambda  
##   7.0000000 
##  (0.6831301)
x<-seq(0, 14, 1)
y<-dpois(x, 7)
plot(x,y,main="Poisson density")

n<-length(frequency)
i<-1:n
qpoisson<-qpois((i-0.5)/n, lambda=7)
plot(qpoisson, sort(frequency), main="poisson plotting")
abline(0,1,col="green")

Question 4. Test if data from different fat types belong to same standard distribution.

dataQ4<-read.delim("BLOODFAT.DAT", header=FALSE)
str(dataQ4)
## 'data.frame':    54 obs. of  14 variables:
##  $ V1 : int  195 170 234 158 219 155 243 168 184 215 ...
##  $ V2 : int  348 90 143 87 98 48 101 227 145 168 ...
##  $ V3 : int  237 150 222 167 266 207 209 207 263 233 ...
##  $ V4 : int  174 167 284 177 486 195 97 160 142 340 ...
##  $ V5 : int  205 200 116 217 190 238 221 NA 185 212 ...
##  $ V6 : int  158 154 87 114 108 172 156 NA 115 171 ...
##  $ V7 : int  201 228 157 234 156 168 178 NA 271 221 ...
##  $ V8 : int  171 119 134 116 126 71 116 NA 128 140 ...
##  $ V9 : int  190 169 194 190 187 210 289 NA 173 239 ...
##  $ V10: int  85 86 121 132 109 91 120 NA 56 97 ...
##  $ V11: int  180 178 130 178 149 208 201 NA 230 168 ...
##  $ V12: int  82 166 64 157 146 139 72 NA 304 131 ...
##  $ V13: int  193 251 206 265 147 160 168 NA 222 231 ...
##  $ V14: int  210 211 99 73 95 116 100 NA 151 145 ...
dataQ4
##     V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14
## 1  195 348 237 174 205 158 201 171 190  85 180  82 193 210
## 2  170  90 150 167 200 154 228 119 169  86 178 166 251 211
## 3  234 143 222 284 116  87 157 134 194 121 130  64 206  99
## 4  158  87 167 177 217 114 234 116 190 132 178 157 265  73
## 5  219  98 266 486 190 108 156 126 187 109 149 146 147  95
## 6  155  48 207 195 238 172 168  71 210  91 208 139 160 116
## 7  243 101 209  97 221 156 178 116 289 120 201  72 168 100
## 8  168 227 207 160  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## 9  184 145 263 142 185 115 271 128 173  56 230 304 222 151
## 10 215 168 233 340 212 171 221 140 239  97 168 131 231 145
## 11 221 432 131 137 211 124 232 258 313 256 240 221 176 166
## 12 210  92 251 189 175 148 185 256 184 222 198 149 198 333
## 13 208 112 284 245 231 181 171 165 258 210 164  76 230 492
## 14 197  87 216 112 230  90 265 156 197 158 230 146 233 142
## 15 250 118 243  50 175 489 200  68 240 196 185 116 213 130
## 16 180  80 208 220 386 162 236 152 230 162 188 220 200 101
## 17 212 130 193 188 230 158 169 112 181 104 189  84 180 202
## 18 297 232 232 328 150 426 239 154 178 100 242 144 323 196
## 19 168 208 197 291 417 198 172 140 240 441 191 115 217 327
## 20 208 262 220  75 191 115 119  84 171 170 179 126 208 149
## 21 180 102 254 153 191 136 176 217 283 424 253 222 220 172
## 22 268 154 248 312 245 120 171 108 239  92 196 141 247 137
## 23 219 454 159 125 200 152 233 127 232 131 189 135 237 400
## 24 319 418 171  78 194 183 244 108 236 148 260 144 254 170
## 25 250 161 196 130 298 143 306 408 175 153 251 117 256 271
## 26 285 930 184 255 228 142 171 120 229 242 195 137 214 223
## 27 221 268 204 150 276 199 165 121 211  91 264 259 245 446
## 28 227 146 197 265 196 103 193 170 211 122 185 120 157  59
## 29 224 124 209  82 223  80 278 152 251 152 140 164 197 101
## 30 172 106 174 117 192 101 221 179 283 199 178 109 185 168
## 31 181 119 191 233 185 130 206 133 210 217 226  72 219 267
## 32 215 325 228 130 245 257 186 273 242  85 201 297 239 137
## 33 179 126 218 123 279 317 234 135 264 269 237  88 162  91
## 34 245 166 191  90 207 316 248 142 139 173 246  87 247  91
## 35 193 290 332 250 194 116 195 363 243 112 271  89 197 347
## 36 242 179 175 246 138  91 244 177 206 201 191 149 223 154
## 37 172 207 190 120 144 125 194 125 105  36 201  92 193 259
## 38 262  88 211 304 178  84 331 134 235 144 267 199 227 202
## 39 243 126 261 174 185 100 171  90 222 229 231 161 258 328
## 40 211 306 249 256 209  89 177 133 165 151 299  93 274 323
## 41 219 163 233 101 220 153 348 154 194 400 230 137 250 160
## 42 173 300 260 127 258 151 131  61 168  91 208  77 287 209
## 43 308 260 227 172 168 126 178 101 164  80 151  73 165 155
## 44 249 146 258 145 194 196 140  99 187 390 171 135 221 156
## 45 294 135 167  80 208 201 208 148 185 231 159  82 222 108
## 46 266 164 217 227 249 200 218 207 245 322 242 180 262 169
## 47 169 158 204  84 184 182 206 148 198 124 242 248 189 176
## 48 260  98 199 153 207 150 206 107 210  95 229 296 232 583
## 49 267 192 228 149 187 115 304 149 140 102 209 376 198 105
## 50 270 110 188 148 160 125 218  96 257 402 259 240 139  54
## 51 213 261 178 125 172 146 198 103 222 348 238 156 273 146
## 52 131  96 233 141 269  84 170 284 149 237 194 272 142 111
## 53 218 567 194 278 252 233 184 184 203 170 239  38 232 161
## 54 225 240 280 218 185 110 163 156 216 101  NA  NA  NA  NA
apply(dataQ4, 2, function(x)sum(is.na(x)))
##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 
##   0   0   0   0   1   1   1   1   1   1   2   2   2   2
sum(apply(dataQ4, 2, function(x)sum(is.na(x))))   # total number of 14 observations are marked NA
## [1] 14
group1<-1:8
chol.group1<-c(dataQ4[group1,1],
        dataQ4[group1,3],
        dataQ4[group1,5],
        dataQ4[group1,7],
        dataQ4[group1,9],
        dataQ4[group1,11],
        dataQ4[group1,13]
        )
chol.group2<-c(dataQ4[-group1,1],
        dataQ4[-group1,3],
        dataQ4[-group1,5],
        dataQ4[-group1,7],
        dataQ4[-group1,9],
        dataQ4[-group1,11],
        dataQ4[-group1,13]
        )
trigly.group1<-c(dataQ4[group1,2],
        dataQ4[group1,4],
        dataQ4[group1,6],
        dataQ4[group1,8],
        dataQ4[group1,10],
        dataQ4[group1,12],
        dataQ4[group1,14]
        )
trigly.group2<-c(dataQ4[-group1,2],
        dataQ4[-group1,4],
        dataQ4[-group1,6],
        dataQ4[-group1,8],
        dataQ4[-group1,10],
        dataQ4[-group1,12],
        dataQ4[-group1,14]
        )
chol.group1<-na.omit(chol.group1)
trigly.group1<-na.omit(trigly.group1)
chol.group2<-na.omit(chol.group2)
trigly.group2<-na.omit(trigly.group2)
str(chol.group1)
##  atomic [1:51] 195 170 234 158 219 155 243 168 237 150 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:5] 24 32 40 48 56
str(chol.group2)
##  atomic [1:320] 184 215 221 210 208 197 250 180 212 297 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:2] 276 322
str(trigly.group1)
##  atomic [1:51] 348 90 143 87 98 48 101 227 174 167 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:5] 24 32 40 48 56
str(trigly.group2)
##  atomic [1:320] 145 168 432 92 112 87 118 80 130 232 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:2] 276 322
qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))
abline(0,1,col="green")

qqplot(sort(chol.group2), sort(quantile(chol.group1, probs=ppoints(chol.group2))))
abline(0,1,col="green")

qq.chol<-qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))

qq.group1 <- qq.chol$y
qq.group2 <- qq.chol$x

plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Cholesterol", 
        ylab="Group2 - Group1", xlab="mean", ylim=c(5,-40))
abline(0,0, col="green")

qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))
abline(0,1,col="green")

qq.trigly<-qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))

qq.group1 <- qq.trigly$y
qq.group2 <- qq.trigly$x

plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Triglyceride", 
        ylab="Group2 - Group1", xlab="mean", ylim=c(5, -120))
abline(0,0, col="green")

chol<-c(chol.group1, chol.group2)
chol<-sort(chol)
hist(chol)

fitdistr(chol, densfu="gamma")
##       shape           rate    
##   25.665328442    0.120316388 
##  ( 1.870081336) ( 0.008852518)
(mean.chol<-mean(chol))
## [1] 213.3154
(var.chol<-var(chol))
## [1] 1823.433
(shape.chol<-mean.chol^2/var.chol)
## [1] 24.95483
(scale.chol<-mean.chol/var.chol)
## [1] 0.1169856
i<-1:length(chol)
qgamma.chol<-qgamma((i-0.5)/length(chol),shape=shape.chol, scale=scale.chol)
plot(qgamma.chol, chol, main="Gamma prob plot")
abline(0,1, col="green")

fitdistr(chol, densfu="exponential")
##        rate    
##   0.0046878949 
##  (0.0002433834)
qexp.chol<- -log(1-(i-0.5)/length(chol)*(0.0046878949))
plot(qexp.chol, chol, main="Exponential prob plot")
abline(0,1, col="green")

fitdistr(chol, densfu="normal")
##       mean          sd    
##   213.315364    42.644083 
##  (  2.213970) (  1.565514)
qnorm.chol<-qnorm((i-0.5)/length(chol), mean=213.315364, sd=42.644083)
plot(qnorm.chol, chol, main="Normal prob plot")
abline(0,1, col="green")

trigly<-sort(trigly.group2)
hist(trigly)

fitdistr(trigly, densfu="gamma")
##       shape         rate    
##   4.164402883   0.023219079 
##  (0.315418255) (0.001867378)
(mean.trigly<-mean(trigly))
## [1] 179.3531
(var.trigly<-var(trigly))
## [1] 10372.69
(shape.trigly<-mean.trigly^2/var.trigly)
## [1] 3.101176
(scale.trigly<-mean.trigly/var.trigly)
## [1] 0.01729089
i<-1:length(trigly)
qgamma.trigly<-qgamma((i-0.5)/length(trigly),shape=shape.trigly, scale=scale.trigly)
plot(qgamma.trigly, trigly, main="Gamma prob plot")
abline(0,1, col="green")

fitdistr(trigly, densfu="normal")
##       mean          sd    
##   179.353125   101.687160 
##  (  5.684485) (  4.019538)
qnorm.trigly<-qnorm((i-0.5)/length(trigly), mean=179.353125, sd=101.687160)
plot(qnorm.trigly, trigly, main="Normal prob plot")
abline(0,1, col="green")