EDA Assignment 4_2013122032

Question 1. Conduct log transformation with ‘rivers’ data and search to which theoretical distribution it belong

str(rivers)

##  num [1:141] 735 320 325 392 524 ...

sort.rivers<-sort(rivers)
log.rivers<-log(sort.rivers)
library(lattice)
library(MASS)

hist(log.rivers)

n <- length(log.rivers)
i <- 1:n

A. Uniform distribution

max(log.rivers)-min(log.rivers)

## [1] 3.313512

adjust.scale.log.rivers<-(log.rivers - min(log.rivers))/(max(log.rivers) - min(log.rivers))    # follows unif[0,1]/ min-max scaling
qunif.log.rivers<-(((i-0.5)/n)-0)/(1-0)
plot(qunif.log.rivers,adjust.scale.log.rivers, main="Uniform prob Plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distriution=function(p)qunif(p,min(log.rivers),max(log.rivers)))

B. Normal and Standard Normal distribution

fitdistr(log.rivers, densfu="normal")

##       mean          sd    
##   6.17587888   0.58938291 
##  (0.04963500) (0.03509724)

qnorm.log.rivers<-qnorm((i-0.5)/n,6.17587888,0.58938291)
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, disribution=function(p)qnorm(p,6.17587888,0.58938291))

qstandard.n.log.rivers<-qnorm((i-0.5)/n)
plot(qstandard.n.log.rivers, log.rivers, main="Standard Normal prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=qnorm)

C. Student-t distribution

fitdistr(log.rivers, densfu="t")

## Warning in log(s): NaNì´ ìì±ëììµëë¤

##        m            s            df    
##   6.13240057   0.52222478   9.11266483 
##  (0.05423608) (0.05259516) (6.68045101)

qt1.log.rivers<-qt((i-0.5)/n, df=9.11266483)
plot(qt1.log.rivers, log.rivers, main="Student-t prob plot", col="blue")

qt2.log.rivers<-qt((i-0.5)/n,df=n-2)
plot(qt2.log.rivers, log.rivers, main="Student-t, df=n-2")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qt(p,df=9.11266483))

D. Exponential Distribution

fitdistr(log.rivers, densfu="exponential")

##       rate   
##   0.16192027 
##  (0.01363615)

qexp.log.rivers<- -log(1-(i-0.5)/n)*(1/0.16192027)             # exponential quantile
plot(qexp.log.rivers, log.rivers, main="Exponential prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qexp(p, rate=0.16912027))

E. Weibull Distribution

fitdistr(log.rivers, densfu="weibull")

## Warning in densfun(x, parm[1], parm[2], ...): NaNì´ ìì±ëììµëë¤

##      shape        scale   
##   9.68871321   6.45644999 
##  (0.56204457) (0.05970672)

qweibull.log.rivers<-log(-log(1-(i-0.5)/n))
plot(qweibull.log.rivers, log(log.rivers), main="Weibull prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qweibull(p, shape=9.68871321, scale=6.45644999))

F. Gamma Distribution

fitdistr(log.rivers, densfu="gamma")

##      shape         rate   
##   109.021358    17.652904 
##  ( 12.963503) (  2.103892)

Find shape, scale parameter with ‘fitdistr’ function. Or equivalently

(mean.rivers <- mean(log.rivers))

## [1] 6.175879

(var.rivers <- var(log.rivers))

## [1] 0.3498534

(shape.rivers <- mean.rivers^2/var.rivers)

## [1] 109.0213

(scale.rivers <- mean.rivers/var.rivers)

## [1] 17.65276

qgamma.log.rivers<-qgamma((i-0.5)/n,shape=shape.rivers, rate=scale.rivers)
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")

qqmath(log.rivers, distribution=function(p)qgamma(p, shape=109.021358, rate=17.652904))

Normal Distribution and gamma distribution seems plausible for sample’s theoretical distributions

par(mfrow=c(1,2))
plot(qnorm.log.rivers, log.rivers, main="Normal prob plot", col="blue")
abline(0,1,col="green")
plot(qgamma.log.rivers, log.rivers, main="Gamma prob plot", col="blue")
abline(0,1,col="green")

qq.norm<-qqplot(qnorm.log.rivers, log.rivers)

qq.x1<-qq.norm$x
qq.y1<-qq.norm$y
qq.gamma<-qqplot(qgamma.log.rivers, log.rivers)

qq.x2<-qq.gamma$x
qq.y2<-qq.gamma$y

par(mfrow=c(1,2))
plot((qq.x1+qq.y1)/2, qq.y1-qq.x1, main="tmd_Normal", ylab="log.rivers - qnorm.log.rivers", xlab="mean", ylim=c(min(qq.y1-qq.x1-3),max(qq.y1-qq.x1+3)))
abline(0,0, col="green")
plot((qq.x2+qq.y2)/2, qq.y2-qq.x2, main="tmd_Gamma", 
        ylab="log.rivers - qgamma.log.rivers", xlab="mean", ylim=c(min(qq.y2-qq.x2-3),max(qq.y2-qq.x2+3)))
abline(0,0, col="green")

Compare between pdf of normal, gamma distribution with estimated parameters

x<-seq(4,8.7,0.01)
y1<-dnorm(x,6.17587888, 0.58938291)
y2<-dgamma(x,109.0213, 17.65276)
par(mfrow=c(1,2))
plot(x,y1,type="l", main="normal")
plot(x,y2,type="l", main="gamma")

Question 2. Check if QuAKES.DAT belongs to exponential or weibull distribution

work.d<-"C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 4"
setwd(work.d)
dataQ2<-read.delim("QUAKES.DAT", header=FALSE)
str(dataQ2)

## 'data.frame':    5 obs. of  13 variables:
##  $ V1 : int  840 1901 40 139 246
##  $ V2 : int  157 695 1336 780 1617
##  $ V3 : int  145 294 335 203 638
##  $ V4 : int  44 562 1354 436 937
##  $ V5 : int  33 721 454 30 735
##  $ V6 : int  121 76 36 384 38
##  $ V7 : int  150 710 667 129 365
##  $ V8 : int  280 46 40 9 92
##  $ V9 : int  434 402 556 209 82
##  $ V10: int  736 194 99 599 220
##  $ V11: int  584 759 304 83 NA
##  $ V12: int  887 319 375 832 NA
##  $ V13: int  263 460 567 328 NA

apply(dataQ2, 2, function(x)sum(is.na(x)))   # we have 3 NA values in col 11, 12, 13

##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 
##   0   0   0   0   0   0   0   0   0   0   1   1   1

attach(dataQ2)
quakes<-c(V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13)
quakes<-na.omit(quakes)
str(quakes)

##  atomic [1:62] 840 1901 40 139 246 157 695 1336 780 1617 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:3] 55 60 65

n<-length(quakes)
quakes<-sort(quakes)
i<-1:n

fitdistr(quakes, densfu="exponential")

##        rate    
##   0.0022872321 
##  (0.0002904788)

qexp.quakes<- -log(1-(i-0.5)/n)/(0.0022872321)             # exponential quantile
plot(qexp.quakes, quakes, main="Exponential prob plot")
abline(0,1, col="red")

qweibull.quakes<-log(-log(1-(i-0.5)/n))
plot(qweibull.quakes, log(quakes), main="Weibull prob plot")
abline(0,1, col="green")

Qustion 3. Check if this data set belongs to Poisson distribution

count<-c(0:14)
frequency<-c(57,203,383,525,532,408,272,139,46,27,10,4,0,1,1)
dataQ3<-rbind(count, frequency)
dataQ3

##           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## count        0    1    2    3    4    5    6    7    8     9    10    11
## frequency   57  203  383  525  532  408  272  139   46    27    10     4
##           [,13] [,14] [,15]
## count        12    13    14
## frequency     0     1     1

sum.freq<-sum(frequency)
prob.event<-frequency/sum.freq
plot(prob.event~count, data=dataQ3)

fitdistr(count, "poisson")

##     lambda  
##   7.0000000 
##  (0.6831301)

x<-seq(0, 14, 1)
y<-dpois(x, 7)
plot(x,y,main="Poisson density")

n<-length(frequency)
i<-1:n
qpoisson<-qpois((i-0.5)/n, lambda=7)
plot(qpoisson, sort(frequency), main="poisson plotting")
abline(0,1,col="green")

Question 4. Test if data from different fat types belong to same standard distribution.

dataQ4<-read.delim("BLOODFAT.DAT", header=FALSE)
str(dataQ4)

## 'data.frame':    54 obs. of  14 variables:
##  $ V1 : int  195 170 234 158 219 155 243 168 184 215 ...
##  $ V2 : int  348 90 143 87 98 48 101 227 145 168 ...
##  $ V3 : int  237 150 222 167 266 207 209 207 263 233 ...
##  $ V4 : int  174 167 284 177 486 195 97 160 142 340 ...
##  $ V5 : int  205 200 116 217 190 238 221 NA 185 212 ...
##  $ V6 : int  158 154 87 114 108 172 156 NA 115 171 ...
##  $ V7 : int  201 228 157 234 156 168 178 NA 271 221 ...
##  $ V8 : int  171 119 134 116 126 71 116 NA 128 140 ...
##  $ V9 : int  190 169 194 190 187 210 289 NA 173 239 ...
##  $ V10: int  85 86 121 132 109 91 120 NA 56 97 ...
##  $ V11: int  180 178 130 178 149 208 201 NA 230 168 ...
##  $ V12: int  82 166 64 157 146 139 72 NA 304 131 ...
##  $ V13: int  193 251 206 265 147 160 168 NA 222 231 ...
##  $ V14: int  210 211 99 73 95 116 100 NA 151 145 ...

dataQ4

##     V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14
## 1  195 348 237 174 205 158 201 171 190  85 180  82 193 210
## 2  170  90 150 167 200 154 228 119 169  86 178 166 251 211
## 3  234 143 222 284 116  87 157 134 194 121 130  64 206  99
## 4  158  87 167 177 217 114 234 116 190 132 178 157 265  73
## 5  219  98 266 486 190 108 156 126 187 109 149 146 147  95
## 6  155  48 207 195 238 172 168  71 210  91 208 139 160 116
## 7  243 101 209  97 221 156 178 116 289 120 201  72 168 100
## 8  168 227 207 160  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## 9  184 145 263 142 185 115 271 128 173  56 230 304 222 151
## 10 215 168 233 340 212 171 221 140 239  97 168 131 231 145
## 11 221 432 131 137 211 124 232 258 313 256 240 221 176 166
## 12 210  92 251 189 175 148 185 256 184 222 198 149 198 333
## 13 208 112 284 245 231 181 171 165 258 210 164  76 230 492
## 14 197  87 216 112 230  90 265 156 197 158 230 146 233 142
## 15 250 118 243  50 175 489 200  68 240 196 185 116 213 130
## 16 180  80 208 220 386 162 236 152 230 162 188 220 200 101
## 17 212 130 193 188 230 158 169 112 181 104 189  84 180 202
## 18 297 232 232 328 150 426 239 154 178 100 242 144 323 196
## 19 168 208 197 291 417 198 172 140 240 441 191 115 217 327
## 20 208 262 220  75 191 115 119  84 171 170 179 126 208 149
## 21 180 102 254 153 191 136 176 217 283 424 253 222 220 172
## 22 268 154 248 312 245 120 171 108 239  92 196 141 247 137
## 23 219 454 159 125 200 152 233 127 232 131 189 135 237 400
## 24 319 418 171  78 194 183 244 108 236 148 260 144 254 170
## 25 250 161 196 130 298 143 306 408 175 153 251 117 256 271
## 26 285 930 184 255 228 142 171 120 229 242 195 137 214 223
## 27 221 268 204 150 276 199 165 121 211  91 264 259 245 446
## 28 227 146 197 265 196 103 193 170 211 122 185 120 157  59
## 29 224 124 209  82 223  80 278 152 251 152 140 164 197 101
## 30 172 106 174 117 192 101 221 179 283 199 178 109 185 168
## 31 181 119 191 233 185 130 206 133 210 217 226  72 219 267
## 32 215 325 228 130 245 257 186 273 242  85 201 297 239 137
## 33 179 126 218 123 279 317 234 135 264 269 237  88 162  91
## 34 245 166 191  90 207 316 248 142 139 173 246  87 247  91
## 35 193 290 332 250 194 116 195 363 243 112 271  89 197 347
## 36 242 179 175 246 138  91 244 177 206 201 191 149 223 154
## 37 172 207 190 120 144 125 194 125 105  36 201  92 193 259
## 38 262  88 211 304 178  84 331 134 235 144 267 199 227 202
## 39 243 126 261 174 185 100 171  90 222 229 231 161 258 328
## 40 211 306 249 256 209  89 177 133 165 151 299  93 274 323
## 41 219 163 233 101 220 153 348 154 194 400 230 137 250 160
## 42 173 300 260 127 258 151 131  61 168  91 208  77 287 209
## 43 308 260 227 172 168 126 178 101 164  80 151  73 165 155
## 44 249 146 258 145 194 196 140  99 187 390 171 135 221 156
## 45 294 135 167  80 208 201 208 148 185 231 159  82 222 108
## 46 266 164 217 227 249 200 218 207 245 322 242 180 262 169
## 47 169 158 204  84 184 182 206 148 198 124 242 248 189 176
## 48 260  98 199 153 207 150 206 107 210  95 229 296 232 583
## 49 267 192 228 149 187 115 304 149 140 102 209 376 198 105
## 50 270 110 188 148 160 125 218  96 257 402 259 240 139  54
## 51 213 261 178 125 172 146 198 103 222 348 238 156 273 146
## 52 131  96 233 141 269  84 170 284 149 237 194 272 142 111
## 53 218 567 194 278 252 233 184 184 203 170 239  38 232 161
## 54 225 240 280 218 185 110 163 156 216 101  NA  NA  NA  NA

apply(dataQ4, 2, function(x)sum(is.na(x)))

##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 
##   0   0   0   0   1   1   1   1   1   1   2   2   2   2

sum(apply(dataQ4, 2, function(x)sum(is.na(x))))   # total number of 14 observations are marked NA

## [1] 14

group1<-1:8
chol.group1<-c(dataQ4[group1,1],
        dataQ4[group1,3],
        dataQ4[group1,5],
        dataQ4[group1,7],
        dataQ4[group1,9],
        dataQ4[group1,11],
        dataQ4[group1,13]
        )
chol.group2<-c(dataQ4[-group1,1],
        dataQ4[-group1,3],
        dataQ4[-group1,5],
        dataQ4[-group1,7],
        dataQ4[-group1,9],
        dataQ4[-group1,11],
        dataQ4[-group1,13]
        )
trigly.group1<-c(dataQ4[group1,2],
        dataQ4[group1,4],
        dataQ4[group1,6],
        dataQ4[group1,8],
        dataQ4[group1,10],
        dataQ4[group1,12],
        dataQ4[group1,14]
        )
trigly.group2<-c(dataQ4[-group1,2],
        dataQ4[-group1,4],
        dataQ4[-group1,6],
        dataQ4[-group1,8],
        dataQ4[-group1,10],
        dataQ4[-group1,12],
        dataQ4[-group1,14]
        )
chol.group1<-na.omit(chol.group1)
trigly.group1<-na.omit(trigly.group1)
chol.group2<-na.omit(chol.group2)
trigly.group2<-na.omit(trigly.group2)
str(chol.group1)

##  atomic [1:51] 195 170 234 158 219 155 243 168 237 150 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:5] 24 32 40 48 56

str(chol.group2)

##  atomic [1:320] 184 215 221 210 208 197 250 180 212 297 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:2] 276 322

str(trigly.group1)

##  atomic [1:51] 348 90 143 87 98 48 101 227 174 167 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:5] 24 32 40 48 56

str(trigly.group2)

##  atomic [1:320] 145 168 432 92 112 87 118 80 130 232 ...
##  - attr(*, "na.action")=Class 'omit'  int [1:2] 276 322

qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))
abline(0,1,col="green")

qqplot of cholesterol levels between group1 and group2. Interpolated group 1 which is bigger by estimating length(group1) quantiles from length(group2) observations
Interpolating smaller data would yield qqplot with wierd figure

qqplot(sort(chol.group2), sort(quantile(chol.group1, probs=ppoints(chol.group2))))
abline(0,1,col="green")

The extra information from interpolating smaller dataset is mere ‘extra’ information from interpolation function, which is highly unneccassary.

qq.chol<-qqplot(sort(chol.group1), sort(quantile(chol.group2, probs=ppoints(chol.group1))))

qq.group1 <- qq.chol$y
qq.group2 <- qq.chol$x

plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Cholesterol", 
        ylab="Group2 - Group1", xlab="mean", ylim=c(5,-40))
abline(0,0, col="green")

qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))
abline(0,1,col="green")

qq.trigly<-qqplot(sort(trigly.group1), sort(quantile(trigly.group2, probs=ppoints(trigly.group1))))

qq.group1 <- qq.trigly$y
qq.group2 <- qq.trigly$x

plot((qq.group1+qq.group2)/2, qq.group2-qq.group1, main="tmd_Triglyceride", 
        ylab="Group2 - Group1", xlab="mean", ylim=c(5, -120))
abline(0,0, col="green")

chol<-c(chol.group1, chol.group2)
chol<-sort(chol)
hist(chol)

fitdistr(chol, densfu="gamma")

##       shape           rate    
##   25.665328442    0.120316388 
##  ( 1.870081336) ( 0.008852518)

(mean.chol<-mean(chol))

## [1] 213.3154

(var.chol<-var(chol))

## [1] 1823.433

(shape.chol<-mean.chol^2/var.chol)

## [1] 24.95483

(scale.chol<-mean.chol/var.chol)

## [1] 0.1169856

i<-1:length(chol)
qgamma.chol<-qgamma((i-0.5)/length(chol),shape=shape.chol, scale=scale.chol)
plot(qgamma.chol, chol, main="Gamma prob plot")
abline(0,1, col="green")

fitdistr(chol, densfu="exponential")

##        rate    
##   0.0046878949 
##  (0.0002433834)

qexp.chol<- -log(1-(i-0.5)/length(chol)*(0.0046878949))
plot(qexp.chol, chol, main="Exponential prob plot")
abline(0,1, col="green")

fitdistr(chol, densfu="normal")

##       mean          sd    
##   213.315364    42.644083 
##  (  2.213970) (  1.565514)

qnorm.chol<-qnorm((i-0.5)/length(chol), mean=213.315364, sd=42.644083)
plot(qnorm.chol, chol, main="Normal prob plot")
abline(0,1, col="green")

trigly<-sort(trigly.group2)
hist(trigly)

fitdistr(trigly, densfu="gamma")

##       shape         rate    
##   4.164402883   0.023219079 
##  (0.315418255) (0.001867378)

(mean.trigly<-mean(trigly))

## [1] 179.3531

(var.trigly<-var(trigly))

## [1] 10372.69

(shape.trigly<-mean.trigly^2/var.trigly)

## [1] 3.101176

(scale.trigly<-mean.trigly/var.trigly)

## [1] 0.01729089

i<-1:length(trigly)
qgamma.trigly<-qgamma((i-0.5)/length(trigly),shape=shape.trigly, scale=scale.trigly)
plot(qgamma.trigly, trigly, main="Gamma prob plot")
abline(0,1, col="green")

fitdistr(trigly, densfu="normal")

##       mean          sd    
##   179.353125   101.687160 
##  (  5.684485) (  4.019538)

qnorm.trigly<-qnorm((i-0.5)/length(trigly), mean=179.353125, sd=101.687160)
plot(qnorm.trigly, trigly, main="Normal prob plot")
abline(0,1, col="green")

EDA Assignment 4_2013122032_SeongJin Kim

SeongJin Kim

April 6, 2018

Question 1. Conduct log transformation with ‘rivers’ data and search to which theoretical distribution it belong

Question 2. Check if QuAKES.DAT belongs to exponential or weibull distribution

Qustion 3. Check if this data set belongs to Poisson distribution

Question 4. Test if data from different fat types belong to same standard distribution.