Call R libraries.
library(data.table)
library(plyr)
library(ggplot2)
library(plotrix)
library(fitdistrplus)
## Loading required package: MASS
Data Subset
download.file(url = "http://coaps.fsu.edu/~tachanat/data/Atlanta_Precip.txt", destfile = "Atlanta_Precip.txt")
#data = read.class("Atlanta_Precip.txt", skip= 14, header = TRUE, colClasses=c("numeric"))
#precip = data[4]
data = fread("Atlanta_Precip.txt", skip= 14, header = TRUE, colClasses=c("numeric"))
precip = subset(data, as.numeric(inch) > 0)
n_observed = length(data$inch)
n_rainyday = length(precip$inch)
p_rainyday = round(n_rainyday/n_observed*100, digits=2)
Daily precipitation at Hartsfield International Airport in Atlanta
Total observations = 4,116 days (100%)
Observed rainy days = 1,294 days (31.44%)
Data Binning
nbin = 58
width = 0.2
for (i in 1:nbin) {
lower = width*i - width
upper = width*i
precip$class[precip$inch > lower & precip$inch <= upper] = i
}
#count = count(precip, "class")
class1 = data.frame(class = 1:26)
class1$freq = tabulate(precip$class)
class1$density = round(class1$freq/sum(class1$freq), digits=4)
class2 = data.frame(class = 27:58, freq=0, density=0)
class = rbind(class1, class2)
Histogram
par(mfrow = c(1, 2))
barplot(class$freq, xlim=c(1, 58), ylim=c(0, 700), ylab="frequency", main="histogram (frequency)")
barplot(class$density, xlim=c(1, 58), ylim=c(0, 0.5), ylab="density", main="histogram (density)")
Normalized histogram
class$norm_density = round(class$freq/(sum(class$freq)*width), digits=4)
class
## class freq density norm_density
## 1 1 657 0.5077 2.5386
## 2 2 214 0.1654 0.8269
## 3 3 132 0.1020 0.5100
## 4 4 104 0.0804 0.4019
## 5 5 49 0.0379 0.1893
## 6 6 34 0.0263 0.1314
## 7 7 23 0.0178 0.0889
## 8 8 31 0.0240 0.1198
## 9 9 16 0.0124 0.0618
## 10 10 8 0.0062 0.0309
## 11 11 7 0.0054 0.0270
## 12 12 3 0.0023 0.0116
## 13 13 2 0.0015 0.0077
## 14 14 1 0.0008 0.0039
## 15 15 6 0.0046 0.0232
## 16 16 1 0.0008 0.0039
## 17 17 0 0.0000 0.0000
## 18 18 0 0.0000 0.0000
## 19 19 2 0.0015 0.0077
## 20 20 0 0.0000 0.0000
## 21 21 0 0.0000 0.0000
## 22 22 0 0.0000 0.0000
## 23 23 0 0.0000 0.0000
## 24 24 1 0.0008 0.0039
## 25 25 2 0.0015 0.0077
## 26 26 1 0.0008 0.0039
## 27 27 0 0.0000 0.0000
## 28 28 0 0.0000 0.0000
## 29 29 0 0.0000 0.0000
## 30 30 0 0.0000 0.0000
## 31 31 0 0.0000 0.0000
## 32 32 0 0.0000 0.0000
## 33 33 0 0.0000 0.0000
## 34 34 0 0.0000 0.0000
## 35 35 0 0.0000 0.0000
## 36 36 0 0.0000 0.0000
## 37 37 0 0.0000 0.0000
## 38 38 0 0.0000 0.0000
## 39 39 0 0.0000 0.0000
## 40 40 0 0.0000 0.0000
## 41 41 0 0.0000 0.0000
## 42 42 0 0.0000 0.0000
## 43 43 0 0.0000 0.0000
## 44 44 0 0.0000 0.0000
## 45 45 0 0.0000 0.0000
## 46 46 0 0.0000 0.0000
## 47 47 0 0.0000 0.0000
## 48 48 0 0.0000 0.0000
## 49 49 0 0.0000 0.0000
## 50 50 0 0.0000 0.0000
## 51 51 0 0.0000 0.0000
## 52 52 0 0.0000 0.0000
## 53 53 0 0.0000 0.0000
## 54 54 0 0.0000 0.0000
## 55 55 0 0.0000 0.0000
## 56 56 0 0.0000 0.0000
## 57 57 0 0.0000 0.0000
## 58 58 0 0.0000 0.0000
barp(class$norm_density, xlim=c(0, 60), ylim=c(-0.25, 3.25), ylab="density", main="normalized histogram", xlab="bin class index for precipitation rate [inch/day] by 0.2 binwidth")
Fitting parameters
#Gaussian distribution
Gauss_Exp = mean(precip$inch)
Gauss_Var = var(precip$inch)
#Log-normal distribution
precip$log = log(precip$inch)
Lnorm_Exp = exp(mean(precip$log) + var(precip$log)/2)
Lnorm_Var = (exp(var(precip$log))-1)*exp(2*mean(precip$log)+var(precip$log))
Fitting parameters for Gaussian distribution
E[x] = 0.404
Var[x] = 0.319
Fitting parameters for log-normal distribution
E[x] = 0.525
Var[x] = 2.752
mean_ln(x) = -1.844
var_ln(x) = 2.397
std_ln(x) = 1.548
PDF curve over histogram
#Define the central value for each bin class.
pdf = data.frame(class = 1:58)
pdf$central = (pdf$class - 1/2)*width
pdf$freq = class$freq
#Calculate the probability based on known mean and variance.(by command)
pdf$pdf_Gauss = round(dnorm(pdf$central, mean=Gauss_Exp, sd=sqrt(Gauss_Var)), digits=5)
pdf$pdf_Lnorm = round(dlnorm(pdf$central, meanlog=mean(precip$log), sdlog=sd(precip$log)), digits=5)
#Calculate the probability based on known mean and variance.(by hand)
pdf$pdf_Gauss2 = round((1/(sqrt(2*pi)*sd(precip$inch)))*exp(-((pdf$central-mean(precip$inch))^2 / (2*var(precip$inch)))), digits=5)
pdf$pdf_Lnorm2 = round((1/(pdf$central*sd(precip$log)*sqrt(2*pi)))*exp(-((log(pdf$central)-mean(precip$log))^2/(2*var(precip$log)))), digits=5)
pdf
## class central freq pdf_Gauss pdf_Lnorm pdf_Gauss2 pdf_Lnorm2
## 1 1 0.1 657 0.61108 2.46582 0.61108 2.46582
## 2 2 0.3 214 0.69447 0.78862 0.69447 0.78862
## 3 3 0.5 132 0.69623 0.39101 0.69623 0.39101
## 4 4 0.7 104 0.61573 0.23211 0.61573 0.23211
## 5 5 0.9 49 0.48036 0.15245 0.48036 0.15245
## 6 6 1.1 34 0.33059 0.10694 0.33059 0.10694
## 7 7 1.3 23 0.20070 0.07859 0.20070 0.07859
## 8 8 1.5 31 0.10749 0.05981 0.10749 0.05981
## 9 9 1.7 16 0.05078 0.04678 0.05078 0.04678
## 10 10 1.9 8 0.02116 0.03739 0.02116 0.03739
## 11 11 2.1 7 0.00778 0.03043 0.00778 0.03043
## 12 12 2.3 3 0.00252 0.02514 0.00252 0.02514
## 13 13 2.5 2 0.00072 0.02105 0.00072 0.02105
## 14 14 2.7 1 0.00018 0.01781 0.00018 0.01781
## 15 15 2.9 6 0.00004 0.01522 0.00004 0.01522
## 16 16 3.1 1 0.00001 0.01312 0.00001 0.01312
## 17 17 3.3 0 0.00000 0.01140 0.00000 0.01140
## 18 18 3.5 0 0.00000 0.00997 0.00000 0.00997
## 19 19 3.7 2 0.00000 0.00877 0.00000 0.00877
## 20 20 3.9 0 0.00000 0.00776 0.00000 0.00776
## 21 21 4.1 0 0.00000 0.00690 0.00000 0.00690
## 22 22 4.3 0 0.00000 0.00616 0.00000 0.00616
## 23 23 4.5 0 0.00000 0.00553 0.00000 0.00553
## 24 24 4.7 1 0.00000 0.00498 0.00000 0.00498
## 25 25 4.9 2 0.00000 0.00450 0.00000 0.00450
## 26 26 5.1 1 0.00000 0.00408 0.00000 0.00408
## 27 27 5.3 0 0.00000 0.00372 0.00000 0.00372
## 28 28 5.5 0 0.00000 0.00339 0.00000 0.00339
## 29 29 5.7 0 0.00000 0.00310 0.00000 0.00310
## 30 30 5.9 0 0.00000 0.00285 0.00000 0.00285
## 31 31 6.1 0 0.00000 0.00262 0.00000 0.00262
## 32 32 6.3 0 0.00000 0.00241 0.00000 0.00241
## 33 33 6.5 0 0.00000 0.00223 0.00000 0.00223
## 34 34 6.7 0 0.00000 0.00206 0.00000 0.00206
## 35 35 6.9 0 0.00000 0.00191 0.00000 0.00191
## 36 36 7.1 0 0.00000 0.00178 0.00000 0.00178
## 37 37 7.3 0 0.00000 0.00165 0.00000 0.00165
## 38 38 7.5 0 0.00000 0.00154 0.00000 0.00154
## 39 39 7.7 0 0.00000 0.00144 0.00000 0.00144
## 40 40 7.9 0 0.00000 0.00134 0.00000 0.00134
## 41 41 8.1 0 0.00000 0.00126 0.00000 0.00126
## 42 42 8.3 0 0.00000 0.00118 0.00000 0.00118
## 43 43 8.5 0 0.00000 0.00111 0.00000 0.00111
## 44 44 8.7 0 0.00000 0.00104 0.00000 0.00104
## 45 45 8.9 0 0.00000 0.00098 0.00000 0.00098
## 46 46 9.1 0 0.00000 0.00092 0.00000 0.00092
## 47 47 9.3 0 0.00000 0.00087 0.00000 0.00087
## 48 48 9.5 0 0.00000 0.00082 0.00000 0.00082
## 49 49 9.7 0 0.00000 0.00078 0.00000 0.00078
## 50 50 9.9 0 0.00000 0.00073 0.00000 0.00073
## 51 51 10.1 0 0.00000 0.00070 0.00000 0.00070
## 52 52 10.3 0 0.00000 0.00066 0.00000 0.00066
## 53 53 10.5 0 0.00000 0.00063 0.00000 0.00063
## 54 54 10.7 0 0.00000 0.00059 0.00000 0.00059
## 55 55 10.9 0 0.00000 0.00056 0.00000 0.00056
## 56 56 11.1 0 0.00000 0.00054 0.00000 0.00054
## 57 57 11.3 0 0.00000 0.00051 0.00000 0.00051
## 58 58 11.5 0 0.00000 0.00049 0.00000 0.00049
#Normalize the PDF and CDF
pdf$npdf_Gaussian = round(pdf$pdf_Gauss2/sum(pdf$pdf_Gauss2), digits=5)
pdf$npdf_Lognormal = round(pdf$pdf_Lnorm2/sum(pdf$pdf_Lnorm2), digits=5)
#for (i in 1:nbin) {
# pdf$cdf_Gaussian[i] = round(sum(pdf$pdf_Gaussian[1:i]), digits=3)
# pdf$cdf_Lognormal[i] = round(sum(pdf$pdf_Lognormal[1:i]), digits=3)
#}
#Plots of theoretical distibutions [Normalized PDFs over (Unnormalized) Histogram]
class$density[which(class$density==0)] = NA
plot(pdf$central, class$density, type="h", col="darkgray", lwd=8, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth", main="Normalized PDFs over (Unnormalized) Histogram \n Gaussian (blue) and Lognormal (red)")
par(new=TRUE)
plot(pdf$central, pdf$npdf_Gaussian, type="l", col="blue", lwd=3, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
par(new=TRUE)
plot(pdf$central, pdf$npdf_Lognormal, type="l", col="red", lwd=3, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
#Plots of theoretical distibutions [(Unnormalized) PDFs over Normalized Histogram]
class$norm_density[which(class$norm_density==0)] = NA
plot(pdf$central, class$norm_density, type="h", col="darkgray", lwd=8, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth", main="(Unnormalized) PDFs over Normalized Histogram \n Gaussian (blue) and Lognormal (red)")
par(new=TRUE)
plot(pdf$central, pdf$pdf_Gauss2, type="l", col="blue", lwd=3, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
par(new=TRUE)
plot(pdf$central, pdf$pdf_Lnorm2, type="l", col="red", lwd=3, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
Chi-square test for the goodness of fit
gof = data.frame(class = 1:58)
gof$central = pdf$central
gof$pdf_Gaussian = pdf$pdf_Gauss2
gof$pdf_Lognormal = pdf$pdf_Lnorm2
gof$observ = class$freq
#Calculate the expected values based on known probability.
gof$expect_Gaussian = round(gof$pdf_Gaussian*sum(class$freq)*width, digits=5)
gof$expect_Lognormal = round(gof$pdf_Lognormal*sum(class$freq)*width, digits=5)
#Calculate the chi-square based on observed and expected values.
#gof$chi_Gaussian = ((gof$observ-gof$expect_Gaussian)^2/gof$expect_Gaussian)
#gof$chi_Lognormal = ((gof$observ-gof$expect_Lognormal)^2/gof$expect_Lognormal)
gof$chi_Gaussian = round(((gof$observ-gof$expect_Gaussian)^2/gof$expect_Gaussian), digits=2)
gof$chi_Lognormal = round(((gof$observ-gof$expect_Lognormal)^2/gof$expect_Lognormal), digits=2)
gof_stat = data.frame(class = 1:58)
gof_stat$observ = gof$observ
gof_stat$expect_Gauss = gof$expect_Gaussian
gof_stat$expect_Lnorm = gof$expect_Lognormal
gof_stat$chi_Gaussian = gof$chi_Gaussian
gof_stat$chi_Lognormal = gof$chi_Lognormal
gof_stat
## class observ expect_Gauss expect_Lnorm chi_Gaussian chi_Lognormal
## 1 1 657 158.14750 638.15422 1573.56 0.56
## 2 2 214 179.72884 204.09486 6.53 0.48
## 3 3 132 180.18432 101.19339 12.89 9.38
## 4 4 104 159.35092 60.07007 19.23 32.13
## 5 5 49 124.31717 39.45406 45.63 2.31
## 6 6 34 85.55669 27.67607 31.07 1.45
## 7 7 23 51.94116 20.33909 16.13 0.35
## 8 8 31 27.81841 15.47883 0.36 15.56
## 9 9 16 13.14186 12.10666 0.62 1.25
## 10 10 8 5.47621 9.67653 1.16 0.29
## 11 11 7 2.01346 7.87528 12.35 0.10
## 12 12 3 0.65218 6.50623 8.45 1.89
## 13 13 2 0.18634 5.44774 17.65 2.18
## 14 14 1 0.04658 4.60923 19.52 2.83
## 15 15 6 0.01035 3.93894 3466.27 1.08
## 16 16 1 0.00259 3.39546 384.10 1.69
## 17 17 0 0.00000 2.95032 NaN 2.95
## 18 18 0 0.00000 2.58024 NaN 2.58
## 19 19 2 0.00000 2.26968 Inf 0.03
## 20 20 0 0.00000 2.00829 NaN 2.01
## 21 21 0 0.00000 1.78572 NaN 1.79
## 22 22 0 0.00000 1.59421 NaN 1.59
## 23 23 0 0.00000 1.43116 NaN 1.43
## 24 24 1 0.00000 1.28882 Inf 0.06
## 25 25 2 0.00000 1.16460 Inf 0.60
## 26 26 1 0.00000 1.05590 Inf 0.00
## 27 27 0 0.00000 0.96274 NaN 0.96
## 28 28 0 0.00000 0.87733 NaN 0.88
## 29 29 0 0.00000 0.80228 NaN 0.80
## 30 30 0 0.00000 0.73758 NaN 0.74
## 31 31 0 0.00000 0.67806 NaN 0.68
## 32 32 0 0.00000 0.62371 NaN 0.62
## 33 33 0 0.00000 0.57712 NaN 0.58
## 34 34 0 0.00000 0.53313 NaN 0.53
## 35 35 0 0.00000 0.49431 NaN 0.49
## 36 36 0 0.00000 0.46066 NaN 0.46
## 37 37 0 0.00000 0.42702 NaN 0.43
## 38 38 0 0.00000 0.39855 NaN 0.40
## 39 39 0 0.00000 0.37267 NaN 0.37
## 40 40 0 0.00000 0.34679 NaN 0.35
## 41 41 0 0.00000 0.32609 NaN 0.33
## 42 42 0 0.00000 0.30538 NaN 0.31
## 43 43 0 0.00000 0.28727 NaN 0.29
## 44 44 0 0.00000 0.26915 NaN 0.27
## 45 45 0 0.00000 0.25362 NaN 0.25
## 46 46 0 0.00000 0.23810 NaN 0.24
## 47 47 0 0.00000 0.22516 NaN 0.23
## 48 48 0 0.00000 0.21222 NaN 0.21
## 49 49 0 0.00000 0.20186 NaN 0.20
## 50 50 0 0.00000 0.18892 NaN 0.19
## 51 51 0 0.00000 0.18116 NaN 0.18
## 52 52 0 0.00000 0.17081 NaN 0.17
## 53 53 0 0.00000 0.16304 NaN 0.16
## 54 54 0 0.00000 0.15269 NaN 0.15
## 55 55 0 0.00000 0.14493 NaN 0.14
## 56 56 0 0.00000 0.13975 NaN 0.14
## 57 57 0 0.00000 0.13199 NaN 0.13
## 58 58 0 0.00000 0.12681 NaN 0.13
#Chi-square statistic.
chisq_Gaussian = sum(gof$chi_Gaussian[1:11], NA, na.rm = TRUE)
chisq_Lognormal = sum(gof$chi_Lognormal[1:26], NA, na.rm = TRUE)
chisq_Gaussian
## [1] 1719.53
chisq_Lognormal
## [1] 86.57
============================================================================================================================
Chi-square test for the goodness of fit
Chi-square statistic to evaluate the goodness of fit for Gaussian distribution = 1719.53
Chi-square statistic to evaluate the goodness of fit for lognormal distribution = 86.57
Degree of freedom = 58 - 2 - 1 = 55
Chi-square distribution from the table (df=55, p=0.01) = 82.292
At the 99% confidence limit, the null hypothesis would be rejected for both the Gaussian distribution and log-normal distribution.
Fitting parameters for Gaussian distribution
E[x] = 0.404
Var[x] = 0.319
Fitting parameters for log-normal distribution
E[x] = 0.525
Var[x] = 2.752
with additional stats: mean_ln(x) = -1.844; var_ln(x) = 2.397; std_ln(x) = 1.548
============================================================================================================================
.
.
.
.
.
.
.
.
Other statistical tests for the goodness of fit
fit_n = fitdist(precip$inch, "norm")
fit_l = fitdist(precip$inch, "lnorm")
fit_g = fitdist(precip$inch, "gamma")
fit_w = fitdist(precip$inch, "weibull")
gofstat(list(fit_n, fit_l, fit_g, fit_w), fitnames=c("Gaussian","lognormal","gamma","Weibull"))
## Goodness-of-fit statistics
## Gaussian lognormal gamma Weibull
## Kolmogorov-Smirnov statistic 0.2426139 0.07389962 0.0750221 0.07068171
## Cramer-von Mises statistic 20.2670677 2.02602634 1.1658133 0.65125819
## Anderson-Darling statistic Inf 14.76146566 9.1826692 6.51343167
##
## Goodness-of-fit criteria
## Gaussian lognormal gamma Weibull
## Aikake's Information Criterion 2196.690 35.43021 58.92670 26.23274
## Bayesian Information Criterion 2207.021 45.76119 69.25768 36.56372
qqcomp(list(fit_n, fit_l, fit_g, fit_w), legendtext=c("Gaussian","lognormal","gamma","Weibull"))
ppcomp(list(fit_n, fit_l, fit_g, fit_w), legendtext=c("Gaussian","lognormal","gamma","Weibull"))