Assignment #2

Histograms, Theoretical Distributions and Goodness of Fit

Call R libraries.

library(data.table)
library(plyr)
library(ggplot2)
library(plotrix)
library(fitdistrplus)
## Loading required package: MASS

Data Subset

download.file(url = "http://coaps.fsu.edu/~tachanat/data/Atlanta_Precip.txt", destfile = "Atlanta_Precip.txt")
#data = read.class("Atlanta_Precip.txt", skip= 14, header = TRUE, colClasses=c("numeric"))
#precip = data[4]
data = fread("Atlanta_Precip.txt", skip= 14, header = TRUE, colClasses=c("numeric"))
precip = subset(data, as.numeric(inch) > 0)
n_observed = length(data$inch)
n_rainyday = length(precip$inch)
p_rainyday = round(n_rainyday/n_observed*100, digits=2)

Daily precipitation at Hartsfield International Airport in Atlanta

Total observations = 4,116 days (100%)

Observed rainy days = 1,294 days (31.44%)

Data Binning

nbin = 58
width = 0.2
for (i in 1:nbin) {
  lower = width*i - width
  upper = width*i 
  precip$class[precip$inch > lower & precip$inch <= upper] = i
}
#count = count(precip, "class")
class1 = data.frame(class = 1:26)
class1$freq = tabulate(precip$class)
class1$density = round(class1$freq/sum(class1$freq), digits=4)
class2 = data.frame(class = 27:58, freq=0, density=0)
class = rbind(class1, class2)

Histogram

par(mfrow = c(1, 2))
barplot(class$freq, xlim=c(1, 58), ylim=c(0, 700), ylab="frequency", main="histogram (frequency)")
barplot(class$density, xlim=c(1, 58), ylim=c(0, 0.5), ylab="density", main="histogram (density)")

Normalized histogram

class$norm_density = round(class$freq/(sum(class$freq)*width), digits=4)
class
##    class freq density norm_density
## 1      1  657  0.5077       2.5386
## 2      2  214  0.1654       0.8269
## 3      3  132  0.1020       0.5100
## 4      4  104  0.0804       0.4019
## 5      5   49  0.0379       0.1893
## 6      6   34  0.0263       0.1314
## 7      7   23  0.0178       0.0889
## 8      8   31  0.0240       0.1198
## 9      9   16  0.0124       0.0618
## 10    10    8  0.0062       0.0309
## 11    11    7  0.0054       0.0270
## 12    12    3  0.0023       0.0116
## 13    13    2  0.0015       0.0077
## 14    14    1  0.0008       0.0039
## 15    15    6  0.0046       0.0232
## 16    16    1  0.0008       0.0039
## 17    17    0  0.0000       0.0000
## 18    18    0  0.0000       0.0000
## 19    19    2  0.0015       0.0077
## 20    20    0  0.0000       0.0000
## 21    21    0  0.0000       0.0000
## 22    22    0  0.0000       0.0000
## 23    23    0  0.0000       0.0000
## 24    24    1  0.0008       0.0039
## 25    25    2  0.0015       0.0077
## 26    26    1  0.0008       0.0039
## 27    27    0  0.0000       0.0000
## 28    28    0  0.0000       0.0000
## 29    29    0  0.0000       0.0000
## 30    30    0  0.0000       0.0000
## 31    31    0  0.0000       0.0000
## 32    32    0  0.0000       0.0000
## 33    33    0  0.0000       0.0000
## 34    34    0  0.0000       0.0000
## 35    35    0  0.0000       0.0000
## 36    36    0  0.0000       0.0000
## 37    37    0  0.0000       0.0000
## 38    38    0  0.0000       0.0000
## 39    39    0  0.0000       0.0000
## 40    40    0  0.0000       0.0000
## 41    41    0  0.0000       0.0000
## 42    42    0  0.0000       0.0000
## 43    43    0  0.0000       0.0000
## 44    44    0  0.0000       0.0000
## 45    45    0  0.0000       0.0000
## 46    46    0  0.0000       0.0000
## 47    47    0  0.0000       0.0000
## 48    48    0  0.0000       0.0000
## 49    49    0  0.0000       0.0000
## 50    50    0  0.0000       0.0000
## 51    51    0  0.0000       0.0000
## 52    52    0  0.0000       0.0000
## 53    53    0  0.0000       0.0000
## 54    54    0  0.0000       0.0000
## 55    55    0  0.0000       0.0000
## 56    56    0  0.0000       0.0000
## 57    57    0  0.0000       0.0000
## 58    58    0  0.0000       0.0000
barp(class$norm_density, xlim=c(0, 60), ylim=c(-0.25, 3.25), ylab="density", main="normalized histogram", xlab="bin class index for precipitation rate [inch/day] by 0.2 binwidth")

Fitting parameters

#Gaussian distribution
Gauss_Exp = mean(precip$inch)
Gauss_Var = var(precip$inch)

#Log-normal distribution
precip$log = log(precip$inch)
Lnorm_Exp = exp(mean(precip$log) + var(precip$log)/2)
Lnorm_Var = (exp(var(precip$log))-1)*exp(2*mean(precip$log)+var(precip$log))

Fitting parameters for Gaussian distribution

E[x] = 0.404

Var[x] = 0.319

Fitting parameters for log-normal distribution

E[x] = 0.525

Var[x] = 2.752

mean_ln(x) = -1.844

var_ln(x) = 2.397

std_ln(x) = 1.548

PDF curve over histogram

#Define the central value for each bin class.
pdf = data.frame(class = 1:58)
pdf$central = (pdf$class - 1/2)*width
pdf$freq = class$freq

#Calculate the probability based on known mean and variance.(by command)
pdf$pdf_Gauss = round(dnorm(pdf$central, mean=Gauss_Exp, sd=sqrt(Gauss_Var)), digits=5)
pdf$pdf_Lnorm = round(dlnorm(pdf$central, meanlog=mean(precip$log), sdlog=sd(precip$log)), digits=5)

#Calculate the probability based on known mean and variance.(by hand)
pdf$pdf_Gauss2 = round((1/(sqrt(2*pi)*sd(precip$inch)))*exp(-((pdf$central-mean(precip$inch))^2 / (2*var(precip$inch)))), digits=5)
pdf$pdf_Lnorm2 = round((1/(pdf$central*sd(precip$log)*sqrt(2*pi)))*exp(-((log(pdf$central)-mean(precip$log))^2/(2*var(precip$log)))), digits=5)
pdf
##    class central freq pdf_Gauss pdf_Lnorm pdf_Gauss2 pdf_Lnorm2
## 1      1     0.1  657   0.61108   2.46582    0.61108    2.46582
## 2      2     0.3  214   0.69447   0.78862    0.69447    0.78862
## 3      3     0.5  132   0.69623   0.39101    0.69623    0.39101
## 4      4     0.7  104   0.61573   0.23211    0.61573    0.23211
## 5      5     0.9   49   0.48036   0.15245    0.48036    0.15245
## 6      6     1.1   34   0.33059   0.10694    0.33059    0.10694
## 7      7     1.3   23   0.20070   0.07859    0.20070    0.07859
## 8      8     1.5   31   0.10749   0.05981    0.10749    0.05981
## 9      9     1.7   16   0.05078   0.04678    0.05078    0.04678
## 10    10     1.9    8   0.02116   0.03739    0.02116    0.03739
## 11    11     2.1    7   0.00778   0.03043    0.00778    0.03043
## 12    12     2.3    3   0.00252   0.02514    0.00252    0.02514
## 13    13     2.5    2   0.00072   0.02105    0.00072    0.02105
## 14    14     2.7    1   0.00018   0.01781    0.00018    0.01781
## 15    15     2.9    6   0.00004   0.01522    0.00004    0.01522
## 16    16     3.1    1   0.00001   0.01312    0.00001    0.01312
## 17    17     3.3    0   0.00000   0.01140    0.00000    0.01140
## 18    18     3.5    0   0.00000   0.00997    0.00000    0.00997
## 19    19     3.7    2   0.00000   0.00877    0.00000    0.00877
## 20    20     3.9    0   0.00000   0.00776    0.00000    0.00776
## 21    21     4.1    0   0.00000   0.00690    0.00000    0.00690
## 22    22     4.3    0   0.00000   0.00616    0.00000    0.00616
## 23    23     4.5    0   0.00000   0.00553    0.00000    0.00553
## 24    24     4.7    1   0.00000   0.00498    0.00000    0.00498
## 25    25     4.9    2   0.00000   0.00450    0.00000    0.00450
## 26    26     5.1    1   0.00000   0.00408    0.00000    0.00408
## 27    27     5.3    0   0.00000   0.00372    0.00000    0.00372
## 28    28     5.5    0   0.00000   0.00339    0.00000    0.00339
## 29    29     5.7    0   0.00000   0.00310    0.00000    0.00310
## 30    30     5.9    0   0.00000   0.00285    0.00000    0.00285
## 31    31     6.1    0   0.00000   0.00262    0.00000    0.00262
## 32    32     6.3    0   0.00000   0.00241    0.00000    0.00241
## 33    33     6.5    0   0.00000   0.00223    0.00000    0.00223
## 34    34     6.7    0   0.00000   0.00206    0.00000    0.00206
## 35    35     6.9    0   0.00000   0.00191    0.00000    0.00191
## 36    36     7.1    0   0.00000   0.00178    0.00000    0.00178
## 37    37     7.3    0   0.00000   0.00165    0.00000    0.00165
## 38    38     7.5    0   0.00000   0.00154    0.00000    0.00154
## 39    39     7.7    0   0.00000   0.00144    0.00000    0.00144
## 40    40     7.9    0   0.00000   0.00134    0.00000    0.00134
## 41    41     8.1    0   0.00000   0.00126    0.00000    0.00126
## 42    42     8.3    0   0.00000   0.00118    0.00000    0.00118
## 43    43     8.5    0   0.00000   0.00111    0.00000    0.00111
## 44    44     8.7    0   0.00000   0.00104    0.00000    0.00104
## 45    45     8.9    0   0.00000   0.00098    0.00000    0.00098
## 46    46     9.1    0   0.00000   0.00092    0.00000    0.00092
## 47    47     9.3    0   0.00000   0.00087    0.00000    0.00087
## 48    48     9.5    0   0.00000   0.00082    0.00000    0.00082
## 49    49     9.7    0   0.00000   0.00078    0.00000    0.00078
## 50    50     9.9    0   0.00000   0.00073    0.00000    0.00073
## 51    51    10.1    0   0.00000   0.00070    0.00000    0.00070
## 52    52    10.3    0   0.00000   0.00066    0.00000    0.00066
## 53    53    10.5    0   0.00000   0.00063    0.00000    0.00063
## 54    54    10.7    0   0.00000   0.00059    0.00000    0.00059
## 55    55    10.9    0   0.00000   0.00056    0.00000    0.00056
## 56    56    11.1    0   0.00000   0.00054    0.00000    0.00054
## 57    57    11.3    0   0.00000   0.00051    0.00000    0.00051
## 58    58    11.5    0   0.00000   0.00049    0.00000    0.00049
#Normalize the PDF and CDF
pdf$npdf_Gaussian = round(pdf$pdf_Gauss2/sum(pdf$pdf_Gauss2), digits=5)
pdf$npdf_Lognormal = round(pdf$pdf_Lnorm2/sum(pdf$pdf_Lnorm2), digits=5)
#for (i in 1:nbin) {
#  pdf$cdf_Gaussian[i] = round(sum(pdf$pdf_Gaussian[1:i]), digits=3)
#  pdf$cdf_Lognormal[i] = round(sum(pdf$pdf_Lognormal[1:i]), digits=3)
#}


#Plots of theoretical distibutions [Normalized PDFs over (Unnormalized) Histogram]
class$density[which(class$density==0)] = NA
plot(pdf$central, class$density, type="h", col="darkgray", lwd=8, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth", main="Normalized PDFs over (Unnormalized) Histogram \n Gaussian (blue) and Lognormal (red)")
par(new=TRUE)
plot(pdf$central, pdf$npdf_Gaussian, type="l", col="blue", lwd=3, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
par(new=TRUE)
plot(pdf$central, pdf$npdf_Lognormal, type="l", col="red", lwd=3, xlim=c(0, 12), ylim=c(0, 0.6), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")

#Plots of theoretical distibutions [(Unnormalized) PDFs over Normalized Histogram]
class$norm_density[which(class$norm_density==0)] = NA
plot(pdf$central, class$norm_density, type="h", col="darkgray", lwd=8, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth", main="(Unnormalized) PDFs over Normalized Histogram \n Gaussian (blue) and Lognormal (red)")
par(new=TRUE)
plot(pdf$central, pdf$pdf_Gauss2, type="l", col="blue", lwd=3, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")
par(new=TRUE)
plot(pdf$central, pdf$pdf_Lnorm2, type="l", col="red", lwd=3, xlim=c(0, 12), ylim=c(0, 3), ylab="density", xlab="precipitation rate (inch/day) by 0.2 binwidth")

Chi-square test for the goodness of fit

gof = data.frame(class = 1:58)
gof$central = pdf$central
gof$pdf_Gaussian = pdf$pdf_Gauss2
gof$pdf_Lognormal = pdf$pdf_Lnorm2
gof$observ = class$freq

#Calculate the expected values based on known probability.
gof$expect_Gaussian = round(gof$pdf_Gaussian*sum(class$freq)*width, digits=5)
gof$expect_Lognormal = round(gof$pdf_Lognormal*sum(class$freq)*width, digits=5)

#Calculate the chi-square based on observed and expected values.
#gof$chi_Gaussian = ((gof$observ-gof$expect_Gaussian)^2/gof$expect_Gaussian)
#gof$chi_Lognormal = ((gof$observ-gof$expect_Lognormal)^2/gof$expect_Lognormal)
gof$chi_Gaussian = round(((gof$observ-gof$expect_Gaussian)^2/gof$expect_Gaussian), digits=2)
gof$chi_Lognormal = round(((gof$observ-gof$expect_Lognormal)^2/gof$expect_Lognormal), digits=2)
gof_stat = data.frame(class = 1:58)
gof_stat$observ = gof$observ
gof_stat$expect_Gauss = gof$expect_Gaussian
gof_stat$expect_Lnorm = gof$expect_Lognormal
gof_stat$chi_Gaussian = gof$chi_Gaussian
gof_stat$chi_Lognormal = gof$chi_Lognormal
gof_stat
##    class observ expect_Gauss expect_Lnorm chi_Gaussian chi_Lognormal
## 1      1    657    158.14750    638.15422      1573.56          0.56
## 2      2    214    179.72884    204.09486         6.53          0.48
## 3      3    132    180.18432    101.19339        12.89          9.38
## 4      4    104    159.35092     60.07007        19.23         32.13
## 5      5     49    124.31717     39.45406        45.63          2.31
## 6      6     34     85.55669     27.67607        31.07          1.45
## 7      7     23     51.94116     20.33909        16.13          0.35
## 8      8     31     27.81841     15.47883         0.36         15.56
## 9      9     16     13.14186     12.10666         0.62          1.25
## 10    10      8      5.47621      9.67653         1.16          0.29
## 11    11      7      2.01346      7.87528        12.35          0.10
## 12    12      3      0.65218      6.50623         8.45          1.89
## 13    13      2      0.18634      5.44774        17.65          2.18
## 14    14      1      0.04658      4.60923        19.52          2.83
## 15    15      6      0.01035      3.93894      3466.27          1.08
## 16    16      1      0.00259      3.39546       384.10          1.69
## 17    17      0      0.00000      2.95032          NaN          2.95
## 18    18      0      0.00000      2.58024          NaN          2.58
## 19    19      2      0.00000      2.26968          Inf          0.03
## 20    20      0      0.00000      2.00829          NaN          2.01
## 21    21      0      0.00000      1.78572          NaN          1.79
## 22    22      0      0.00000      1.59421          NaN          1.59
## 23    23      0      0.00000      1.43116          NaN          1.43
## 24    24      1      0.00000      1.28882          Inf          0.06
## 25    25      2      0.00000      1.16460          Inf          0.60
## 26    26      1      0.00000      1.05590          Inf          0.00
## 27    27      0      0.00000      0.96274          NaN          0.96
## 28    28      0      0.00000      0.87733          NaN          0.88
## 29    29      0      0.00000      0.80228          NaN          0.80
## 30    30      0      0.00000      0.73758          NaN          0.74
## 31    31      0      0.00000      0.67806          NaN          0.68
## 32    32      0      0.00000      0.62371          NaN          0.62
## 33    33      0      0.00000      0.57712          NaN          0.58
## 34    34      0      0.00000      0.53313          NaN          0.53
## 35    35      0      0.00000      0.49431          NaN          0.49
## 36    36      0      0.00000      0.46066          NaN          0.46
## 37    37      0      0.00000      0.42702          NaN          0.43
## 38    38      0      0.00000      0.39855          NaN          0.40
## 39    39      0      0.00000      0.37267          NaN          0.37
## 40    40      0      0.00000      0.34679          NaN          0.35
## 41    41      0      0.00000      0.32609          NaN          0.33
## 42    42      0      0.00000      0.30538          NaN          0.31
## 43    43      0      0.00000      0.28727          NaN          0.29
## 44    44      0      0.00000      0.26915          NaN          0.27
## 45    45      0      0.00000      0.25362          NaN          0.25
## 46    46      0      0.00000      0.23810          NaN          0.24
## 47    47      0      0.00000      0.22516          NaN          0.23
## 48    48      0      0.00000      0.21222          NaN          0.21
## 49    49      0      0.00000      0.20186          NaN          0.20
## 50    50      0      0.00000      0.18892          NaN          0.19
## 51    51      0      0.00000      0.18116          NaN          0.18
## 52    52      0      0.00000      0.17081          NaN          0.17
## 53    53      0      0.00000      0.16304          NaN          0.16
## 54    54      0      0.00000      0.15269          NaN          0.15
## 55    55      0      0.00000      0.14493          NaN          0.14
## 56    56      0      0.00000      0.13975          NaN          0.14
## 57    57      0      0.00000      0.13199          NaN          0.13
## 58    58      0      0.00000      0.12681          NaN          0.13
#Chi-square statistic.
chisq_Gaussian = sum(gof$chi_Gaussian[1:11], NA, na.rm = TRUE)
chisq_Lognormal = sum(gof$chi_Lognormal[1:26], NA, na.rm = TRUE)
chisq_Gaussian
## [1] 1719.53
chisq_Lognormal
## [1] 86.57

Conclusions

============================================================================================================================

Chi-square test for the goodness of fit

Chi-square statistic to evaluate the goodness of fit for Gaussian distribution = 1719.53

Chi-square statistic to evaluate the goodness of fit for lognormal distribution = 86.57

Degree of freedom = 58 - 2 - 1 = 55

Chi-square distribution from the table (df=55, p=0.01) = 82.292

At the 99% confidence limit, the null hypothesis would be rejected for both the Gaussian distribution and log-normal distribution.

Fitting parameters for Gaussian distribution

E[x] = 0.404

Var[x] = 0.319

Fitting parameters for log-normal distribution

E[x] = 0.525

Var[x] = 2.752

with additional stats: mean_ln(x) = -1.844; var_ln(x) = 2.397; std_ln(x) = 1.548

============================================================================================================================

.

.

.

.

.

.

.

.

Other statistical tests for the goodness of fit

fit_n = fitdist(precip$inch, "norm")
fit_l = fitdist(precip$inch, "lnorm")
fit_g = fitdist(precip$inch, "gamma")
fit_w = fitdist(precip$inch, "weibull")
gofstat(list(fit_n, fit_l, fit_g, fit_w), fitnames=c("Gaussian","lognormal","gamma","Weibull"))
## Goodness-of-fit statistics
##                                Gaussian   lognormal     gamma    Weibull
## Kolmogorov-Smirnov statistic  0.2426139  0.07389962 0.0750221 0.07068171
## Cramer-von Mises statistic   20.2670677  2.02602634 1.1658133 0.65125819
## Anderson-Darling statistic          Inf 14.76146566 9.1826692 6.51343167
## 
## Goodness-of-fit criteria
##                                Gaussian lognormal    gamma  Weibull
## Aikake's Information Criterion 2196.690  35.43021 58.92670 26.23274
## Bayesian Information Criterion 2207.021  45.76119 69.25768 36.56372
qqcomp(list(fit_n, fit_l, fit_g, fit_w), legendtext=c("Gaussian","lognormal","gamma","Weibull"))

ppcomp(list(fit_n, fit_l, fit_g, fit_w), legendtext=c("Gaussian","lognormal","gamma","Weibull"))