Histogram with Normal density line Plot in R

# Sample data
#set.seed(104)
x <- rnorm(200)
x2 <- seq(min(x), max(x), length = 40)
fun <- dnorm(x2, mean = mean(x), sd = sd(x))


# Histogram
hist(x, prob = TRUE, ylim = c(0, max(fun)),
     main = "Histogram with density curve", col="skyblue")
lines(density(x), col = 6, lwd = 4)

# X-axis grid

x <- rnorm(20000)
x2 <- seq(min(x), max(x), length = 40)
fun <- dnorm(x2, mean = 0, sd = 1)


# Normal curve

# Histogram
hist(x, prob = TRUE, col = "skyblue",
     ylim = c(0, max(fun)),
     main = "Histogram with normal curve")
lines(x2, fun, col = 2, lwd = 3)

Exercise: Simulate 200 Data from t(df=199), Overlay Normal curve, t-curve

# X-axis grid
set.seed(100)
x <- rt(200, 199)
fun <- dnorm(x2, mean = mean(x), sd = sd(x))
funt <- dt(x2, 1999)

x2 <- seq(min(x), max(x), length = 40)
# Normal curve

# Histogram
hist(x, prob = TRUE,
     ylim = c(0, max(fun)),
     main = "Histogram of 200 data with overlaying normal curve, t-curve ")
lines(x2, fun, col = "blue", lwd = 3)
lines(x2, funt, col = "red", lwd = 3, lty=2)
#abline(v=mean(x),col="blue",lwd=4, lty=2)
#abline(v=0,col="red",lwd=4, lty=3)
legend("topleft",
c("N(0,1)","t(df=199)"),
fill=c("blue","red" )
)

# install.packages("ggplot2")
require(ggplot2)
## Loading required package: ggplot2
# Data
set.seed(5)
x <- rnorm(1000)
df <- data.frame(x)

# Histogram with kernel density
ggplot(df, aes(x = x)) + 
  geom_histogram(aes(y = ..density..),
                 colour = 2, lwd=1,fill = "white") +
  geom_density(lwd = 1, colour = 4,
               fill = 4, alpha = 0.25)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The Students t-Distribution in R

he R software provides access to the t-distribution by the dt(), pt(), qt() and rt() functions. Apply the help() function on these functions for further information.

The rt() function generates random deviates of the t-distribution and is written as rt(n, df). We may easily generate n number of random samples. Recall that he number of degrees of freedom for a t-distribution is equal to the sample size minus one, that is, df=n−1.

n <- 100
df <- n - 1
rt(n, df)
##   [1] -1.34306460 -0.43393237  0.12895811  0.11156187  0.49930278  0.18437615
##   [7] -0.92773137 -0.09201836 -0.16704139 -1.47841674  0.95338265  0.70981794
##  [13] -0.95151211  0.44728232  0.44961966  0.89072626  0.93618576  0.81398072
##  [19]  0.02183607  2.54441310  0.17230093 -0.97406736 -1.52900695 -1.58375631
##  [25] -0.79106779  2.02658470 -0.40167971 -1.02622771  0.72360181  0.15748617
##  [31] -0.70258605  2.39413054  2.04684840  2.49695911  0.32842662 -1.42913692
##  [37] -3.16477155 -1.59276286 -1.21984610  1.07717982 -2.80332146 -2.11226702
##  [43]  0.44866705  0.37055896  0.29530760 -2.72712322 -0.38862687  0.16546756
##  [49]  0.93803860  1.32624986  1.87058713 -1.22999079  0.28372089 -1.09130986
##  [55]  0.95305617 -1.12126642 -0.84037470  0.12406108 -0.33289646 -0.29504533
##  [61] -0.21243450 -0.87102873 -0.77124475 -0.23431149 -0.35829111 -0.27620606
##  [67]  0.09235734  2.01351600  1.03369303  0.39881363 -0.82756332 -1.24052795
##  [73]  1.82614188 -0.32026516  1.87839381 -2.44748094  1.05141171  0.12247260
##  [79] -0.02706024  1.17386043  0.03095236  0.64218233 -0.75526260  0.18358931
##  [85] -0.15497296 -0.37836873  0.62746795 -1.47198432  1.30573003  0.32089523
##  [91]  1.39657151 -0.38645444 -0.12857668  0.43635519  0.11545396  0.86293865
##  [97]  0.43462505  0.13355531 -1.16647216  2.22161196
n <- 10000
df <- n - 1
samples <- rt(n, df)
hist(samples, breaks = 'Scott', freq = FALSE, col="yellow")

#lines(dt(n,df), add=T)
Class Work
Plot the density line of T distribution over the above Histogram plot in R

The Chi-Square Distribution in R

The main functions to interact with the χ2-distribution are dchisq(), pchisq(), qchisq(), rchisq(). The dchisq() function gives the density, the pchisq()function gives the distribution function, the qchisq() function gives the quantile function, and the rchisq()function generates random deviates.

For example, We use the dchisq() to calculate the density for the integer values 4 to 8 of a χ2-curve with df=7.

dchisq(4:8, df = 7)
## [1] 0.11518073 0.12204152 0.11676522 0.10411977 0.08817914

We use the pchisq() to calculate the area under the curve for the interval [0,6] and the interval [6,∞) of a χ2-curve with df=7. Further, we ask R if the sum of the intervals [0,6] and [6,∞) sums up to 1.

# interval $[0,6]
pchisq(6, df = 7,lower.tail = TRUE)
## [1] 0.4602506
# interval $[6,inf]
pchisq(6, df = 7,lower.tail = FALSE)
## [1] 0.5397494

We use the rchisq() function to generate 100,000 random values from the χ2-distribution with df=7. Thereafter we plot a histogram and compare it to the probability density function of the χ2-distribution with df=7 (blue line).

require(latex2exp)
## Loading required package: latex2exp
x <- rchisq(100000, df = 7)
hist(x, 
     breaks = 'Scott', 
     freq = FALSE, 
     xlim = c(0,14), 
     ylim = c(0,0.2),
     xlab = '', 
     main = (TeX('Histogram for $\\chi^2$-distributions with 7 degrees of freedom (df)')), cex.main=0.9, col="pink")

curve(dchisq(x, df = 7), from = 0, to = 15, n = 5000, col= 'blue', lwd=2, add = T)

Remark ——-

The chi-squared distribution with df degrees of freedom is the distribution of the sums of the squares of df independent standard normal random variables.

The chi-squared distribution is positively skewed with values between 0 and ∞. The mean is =df and the variance is 2*df. When df>=2, the maximum occurs at (n−1)−2.

As n approaches infinity, the distribution approaches normal. The chi-square distribution is heavily skewed right, so calculate an upper and lower χ2df to create confidence intervals.

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ggplot2)
require(tidyr)
## Loading required package: tidyr
data.frame(chisq = 0:7000 / 100) %>% 
           mutate(df_05 = dchisq(x = chisq, df = 5),
                  df_10 = dchisq(x = chisq, df = 10),
                  df_15 = dchisq(x = chisq, df = 15),
                  df_30 = dchisq(x = chisq, df = 30),
                  df_50 = dchisq(x = chisq, df = 50)) %>%
  gather(key = "df", value = "density", -chisq) %>%
ggplot() +
  geom_line(aes(x = chisq, y = density, color = df)) +
  labs(title = "Chi-Square at Various Degrees of Freedom",
       x = "Chi-square",
       y = "Density") 

require(dplyr)
require(ggplot2)
require(tidyr)

data.frame(t = -7000:7000 / 1000) %>% 
           mutate(df_05 = dt(x = t, df = 5),
                  df_15 = dt(x = t, df = 15),
                  df_100 = dt(x = t, df = 100)) %>%
  gather(key = "df", value = "density", -t) %>%
ggplot() +
  geom_line(aes(x = t, y = density, color = df)) +
  labs(title = "t- Distribution at Various Degrees of Freedom",
       x = "t-distribution",
       y = "Density") 

References

Hartmann, K., Krois, J., Waske, B. (2018): E-Learning Project SOGA: Statistics and Geospatial Data Analysis. Department of Earth Sciences, Freie Universitaet Berlin.

https://rpubs.com/mpfoley73/460935

https://www.geo.fu-berlin.de/en/v/soga/Basics-of-statistics/Continous-Random-Variables/Students-t-Distribution/Students-t-Distribution-in-R/index.html#:~:text=The%20rt()%20function%20generates,df%3Dn%E2%88%921.

https://www.geo.fu-berlin.de/en/v/soga/Basics-of-statistics/Continous-Random-Variables/Students-t-Distribution/Students-t-Distribution-in-R/index.html#:~:text=The%20rt()%20function%20generates,df%3Dn%E2%88%921.