# Normal distribution
# Generate random numbers from a normal distribution
# Suppose we have interviewed n number of farmers to know their age (in years).
# To get the same random samples each time, we use set.seed()
set.seed(1)
sample = rnorm(n = 10, mean = 50, sd = 5)
sample
## [1] 46.86773 50.91822 45.82186 57.97640 51.64754 45.89766 52.43715 53.69162
## [9] 52.87891 48.47306
# Round down
floor(sample)
## [1] 46 50 45 57 51 45 52 53 52 48
# Round up
ceiling(sample)
## [1] 47 51 46 58 52 46 53 54 53 49
# Round the values to the nearest full integer
age = round(sample)
age
## [1] 47 51 46 58 52 46 52 54 53 48
# Make samples of 10000 farmers
set.seed(1)
ages = round(rnorm(10000, 50, 5))
summary(ages)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.00 47.00 50.00 49.97 53.00 69.00
# Check normality with boxplot
boxplot(ages)

# Check normality with QQ plot
qqnorm(ages)
qqline(ages)

# Check statistically: Shapiro test, Smirnov-Kolmogorov test
# H0: Distribution is normal
# p-value > 0.05: H0 not rejected
# p <= 0.05: H0 rejected
# p-value = evidence to support the H0 (null hypothesis)
# p-value = Type I error, extent error can happen in future trials
# Shapiro test does not work for n>5000
shapiro.test(ages[1:5000])
##
## Shapiro-Wilk normality test
##
## data: ages[1:5000]
## W = 0.99635, p-value = 9.289e-10
# p < 0.05, H0 rejected, Data is not normal
# Smirnov-Kologorov test works for two variables
# ks.test(ages)
# Visualize the distibution
hist(ages, 20)

# Density plot
plot(density(ages))

# Descriptive statistics
mean(ages)
## [1] 49.9683
sd(ages)
## [1] 5.069898
length(ages)
## [1] 10000
# Calculate standard error (se)
se = sd(ages)/sqrt(length(ages))
se
## [1] 0.05069898
# Confidence Interval, CI at 95% level
# CI = mean +- 1.96*se
# Z-score at 95% level = 1.96
# lower limit
mean(ages)-1.96*se
## [1] 49.86893
# upper limit
mean(ages)+1.96*se
## [1] 50.06767
paste('95% ci is [49.89, 50.07]')
## [1] "95% ci is [49.89, 50.07]"
# CI using t.test
t.test(ages)
##
## One Sample t-test
##
## data: ages
## t = 985.59, df = 9999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 49.86892 50.06768
## sample estimates:
## mean of x
## 49.9683
range(ages)
## [1] 32 69