Lesson_6

# Normal distribution

# Generate random numbers from a normal distribution

# Suppose we have interviewed n number of farmers to know their age (in years).

# To get the same random samples each time, we use set.seed()

set.seed(1)
sample = rnorm(n = 10, mean = 50, sd = 5)
sample

##  [1] 46.86773 50.91822 45.82186 57.97640 51.64754 45.89766 52.43715 53.69162
##  [9] 52.87891 48.47306

# Round down
floor(sample)

##  [1] 46 50 45 57 51 45 52 53 52 48

# Round up
ceiling(sample)

##  [1] 47 51 46 58 52 46 53 54 53 49

# Round the values to the nearest full integer
age = round(sample)
age

##  [1] 47 51 46 58 52 46 52 54 53 48

# Make samples of 10000 farmers
set.seed(1)
ages = round(rnorm(10000, 50, 5))
summary(ages)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.00   47.00   50.00   49.97   53.00   69.00

# Check normality with boxplot
boxplot(ages)

# Check normality with QQ plot
qqnorm(ages)
qqline(ages)

# Check statistically: Shapiro test, Smirnov-Kolmogorov test

# H0: Distribution is normal
# p-value > 0.05: H0 not rejected
# p <= 0.05: H0 rejected
# p-value = evidence to support the H0 (null hypothesis)
# p-value = Type I error, extent error can happen in future trials

# Shapiro test does not work for n>5000
shapiro.test(ages[1:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  ages[1:5000]
## W = 0.99635, p-value = 9.289e-10

# p < 0.05, H0 rejected, Data is not normal

# Smirnov-Kologorov test works for two variables
# ks.test(ages)

# Visualize the distibution

hist(ages, 20)

# Density plot
plot(density(ages))

# Descriptive statistics
mean(ages)

## [1] 49.9683

sd(ages)

## [1] 5.069898

length(ages)

## [1] 10000

# Calculate standard error (se)

se = sd(ages)/sqrt(length(ages))
se

## [1] 0.05069898

# Confidence Interval, CI at 95% level
# CI = mean +- 1.96*se
# Z-score at 95% level = 1.96

# lower limit
mean(ages)-1.96*se

## [1] 49.86893

# upper limit
mean(ages)+1.96*se

## [1] 50.06767

paste('95% ci is [49.89, 50.07]')

## [1] "95% ci is [49.89, 50.07]"

# CI using t.test
t.test(ages)

## 
##  One Sample t-test
## 
## data:  ages
## t = 985.59, df = 9999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  49.86892 50.06768
## sample estimates:
## mean of x 
##   49.9683

range(ages)

## [1] 32 69

Lesson_6_7.R

hp

2025-02-11