Import Dataset

We will need and will be using the household income variable (hincp) data

hidata <-read.csv(“C:/Users/HP/OneDrive/Desktop/Queens College/Data 306-Data Analysis and Modeling (2022)/ACS_2020_NY_subset.csv”)

Rename variable for hincp use

hincp <- hidata$hincp

##1. Create a Histogram for hincp ## hist(hincp, main= “Population Mean”, xlab= “Household Income”, border= “black”, col=“cyan”) abline (v=mean(hincp), col= “red”, lty=3)

##1. Create a Q-Q Plot for hincp to examine it’s normality## qqnorm(hincp, pch=1) qqline(hincp, col=“magenta”, lwd=3)

##2. Calculate the mean; Answer will be the population mean of Household Income (hincp) ## mean(hincp) # The mean is 110719.6 #

##3. Draw 10,000 random samples with an N size of 2 ## iter <- 10000 # sample size = 10000 # n <- 2 # random samples of 2 # hincp.means <- rep(NA, iter)

##3a. Create a Histogram of the sampling distribution ## for (i in 1:iter){ sample.hincp <- sample (hincp, n) hincp.means[i] <- mean(sample.hincp)} mean(sample.hincp) # The mean is 55550 #

hist(hincp.means, main= “Sampling Distribution n=2”, xlab= “Sample Mean Hincp”, border= “black”, col= “blue”) abline(v=mean(hincp), col=“orange”, lty=3)

##3b. Create a Q-Q Plot of the sampling distribution ## qqnorm(hincp.means, main=“Q-Q Plot n=2”) qqline(hincp.means, col= “green”, lwd=6)

##3c. Compute Mean of the sampling distribution & compare to the “population mean” ## population.mean <- mean(hincp) # Population mean is 110719.580616302 # sample.mean <- mean(sample.hincp) # Sampling Distribution mean is 55550 # population.mean - sample.mean # A difference of 55169.58 #

##4. Draw 10,000 random samples with an N size of 10 ## iter <- 10000 # sample size = 10000 # n <- 10 # random samples of 10 # hincp.means <- rep(NA, iter)

##4a. Create a Histogram of the sampling distribution ## for (i in 1:iter){ sample.hincp <- sample (hincp, n) hincp.means[i] <- mean(sample.hincp)} mean(sample.hincp) # The mean is 100200 #

hist(hincp.means, main= “Sampling Distribution n=10”, xlab= “Sample Mean Hincp”, border= “black”, col= “blue”) abline(v=mean(hincp), col=“orange”, lty=3)

##4b. Create a Q-Q Plot of the sampling distribution ## qqnorm(hincp.means, main=“Q-Q Plot n=10”) qqline(hincp.means, col= “green”, lwd=6)

##4c. Compute Mean of the sampling distribution & compare to the “population mean” ## population.mean <- mean(hincp) # Population mean is 110719.58 # sample.mean <- mean(sample.hincp) # Sampling Distribution mean is 100200# population.mean - sample.mean # A difference of 10519.58 #

##5. Draw 10,000 random samples with an N size of 1000 ## iter <- 10000 # sample size = 10000 # n <- 1000 # random samples of 1000 # hincp.means <- rep(NA, iter)

##5a. Create a Histogram of the sampling distribution ## for (i in 1:iter){ sample.hincp <- sample (hincp, n) hincp.means[i] <- mean(sample.hincp)} mean(sample.hincp) # The mean is 107948.9

hist(hincp.means, main= “Sampling Distribution n=1000”, xlab= “Sample Mean Hincp”, border= “black”, col= “blue”) abline(v=mean(hincp), col=“orange”, lty=3)

##5b. Create a Q-Q Plot of the sampling distribution ## qqnorm(hincp.means, main=“Q-Q Plot n=1000”) qqline(hincp.means, col= “green”, lwd=6)

##5c. Compute Mean of the sampling distribution & compare to the “population mean” ## population.mean <- mean(hincp) # Population mean is 110719.580616302 # sample.mean <- mean(sample.hincp) # Sampling Distribution mean is 107948.93 # population.mean - sample.mean # A difference of 2770.651 #

##6. Confirm or Reject the two properties of the CLT## # I confirm the two CLT properties: As the sample size increases, the sampling distribution converges on a normal distribution where the mean equals the population mean, and the standard deviation equals σ/√n. As I look and compare all the histograms and QQ plots I created, I notice as my sample size increases, the distribution does indeed become more normally distributed. #

## Import Dataset ##
# We will need and will be using the household income variable (hincp) data #
hidata <-read.csv("C:/Users/HP/OneDrive/Desktop/Queens College/Data 306-Data Analysis and Modeling (2022)/ACS_2020_NY_subset.csv")

# Rename variable for hincp use #
hincp <- hidata$hincp

##1. Create a Histogram for hincp ##
hist(hincp, main= "Population Mean", xlab= "Household Income", border= "black", col="cyan")
abline (v=mean(hincp), col= "red", lty=3)

##1. Create a Q-Q Plot for hincp to examine it's normality##
qqnorm(hincp, pch=1)
qqline(hincp, col="magenta", lwd=3)

##2. Calculate the mean; Answer will be the population mean of Household Income (hincp) ##
mean(hincp)
## [1] 110719.6
# The mean is 110719.6 #

##3. Draw 10,000 random samples with an N size of 2 ##
iter <- 10000 # sample size = 10000 #
n <- 2 # random samples of 2 #
hincp.means <- rep(NA, iter)

##3a. Create a Histogram of the sampling distribution ##
for (i in 1:iter){
  sample.hincp <- sample (hincp, n)
  hincp.means[i] <- mean(sample.hincp)}
mean(sample.hincp)
## [1] 68000
# The mean is 55550 #

hist(hincp.means, main= "Sampling Distribution n=2", xlab= "Sample Mean Hincp", border= "black", col= "blue")
abline(v=mean(hincp), col="orange", lty=3)

##3b. Create a Q-Q Plot of the sampling distribution ##
qqnorm(hincp.means, main="Q-Q Plot n=2")
qqline(hincp.means, col= "green", lwd=6)

##3c. Compute Mean of the sampling distribution & compare to the "population mean" ##
population.mean <- mean(hincp)
# Population mean is 110719.580616302 #
sample.mean <- mean(sample.hincp)
# Sampling Distribution mean is 55550 #
population.mean - sample.mean
## [1] 42719.58
# A difference of 55169.58 #

##4. Draw 10,000 random samples with an N size of 10 ##
iter <- 10000 # sample size = 10000 #
n <- 10 # random samples of 10 #
hincp.means <- rep(NA, iter)

##4a. Create a Histogram of the sampling distribution ##
for (i in 1:iter){
  sample.hincp <- sample (hincp, n)
  hincp.means[i] <- mean(sample.hincp)}
mean(sample.hincp)
## [1] 115983
# The mean is 100200 #

hist(hincp.means, main= "Sampling Distribution n=10", xlab= "Sample Mean Hincp", border= "black", col= "blue")
abline(v=mean(hincp), col="orange", lty=3)

##4b. Create a Q-Q Plot of the sampling distribution ##
qqnorm(hincp.means, main="Q-Q Plot n=10")
qqline(hincp.means, col= "green", lwd=6)

##4c. Compute Mean of the sampling distribution & compare to the "population mean" ##
population.mean <- mean(hincp)
# Population mean is 110719.58 #
sample.mean <- mean(sample.hincp)
# Sampling Distribution mean is 100200#
population.mean - sample.mean
## [1] -5263.419
# A difference of 10519.58 #

##5. Draw 10,000 random samples with an N size of 1000 ##
iter <- 10000 # sample size = 10000 #
n <- 1000 # random samples of 1000 #
hincp.means <- rep(NA, iter)

##5a. Create a Histogram of the sampling distribution ##
for (i in 1:iter){
  sample.hincp <- sample (hincp, n)
  hincp.means[i] <- mean(sample.hincp)}
mean(sample.hincp)
## [1] 110224.5
# The mean is 107948.9

hist(hincp.means, main= "Sampling Distribution n=1000", xlab= "Sample Mean Hincp", border= "black", col= "blue")
abline(v=mean(hincp), col="orange", lty=3)

##5b. Create a Q-Q Plot of the sampling distribution ##
qqnorm(hincp.means, main="Q-Q Plot n=1000")
qqline(hincp.means, col= "green", lwd=6)

##5c. Compute Mean of the sampling distribution & compare to the "population mean" ##
population.mean <- mean(hincp)
# Population mean is 110719.580616302 #
sample.mean <- mean(sample.hincp)
# Sampling Distribution mean is 107948.93 #
population.mean - sample.mean
## [1] 495.0826
# A difference of 2770.651 #

##6. Confirm or Reject the two properties of the CLT##
# I confirm the two CLT properties: As the sample size increases, the sampling distribution converges on a normal distribution where the mean equals the population mean, and the standard deviation equals σ/√n. As I look and compare all the histograms and QQ plots I created, I notice as my sample size increases, the distribution does indeed become more normally distributed. #