setwd("~/Desktop")
hincp<- read.csv("ACS_2020_NY_subset.csv",h=T)
head(hincp)
## serialno hincp
## 1 2020HU0000013 7300
## 2 2020HU0000022 900
## 3 2020HU0000038 55900
## 4 2020HU0000040 35800
## 5 2020HU0000051 202000
## 6 2020HU0000055 99200
names(hincp)
## [1] "serialno" "hincp"
dim(hincp)
## [1] 60360 2
histogram<-hist(hincp$hincp,main="Household Income Distribution",xlab="Household Income $")
We can clearly observe most household income lies left sided of graph so we can say that it is left or positively skewed
library(car)
## Loading required package: carData
qqnorm(hincp$hincp, pch = 1, frame = FALSE)
qqline(hincp$hincp, col = "red", lwd = 2)
Mostly data lies above the line so we can say the household income does not follow the normal distribution it is positive skewed.
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
population_sd<-sd(hincp$hincp)
population_sd
## [1] 126177.7
The all household average income is 110719.
#make this example reproducible
set.seed(0)
#define number of samples
N <- 10000
n<-2
#create empty vector of length n
sample_means = rep(NA, N)
#fill empty vector with means
for(i in 1:N){
sample_means[i] = mean(rnorm(n, mean= 110719.6, sd=126177.7))
}
#view first six sample means
head(sample_means)
## [1] 169816.25 274891.21 39725.17 33543.80 262062.57 108485.26
hist(sample_means, main = "", xlab = "Sample Means", col = "steelblue")
Graph shows normal distribution
qqnorm(sample_means, pch = 1, frame = FALSE)
qqline(sample_means, col = "red", lwd = 2)
Graph shows normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 111087.5
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is greater than the population mean but not by a large difference
#make this example reproducible
set.seed(0)
#define number of samples
N<- 10000
n<-10
#create empty vector of length n
sample_means = rep(NA, N)
#fill empty vector with means
for(i in 1:N){
sample_means[i] = mean(rnorm(n, mean= 110719.6, sd=126177.7))
}
#view first six sample means
head(sample_means)
## [1] 156007.80 64982.54 119477.56 130136.97 98090.91 94175.55
hist(sample_means, main = "", xlab = "Sample Means", col = "steelblue")
Graph shows normal distribution
qqnorm(sample_means, pch = 1, frame = FALSE)
qqline(sample_means, col = "red", lwd = 2)
Graph shows normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 110897.5
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is greater than population mean but not a very large difference
#make this example reproducible
set.seed(0)
#define number of samples
N<- 10000
n<-1000
#create empty vector of length n
sample_means = rep(NA, N)
#fill empty vector with means
for(i in 1:N){
sample_means[i] = mean(rnorm(n, mean= 110719.6, sd=126177.7))
}
#view first six sample means
head(sample_means)
## [1] 108722.3 107592.1 119317.7 111033.3 111237.4 113208.8
hist(sample_means, main = "", xlab = "Sample Means", col = "steelblue")
Graph shows normal distribution
qqnorm(sample_means, pch = 1, frame = FALSE)
qqline(sample_means, col = "red", lwd = 2)
Graph shows normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 110659.5
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is less than the population mean but not by a large difference.
histogram<-hist(hincp$hincp,main="Household Incom Distribution",xlab="Household Incom $")
qqnorm(hincp$hincp, pch = 1, frame = FALSE)
qqline(hincp$hincp, col = "red", lwd = 2)
The graph for population given in question 6 we can confirm for the two properties CLT by comparing that the histograms are the same or we cannot reject it.
The histogram and QQ plot of sample distribution for sample size 2 is not same for the sampling distribution for population size 2 so we reject it and conclude that both are not the same.
The histogram and QQ plot of sample distribution for sample size 10 is not same for the sampling distribution for population size 10 so we reject it and conclude that both are not the same.
The histogram and QQ plot of the sample distribution for sample size 1000 is the same for the sampling distribution for population size 1000 so we confirm it and conclude that both are the same.