setwd("~/Desktop")
hincp<- read.csv("ACS_2020_NY_subset.csv",h=T)
head(hincp)
## serialno hincp
## 1 2020HU0000013 7300
## 2 2020HU0000022 900
## 3 2020HU0000038 55900
## 4 2020HU0000040 35800
## 5 2020HU0000051 202000
## 6 2020HU0000055 99200
names(hincp)
## [1] "serialno" "hincp"
dim(hincp)
## [1] 60360 2
histogram<-hist(hincp$hincp,main="Household Income Distribution",xlab="Household Income $")
We can clearly observe most household income lies left sided of graph so we can say that it is left or positively skewed
library(car)
## Loading required package: carData
qqnorm(hincp$hincp, pch = 1, frame = FALSE)
qqline(hincp$hincp, col = "red", lwd = 2)
Mostly data lies above the line so we can say the household income does not follow the normal distribution it is positive skewed.
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
population_sd<-sd(hincp$hincp)
population_sd
## [1] 126177.7
The all household average income is 110719.
#make this example reproducible
sample<-sample(1:10000,2,replace=T)
sample_means<-mean(sample)
head(sample_means)
## [1] 5723.5
#Create a histogram of the sampling distribution
hist(sample, main = " ", xlab = "Sample Means", col = "steelblue")
Graph shows normal distribution
qqnorm(sample, pch = 1, frame = FALSE)
qqline(sample, col = "red", lwd = 2)
Graph shows normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 5723.5
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is much less than the population mean.
#make this example reproducible
set.seed(0)
#define number of samples
sample<-sample(1:10000,10,replace=T)
sample_means<-mean(sample)
head(sample_means)
## [1] 6425.9
#view first six sample means
hist(sample, main = "", xlab = "Sample Means", col = "steelblue")
Graph shows not normal distribution
qqnorm(sample, pch = 1, frame = FALSE)
qqline(sample, col = "red", lwd = 2)
Graph shows not normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 6425.9
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is much less than the population mean.
#make this example reproducible
set.seed(0)
sample<-sample(1:10000,1000,replace=T)
sample_means<-mean(sample)
head(sample_means)
## [1] 5084.031
#view first six sample means
hist(sample, main = "", xlab = "Sample Means", col = "steelblue")
Graph shows normal distribution
qqnorm(sample, pch = 1, frame = FALSE)
qqline(sample_means, col = "red", lwd = 2)
Graph shows normal distribution
Samples_Mean<-mean(sample_means)
Samples_Mean
## [1] 5084.031
population_mean<-mean(hincp$hincp)
population_mean
## [1] 110719.6
We can clearly observe that the sample mean is much less than the population mean.
histogram<-hist(hincp$hincp,main="Household Incom Distribution",xlab="Household Incom $")
qqnorm(hincp$hincp, pch = 1, frame = FALSE)
qqline(hincp$hincp, col = "red", lwd = 2)
The graph for population given in question 6 we can confirm for the two properties CLT by comparing that the histograms are the same or we cannot reject it.
The histogram and QQ plot of sample distribution for sample size 2 is not same for the sampling distribution for population size 2 so we reject it and conclude that both are not the same.
The histogram and QQ plot of sample distribution for sample size 10 is not same for the sampling distribution for population size 10 so we reject it and conclude that both are not the same.
The histogram and QQ plot of the sample distribution for sample size 1000 is the same for the sampling distribution for population size 1000 so we confirm it and conclude that both are the same.