October 11, 2024
Load the Panel Study of Income CSV file:
PSID <- read.csv("PSID.csv", stringsAsFactors = TRUE)
mean_income <- mean(PSID$Income)
sd_income <- sd(PSID$Income)
cat("Sample average of family income:", mean_income, "\n")
## Sample average of family income: 77895.38
cat("Sample SD of family income:", sd_income, "\n")
## Sample SD of family income: 88959.81
summary <- summary(PSID$Income)
summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -267900 28000 55076 77895 100100 2125100
3.Compute the following percentiles: 90, 95, 99, 99.5, 99.9. (Hint: use the R command quantile.)
percentiles <- quantile(PSID$Income, probs = c(0.90, 0.95, 0.99, 0.995, 0.999))
percentiles
## 90% 95% 99% 99.5% 99.9%
## 160000.0 210228.0 390724.2 510569.8 978774.7
hist(PSID$Income, col="turquoise1", breaks = 500, xlim = c(0, 500000), main = "Histogram of Family Income", xlab = "Income")
# Plot histogram with normal curve overlay
hist(PSID$Income, col="turquoise1", breaks = 500, xlim = c(0, 500000), probability = TRUE,
main = "Histogram of Family Income", xlab = "Income")
curve(dnorm(x, mean = mean_income, sd = sd_income), add = TRUE, col = "red", lwd = 2)
cat("The chart above illustrates a normal distribution curve
with a mean and SD equal to the mean and SD of the sample.
We can clearly see that the U.S. family income is not normally
distributed as the theoretical curve and actual histogram do not overlay.
Another property of a normal distribution is that it's symmetrical
relative to the mean. This is not the case for our sample.")
## The chart above illustrates a normal distribution curve
## with a mean and SD equal to the mean and SD of the sample.
## We can clearly see that the U.S. family income is not normally
## distributed as the theoretical curve and actual histogram do not overlay.
## Another property of a normal distribution is that it's symmetrical
## relative to the mean. This is not the case for our sample.
# 99th and 99.5th percentiles from the data
percentiles_computed <- quantile(PSID$Income, probs = c(0.99, 0.995))
# theoretical 99th and 99.5th percentiles for a normal distribution with the same mean and SD
percentiles_theoretical <- qnorm(c(0.99, 0.995), mean = mean_income, sd = sd_income)
# Create a comparison table
comparison_table <- data.frame(
Percentile = c("99%", "99.5%"),
Empirical = percentiles_computed,
Theoretical = percentiles_theoretical
)
print(comparison_table)
## Percentile Empirical Theoretical
## 99% 99% 390724.2 284846.8
## 99.5% 99.5% 510569.8 307040.7
cat("The empirical percentiles from the PSID are lower than
the ones from the normal distribution. This suggests that
the distribution of the PSID date is likely skewed to the right
(with a longer tail on the higher income side), rather than
following a perfect normal distribution.\n")
## The empirical percentiles from the PSID are lower than
## the ones from the normal distribution. This suggests that
## the distribution of the PSID date is likely skewed to the right
## (with a longer tail on the higher income side), rather than
## following a perfect normal distribution.
real_mean_income <- mean(PSID$Income)
set.seed(12345)
sample_100 <- sample(PSID$Income, 100, replace = FALSE)
sample_mean_income <- mean(sample_100)
# sampling error
sampling_error <- real_mean_income - sample_mean_income
cat("True mean income in PSID-land:", real_mean_income, "\n")
## True mean income in PSID-land: 77895.38
cat("Sample mean income of 100 families:", sample_mean_income, "\n")
## Sample mean income of 100 families: 90173.69
cat("Sampling error:", sampling_error, "\n")
## Sampling error: -12278.31
cat("The sample mean income of the 100 families in PSID-land is $90,173.69,
while the true mean income across the entire population of 9,569 families
is $77,895.38. The sampling error, calculated as the difference between
the sample mean and the true mean, is $-12,278.31. This negative sampling error
indicates that the sample mean overestimates the true mean by this amount.
This discrepancy reflects the noise introduced when using a small sample size
to estimate a population mean. While sampling provides a more cost-effective
way of gathering information, it inherently introduces some degree of error.
In this case, the sampled families had, on average, higher incomes than
the full population, leading to an overestimation of the mean income in PSID-land. \n")
## The sample mean income of the 100 families in PSID-land is $90,173.69,
## while the true mean income across the entire population of 9,569 families
## is $77,895.38. The sampling error, calculated as the difference between
## the sample mean and the true mean, is $-12,278.31. This negative sampling error
## indicates that the sample mean overestimates the true mean by this amount.
##
## This discrepancy reflects the noise introduced when using a small sample size
## to estimate a population mean. While sampling provides a more cost-effective
## way of gathering information, it inherently introduces some degree of error.
## In this case, the sampled families had, on average, higher incomes than
## the full population, leading to an overestimation of the mean income in PSID-land.
set.seed(12345)
# 10,000 samples of size 100 and compute the mean income for each sample
TestSamples <- replicate(10000, sample(PSID$Income, 100, replace = FALSE))
# mean of each sample (each column)
TestMeans <- colMeans(TestSamples)
# sample average and standard deviation of the 10,000 sample means
sample_average_of_means <- mean(TestMeans)
sample_sd_of_means <- sd(TestMeans)
cat("Sample average of the mean incomes from 10,000 surveys:", sample_average_of_means, "\n")
## Sample average of the mean incomes from 10,000 surveys: 77913.99
cat("Sample standard deviation of the mean incomes from 10,000 surveys:", sample_sd_of_means, "\n")
## Sample standard deviation of the mean incomes from 10,000 surveys: 8942.945
hist(TestMeans, col="turquoise1", breaks = 100, xlim = c(50000, 120000),
main = "Histogram of Survey Means (Sample Size 100)",
xlab = "Mean Income")
# standard deviation of sample means
mean_test_means <- mean(TestMeans)
sd_test_means <- sd(TestMeans)
# empirical percentiles
empirical_percentiles <- quantile(TestMeans, probs = c(0.025, 0.5, 0.975))
# theoretical percentiles based on normal distribution
theoretical_percentiles <- qnorm(c(0.025, 0.5, 0.975), mean = mean_test_means, sd = sd_test_means)
cat("Empirical Percentiles:", empirical_percentiles, "\n")
## Empirical Percentiles: 62534.52 77052.17 97780.42
cat("Theoretical Percentiles (Normal Dist.):", theoretical_percentiles, "\n")
## Theoretical Percentiles (Normal Dist.): 60386.14 77913.99 95441.84
cat("The histogram shows a roughly bell-shaped distribution,
which is characteristic of a normal distribution.
The central peak and the symmetric decline on either side suggest
normality, though there may be a slight right skew.
The empirical percentiles are close to the theoretical percentiles,
especially at the 50th percentile. The small differences at
the 2.5th and 97.5th percentiles indicate a slight deviation
from perfect normality, with the distribution of survey means being
slightly skewed to the right.The distribution of survey means
approximates normality well, with minor deviations, which is expected
due to the right-skewed nature of income data.
")
## The histogram shows a roughly bell-shaped distribution,
## which is characteristic of a normal distribution.
## The central peak and the symmetric decline on either side suggest
## normality, though there may be a slight right skew.
##
## The empirical percentiles are close to the theoretical percentiles,
## especially at the 50th percentile. The small differences at
## the 2.5th and 97.5th percentiles indicate a slight deviation
## from perfect normality, with the distribution of survey means being
## slightly skewed to the right.The distribution of survey means
## approximates normality well, with minor deviations, which is expected
## due to the right-skewed nature of income data.
set.seed(12345)
TestSamples_1600 <- replicate(10000, sample(PSID$Income, 1600, replace = FALSE))
TestMeans_1600 <- colMeans(TestSamples_1600)
# sample average and standard deviation of the 10,000 sample means for size 1600
sample_average_of_means_1600 <- mean(TestMeans_1600)
sample_sd_of_means_1600 <- sd(TestMeans_1600)
cat("Sample average of the mean incomes from 10,000 surveys (sample size 1600):", sample_average_of_means_1600, "\n")
## Sample average of the mean incomes from 10,000 surveys (sample size 1600): 77897.25
cat("Sample standard deviation of the mean incomes from 10,000 surveys (sample size 1600):", sample_sd_of_means_1600, "\n")
## Sample standard deviation of the mean incomes from 10,000 surveys (sample size 1600): 2033.09
hist(TestMeans_1600, col="turquoise1", breaks = 100, xlim = c(50000, 120000),
main = "Histogram of Survey Means (Sample Size 1600)",
xlab = "Mean Income")
# standard deviation of sample means for size 1600
mean_test_means_1600 <- mean(TestMeans_1600)
sd_test_means_1600 <- sd(TestMeans_1600)
# empirical percentiles for sample size 1600
empirical_percentiles_1600 <- quantile(TestMeans_1600, probs = c(0.025, 0.5, 0.975))
# theoretical percentiles based on normal distribution for sample size 1600
theoretical_percentiles_1600 <- qnorm(c(0.025, 0.5, 0.975), mean = mean_test_means_1600, sd = sd_test_means_1600)
cat("Empirical Percentiles (Sample Size 1600):", empirical_percentiles_1600, "\n")
## Empirical Percentiles (Sample Size 1600): 74061.09 77854.13 81994.58
cat("Theoretical Percentiles (Normal Dist., Sample Size 1600):", theoretical_percentiles_1600, "\n")
## Theoretical Percentiles (Normal Dist., Sample Size 1600): 73912.47 77897.25 81882.03
cat("For a larger sample size (1600), the distribution of the survey means
is closer to a normal distribution compared to the smaller sample size of 100,
indicating reduced variability and increased accuracy of the sample mean estimates.\n")
## For a larger sample size (1600), the distribution of the survey means
## is closer to a normal distribution compared to the smaller sample size of 100,
## indicating reduced variability and increased accuracy of the sample mean estimates.
cat("For a larger sample size (1600), the distribution of the survey means
is closer to a normal distribution compared to the smaller sample size of 100,
indicating reduced variability and increased accuracy of the sample mean estimates.\n")
## For a larger sample size (1600), the distribution of the survey means
## is closer to a normal distribution compared to the smaller sample size of 100,
## indicating reduced variability and increased accuracy of the sample mean estimates.