This document will explore functions which may be useful for simulation of CLT, CI, Hypothesis testing on numerical as well as on categorical variables
library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(shiny)
## Warning: package 'shiny' was built under R version 3.5.1
library(ggplot2)
library(moments)
###################Foundations for inference - Sampling distributions#################
# The data
# We consider real estate data from the city of Ames, Iowa. The details of every real estate transaction in Ames is recorded by the City Assessor's office. Our particular focus for this lab will be all residential home sales in Ames between 2006 and 2010. This collection represents our population of interest. In this lab we would like to learn about these home sales by taking smaller samples from the full population. Let's load the data.
data(ames)
names(ames)
## [1] "Order" "PID" "area"
## [4] "price" "MS.SubClass" "MS.Zoning"
## [7] "Lot.Frontage" "Lot.Area" "Street"
## [10] "Alley" "Lot.Shape" "Land.Contour"
## [13] "Utilities" "Lot.Config" "Land.Slope"
## [16] "Neighborhood" "Condition.1" "Condition.2"
## [19] "Bldg.Type" "House.Style" "Overall.Qual"
## [22] "Overall.Cond" "Year.Built" "Year.Remod.Add"
## [25] "Roof.Style" "Roof.Matl" "Exterior.1st"
## [28] "Exterior.2nd" "Mas.Vnr.Type" "Mas.Vnr.Area"
## [31] "Exter.Qual" "Exter.Cond" "Foundation"
## [34] "Bsmt.Qual" "Bsmt.Cond" "Bsmt.Exposure"
## [37] "BsmtFin.Type.1" "BsmtFin.SF.1" "BsmtFin.Type.2"
## [40] "BsmtFin.SF.2" "Bsmt.Unf.SF" "Total.Bsmt.SF"
## [43] "Heating" "Heating.QC" "Central.Air"
## [46] "Electrical" "X1st.Flr.SF" "X2nd.Flr.SF"
## [49] "Low.Qual.Fin.SF" "Bsmt.Full.Bath" "Bsmt.Half.Bath"
## [52] "Full.Bath" "Half.Bath" "Bedroom.AbvGr"
## [55] "Kitchen.AbvGr" "Kitchen.Qual" "TotRms.AbvGrd"
## [58] "Functional" "Fireplaces" "Fireplace.Qu"
## [61] "Garage.Type" "Garage.Yr.Blt" "Garage.Finish"
## [64] "Garage.Cars" "Garage.Area" "Garage.Qual"
## [67] "Garage.Cond" "Paved.Drive" "Wood.Deck.SF"
## [70] "Open.Porch.SF" "Enclosed.Porch" "X3Ssn.Porch"
## [73] "Screen.Porch" "Pool.Area" "Pool.QC"
## [76] "Fence" "Misc.Feature" "Misc.Val"
## [79] "Mo.Sold" "Yr.Sold" "Sale.Type"
## [82] "Sale.Condition"
# interest we have for area
ggplot(data = ames, aes(x = area)) +
geom_histogram(binwidth = 250)
ames %>%
summarise(mu = mean(area), pop_med = median(area),
sigma = sd(area), pop_iqr = IQR(area),
pop_min = min(area), pop_max = max(area),
pop_q1 = quantile(area, 0.25), # first quartile, 25th percentile
pop_q3 = quantile(area, 0.75)) # third quartile, 75th percentile
## # A tibble: 1 x 8
## mu pop_med sigma pop_iqr pop_min pop_max pop_q1 pop_q3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1500. 1442 506. 617. 334 5642 1126 1743.
# take sample
samp1 <- ames %>%
sample_n(size = 50)
samp1 %>%
summarise(x_bar = mean(area), samp_med = median(area),
s = sd(area), samp_iqr = IQR(area),
samp_min = min(area), samp_max = max(area),
samp_q1 = quantile(area, 0.25), # first quartile, 25th percentile
samp_q3 = quantile(area, 0.75)) # third quartile, 75th percentile
## # A tibble: 1 x 8
## x_bar samp_med s samp_iqr samp_min samp_max samp_q1 samp_q3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1478. 1436. 425. 521. 816 2654 1207 1728.
# visualization and simulation of n sample mean and its distribution
sample_means50 <- ames %>%
rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>%
summarise(x_bar = mean(area))
ggplot(data = sample_means50, aes(x = x_bar)) +
geom_histogram(binwidth = 20)
##############Foundations for inference - Confidence intervals######################################
# setting this one to just get random sample same everytime
set.seed(9102015)
n <- 60
samp60 <- sample_n(ames,n)
summary(samp60$area)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 759 1196 1482 1603 1799 4676
ggplot(data = samp60, aes(x = area)) + geom_histogram(binwidth = 250)
# critical value of asscoaited with 95% CI is 97.5% percentile
z_s_95 <- qnorm(0.975)
ci <- samp60 %>% summarise(lower = mean(area) - z_s_95 * sd(area)/sqrt(n), upper = mean(area) + z_s_95 * sd(area)/sqrt(n))
ci
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 1433. 1774.
#we’re 95% confident that the true average size of houses in Ames lies between the values lower and upper.
#There are a few conditions that must be met for this interval to be valid.
# sample must be random,
# sample enough ?
ns <- skewness(samp60$area)^2 * 10
nk <- kurtosis(samp60$area) * 10
params <- ames %>%
summarise(mu = mean(area))
params
## # A tibble: 1 x 1
## mu
## <dbl>
## 1 1500.
# Using R, we’re going to collect many samples to learn more about how sample means and confidence intervals vary from one sample to another.
#
# Here is the rough outline:
#
# Obtain a random sample.
# Calculate the sample’s mean and standard deviation, and use these to calculate and store the lower and upper bounds of the confidence intervals.
# Repeat these steps 50 times.
# We can accomplish this using the rep_sample_n function. The following lines of code takes 50 random samples of size n from population (and remember we defined n=60n=60 earlier), and computes the upper and lower bounds of the confidence intervals based on these samples.
ci <- ames %>%
rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
summarise(lower = mean(area) - z_s_95 * (sd(area) / sqrt(n)),
upper = mean(area) + z_s_95 * (sd(area) / sqrt(n)))
ci <- ci %>%
mutate(capture_mu = ifelse(lower < params$mu & upper > params$mu, "yes", "no"))
ci %>% group_by(capture_mu) %>% summarise(n())
## # A tibble: 2 x 2
## capture_mu `n()`
## <chr> <int>
## 1 no 3
## 2 yes 47
# convert each bound as single row to easy plot
ci_data <- data.frame(ci_id = c(1:50, 1:50),
ci_bounds = c(ci$lower, ci$upper),
capture_mu = c(ci$capture_mu, ci$capture_mu))
ggplot(data = ci_data, aes(x = ci_bounds, y = ci_id,
group = ci_id, color = capture_mu)) +
geom_point(size = 2) + # add points at the ends, size = 2
geom_line() + # connect with lines
geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line
# do it for CI 99
z_s_99 <- qnorm(0.995)
ci_99 <- ames %>%
rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
summarise(lower = mean(area) - z_s_99 * (sd(area) / sqrt(n)),
upper = mean(area) + z_s_99 * (sd(area) / sqrt(n)))
ci_99 <- ci_99 %>%
mutate(capture_mu = ifelse(lower < params$mu & upper > params$mu, "yes", "no"))
ci_99 %>% group_by(capture_mu) %>% summarise(n())
## # A tibble: 2 x 2
## capture_mu `n()`
## <chr> <int>
## 1 no 1
## 2 yes 49
# convert each bound as single row to easy plot
ci_data_99 <- data.frame(ci_id = c(1:50, 1:50),
ci_bounds = c(ci_99$lower, ci_99$upper),
capture_mu = c(ci_99$capture_mu, ci_99$capture_mu))
ggplot(data = ci_data_99, aes(x = ci_bounds, y = ci_id,
group = ci_id, color = capture_mu)) +
geom_point(size = 2) + # add points at the ends, size = 2
geom_line() + # connect with lines
geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line
# Inference on Numerical Data
data(nc)
names(nc)
## [1] "fage" "mage" "mature" "weeks"
## [5] "premie" "visits" "marital" "gained"
## [9] "weight" "lowbirthweight" "gender" "habit"
## [13] "whitemom"
str(nc)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
summary(nc$gained)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 20.00 30.00 30.33 38.00 85.00 27
ggplot(nc, aes(x = habit, y = weight)) + geom_boxplot()
nc %>% group_by(habit) %>% summarise(iqr = IQR(weight, na.rm = TRUE))
## # A tibble: 3 x 2
## habit iqr
## <fct> <dbl>
## 1 nonsmoker 1.62
## 2 smoker 1.66
## 3 <NA> 0
nc %>%
group_by(habit) %>%
summarise(mean_weight = mean(weight))
## # A tibble: 3 x 2
## habit mean_weight
## <fct> <dbl>
## 1 nonsmoker 7.14
## 2 smoker 6.83
## 3 <NA> 3.63
#There is an observed difference, but is this difference statistically significant? In order to answer this question we will conduct a
#hypothesis test.
inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ht", null = 0,
alternative = "twosided", method = "theoretical")
## Response variable: numerical
## Explanatory variable: categorical (2 levels)
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## H0: mu_nonsmoker = mu_smoker
## HA: mu_nonsmoker != mu_smoker
## t = 2.359, df = 125
## p_value = 0.0199
# CI test
inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ci", conf_level = 0.95 ,method = "theoretical")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## 95% CI (nonsmoker - smoker): (0.0508 , 0.5803)
# just chnage the order
inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ci",
method = "theoretical", order = c("smoker","nonsmoker"))
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## 95% CI (smoker - nonsmoker): (-0.5803 , -0.0508)
# Calculate a 99% confidence interval for the average length of pregnancies (weeks)
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
method = "theoretical", conf_level = 0.99)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 99% CI: (38.0952 , 38.5742)
# 90 level
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
method = "theoretical", conf_level = 0.90)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 90% CI: (38.1819 , 38.4874)
# 95% level
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
method = "theoretical", conf_level = 0.95)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 95% CI: (38.1526 , 38.5168)
# Conduct a hypothesis test evaluating whether the average weight gained by younger mothers is different than the average weight gained by mature mothers.
inference(y = gained, x = mature, data = nc, statistic = "mean", type = "ht", null = 0,
alternative = "twosided", method = "theoretical")
## Response variable: numerical
## Explanatory variable: categorical (2 levels)
## n_mature mom = 129, y_bar_mature mom = 28.7907, s_mature mom = 13.4824
## n_younger mom = 844, y_bar_younger mom = 30.5604, s_younger mom = 14.3469
## H0: mu_mature mom = mu_younger mom
## HA: mu_mature mom != mu_younger mom
## t = -1.3765, df = 128
## p_value = 0.1711
nc %>% group_by(mature) %>% summarise(maxage = max(mage), minage = min(mage))
## # A tibble: 2 x 3
## mature maxage minage
## <fct> <dbl> <dbl>
## 1 mature mom 50 35
## 2 younger mom 34 13
############ play with proportions and Inference for categorical data#####################
data(atheism)
names(atheism)
## [1] "nationality" "response" "year"
head(atheism)
## # A tibble: 6 x 3
## nationality response year
## <fct> <fct> <int>
## 1 Afghanistan non-atheist 2012
## 2 Afghanistan non-atheist 2012
## 3 Afghanistan non-atheist 2012
## 4 Afghanistan non-atheist 2012
## 5 Afghanistan non-atheist 2012
## 6 Afghanistan non-atheist 2012
str(atheism$response)
## Factor w/ 2 levels "atheist","non-atheist": 2 2 2 2 2 2 2 2 2 2 ...
us12 <- atheism %>%
filter(nationality == "United States" , atheism$year == "2012")
us12 %>% group_by(response) %>% summarise(count = n())
## # A tibble: 2 x 2
## response count
## <fct> <int>
## 1 atheist 50
## 2 non-atheist 952
# Write out the conditions for inference to construct a 95% confidence interval for the proportion of atheists in the United States in 2012. Are you confident all conditions are met?
#
# If the conditions for inference are reasonable, we can either calculate the standard error and construct the interval by hand, or allow the inference function to do it for us.
inference(y = response, data = us12, statistic = "proportion", type = "ci", method = "theoretical", success = "atheist")
## Single categorical variable, success: atheist
## n = 1002, p-hat = 0.0499
## 95% CI: (0.0364 , 0.0634)
# Note that since the goal is to construct an interval estimate for a proportion, it’s necessary to specify what constitutes a `success'', which here is a response ofatheist`.
d <- data.frame(p <- seq(0, 1, 0.01))
n <- 1000
d <- d %>%
mutate(me = 1.96*sqrt(p*(1 - p)/n))
ggplot(d, aes(x = p, y = me)) +
geom_line()
#There is convincing evidence that Spain has seen a change in its atheism index between 2005 and 2012.
spain <- atheism %>%
filter(nationality == "Spain")
inference(y = response, x = factor(year), data = spain, statistic = "proportion", type = "ht", null = 0,
alternative = "twosided", method = "theoretical", success = "atheist")
## Response variable: categorical (2 levels, success: atheist)
## Explanatory variable: categorical (2 levels)
## n_2005 = 1146, p_hat_2005 = 0.1003
## n_2012 = 1145, p_hat_2012 = 0.09
## H0: p_2005 = p_2012
## HA: p_2005 != p_2012
## z = 0.8476
## p_value = 0.3966
# for usa?
usa <- atheism %>%
filter(nationality == "United States")
inference(y = response, x = factor(year), data = usa, statistic = "proportion", type = "ht", null = 0,
alternative = "twosided", method = "theoretical", success = "atheist")
## Response variable: categorical (2 levels, success: atheist)
## Explanatory variable: categorical (2 levels)
## n_2005 = 1002, p_hat_2005 = 0.01
## n_2012 = 1002, p_hat_2012 = 0.0499
## H0: p_2005 = p_2012
## HA: p_2005 != p_2012
## z = -5.2431
## p_value = < 0.0001