R Markdown

This document will explore functions which may be useful for simulation of CLT, CI, Hypothesis testing on numerical as well as on categorical variables

library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(shiny)
## Warning: package 'shiny' was built under R version 3.5.1
library(ggplot2)
library(moments)



###################Foundations for inference - Sampling distributions#################
# The data
# We consider real estate data from the city of Ames, Iowa. The details of every real estate transaction in Ames is recorded by the City Assessor's office. Our particular focus for this lab will be all residential home sales in Ames between 2006 and 2010. This collection represents our population of interest. In this lab we would like to learn about these home sales by taking smaller samples from the full population. Let's load the data.

data(ames)
names(ames)
##  [1] "Order"           "PID"             "area"           
##  [4] "price"           "MS.SubClass"     "MS.Zoning"      
##  [7] "Lot.Frontage"    "Lot.Area"        "Street"         
## [10] "Alley"           "Lot.Shape"       "Land.Contour"   
## [13] "Utilities"       "Lot.Config"      "Land.Slope"     
## [16] "Neighborhood"    "Condition.1"     "Condition.2"    
## [19] "Bldg.Type"       "House.Style"     "Overall.Qual"   
## [22] "Overall.Cond"    "Year.Built"      "Year.Remod.Add" 
## [25] "Roof.Style"      "Roof.Matl"       "Exterior.1st"   
## [28] "Exterior.2nd"    "Mas.Vnr.Type"    "Mas.Vnr.Area"   
## [31] "Exter.Qual"      "Exter.Cond"      "Foundation"     
## [34] "Bsmt.Qual"       "Bsmt.Cond"       "Bsmt.Exposure"  
## [37] "BsmtFin.Type.1"  "BsmtFin.SF.1"    "BsmtFin.Type.2" 
## [40] "BsmtFin.SF.2"    "Bsmt.Unf.SF"     "Total.Bsmt.SF"  
## [43] "Heating"         "Heating.QC"      "Central.Air"    
## [46] "Electrical"      "X1st.Flr.SF"     "X2nd.Flr.SF"    
## [49] "Low.Qual.Fin.SF" "Bsmt.Full.Bath"  "Bsmt.Half.Bath" 
## [52] "Full.Bath"       "Half.Bath"       "Bedroom.AbvGr"  
## [55] "Kitchen.AbvGr"   "Kitchen.Qual"    "TotRms.AbvGrd"  
## [58] "Functional"      "Fireplaces"      "Fireplace.Qu"   
## [61] "Garage.Type"     "Garage.Yr.Blt"   "Garage.Finish"  
## [64] "Garage.Cars"     "Garage.Area"     "Garage.Qual"    
## [67] "Garage.Cond"     "Paved.Drive"     "Wood.Deck.SF"   
## [70] "Open.Porch.SF"   "Enclosed.Porch"  "X3Ssn.Porch"    
## [73] "Screen.Porch"    "Pool.Area"       "Pool.QC"        
## [76] "Fence"           "Misc.Feature"    "Misc.Val"       
## [79] "Mo.Sold"         "Yr.Sold"         "Sale.Type"      
## [82] "Sale.Condition"
# interest we have for area
ggplot(data = ames, aes(x = area)) +
  geom_histogram(binwidth = 250)

ames %>%
  summarise(mu = mean(area), pop_med = median(area), 
            sigma = sd(area), pop_iqr = IQR(area),
            pop_min = min(area), pop_max = max(area),
            pop_q1 = quantile(area, 0.25),  # first quartile, 25th percentile
            pop_q3 = quantile(area, 0.75))  # third quartile, 75th percentile
## # A tibble: 1 x 8
##      mu pop_med sigma pop_iqr pop_min pop_max pop_q1 pop_q3
##   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
## 1 1500.    1442  506.    617.     334    5642   1126  1743.
# take sample 
samp1 <- ames %>%
  sample_n(size = 50)

samp1 %>%
  summarise(x_bar = mean(area), samp_med = median(area), 
            s = sd(area), samp_iqr = IQR(area),
            samp_min = min(area), samp_max = max(area),
            samp_q1 = quantile(area, 0.25),  # first quartile, 25th percentile
            samp_q3 = quantile(area, 0.75))  # third quartile, 75th percentile
## # A tibble: 1 x 8
##   x_bar samp_med     s samp_iqr samp_min samp_max samp_q1 samp_q3
##   <dbl>    <dbl> <dbl>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
## 1 1478.    1436.  425.     521.      816     2654    1207   1728.
# visualization and simulation of n sample mean and its distribution
sample_means50 <- ames %>%
  rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>%
  summarise(x_bar = mean(area))

ggplot(data = sample_means50, aes(x = x_bar)) +
  geom_histogram(binwidth = 20)

##############Foundations for inference - Confidence intervals######################################
# setting this one to just get random sample same everytime
set.seed(9102015)
n <- 60
samp60 <- sample_n(ames,n)
summary(samp60$area)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     759    1196    1482    1603    1799    4676
ggplot(data = samp60, aes(x = area)) + geom_histogram(binwidth = 250)

# critical value of asscoaited with 95% CI is 97.5% percentile
z_s_95 <- qnorm(0.975)

ci <- samp60 %>% summarise(lower = mean(area) - z_s_95 * sd(area)/sqrt(n), upper = mean(area) + z_s_95 * sd(area)/sqrt(n))
ci
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 1433. 1774.
#we’re 95% confident that the true average size of houses in Ames lies between the values lower and upper.
#There are a few conditions that must be met for this interval to be valid.
# sample must be random,

# sample enough ?
ns <- skewness(samp60$area)^2 * 10
nk <- kurtosis(samp60$area) * 10


params <- ames %>%
  summarise(mu = mean(area))
params
## # A tibble: 1 x 1
##      mu
##   <dbl>
## 1 1500.
# Using R, we’re going to collect many samples to learn more about how sample means and confidence intervals vary from one sample to another.
#
# Here is the rough outline:
#
#   Obtain a random sample.
# Calculate the sample’s mean and standard deviation, and use these to calculate and store the lower and upper bounds of the confidence intervals.
# Repeat these steps 50 times.
# We can accomplish this using the rep_sample_n function. The following lines of code takes 50 random samples of size n from population (and remember we defined n=60n=60 earlier), and computes the upper and lower bounds of the confidence intervals based on these samples.

ci <- ames %>%
  rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
  summarise(lower = mean(area) - z_s_95 * (sd(area) / sqrt(n)),
            upper = mean(area) + z_s_95 * (sd(area) / sqrt(n)))

ci <- ci %>%
  mutate(capture_mu = ifelse(lower < params$mu & upper > params$mu, "yes", "no"))

ci %>% group_by(capture_mu) %>% summarise(n())
## # A tibble: 2 x 2
##   capture_mu `n()`
##   <chr>      <int>
## 1 no             3
## 2 yes           47
# convert each bound as single row to easy plot
ci_data <- data.frame(ci_id = c(1:50, 1:50),
                      ci_bounds = c(ci$lower, ci$upper),
                      capture_mu = c(ci$capture_mu, ci$capture_mu))

ggplot(data = ci_data, aes(x = ci_bounds, y = ci_id,
                           group = ci_id, color = capture_mu)) +
  geom_point(size = 2) +  # add points at the ends, size = 2
  geom_line() +           # connect with lines
  geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line

# do it for CI 99
z_s_99 <- qnorm(0.995)

ci_99 <- ames %>%
  rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
  summarise(lower = mean(area) - z_s_99 * (sd(area) / sqrt(n)),
            upper = mean(area) + z_s_99 * (sd(area) / sqrt(n)))

ci_99 <- ci_99 %>%
  mutate(capture_mu = ifelse(lower < params$mu & upper > params$mu, "yes", "no"))

ci_99 %>% group_by(capture_mu) %>% summarise(n())
## # A tibble: 2 x 2
##   capture_mu `n()`
##   <chr>      <int>
## 1 no             1
## 2 yes           49
# convert each bound as single row to easy plot
ci_data_99 <- data.frame(ci_id = c(1:50, 1:50),
                         ci_bounds = c(ci_99$lower, ci_99$upper),
                         capture_mu = c(ci_99$capture_mu, ci_99$capture_mu))

ggplot(data = ci_data_99, aes(x = ci_bounds, y = ci_id,
                              group = ci_id, color = capture_mu)) +
  geom_point(size = 2) +  # add points at the ends, size = 2
  geom_line() +           # connect with lines
  geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line

# Inference on Numerical Data

data(nc)
names(nc)
##  [1] "fage"           "mage"           "mature"         "weeks"         
##  [5] "premie"         "visits"         "marital"        "gained"        
##  [9] "weight"         "lowbirthweight" "gender"         "habit"         
## [13] "whitemom"
str(nc)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  13 variables:
##  $ fage          : int  NA NA 19 21 NA NA 18 17 NA 20 ...
##  $ mage          : int  13 14 15 15 15 15 15 15 16 16 ...
##  $ mature        : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weeks         : int  39 42 37 41 39 38 37 35 38 37 ...
##  $ premie        : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
##  $ visits        : int  10 15 11 6 9 19 12 5 9 13 ...
##  $ marital       : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gained        : int  38 20 38 34 27 22 76 15 NA 52 ...
##  $ weight        : num  7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
##  $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
##  $ gender        : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
##  $ habit         : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
##  $ whitemom      : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
summary(nc$gained)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   20.00   30.00   30.33   38.00   85.00      27
ggplot(nc, aes(x = habit, y = weight)) +   geom_boxplot()

nc %>% group_by(habit) %>% summarise(iqr = IQR(weight, na.rm = TRUE))
## # A tibble: 3 x 2
##   habit       iqr
##   <fct>     <dbl>
## 1 nonsmoker  1.62
## 2 smoker     1.66
## 3 <NA>       0
nc %>%
  group_by(habit) %>%
  summarise(mean_weight = mean(weight))
## # A tibble: 3 x 2
##   habit     mean_weight
##   <fct>           <dbl>
## 1 nonsmoker        7.14
## 2 smoker           6.83
## 3 <NA>             3.63
#There is an observed difference, but is this difference statistically significant? In order to answer this question we will conduct a
#hypothesis test.

inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ht", null = 0,
          alternative = "twosided", method = "theoretical")
## Response variable: numerical
## Explanatory variable: categorical (2 levels) 
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## H0: mu_nonsmoker =  mu_smoker
## HA: mu_nonsmoker != mu_smoker
## t = 2.359, df = 125
## p_value = 0.0199

# CI test
inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ci", conf_level = 0.95 ,method = "theoretical")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## 95% CI (nonsmoker - smoker): (0.0508 , 0.5803)

# just chnage the order
inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ci",
          method = "theoretical", order = c("smoker","nonsmoker"))
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_smoker = 126, y_bar_smoker = 6.8287, s_smoker = 1.3862
## n_nonsmoker = 873, y_bar_nonsmoker = 7.1443, s_nonsmoker = 1.5187
## 95% CI (smoker - nonsmoker): (-0.5803 , -0.0508)

# Calculate a 99% confidence interval for the average length of pregnancies (weeks)
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
          method = "theoretical", conf_level = 0.99)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 99% CI: (38.0952 , 38.5742)

# 90 level
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
          method = "theoretical", conf_level = 0.90)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 90% CI: (38.1819 , 38.4874)

# 95% level
inference(y = weeks, data = nc, statistic = "mean", type = "ci",
          method = "theoretical", conf_level = 0.95)
## Single numerical variable
## n = 998, y-bar = 38.3347, s = 2.9316
## 95% CI: (38.1526 , 38.5168)

# Conduct a hypothesis test evaluating whether the average weight gained by younger mothers is different than the average weight gained by mature mothers.
inference(y = gained, x = mature, data = nc, statistic = "mean", type = "ht", null = 0,
          alternative = "twosided", method = "theoretical")
## Response variable: numerical
## Explanatory variable: categorical (2 levels) 
## n_mature mom = 129, y_bar_mature mom = 28.7907, s_mature mom = 13.4824
## n_younger mom = 844, y_bar_younger mom = 30.5604, s_younger mom = 14.3469
## H0: mu_mature mom =  mu_younger mom
## HA: mu_mature mom != mu_younger mom
## t = -1.3765, df = 128
## p_value = 0.1711

nc %>% group_by(mature) %>% summarise(maxage = max(mage), minage = min(mage))
## # A tibble: 2 x 3
##   mature      maxage minage
##   <fct>        <dbl>  <dbl>
## 1 mature mom      50     35
## 2 younger mom     34     13
############ play with proportions and Inference for categorical data#####################

data(atheism)
names(atheism)
## [1] "nationality" "response"    "year"
head(atheism)
## # A tibble: 6 x 3
##   nationality response     year
##   <fct>       <fct>       <int>
## 1 Afghanistan non-atheist  2012
## 2 Afghanistan non-atheist  2012
## 3 Afghanistan non-atheist  2012
## 4 Afghanistan non-atheist  2012
## 5 Afghanistan non-atheist  2012
## 6 Afghanistan non-atheist  2012
str(atheism$response)
##  Factor w/ 2 levels "atheist","non-atheist": 2 2 2 2 2 2 2 2 2 2 ...
us12 <- atheism %>%
  filter(nationality == "United States" , atheism$year == "2012")

us12 %>% group_by(response) %>% summarise(count = n())
## # A tibble: 2 x 2
##   response    count
##   <fct>       <int>
## 1 atheist        50
## 2 non-atheist   952
# Write out the conditions for inference to construct a 95% confidence interval for the proportion of atheists in the United States in 2012. Are you confident all conditions are met?
#
#   If the conditions for inference are reasonable, we can either calculate the standard error and construct the interval by hand, or allow the inference function to do it for us.
inference(y = response, data = us12, statistic = "proportion", type = "ci", method = "theoretical", success = "atheist")
## Single categorical variable, success: atheist
## n = 1002, p-hat = 0.0499
## 95% CI: (0.0364 , 0.0634)

# Note that since the goal is to construct an interval estimate for a proportion, it’s necessary to specify what constitutes a `success'', which here is a response ofatheist`.


d <- data.frame(p <- seq(0, 1, 0.01))
n <- 1000
d <- d %>%
  mutate(me = 1.96*sqrt(p*(1 - p)/n))
ggplot(d, aes(x = p, y = me)) +
  geom_line()

#There is convincing evidence that Spain has seen a change in its atheism index between 2005 and 2012. 
spain <- atheism %>%
  filter(nationality == "Spain")

inference(y = response, x = factor(year), data = spain, statistic = "proportion", type = "ht", null = 0,
          alternative = "twosided", method = "theoretical", success = "atheist")
## Response variable: categorical (2 levels, success: atheist)
## Explanatory variable: categorical (2 levels) 
## n_2005 = 1146, p_hat_2005 = 0.1003
## n_2012 = 1145, p_hat_2012 = 0.09
## H0: p_2005 =  p_2012
## HA: p_2005 != p_2012
## z = 0.8476
## p_value = 0.3966

# for usa?
usa <- atheism %>%
  filter(nationality == "United States")

inference(y = response, x = factor(year), data = usa, statistic = "proportion", type = "ht", null = 0,
          alternative = "twosided", method = "theoretical", success = "atheist")
## Response variable: categorical (2 levels, success: atheist)
## Explanatory variable: categorical (2 levels) 
## n_2005 = 1002, p_hat_2005 = 0.01
## n_2012 = 1002, p_hat_2012 = 0.0499
## H0: p_2005 =  p_2012
## HA: p_2005 != p_2012
## z = -5.2431
## p_value = < 0.0001