library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Customer <- read_csv("~/Downloads/Customer.csv")
## Parsed with column specification:
## cols(
##   agegroup = col_double(),
##   Satisfaction = col_double(),
##   recommend = col_double()
## )
head(Customer)
## # A tibble: 6 x 3
##   agegroup Satisfaction recommend
##      <dbl>        <dbl>     <dbl>
## 1        1            1         9
## 2        1            1         9
## 3        2            1        10
## 4        2            1         9
## 5        2            1        10
## 6        2            2         4

Data Prep

Customer <-Customer %>%
mutate(agegroup =
        ifelse(agegroup==1,"Children Under age 7",
        ifelse(agegroup==2,"Children Over age 7", NA)),
       Satisfaction = ifelse(Satisfaction==1,"Satisfied",
                      ifelse(Satisfaction==2,"Dissatisfied", NA)),
       recommend = ifelse(recommend>10,NA,recommend))%>%


  select(recommend, Satisfaction, agegroup)%>%
filter(agegroup %in% c("Children Under age 7","Children Over age 7")) 

1. % of Respondents by age

table(Customer$agegroup)%>%
prop.table()%>%
round(2)
## 
##  Children Over age 7 Children Under age 7 
##                 0.74                 0.26

1B.% of Satisfaction

table(Customer$Satisfaction)%>%
prop.table()%>%
round(2)
## 
## Dissatisfied    Satisfied 
##         0.08         0.92

2. Null Hypothesis

Children Over age 7 .74 * .08 = .06 Dissatisfied .74 * .92 = .69 Satisfied

Children Under age 7 .26 * .08 = .02 Dissatisfied .26 * .92 = .24 Satisfied

3. Actual Observation

table(Customer$agegroup, Customer$Satisfaction) %>%
  prop.table(1)
##                       
##                        Dissatisfied  Satisfied
##   Children Over age 7    0.09701493 0.90298507
##   Children Under age 7   0.04519774 0.95480226

4. Chi-squared test for independence

chisq.test(Customer$Satisfaction, Customer$agegroup)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Customer$Satisfaction and Customer$agegroup
## X-squared = 8.4154, df = 1, p-value = 0.003721
options(scipen =9999)

T test

Customer %>%
  group_by(agegroup) %>%
  summarize(recommend= mean(recommend, na.rm=TRUE))
## # A tibble: 2 x 2
##   agegroup             recommend
##   <chr>                    <dbl>
## 1 Children Over age 7       8.90
## 2 Children Under age 7      9.17

Population Distribution

Customer %>%
    filter(agegroup %in% c("Children Under age 7", "Children Over age 7")) %>%
    ggplot()+
    geom_histogram(aes(x=recommend, fill=agegroup))+
    facet_wrap(~agegroup)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 33 rows containing non-finite values (stat_bin).

Sampling

Customer_under <- Customer %>%
  filter(agegroup=="Children Under age 7")

Customer_under <-
  replicate(10000,
  sample(Customer_under$recommend,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)
Customer_over <- Customer %>%
  filter(agegroup=="Children Over age 7")

Customer_over <-
  replicate(10000,
  sample(Customer_over$recommend,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)
ggplot()+
geom_histogram(data=Customer_under, aes(x=mean),fill="lavender")+ geom_histogram(data=Customer_over, aes(x=mean),fill="pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Null Hypothesis

Customer%>%
summarize(recommend= mean(recommend,na.rm=TRUE))
## # A tibble: 1 x 1
##   recommend
##       <dbl>
## 1      8.97

T-Test

t.test(recommend~agegroup, data=Customer) 
## 
##  Welch Two Sample t-test
## 
## data:  recommend by agegroup
## t = -3.4958, df = 901.74, p-value = 0.0004956
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4157539 -0.1167816
## sample estimates:
##  mean in group Children Over age 7 mean in group Children Under age 7 
##                           8.904187                           9.170455
options(scipen =9999)