2 Proportion Test and Chi-Square Test Using R

2 Proportion Test

# Read the dataset from Excel
library(readxl)
sales_data <- read_excel("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\Adults_Children_Sales.xlsx")
#str(sales_data)   # Structure of Data
attach(sales_data)
#sales_data_adult <- factor(sales_data$Adults, levels = c("Purchased", "Did not Purchase"))
#sales_data_children <- factor(sales_data$Children, levels = c("Purchased", "Did not Purchase"))

# Find frequencies of purchased and did not purchase
sales_adult_tb <- table(Adults)
sales_children_tb <- table(Children)

#View(sales_adult_tb)
#View(sales_children_tb)

# 2 Proportion Test
prop.test(c(sales_adult_tb[2], sales_children_tb[2]), c(length(Adults), length(Children)))

## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  c(sales_adult_tb[2], sales_children_tb[2]) out of c(length(Adults), length(Children))
## X-squared = 1.0064, df = 1, p-value = 0.3158
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.33394732  0.09394732
## sample estimates:
## prop 1 prop 2 
##   0.40   0.52

# Method 2: 2 Proportion Test
stacked_sales <- stack(sales_data)
sales_freq <- table(stacked_sales)
sales_freq

##                   ind
## values             Adults Children
##   Did not Purchase     30       24
##   Purchased            20       26

# Proportion table
prop.table(sales_freq, 2)

##                   ind
## values             Adults Children
##   Did not Purchase   0.60     0.48
##   Purchased          0.40     0.52

# Test result
prop.test(sales_freq, correct = T)

## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  sales_freq
## X-squared = 1.0064, df = 1, p-value = 0.3158
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.09451548  0.33606137
## sample estimates:
##    prop 1    prop 2 
## 0.5555556 0.4347826

Chi-Square Test

# Read the dataset from Excel
imrb_research <- read_excel("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\imrb_research.xlsx")
imrb_research <- imrb_research[,1:4]
str(imrb_research)

## Classes 'tbl_df', 'tbl' and 'data.frame':    49 obs. of  4 variables:
##  $ India     : num  1 0 0 0 0 0 1 1 1 1 ...
##  $ China     : num  0 1 0 1 0 0 1 1 1 1 ...
##  $ Srilanka  : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Bangladesh: num  0 1 1 1 1 1 1 1 1 1 ...

attach(imrb_research)

stack_imrb <- stack(imrb_research)
imrb_freq <- table(stack_imrb)
imrb_freq

##       ind
## values India China Srilanka Bangladesh
##      0    28    30       25         17
##      1    21    19       24         32

prop.table(imrb_freq, 2)

##       ind
## values     India     China  Srilanka Bangladesh
##      0 0.5714286 0.6122449 0.5102041  0.3469388
##      1 0.4285714 0.3877551 0.4897959  0.6530612

chisq.test(imrb_freq)

## 
##  Pearson's Chi-squared test
## 
## data:  imrb_freq
## X-squared = 8.0033, df = 3, p-value = 0.04594