Data Analysis using R

chi-squared test

Day6

———————————————————————–

rm(list=ls())
library(descr)
library(readr)
library(desc)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:desc':
## 
##     desc

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(tidyr)
library(lsr)
library(moments)
library(BSDA)

## Loading required package: lattice

## 
## Attaching package: 'BSDA'

## The following object is masked from 'package:datasets':
## 
##     Orange

#Loading the data
srcFdr="D:\\D Drive\\Certificate Course\\data"
fileNm="Sales_Tran_2024.csv"
srcFile=paste(srcFdr,fileNm,sep="\\")
sales=read_csv(srcFile)

## Rows: 630 Columns: 4

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): Cust_id, Crd_Typ, prd_type, Str_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(sales)

## [1] "Cust_id"  "Crd_Typ"  "prd_type" "Str_id"

# data Preparation 
salesCln=sales%>%
  mutate(prd_type=recode_factor(prd_type,"1"="Electronics",
                                "2"="Fashion",
                                "3"="HD"))%>%
  mutate(Crd_Typ=recode_factor(Crd_Typ,"1"="Credit",
                               "2"="Debit",
                               "3"="PayPal"))
summary(salesCln)

##     Cust_id        Crd_Typ           prd_type       Str_id     
##  Min.   :  2.0   Credit:200   Electronics:200   Min.   :1.000  
##  1st Qu.:268.2   Debit :230   Fashion    :200   1st Qu.:2.000  
##  Median :519.0   PayPal:200   HD         :230   Median :3.000  
##  Mean   :508.5                                  Mean   :3.037  
##  3rd Qu.:745.5                                  3rd Qu.:4.000  
##  Max.   :998.0                                  Max.   :5.000

#Spreading the table by the product type 
salesCln%>%group_by(Crd_Typ,prd_type)%>%
  summarize(cnt=n())%>%spread(key=prd_type,value=cnt)

## `summarise()` has grouped output by 'Crd_Typ'. You can override using the
## `.groups` argument.

## # A tibble: 3 × 4
## # Groups:   Crd_Typ [3]
##   Crd_Typ Electronics Fashion    HD
##   <fct>         <int>   <int> <int>
## 1 Credit           80      50    70
## 2 Debit            70      80    80
## 3 PayPal           50      70    80

#proportions  with respect to all data.
prop.table(table(card_Type=salesCln$Crd_Typ,
                 product_type=salesCln$prd_type))

##          product_type
## card_Type Electronics    Fashion         HD
##    Credit  0.12698413 0.07936508 0.11111111
##    Debit   0.11111111 0.12698413 0.12698413
##    PayPal  0.07936508 0.11111111 0.12698413

#proportions  with respect to row sum.
prop.table(table(card_Type=salesCln$Crd_Typ,
                 product_type=salesCln$prd_type),
           margin = 1)

##          product_type
## card_Type Electronics   Fashion        HD
##    Credit   0.4000000 0.2500000 0.3500000
##    Debit    0.3043478 0.3478261 0.3478261
##    PayPal   0.2500000 0.3500000 0.4000000

#proportions  with respect to column sum.
prop.table(table(card_Type=salesCln$Crd_Typ,
                 product_type=salesCln$prd_type),
           margin = 2)

##          product_type
## card_Type Electronics   Fashion        HD
##    Credit   0.4000000 0.2500000 0.3043478
##    Debit    0.3500000 0.4000000 0.3478261
##    PayPal   0.2500000 0.3500000 0.3478261

#Bar chart
salesCln%>%group_by(Crd_Typ,prd_type)%>%
  summarize(cnt=n())%>%
  group_by(prd_type)%>%
  mutate(perc=100*cnt/sum(cnt))%>%
  ggplot(aes(x=prd_type,y=perc,fill=Crd_Typ))+
  geom_bar(position="dodge",stat="identity")

## `summarise()` has grouped output by 'Crd_Typ'. You can override using the
## `.groups` argument.

#Chi squared test for independence
chisq.test(salesCln$Crd_Typ,salesCln$prd_type)

## 
##  Pearson's Chi-squared test
## 
## data:  salesCln$Crd_Typ and salesCln$prd_type
## X-squared = 12.466, df = 4, p-value = 0.0142

#Chi squared distribution for different degrees of freedom
x=seq(0,20,.01)
prob=dchisq(x,2)
df1=data.frame(x,prob)
prob=dchisq(x,4)
df2=data.frame(x,prob)
prob=dchisq(x,8)
df3=data.frame(x,prob)
prob=dchisq(x,12)
df4=data.frame(x,prob)
p1=df1%>%ggplot(aes(x=x,y=prob))+geom_point()+
  labs(x="chi sqared value",y="probability density",
       title="df=2")
p2=df2%>%ggplot(aes(x=x,y=prob))+geom_point()+
  labs(x="chi sqared value",y="probability density",
       title="df=4")
p3=df3%>%ggplot(aes(x=x,y=prob))+geom_point()+
  labs(x="chi sqared value",y="probability density",
       title="df=8")
p4=df4%>%ggplot(aes(x=x,y=prob))+geom_point()+
  labs(x="chi sqared value",y="probability density",
       title="df=12")
grid.arrange(p1,p2,p3,p4,nrow=2,ncol=2)

p2

pchisq(12.466,4,lower.tail = FALSE)

## [1] 0.01420237

#cross table information
CrossTable(salesCln$Crd_Typ,
           salesCln$prd_type,
           prop.c = FALSE,
           prop.t = FALSE,
           prop.chisq = FALSE,
           expected=TRUE,
           sresid = TRUE,
           chisq = TRUE)

##    Cell Contents 
## |-------------------------|
## |                       N | 
## |              Expected N | 
## |           N / Row Total | 
## |            Std Residual | 
## |-------------------------|
## 
## ==========================================================
##                     salesCln$prd_type
## salesCln$Crd_Typ    Electronics   Fashion       HD   Total
## ----------------------------------------------------------
## Credit                       80        50       70     200
##                            63.5      63.5     73.0        
##                           0.400     0.250    0.350   0.317
##                           2.072    -1.693   -0.353        
## ----------------------------------------------------------
## Debit                        70        80       80     230
##                            73.0      73.0     84.0        
##                           0.304     0.348    0.348   0.365
##                          -0.353     0.817   -0.433        
## ----------------------------------------------------------
## PayPal                       50        70       80     200
##                            63.5      63.5     73.0        
##                           0.250     0.350    0.400   0.317
##                          -1.693     0.817    0.817        
## ----------------------------------------------------------
## Total                       200       200      230     630
## ==========================================================
## 
## Statistics for All Table Factors
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 = 12.46602      d.f. = 4      p = 0.0142

cramersV(salesCln$Crd_Typ,salesCln$prd_type)

## [1] 0.09946692