rm(list=ls())
library(descr)
library(readr)
library(desc)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:desc':
##
## desc
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
library(lsr)
library(moments)
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
#Loading the data
srcFdr="D:\\D Drive\\Certificate Course\\data"
fileNm="Sales_Tran_2024.csv"
srcFile=paste(srcFdr,fileNm,sep="\\")
sales=read_csv(srcFile)
## Rows: 630 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): Cust_id, Crd_Typ, prd_type, Str_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(sales)
## [1] "Cust_id" "Crd_Typ" "prd_type" "Str_id"
# data Preparation
salesCln=sales%>%
mutate(prd_type=recode_factor(prd_type,"1"="Electronics",
"2"="Fashion",
"3"="HD"))%>%
mutate(Crd_Typ=recode_factor(Crd_Typ,"1"="Credit",
"2"="Debit",
"3"="PayPal"))
summary(salesCln)
## Cust_id Crd_Typ prd_type Str_id
## Min. : 2.0 Credit:200 Electronics:200 Min. :1.000
## 1st Qu.:268.2 Debit :230 Fashion :200 1st Qu.:2.000
## Median :519.0 PayPal:200 HD :230 Median :3.000
## Mean :508.5 Mean :3.037
## 3rd Qu.:745.5 3rd Qu.:4.000
## Max. :998.0 Max. :5.000
#Spreading the table by the product type
salesCln%>%group_by(Crd_Typ,prd_type)%>%
summarize(cnt=n())%>%spread(key=prd_type,value=cnt)
## `summarise()` has grouped output by 'Crd_Typ'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 4
## # Groups: Crd_Typ [3]
## Crd_Typ Electronics Fashion HD
## <fct> <int> <int> <int>
## 1 Credit 80 50 70
## 2 Debit 70 80 80
## 3 PayPal 50 70 80
#proportions with respect to all data.
prop.table(table(card_Type=salesCln$Crd_Typ,
product_type=salesCln$prd_type))
## product_type
## card_Type Electronics Fashion HD
## Credit 0.12698413 0.07936508 0.11111111
## Debit 0.11111111 0.12698413 0.12698413
## PayPal 0.07936508 0.11111111 0.12698413
#proportions with respect to row sum.
prop.table(table(card_Type=salesCln$Crd_Typ,
product_type=salesCln$prd_type),
margin = 1)
## product_type
## card_Type Electronics Fashion HD
## Credit 0.4000000 0.2500000 0.3500000
## Debit 0.3043478 0.3478261 0.3478261
## PayPal 0.2500000 0.3500000 0.4000000
#proportions with respect to column sum.
prop.table(table(card_Type=salesCln$Crd_Typ,
product_type=salesCln$prd_type),
margin = 2)
## product_type
## card_Type Electronics Fashion HD
## Credit 0.4000000 0.2500000 0.3043478
## Debit 0.3500000 0.4000000 0.3478261
## PayPal 0.2500000 0.3500000 0.3478261
#Bar chart
salesCln%>%group_by(Crd_Typ,prd_type)%>%
summarize(cnt=n())%>%
group_by(prd_type)%>%
mutate(perc=100*cnt/sum(cnt))%>%
ggplot(aes(x=prd_type,y=perc,fill=Crd_Typ))+
geom_bar(position="dodge",stat="identity")
## `summarise()` has grouped output by 'Crd_Typ'. You can override using the
## `.groups` argument.

#Chi squared test for independence
chisq.test(salesCln$Crd_Typ,salesCln$prd_type)
##
## Pearson's Chi-squared test
##
## data: salesCln$Crd_Typ and salesCln$prd_type
## X-squared = 12.466, df = 4, p-value = 0.0142
#Chi squared distribution for different degrees of freedom
x=seq(0,20,.01)
prob=dchisq(x,2)
df1=data.frame(x,prob)
prob=dchisq(x,4)
df2=data.frame(x,prob)
prob=dchisq(x,8)
df3=data.frame(x,prob)
prob=dchisq(x,12)
df4=data.frame(x,prob)
p1=df1%>%ggplot(aes(x=x,y=prob))+geom_point()+
labs(x="chi sqared value",y="probability density",
title="df=2")
p2=df2%>%ggplot(aes(x=x,y=prob))+geom_point()+
labs(x="chi sqared value",y="probability density",
title="df=4")
p3=df3%>%ggplot(aes(x=x,y=prob))+geom_point()+
labs(x="chi sqared value",y="probability density",
title="df=8")
p4=df4%>%ggplot(aes(x=x,y=prob))+geom_point()+
labs(x="chi sqared value",y="probability density",
title="df=12")
grid.arrange(p1,p2,p3,p4,nrow=2,ncol=2)

p2

pchisq(12.466,4,lower.tail = FALSE)
## [1] 0.01420237
#cross table information
CrossTable(salesCln$Crd_Typ,
salesCln$prd_type,
prop.c = FALSE,
prop.t = FALSE,
prop.chisq = FALSE,
expected=TRUE,
sresid = TRUE,
chisq = TRUE)
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | N / Row Total |
## | Std Residual |
## |-------------------------|
##
## ==========================================================
## salesCln$prd_type
## salesCln$Crd_Typ Electronics Fashion HD Total
## ----------------------------------------------------------
## Credit 80 50 70 200
## 63.5 63.5 73.0
## 0.400 0.250 0.350 0.317
## 2.072 -1.693 -0.353
## ----------------------------------------------------------
## Debit 70 80 80 230
## 73.0 73.0 84.0
## 0.304 0.348 0.348 0.365
## -0.353 0.817 -0.433
## ----------------------------------------------------------
## PayPal 50 70 80 200
## 63.5 63.5 73.0
## 0.250 0.350 0.400 0.317
## -1.693 0.817 0.817
## ----------------------------------------------------------
## Total 200 200 230 630
## ==========================================================
##
## Statistics for All Table Factors
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 12.46602 d.f. = 4 p = 0.0142
cramersV(salesCln$Crd_Typ,salesCln$prd_type)
## [1] 0.09946692