# Goodness of Fit and Independence test in R- Chi Square test
# By, Eralda Gjika (Dhamo)
# Linkedin: https://al.linkedin.com/in/eralda-dhamo-gjika-71879128
# Department of Applied Mathematics, Faculty of Natural Science, University of Tirana, Albania
#
# For this work I will use the Portuguese Bank Marketing Data.
# http://search.r-project.org/library/regsel/html/bank.html
bank.full <- read.csv("C:/Users/user/Desktop/bank-full.csv", sep=";")
View(bank.full)
#
#######################################################################
# Test of Goodness of Fit and Independence
########################################################################
# The bank dataset is named "bank.full"
# Some of the packages we will need
library(MASS)
library(ISLR)
library(stats4)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("graphics")
library("vcd")
## Loading required package: grid
##
## Attaching package: 'vcd'
## The following object is masked from 'package:ISLR':
##
## Hitters
library(corrplot)
## corrplot 0.84 loaded
#
# Numerical variables in our dataset are positioned in columns 1, 6 and 12
#
str(bank.full)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
summary(bank.full)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
require(corrplot)# correlation package
cordata.2 <- cor(bank.full[,c(1,6,12)])
corrplot.mixed(cordata.2)# Correlation plot and values in a matrix graph

#
# Create a data.frame from the dataset with variables "marital" and "housing"
bank.data_1<- data.frame(bank.full$marital, bank.full$housing)
# Create a table (contigency table)
bank.data.tab1 = table(bank.full$marital, bank.full$housing)
print(bank.data.tab1) # Print the table
##
## no yes
## divorced 2300 2907
## married 11893 15321
## single 5888 6902
#
library("gplots")
# Ballon Graph to understand the proportions of the combined values of the variables "civil status" and "housing"
balloonplot(t(bank.data.tab1), main ="Civil status and housing", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)

#
#
# create a data.frame with variables "education" and "housing"
bank.data_2<- data.frame(bank.full$education, bank.full$housing)
# Create a contigency table
bank.data.tab2 = table(bank.full$education, bank.full$housing)
print(bank.data.tab2) # Print the table
##
## no yes
## primary 2957 3894
## secondary 9164 14038
## tertiary 6923 6378
## unknown 1037 820
#
# Ballon Graph
balloonplot(t(bank.data.tab2), main ="Education and housing", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)

#
# Another graphic presentation ofdata.frame
library("graphics")
mosaicplot(bank.data.tab1, shade = TRUE, las=2,main = "Civil status and housing")

mosaicplot(bank.data.tab2, shade = TRUE, las=2,main = "Education and housing")

# We may agree on the fact that the individuals who have a house loan are mostly with a secondary education
#
# Another mosaic graph
library("vcd")
# If we want to plot just a subset of the table
assoc(head(bank.data.tab2, 3), shade = TRUE, las=3)# only the 3 values of variable "education"

assoc(head(bank.data.tab2, 4), shade = TRUE, las=3) # 4 values of variable "education"

#
# Pearson Residuals are calculated for each cell using the formulae r= (o-e)/sqrt(e)
#############################################################################
# Chi-Square test for independence
###########################################################################
# Attention! Chi-square test is applied only when expected frequency of each cell are at least 5
#
# Independence test for civil status and housing loan.
print(chisq.test(bank.data.tab1))
##
## Pearson's Chi-squared test
##
## data: bank.data.tab1
## X-squared = 19.345, df = 2, p-value = 6.3e-05
# the P-Value < 0.05 so, we accept that there is a strong correlation between the variables, they are dependent.
# Testing: education and housing
print(chisq.test(bank.data.tab2))
##
## Pearson's Chi-squared test
##
## data: bank.data.tab2
## X-squared = 643.89, df = 3, p-value < 2.2e-16
# the P-Value < 0.05, there is a dependency between education and housing
#
#If we want to obtain Observed counts
chisq.test(bank.data.tab2)$observed
##
## no yes
## primary 2957 3894
## secondary 9164 14038
## tertiary 6923 6378
## unknown 1037 820
# Expected counts ( rounded up to 2 decimal)
round(chisq.test(bank.data.tab2)$expected,2)
##
## no yes
## primary 3042.95 3808.05
## secondary 10305.44 12896.56
## tertiary 5907.80 7393.20
## unknown 824.81 1032.19
round(chisq.test(bank.data.tab2)$expected)
##
## no yes
## primary 3043 3808
## secondary 10305 12897
## tertiary 5908 7393
## unknown 825 1032
#
# printing the p-value of the test
chisq.test(bank.data.tab2)$p.value
## [1] 3.078476e-139
# printing the value of Chi-Square statistics
chisq.test(bank.data.tab2)$statistic
## X-squared
## 643.8888
# A graphical view of Pearson residuals using corrplot()
library(corrplot)
corrplot(chisq.test(bank.data.tab2)$residuals, is.cor = FALSE)

round(chisq.test(bank.data.tab2)$residuals, 3)
##
## no yes
## primary -1.558 1.393
## secondary -11.244 10.051
## tertiary 13.208 -11.807
## unknown 7.388 -6.605
#
#The sign of the standardized residuals is very important to interpret the association between rows and columns as explained in the block below.
#Positive residuals are in blue. Positive values in cells specify an attraction (positive association) between the corresponding row and column variables.
#Negative residuals are in red. This implies a repulsion (negative association) between the corresponding row and column variables.
#
# contribution (in %) of a given cell to the total Chi-square score is calculated as follow:
# contrib=r2/??2 where r- is the residual of the cell and ??2 -is the value of the statistics
#
# Contibution in percentage (%)
contribution <- 100*chisq.test(bank.data.tab2)$residuals^2/chisq.test(bank.data.tab2)$statistic
round(contribution, 3)
##
## no yes
## primary 0.377 0.301
## secondary 19.635 15.690
## tertiary 27.094 21.650
## unknown 8.478 6.775
# Visualize the contribution
corrplot(contribution, is.cor = FALSE)

# The relative contribution of each cell to the total Chi-square score give some indication of the nature of the dependency between rows and columns of the contingency table.
#
# Thank you for reading , using and sharing!
# E. Gjika