#  Goodness of Fit and Independence test in R- Chi Square test
# By, Eralda Gjika (Dhamo)
# Linkedin: https://al.linkedin.com/in/eralda-dhamo-gjika-71879128
# Department of Applied Mathematics, Faculty of Natural Science, University of Tirana, Albania
# 
# For this work I will use the Portuguese Bank Marketing Data.
# http://search.r-project.org/library/regsel/html/bank.html
bank.full <- read.csv("C:/Users/user/Desktop/bank-full.csv", sep=";")
View(bank.full)
#
#######################################################################
#              Test of Goodness of Fit and Independence
########################################################################
#  The bank dataset is named "bank.full"
# Some of the packages we will need 
library(MASS)
library(ISLR)
library(stats4)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(gplots) 
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("graphics")
library("vcd")
## Loading required package: grid
## 
## Attaching package: 'vcd'
## The following object is masked from 'package:ISLR':
## 
##     Hitters
library(corrplot)
## corrplot 0.84 loaded
#
# Numerical variables in our dataset are positioned in columns 1, 6 and 12
#
str(bank.full)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
summary(bank.full)
##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
require(corrplot)# correlation package
cordata.2 <- cor(bank.full[,c(1,6,12)])
corrplot.mixed(cordata.2)# Correlation plot and values in a matrix graph 

#
# Create a data.frame from the dataset with variables "marital" and "housing"
bank.data_1<- data.frame(bank.full$marital, bank.full$housing)
# Create a table (contigency table) 
bank.data.tab1 = table(bank.full$marital, bank.full$housing) 
print(bank.data.tab1) # Print the table 
##           
##               no   yes
##   divorced  2300  2907
##   married  11893 15321
##   single    5888  6902
#
library("gplots")
# Ballon Graph to understand the proportions of the combined values of the variables "civil status" and "housing"
balloonplot(t(bank.data.tab1), main ="Civil status and housing", xlab ="", ylab="",
            label = FALSE, show.margins = FALSE)

#
#
# create a data.frame with variables "education" and "housing"
bank.data_2<- data.frame(bank.full$education, bank.full$housing)
# Create a contigency table
bank.data.tab2 = table(bank.full$education, bank.full$housing) 
print(bank.data.tab2) # Print the table
##            
##                no   yes
##   primary    2957  3894
##   secondary  9164 14038
##   tertiary   6923  6378
##   unknown    1037   820
#
# Ballon Graph
balloonplot(t(bank.data.tab2), main ="Education and housing", xlab ="", ylab="",
            label = FALSE, show.margins = FALSE)

#
# Another graphic presentation ofdata.frame 
library("graphics")
mosaicplot(bank.data.tab1, shade = TRUE, las=2,main = "Civil status and housing")

mosaicplot(bank.data.tab2, shade = TRUE, las=2,main = "Education and housing")

# We may agree on the fact that the individuals who have a house loan are mostly with a secondary education
#
# Another mosaic graph
library("vcd")
# If we want to plot just a subset of the table
assoc(head(bank.data.tab2, 3), shade = TRUE, las=3)# only the 3 values of variable "education"

assoc(head(bank.data.tab2, 4), shade = TRUE, las=3) # 4 values of variable "education"

#
# Pearson Residuals are calculated for each cell using the formulae r= (o-e)/sqrt(e)
#############################################################################
#               Chi-Square test for independence
###########################################################################   
# Attention! Chi-square test is applied only when expected frequency of each cell are at least 5
# 
# Independence test for civil status and housing loan.
print(chisq.test(bank.data.tab1))
## 
##  Pearson's Chi-squared test
## 
## data:  bank.data.tab1
## X-squared = 19.345, df = 2, p-value = 6.3e-05
# the P-Value < 0.05 so, we accept that there is a strong correlation between the variables, they are dependent.
# Testing: education and housing
print(chisq.test(bank.data.tab2))
## 
##  Pearson's Chi-squared test
## 
## data:  bank.data.tab2
## X-squared = 643.89, df = 3, p-value < 2.2e-16
# the P-Value < 0.05, there is a dependency between education and housing 
#
#If we want to obtain Observed counts
chisq.test(bank.data.tab2)$observed
##            
##                no   yes
##   primary    2957  3894
##   secondary  9164 14038
##   tertiary   6923  6378
##   unknown    1037   820
# Expected counts ( rounded up to 2 decimal)
round(chisq.test(bank.data.tab2)$expected,2)
##            
##                   no      yes
##   primary    3042.95  3808.05
##   secondary 10305.44 12896.56
##   tertiary   5907.80  7393.20
##   unknown     824.81  1032.19
round(chisq.test(bank.data.tab2)$expected)
##            
##                no   yes
##   primary    3043  3808
##   secondary 10305 12897
##   tertiary   5908  7393
##   unknown     825  1032
#
# printing the p-value of the test
chisq.test(bank.data.tab2)$p.value
## [1] 3.078476e-139
# printing the value of Chi-Square statistics
chisq.test(bank.data.tab2)$statistic
## X-squared 
##  643.8888
# A graphical view of Pearson residuals using corrplot()
library(corrplot)
corrplot(chisq.test(bank.data.tab2)$residuals, is.cor = FALSE)

round(chisq.test(bank.data.tab2)$residuals, 3)
##            
##                  no     yes
##   primary    -1.558   1.393
##   secondary -11.244  10.051
##   tertiary   13.208 -11.807
##   unknown     7.388  -6.605
#
#The sign of the standardized residuals is very important to interpret the association between rows and columns as explained in the block below.
#Positive residuals are in blue. Positive values in cells specify an attraction (positive association) between the corresponding row and column variables.
#Negative residuals are in red. This implies a repulsion (negative association) between the corresponding row and column variables.
#

# contribution (in %) of a given cell to the total Chi-square score is calculated as follow:
# contrib=r2/??2  where r- is the residual of the cell and ??2 -is the value of the statistics
#
# Contibution in percentage (%)
contribution <- 100*chisq.test(bank.data.tab2)$residuals^2/chisq.test(bank.data.tab2)$statistic
round(contribution, 3)
##            
##                 no    yes
##   primary    0.377  0.301
##   secondary 19.635 15.690
##   tertiary  27.094 21.650
##   unknown    8.478  6.775
# Visualize the contribution
corrplot(contribution, is.cor = FALSE)

# The relative contribution of each cell to the total Chi-square score give some indication of the nature of the dependency between rows and columns of the contingency table.
#
# Thank you for reading , using and sharing!
# E. Gjika