################################################################################
################################################################################
###                                                                          ###
###             STAT 230, DATA MINING MINI PROJRCT - 2023                    ###
###                                                                          ###
###      PREPARED BY:                                                        ###
###            NAMES                    IDS                                  ###
###      GERSHON TETTEH AMANOR        10967696                               ###
###      DOE OWEN BUERTEY               10972098                               ###
###      NANA ASIEDU AKROFI           10986678                               ###
###      EVORDA KOKOU JOHN            10971696                               ###
###      NKRUMAH SYLVESTER SADASCO    10964940                               ###
###      AMIDU ABDUL-AZIZ               10967087                               ###
###      ANGMORTEY SYLVESTER TETTEH     10965548                               ###
###      HORMENYO BENJAMIN            10963717                               ###
###                                                                          ###
###                                                                          ###
###                                                                          ###
###    MODELED WITH THE ID3 ALGORITHM                                        ###
###                                                                          ###
###                                                                          ###
################################################################################
################################################################################

file = read.csv(file = choose.files(), header=T)

### Making sure the data is in the right format
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
H1N1_Flu_Vaccines = mutate(file, 
               h1n1_concern=factor(h1n1_concern),
               h1n1_knowledge=factor(h1n1_knowledge),
               chronic_med_condition=factor(chronic_med_condition),
               health_worker=factor(health_worker),
               opinion_h1n1_sick_from_vacc=factor(opinion_h1n1_sick_from_vacc),
               opinion_seas_sick_from_vacc=factor(opinion_seas_sick_from_vacc),
               age_group=factor(age_group),
               sex=factor(sex),
              income_poverty=factor(income_poverty),
              marital_status=factor(marital_status),
              household_children=factor(household_children),
              h1n1_vaccine=factor(h1n1_vaccine),
              seasonal_vaccine=factor(seasonal_vaccine));View(H1N1_Flu_Vaccines)

### Producing descriptive statistics for the following variables: 
# Age group
summary(H1N1_Flu_Vaccines$age_group)
## 18 - 34 Years 35 - 44 Years 45 - 54 Years 55 - 64 Years     65+ Years 
##          5170          3799          5207          5529          6758
round(prop.table(table(H1N1_Flu_Vaccines$age_group)),4)*100
## 
## 18 - 34 Years 35 - 44 Years 45 - 54 Years 55 - 64 Years     65+ Years 
##         19.54         14.36         19.68         20.89         25.54
barplot((table(H1N1_Flu_Vaccines$age_group)),
        col=rainbow(5),xlab = 'Age Group',ylab = 'Frequency')

# h1n1 concern 
summary(H1N1_Flu_Vaccines$h1n1_concern)
##     0     1     2     3  NA's 
##  3255  8078 10495  4544    91
round(prop.table(table(H1N1_Flu_Vaccines$h1n1_concern)),4)*100
## 
##     0     1     2     3 
## 12.34 30.63 39.80 17.23
barplot((table(H1N1_Flu_Vaccines$h1n1_concern)),
        col=rainbow(4),xlab='Level of H1N1 concern', ylab='Frequency')

# sex
summary(H1N1_Flu_Vaccines$sex)
## Female   Male 
##  15722  10741
round(prop.table(table(H1N1_Flu_Vaccines$sex)),4)*100
## 
## Female   Male 
##  59.41  40.59
barplot((table(H1N1_Flu_Vaccines$sex)),
        col=c('#bff00f','#00ff00'),xlab = 'Sex',ylab='Frequency')

# Constructing a contingency table for h1n1_concern (row) and age_group (column).
cont.tab=with(H1N1_Flu_Vaccines,table('H1N1 Concern'=h1n1_concern,
                '                       Age group of respondent'=age_group))
row.names(cont.tab) = c('Not at all concerned (0):',
                        'Not very concerned   (1):',
                        'Somewhat concerned   (2):',
                        'Very concerned       (3):');cont.tab
##                                                   Age group of respondent
## H1N1 Concern                18 - 34 Years 35 - 44 Years 45 - 54 Years
##   Not at all concerned (0):           828           385           561
##   Not very concerned   (1):          1860          1168          1471
##   Somewhat concerned   (2):          1792          1551          2316
##   Very concerned       (3):           686           689           852
##                                                   Age group of respondent
## H1N1 Concern                55 - 64 Years 65+ Years
##   Not at all concerned (0):           580       901
##   Not very concerned   (1):          1655      1924
##   Somewhat concerned   (2):          2310      2526
##   Very concerned       (3):           967      1350
# Alternative
library(gmodels)
CrossTable(cont.tab,chisq = F,expected = F,
           prop.r=F,prop.c=F,prop.t=F,prop.chisq = F,
           mcnemar = F,fisher = F,format = 'SPSS')
## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |-------------------------|
## 
## Total Observations in Table:  26372 
## 
##                           |                        Age group of respondent 
##              H1N1 Concern | 18 - 34 Years  | 35 - 44 Years  | 45 - 54 Years  | 55 - 64 Years  |     65+ Years  |     Row Total | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Not at all concerned (0): |          828  |          385  |          561  |          580  |          901  |         3255  | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Not very concerned   (1): |         1860  |         1168  |         1471  |         1655  |         1924  |         8078  | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Somewhat concerned   (2): |         1792  |         1551  |         2316  |         2310  |         2526  |        10495  | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Very concerned       (3): |          686  |          689  |          852  |          967  |         1350  |         4544  | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
##              Column Total |         5166  |         3793  |         5200  |         5512  |         6701  |        26372  | 
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## 
## 
### CHI SQUARE TEST OF INDEPENDENCE
# null hypothesis: H1N1 concern and Age Group are independent.
# alternative hypothesis: H1N1 concern and Age Group are not independent.
chi = chisq.test(cont.tab);chi
## 
##  Pearson's Chi-squared test
## 
## data:  cont.tab
## X-squared = 332.23, df = 12, p-value < 2.2e-16
### We write a function to check if there exist significant relationship
chisq.test.result = function(alpha,p.value)
{
  if(alpha>p.value){
    res = 'There is no significant relationship'
  }else
  {
    res = 'There is significant relationship'
  }
  return(res)
}
chisq.test.result(alpha=0.05,p.value=chi$p.value)
## [1] "There is no significant relationship"
### Cramer’s V contingency coefficient to measure the strength 
# of the association between h1n1_concern and age_group.

library(DescTools)
## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata

CramerV(cont.tab ,conf.level = 0.05)
##   Cramer V     lwr.ci     upr.ci 
## 0.06480186 0.06349458 0.06394427
### Building a classification algorithm for the Vaccine data set using either 
# h1n1_vaccine as the target variables. 

### ID3 CLASSIFICATION ALGORITHM 
### Loading the needed packages
library('rpart')      # partition the tree
library('dplyr')      # For mutating the data
library('rpart.plot') # For ploting the tree
library('caTools')    # General manipulation of the data
library('caret')      # For splitting the data
## Loading required package: ggplot2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
### Splitting the data into training and testing sets
### 70% for training set and 30% for the testing set
set.seed(7) # 5727 Setting the seed
sample = sample.split(H1N1_Flu_Vaccines$h1n1_vaccine, SplitRatio = .70)
train  = subset(H1N1_Flu_Vaccines,  sample==TRUE)
test   = subset(H1N1_Flu_Vaccines, sample==FALSE)

### Training the decision tree classifier
tree = rpart(h1n1_vaccine~.,
              data=train,na.action = na.rpart,
              method = 'class',)

### Prediction
tree.h1n1.predicted = predict(tree,test,type = 'class')
train.predicted = predict(tree,train,type = 'class')

### Confusion matrix to evaluate the model
conf_test  = confusionMatrix(tree.h1n1.predicted,test$h1n1_vaccine);conf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6119 1443
##          1  131  246
##                                           
##                Accuracy : 0.8017          
##                  95% CI : (0.7928, 0.8105)
##     No Information Rate : 0.7873          
##     P-Value [Acc > NIR] : 0.0007777       
##                                           
##                   Kappa : 0.174           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9790          
##             Specificity : 0.1456          
##          Pos Pred Value : 0.8092          
##          Neg Pred Value : 0.6525          
##              Prevalence : 0.7873          
##          Detection Rate : 0.7708          
##    Detection Prevalence : 0.9525          
##       Balanced Accuracy : 0.5623          
##                                           
##        'Positive' Class : 0               
## 
conf_train = confusionMatrix(train.predicted,train$h1n1_vaccine);conf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14266  3420
##          1   317   521
##                                          
##                Accuracy : 0.7983         
##                  95% CI : (0.7924, 0.804)
##     No Information Rate : 0.7872         
##     P-Value [Acc > NIR] : 0.0001188      
##                                          
##                   Kappa : 0.155          
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9783         
##             Specificity : 0.1322         
##          Pos Pred Value : 0.8066         
##          Neg Pred Value : 0.6217         
##              Prevalence : 0.7872         
##          Detection Rate : 0.7701         
##    Detection Prevalence : 0.9548         
##       Balanced Accuracy : 0.5552         
##                                          
##        'Positive' Class : 0              
## 
### Mis-classification
100-round(sum(diag(conf_test$table))/sum(conf_test$table),4)*100
## [1] 19.83
100-round(sum(diag(conf_train$table))/sum(conf_train$table),4)*100
## [1] 20.17
### Visualizing the decision tree
rpart.plot(tree,
           split.border.col='brown',split.round=10,
           shadow.col='darkgray',split.cex=1,
           split.lwd=2,split.box.col=c('#bff00f','#ffbb00','#00bbff'),
           leaf.round=10,branch.lwd=3,left=T,extra=104)

plot(tree,main='Skeleton 😎')