################################################################################
################################################################################
### ###
### STAT 230, DATA MINING MINI PROJRCT - 2023 ###
### ###
### PREPARED BY: ###
### NAMES IDS ###
### GERSHON TETTEH AMANOR 10967696 ###
### DOE OWEN BUERTEY 10972098 ###
### NANA ASIEDU AKROFI 10986678 ###
### EVORDA KOKOU JOHN 10971696 ###
### NKRUMAH SYLVESTER SADASCO 10964940 ###
### AMIDU ABDUL-AZIZ 10967087 ###
### ANGMORTEY SYLVESTER TETTEH 10965548 ###
### HORMENYO BENJAMIN 10963717 ###
### ###
### ###
### ###
### MODELED WITH THE ID3 ALGORITHM ###
### ###
### ###
################################################################################
################################################################################
file = read.csv(file = choose.files(), header=T)
### Making sure the data is in the right format
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
H1N1_Flu_Vaccines = mutate(file,
h1n1_concern=factor(h1n1_concern),
h1n1_knowledge=factor(h1n1_knowledge),
chronic_med_condition=factor(chronic_med_condition),
health_worker=factor(health_worker),
opinion_h1n1_sick_from_vacc=factor(opinion_h1n1_sick_from_vacc),
opinion_seas_sick_from_vacc=factor(opinion_seas_sick_from_vacc),
age_group=factor(age_group),
sex=factor(sex),
income_poverty=factor(income_poverty),
marital_status=factor(marital_status),
household_children=factor(household_children),
h1n1_vaccine=factor(h1n1_vaccine),
seasonal_vaccine=factor(seasonal_vaccine));View(H1N1_Flu_Vaccines)
### Producing descriptive statistics for the following variables:
# Age group
summary(H1N1_Flu_Vaccines$age_group)
## 18 - 34 Years 35 - 44 Years 45 - 54 Years 55 - 64 Years 65+ Years
## 5170 3799 5207 5529 6758
round(prop.table(table(H1N1_Flu_Vaccines$age_group)),4)*100
##
## 18 - 34 Years 35 - 44 Years 45 - 54 Years 55 - 64 Years 65+ Years
## 19.54 14.36 19.68 20.89 25.54
barplot((table(H1N1_Flu_Vaccines$age_group)),
col=rainbow(5),xlab = 'Age Group',ylab = 'Frequency')

# h1n1 concern
summary(H1N1_Flu_Vaccines$h1n1_concern)
## 0 1 2 3 NA's
## 3255 8078 10495 4544 91
round(prop.table(table(H1N1_Flu_Vaccines$h1n1_concern)),4)*100
##
## 0 1 2 3
## 12.34 30.63 39.80 17.23
barplot((table(H1N1_Flu_Vaccines$h1n1_concern)),
col=rainbow(4),xlab='Level of H1N1 concern', ylab='Frequency')

# sex
summary(H1N1_Flu_Vaccines$sex)
## Female Male
## 15722 10741
round(prop.table(table(H1N1_Flu_Vaccines$sex)),4)*100
##
## Female Male
## 59.41 40.59
barplot((table(H1N1_Flu_Vaccines$sex)),
col=c('#bff00f','#00ff00'),xlab = 'Sex',ylab='Frequency')
# Constructing a contingency table for h1n1_concern (row) and age_group (column).
cont.tab=with(H1N1_Flu_Vaccines,table('H1N1 Concern'=h1n1_concern,
' Age group of respondent'=age_group))
row.names(cont.tab) = c('Not at all concerned (0):',
'Not very concerned (1):',
'Somewhat concerned (2):',
'Very concerned (3):');cont.tab
## Age group of respondent
## H1N1 Concern 18 - 34 Years 35 - 44 Years 45 - 54 Years
## Not at all concerned (0): 828 385 561
## Not very concerned (1): 1860 1168 1471
## Somewhat concerned (2): 1792 1551 2316
## Very concerned (3): 686 689 852
## Age group of respondent
## H1N1 Concern 55 - 64 Years 65+ Years
## Not at all concerned (0): 580 901
## Not very concerned (1): 1655 1924
## Somewhat concerned (2): 2310 2526
## Very concerned (3): 967 1350
# Alternative
library(gmodels)
CrossTable(cont.tab,chisq = F,expected = F,
prop.r=F,prop.c=F,prop.t=F,prop.chisq = F,
mcnemar = F,fisher = F,format = 'SPSS')
##
## Cell Contents
## |-------------------------|
## | Count |
## |-------------------------|
##
## Total Observations in Table: 26372
##
## | Age group of respondent
## H1N1 Concern | 18 - 34 Years | 35 - 44 Years | 45 - 54 Years | 55 - 64 Years | 65+ Years | Row Total |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Not at all concerned (0): | 828 | 385 | 561 | 580 | 901 | 3255 |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Not very concerned (1): | 1860 | 1168 | 1471 | 1655 | 1924 | 8078 |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Somewhat concerned (2): | 1792 | 1551 | 2316 | 2310 | 2526 | 10495 |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Very concerned (3): | 686 | 689 | 852 | 967 | 1350 | 4544 |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
## Column Total | 5166 | 3793 | 5200 | 5512 | 6701 | 26372 |
## --------------------------|---------------|---------------|---------------|---------------|---------------|---------------|
##
##
### CHI SQUARE TEST OF INDEPENDENCE
# null hypothesis: H1N1 concern and Age Group are independent.
# alternative hypothesis: H1N1 concern and Age Group are not independent.
chi = chisq.test(cont.tab);chi
##
## Pearson's Chi-squared test
##
## data: cont.tab
## X-squared = 332.23, df = 12, p-value < 2.2e-16
### We write a function to check if there exist significant relationship
chisq.test.result = function(alpha,p.value)
{
if(alpha>p.value){
res = 'There is no significant relationship'
}else
{
res = 'There is significant relationship'
}
return(res)
}
chisq.test.result(alpha=0.05,p.value=chi$p.value)
## [1] "There is no significant relationship"
### Cramer’s V contingency coefficient to measure the strength
# of the association between h1n1_concern and age_group.
library(DescTools)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata

CramerV(cont.tab ,conf.level = 0.05)
## Cramer V lwr.ci upr.ci
## 0.06480186 0.06349458 0.06394427
### Building a classification algorithm for the Vaccine data set using either
# h1n1_vaccine as the target variables.
### ID3 CLASSIFICATION ALGORITHM
### Loading the needed packages
library('rpart') # partition the tree
library('dplyr') # For mutating the data
library('rpart.plot') # For ploting the tree
library('caTools') # General manipulation of the data
library('caret') # For splitting the data
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
### Splitting the data into training and testing sets
### 70% for training set and 30% for the testing set
set.seed(7) # 5727 Setting the seed
sample = sample.split(H1N1_Flu_Vaccines$h1n1_vaccine, SplitRatio = .70)
train = subset(H1N1_Flu_Vaccines, sample==TRUE)
test = subset(H1N1_Flu_Vaccines, sample==FALSE)
### Training the decision tree classifier
tree = rpart(h1n1_vaccine~.,
data=train,na.action = na.rpart,
method = 'class',)
### Prediction
tree.h1n1.predicted = predict(tree,test,type = 'class')
train.predicted = predict(tree,train,type = 'class')
### Confusion matrix to evaluate the model
conf_test = confusionMatrix(tree.h1n1.predicted,test$h1n1_vaccine);conf_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6119 1443
## 1 131 246
##
## Accuracy : 0.8017
## 95% CI : (0.7928, 0.8105)
## No Information Rate : 0.7873
## P-Value [Acc > NIR] : 0.0007777
##
## Kappa : 0.174
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9790
## Specificity : 0.1456
## Pos Pred Value : 0.8092
## Neg Pred Value : 0.6525
## Prevalence : 0.7873
## Detection Rate : 0.7708
## Detection Prevalence : 0.9525
## Balanced Accuracy : 0.5623
##
## 'Positive' Class : 0
##
conf_train = confusionMatrix(train.predicted,train$h1n1_vaccine);conf_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14266 3420
## 1 317 521
##
## Accuracy : 0.7983
## 95% CI : (0.7924, 0.804)
## No Information Rate : 0.7872
## P-Value [Acc > NIR] : 0.0001188
##
## Kappa : 0.155
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9783
## Specificity : 0.1322
## Pos Pred Value : 0.8066
## Neg Pred Value : 0.6217
## Prevalence : 0.7872
## Detection Rate : 0.7701
## Detection Prevalence : 0.9548
## Balanced Accuracy : 0.5552
##
## 'Positive' Class : 0
##
### Mis-classification
100-round(sum(diag(conf_test$table))/sum(conf_test$table),4)*100
## [1] 19.83
100-round(sum(diag(conf_train$table))/sum(conf_train$table),4)*100
## [1] 20.17
### Visualizing the decision tree
rpart.plot(tree,
split.border.col='brown',split.round=10,
shadow.col='darkgray',split.cex=1,
split.lwd=2,split.box.col=c('#bff00f','#ffbb00','#00bbff'),
leaf.round=10,branch.lwd=3,left=T,extra=104)

plot(tree,main='Skeleton 😎')
