EDA
- Feature Density With Respect to Attendance
- Selected Individual Feature Densities
Clustering
Machine Learning Predictions
Conclusion

############################################################
#                                                          #
#                        My Theme                          ####
#                                                          #
############################################################


my_theme <- function(base_size = 12, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "navy", color = "navy", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "white"),
    legend.position = "right",
    legend.justification = "top", 
    legend.background = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}

theme_set(my_theme())

############################################################
#                                                          #
#                   Plot Neural Network                    #
#                                                          #
############################################################


plot.nnet <- function(mod.in,nid=T,all.out=T,all.in=T,bias=T,wts.only=F,rel.rsc=5,
                    circle.cex=5,node.labs=T,var.labs=T,x.lab=NULL,y.lab=NULL,
                    line.stag=NULL,struct=NULL,cex.val=1,alpha.val=1,
                    circle.col='lightblue',pos.col='black',neg.col='grey',
                    bord.col='black', ...){
  
  require(scales)
  
  #sanity checks
  if('mlp' %in% class(mod.in)) warning('Bias layer not applicable for rsnns object')
  if('numeric' %in% class(mod.in)){
    if(is.null(struct)) stop('Three-element vector required for struct')
    if(length(mod.in) != ((struct[1]*struct[2]+struct[2]*struct[3])+(struct[3]+struct[2])))
      stop('Incorrect length of weight matrix for given network structure')
  }
  if('train' %in% class(mod.in)){
    if('nnet' %in% class(mod.in$finalModel)){
      mod.in<-mod.in$finalModel
      warning('Using best nnet model from train output')
    }
    else stop('Only nnet method can be used with train object')
  }
  
  #gets weights for neural network, output is list
  #if rescaled argument is true, weights are returned but rescaled based on abs value
  nnet.vals<-function(mod.in,nid,rel.rsc,struct.out=struct){
    
    require(scales)
    require(reshape)
    
    if('numeric' %in% class(mod.in)){
      struct.out<-struct
      wts<-mod.in
    }
    
    #neuralnet package
    if('nn' %in% class(mod.in)){
      struct.out<-unlist(lapply(mod.in$weights[[1]],ncol))
        struct.out<-struct.out[-length(struct.out)]
        struct.out<-c(
            length(mod.in$model.list$variables),
            struct.out,
            length(mod.in$model.list$response)
            )           
      wts<-unlist(mod.in$weights[[1]])   
    }
    
    #nnet package
    if('nnet' %in% class(mod.in)){
      struct.out<-mod.in$n
      wts<-mod.in$wts
    }
    
    #RSNNS package
    if('mlp' %in% class(mod.in)){
      struct.out<-c(mod.in$nInputs,mod.in$archParams$size,mod.in$nOutputs)
      hid.num<-length(struct.out)-2
      wts<-mod.in$snnsObject$getCompleteWeightMatrix()
      
      #get all input-hidden and hidden-hidden wts
      inps<-wts[grep('Input',row.names(wts)),grep('Hidden_2',colnames(wts)),drop=F]
      inps<-melt(rbind(rep(NA,ncol(inps)),inps))$value
      uni.hids<-paste0('Hidden_',1+seq(1,hid.num))
      for(i in 1:length(uni.hids)){
        if(is.na(uni.hids[i+1])) break
        tmp<-wts[grep(uni.hids[i],rownames(wts)),grep(uni.hids[i+1],colnames(wts)),drop=F]
        inps<-c(inps,melt(rbind(rep(NA,ncol(tmp)),tmp))$value)
        }
      
      #get connections from last hidden to output layers
      outs<-wts[grep(paste0('Hidden_',hid.num+1),row.names(wts)),grep('Output',colnames(wts)),drop=F]
      outs<-rbind(rep(NA,ncol(outs)),outs)
      
      #weight vector for all
      wts<-c(inps,melt(outs)$value)
      assign('bias',F,envir=environment(nnet.vals))
      }
    
    if(nid) wts<-rescale(abs(wts),c(1,rel.rsc))
    
    #convert wts to list with appropriate names 
    hid.struct<-struct.out[-c(length(struct.out))]
    row.nms<-NULL
    for(i in 1:length(hid.struct)){
      if(is.na(hid.struct[i+1])) break
      row.nms<-c(row.nms,rep(paste('hidden',i,seq(1:hid.struct[i+1])),each=1+hid.struct[i]))
    }
    row.nms<-c(
      row.nms,
      rep(paste('out',seq(1:struct.out[length(struct.out)])),each=1+struct.out[length(struct.out)-1])
      )
    out.ls<-data.frame(wts,row.nms)
    out.ls$row.nms<-factor(row.nms,levels=unique(row.nms),labels=unique(row.nms))
    out.ls<-split(out.ls$wts,f=out.ls$row.nms)
    
    assign('struct',struct.out,envir=environment(nnet.vals))
    
    out.ls
    
    }
  
  wts<-nnet.vals(mod.in,nid=F)
  
  if(wts.only) return(wts)
  
  #circle colors for input, if desired, must be two-vector list, first vector is for input layer
  if(is.list(circle.col)){
                    circle.col.inp<-circle.col[[1]]
                    circle.col<-circle.col[[2]]
                    }
  else circle.col.inp<-circle.col
  
  #initiate plotting
  x.range<-c(0,100)
  y.range<-c(0,100)
  #these are all proportions from 0-1
  if(is.null(line.stag)) line.stag<-0.011*circle.cex/2
  layer.x<-seq(0.17,0.9,length=length(struct))
  bias.x<-layer.x[-length(layer.x)]+diff(layer.x)/2
  bias.y<-0.95
  circle.cex<-circle.cex
  
  #get variable names from mod.in object
  #change to user input if supplied
  if('numeric' %in% class(mod.in)){
    x.names<-paste0(rep('X',struct[1]),seq(1:struct[1]))
    y.names<-paste0(rep('Y',struct[3]),seq(1:struct[3]))
  }
  if('mlp' %in% class(mod.in)){
    all.names<-mod.in$snnsObject$getUnitDefinitions()
    x.names<-all.names[grep('Input',all.names$unitName),'unitName']
    y.names<-all.names[grep('Output',all.names$unitName),'unitName']
  }
  if('nn' %in% class(mod.in)){
    x.names<-mod.in$model.list$variables
    y.names<-mod.in$model.list$respons
  }
  if('xNames' %in% names(mod.in)){
    x.names<-mod.in$xNames
    y.names<-attr(terms(mod.in),'factor')
    y.names<-row.names(y.names)[!row.names(y.names) %in% x.names]
  }
  if(!'xNames' %in% names(mod.in) & 'nnet' %in% class(mod.in)){
    if(is.null(mod.in$call$formula)){
      x.names<-colnames(eval(mod.in$call$x))
      y.names<-colnames(eval(mod.in$call$y))
    }
    else{
      forms<-eval(mod.in$call$formula)
      x.names<-mod.in$coefnames
      facts<-attr(terms(mod.in),'factors')
      y.check<-mod.in$fitted
      if(ncol(y.check)>1) y.names<-colnames(y.check)
      else y.names<-as.character(forms)[2]
    } 
  }
  #change variables names to user sub 
  if(!is.null(x.lab)){
    if(length(x.names) != length(x.lab)) stop('x.lab length not equal to number of input variables')
    else x.names<-x.lab
  }
  if(!is.null(y.lab)){
    if(length(y.names) != length(y.lab)) stop('y.lab length not equal to number of output variables')
    else y.names<-y.lab
  }
  
  #initiate plot
  plot(x.range,y.range,type='n',axes=F,ylab='',xlab='',...)
  
  #function for getting y locations for input, hidden, output layers
  #input is integer value from 'struct'
  get.ys<-function(lyr){
    spacing<-diff(c(0*diff(y.range),0.9*diff(y.range)))/max(struct)
    seq(0.5*(diff(y.range)+spacing*(lyr-1)),0.5*(diff(y.range)-spacing*(lyr-1)),
        length=lyr)
  }
  
  #function for plotting nodes
  #'layer' specifies which layer, integer from 'struct'
  #'x.loc' indicates x location for layer, integer from 'layer.x'
  #'layer.name' is string indicating text to put in node
  layer.points<-function(layer,x.loc,layer.name,cex=cex.val){
    x<-rep(x.loc*diff(x.range),layer)
    y<-get.ys(layer)
    points(x,y,pch=21,cex=circle.cex,col=bord.col,bg=in.col)
    if(node.labs) text(x,y,paste(layer.name,1:layer,sep=''),cex=cex.val)
    if(layer.name=='I' & var.labs) text(x-line.stag*diff(x.range),y,x.names,pos=2,cex=cex.val)      
    if(layer.name=='O' & var.labs) text(x+line.stag*diff(x.range),y,y.names,pos=4,cex=cex.val)
  }
  
  #function for plotting bias points
  #'bias.x' is vector of values for x locations
  #'bias.y' is vector for y location
  #'layer.name' is  string indicating text to put in node
  bias.points<-function(bias.x,bias.y,layer.name,cex,...){
    for(val in 1:length(bias.x)){
      points(
        diff(x.range)*bias.x[val],
        bias.y*diff(y.range),
        pch=21,col=bord.col,bg=in.col,cex=circle.cex
      )
      if(node.labs)
        text(
          diff(x.range)*bias.x[val],
          bias.y*diff(y.range),
          paste(layer.name,val,sep=''),
          cex=cex.val
        )
    }
  }
  
  #function creates lines colored by direction and width as proportion of magnitude
  #use 'all.in' argument if you want to plot connection lines for only a single input node
  layer.lines<-function(mod.in,h.layer,layer1=1,layer2=2,out.layer=F,nid,rel.rsc,all.in,pos.col,
                        neg.col,...){
    
    x0<-rep(layer.x[layer1]*diff(x.range)+line.stag*diff(x.range),struct[layer1])
    x1<-rep(layer.x[layer2]*diff(x.range)-line.stag*diff(x.range),struct[layer1])
    
    if(out.layer==T){
      
      y0<-get.ys(struct[layer1])
      y1<-rep(get.ys(struct[layer2])[h.layer],struct[layer1])
      src.str<-paste('out',h.layer)
      
      wts<-nnet.vals(mod.in,nid=F,rel.rsc)
      wts<-wts[grep(src.str,names(wts))][[1]][-1]
      wts.rs<-nnet.vals(mod.in,nid=T,rel.rsc)
      wts.rs<-wts.rs[grep(src.str,names(wts.rs))][[1]][-1]
      
      cols<-rep(pos.col,struct[layer1])
      cols[wts<0]<-neg.col
      
      if(nid) segments(x0,y0,x1,y1,col=cols,lwd=wts.rs)
      else segments(x0,y0,x1,y1)
      
    }
    
    else{
      
      if(is.logical(all.in)) all.in<-h.layer
      else all.in<-which(x.names==all.in)
      
      y0<-rep(get.ys(struct[layer1])[all.in],struct[2])
      y1<-get.ys(struct[layer2])
      src.str<-paste('hidden',layer1)
      
      wts<-nnet.vals(mod.in,nid=F,rel.rsc)
      wts<-unlist(lapply(wts[grep(src.str,names(wts))],function(x) x[all.in+1]))
      wts.rs<-nnet.vals(mod.in,nid=T,rel.rsc)
      wts.rs<-unlist(lapply(wts.rs[grep(src.str,names(wts.rs))],function(x) x[all.in+1]))
      
      cols<-rep(pos.col,struct[layer2])
      cols[wts<0]<-neg.col
      
      if(nid) segments(x0,y0,x1,y1,col=cols,lwd=wts.rs)
      else segments(x0,y0,x1,y1)
      
    }
    
  }
  
  bias.lines<-function(bias.x,mod.in,nid,rel.rsc,all.out,pos.col,neg.col,...){
    
    if(is.logical(all.out)) all.out<-1:struct[length(struct)]
    else all.out<-which(y.names==all.out)
    
    for(val in 1:length(bias.x)){
      
      wts<-nnet.vals(mod.in,nid=F,rel.rsc)
      wts.rs<-nnet.vals(mod.in,nid=T,rel.rsc)
      
        if(val != length(bias.x)){
        wts<-wts[grep('out',names(wts),invert=T)]
        wts.rs<-wts.rs[grep('out',names(wts.rs),invert=T)]
            sel.val<-grep(val,substr(names(wts.rs),8,8))
            wts<-wts[sel.val]
            wts.rs<-wts.rs[sel.val]
            }
    
        else{
        wts<-wts[grep('out',names(wts))]
        wts.rs<-wts.rs[grep('out',names(wts.rs))]
        }
      
      cols<-rep(pos.col,length(wts))
      cols[unlist(lapply(wts,function(x) x[1]))<0]<-neg.col
      wts.rs<-unlist(lapply(wts.rs,function(x) x[1]))
      
      if(nid==F){
        wts.rs<-rep(1,struct[val+1])
        cols<-rep('black',struct[val+1])
      }
      
      if(val != length(bias.x)){
        segments(
          rep(diff(x.range)*bias.x[val]+diff(x.range)*line.stag,struct[val+1]),
          rep(bias.y*diff(y.range),struct[val+1]),
          rep(diff(x.range)*layer.x[val+1]-diff(x.range)*line.stag,struct[val+1]),
          get.ys(struct[val+1]),
          lwd=wts.rs,
          col=cols
        )
      }
      
      else{
        segments(
          rep(diff(x.range)*bias.x[val]+diff(x.range)*line.stag,struct[val+1]),
          rep(bias.y*diff(y.range),struct[val+1]),
          rep(diff(x.range)*layer.x[val+1]-diff(x.range)*line.stag,struct[val+1]),
          get.ys(struct[val+1])[all.out],
          lwd=wts.rs[all.out],
          col=cols[all.out]
        )
      }
      
    }
  }
  
  #use functions to plot connections between layers
  #bias lines
  if(bias) bias.lines(bias.x,mod.in,nid=nid,rel.rsc=rel.rsc,all.out=all.out,pos.col=alpha(pos.col,alpha.val),
                      neg.col=alpha(neg.col,alpha.val))
  
  #layer lines, makes use of arguments to plot all or for individual layers
  #starts with input-hidden
  #uses 'all.in' argument to plot connection lines for all input nodes or a single node
  if(is.logical(all.in)){  
    mapply(
      function(x) layer.lines(mod.in,x,layer1=1,layer2=2,nid=nid,rel.rsc=rel.rsc,
        all.in=all.in,pos.col=alpha(pos.col,alpha.val),neg.col=alpha(neg.col,alpha.val)),
      1:struct[1]
    )
  }
  else{
    node.in<-which(x.names==all.in)
    layer.lines(mod.in,node.in,layer1=1,layer2=2,nid=nid,rel.rsc=rel.rsc,all.in=all.in,
                pos.col=alpha(pos.col,alpha.val),neg.col=alpha(neg.col,alpha.val))
  }
  #connections between hidden layers
  lays<-split(c(1,rep(2:(length(struct)-1),each=2),length(struct)),
              f=rep(1:(length(struct)-1),each=2))
  lays<-lays[-c(1,(length(struct)-1))]
  for(lay in lays){
    for(node in 1:struct[lay[1]]){
      layer.lines(mod.in,node,layer1=lay[1],layer2=lay[2],nid=nid,rel.rsc=rel.rsc,all.in=T,
                  pos.col=alpha(pos.col,alpha.val),neg.col=alpha(neg.col,alpha.val))
    }
  }
  #lines for hidden-output
  #uses 'all.out' argument to plot connection lines for all output nodes or a single node
  if(is.logical(all.out))
    mapply(
      function(x) layer.lines(mod.in,x,layer1=length(struct)-1,layer2=length(struct),out.layer=T,nid=nid,rel.rsc=rel.rsc,
                              all.in=all.in,pos.col=alpha(pos.col,alpha.val),neg.col=alpha(neg.col,alpha.val)),
      1:struct[length(struct)]
      )
  else{
    node.in<-which(y.names==all.out)
    layer.lines(mod.in,node.in,layer1=length(struct)-1,layer2=length(struct),out.layer=T,nid=nid,rel.rsc=rel.rsc,
                pos.col=pos.col,neg.col=neg.col,all.out=all.out)
  }
  
  #use functions to plot nodes
  for(i in 1:length(struct)){
    in.col<-circle.col
    layer.name<-'H'
    if(i==1) { layer.name<-'I'; in.col<-circle.col.inp}
    if(i==length(struct)) layer.name<-'O'
    layer.points(struct[i],layer.x[i],layer.name)
    }

  if(bias) bias.points(bias.x,bias.y,'B')
  
}

wait <- read_csv("C:/Users/gexbxl1/Documents/OtherData/No-show-Issue-Comma-300k.csv")
colnames(wait)[4] <- "AppointmentData"
colnames(wait)[8] <- "Alcoholism"
colnames(wait)[9] <- "Hypertension"
wait$Status %<>% as.factor()
wait <- wait[,c(6,3,4,1,2,5,7:15)]

EDA

Feature Density With Respect to Attendance

wait %>%
  select(-c(AppointmentRegistration, AppointmentData)) %>%
    gather(x, y, Age:AwaitingTime) %>%
      ggplot(aes(x = y, fill = Status, color = Status)) +
        geom_density(alpha = 0.3, size = 1) +
        geom_rug() +
        scale_fill_brewer(palette = "Set1") +
        scale_color_brewer(palette = "Set1") +
        facet_wrap( ~ as.factor(x), scales = "free_y", ncol = 3) +
        theme(plot.subtitle = element_text(vjust = 1), 
          plot.caption = element_text(vjust = 1), 
          axis.text.x = element_text(vjust = 0.5), 
          plot.title = element_text(size = 18, 
              face = "bold.italic")) +
        labs(title = "Doctor Appointment Attendance Density")+
        labs(x = "Various Variables") +
        labs(y = "Density") +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_blank())

Selected Individual Feature Densities

wait$Status %<>% as.factor() 
Status_DENSITY <-
(ggplot(wait, aes(x=Status))  +
  geom_density(aes(y = ..count../sum(..count..), group = Status, fill = Status), alpha = 0.3) +
  theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    axis.text.x = element_text(angle = 90))+
    labs(title = "Doctor Appointment Attendance Density", 
    x = "Attendance", y = "Density", colour = "Attendance") +
        my_theme() +
        theme(axis.text.x = element_text(angle = 50),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())  +
        theme(legend.background = element_rect(size = 0.1), 
        legend.position = "bottom", legend.direction = "horizontal") +
        theme(plot.title = element_text(size = 14, hjust = 0.5))+
        labs(fill = "Attendance"))

DAYOFTHEWEEK_DENSITY <- 
(ggplot(wait, aes(x=DayOfTheWeek))  +
  geom_density(aes(y = ..count../sum(..count..),group =DayOfTheWeek, fill = DayOfTheWeek), alpha = 0.3) +
  theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    axis.text.x = element_text(angle = 90))+
    labs(title = "Day Of The Week Density", 
    x = "Days of The Week", y = "Density", colour = "Days of The Week") +
        my_theme() +
        theme(axis.text.x = element_text(angle = 50),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())  +
        theme(legend.background = element_rect(size = 0.1), 
        legend.position = "bottom", legend.direction = "horizontal") +
        theme(plot.title = element_text(size = 14, hjust = 0.5)) +
        labs(fill = "Days of The Week"))

GENDER_DENSITY <-
(ggplot(wait, aes(x=Gender))  +
  geom_density(aes(y = ..count../sum(..count..),group = Gender, fill = Gender), alpha = 0.3) +
  theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    axis.text.x = element_text(angle = 90))+
    labs(title = "Gender Densities", 
    x = "Gender", y = "Density", colour = "Gender") +
        my_theme() +
        theme(axis.text.x = element_text(angle = 50),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())  +
        theme(legend.background = element_rect(size = 0.1), 
        legend.position = "bottom", legend.direction = "horizontal") +
        theme(plot.title = element_text(size = 14, hjust = 0.5)) +
        labs(fill = "Gender"))
main=textGrob("Densities of Variables",gp=gpar(fontsize=30,font=3))
grid.arrange(Status_DENSITY, DAYOFTHEWEEK_DENSITY, GENDER_DENSITY, nrow = 3, top = main)

Clustering

wait_bin <- wait
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Sunday"] = 1
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Monday"] = 2
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Tuesday"] = 3  
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Wednesday"] = 4
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Thursday"] = 5
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Friday"] = 6
wait_bin[,"DayOfTheWeek"][wait_bin[,"DayOfTheWeek"] == "Saturday"] = 7
wait_bin$DayOfTheWeek %<>% as.integer()

wait_bin[,"Gender"][wait_bin[,"Gender"] == "M"] = 0
wait_bin[,"Gender"][wait_bin[,"Gender"] == "F"] = 1
wait_bin$Gender %<>% as.integer()

wait_bin$Status_Binary = 0

wait_bin[,"Status_Binary"][wait_bin[,"Status"] == "Show-Up"] = 0
wait_bin[,"Status_Binary"][wait_bin[,"Status"] == "No-Show"] = 1

wait.pca <- wait_bin %>% select(-c(AppointmentRegistration, AppointmentData, Status)) %>% PCA(graph = FALSE)

Variable PCA

fviz_pca_var(wait.pca , col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             )

fviz_pca_var %>% dput()

function (X, axes = c(1, 2), geom = c("arrow", "text"), repel = FALSE, 
    col.var = "black", alpha.var = 1, col.quanti.sup = "blue", 
    col.circle = "grey70", select.var = list(name = NULL, cos2 = NULL, 
        contrib = NULL), ...) 
{
    fviz(X, element = "var", axes = axes, geom = geom, color = col.var, 
        alpha = alpha.var, select = select.var, repel = repel, 
        col.col.sup = col.quanti.sup, col.circle = col.circle, 
        ...)
}

# Contributions of variables to PC1
fviz_contrib(wait.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(wait.pca, choice = "var", axes = 2, top = 10)

Individual PCA

fviz_pca_ind(wait.pca,
             label = "none", # hide individual labels
             habillage = wait$Status, # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE # Concentration ellipses
             )

Partioning Clustering

wait <- wait[,c(1,5,6,2,3,4,7:15)]

wait_bin %>%
  select(-c(AppointmentData, AppointmentRegistration, Status)) %>%
  sample_frac(0.005) %>% 
  scale() %>%
  kmeans(4,nstart=25) %>%
  fviz_cluster(data = wait_bin %>% select(-c(AppointmentData, AppointmentRegistration, Status)),
             main = "Partitioning Clustering Plot"
             ) +
  scale_color_brewer(palette = "Set2") +
  my_theme()

Okay, so clustering didn’t really come out all that well…

Hierarchical Clustering of Factors

HCPC(wait_bin %>% select(-c(AppointmentRegistration, AppointmentData, Status)) %>% sample_frac(0.0004) %>% PCA(graph = FALSE), graph=FALSE) %>% plot(choice = "3D.map")

Machine Learning Predictions

wait$Gender %<>% as.character()
wait$DayOfTheWeek %<>% as.character()
temp_wait <- wait %>% dplyr::select(-c(AppointmentRegistration, AppointmentData))
temp_wait$Status %<>% as.character()
temp_wait[,"Status"][temp_wait[,"Status"] == "Show-Up"] = "Show"
temp_wait[,"Status"][temp_wait[,"Status"] == "No-Show"] = "No"
temp_wait$Status %<>% as.factor()
data <- sample(2, nrow(temp_wait), replace = T, prob = c(0.7, 0.3))
train <- temp_wait[data == 1,]
test <- temp_wait[data == 2,]

R-Part

rp <- rpart(Status~ Age + AwaitingTime + Alcoholism + Hypertension + Scholarship + Smokes + Sms_Reminder,
            train,
            method = "class",
            maxdepth = 4,
            minsplit = 2,
            minbucket = 1,
            cp=-1)
printcp(rp)


Classification tree:
rpart(formula = Status ~ Age + AwaitingTime + Alcoholism + Hypertension + 
    Scholarship + Smokes + Sms_Reminder, data = train, method = "class", 
    maxdepth = 4, minsplit = 2, minbucket = 1, cp = -1)

Variables actually used in tree construction:
[1] Age          Alcoholism   AwaitingTime Scholarship  Smokes      
[6] Sms_Reminder

Root node error: 63307/209703 = 0.30189

n= 209703 

  CP nsplit rel error xerror      xstd
1  0      0         1      1 0.0033208
2 -1     15         1      1 0.0033208

#rpart.plot::rpart.plot(rp, type = 2, fallen.leaves = FALSE, extra = 4)
rattle::fancyRpartPlot(rp, sub="", main = "Recursive Partioning of Attendance")

testPred <- predict(rp, newdata = test)
predictability <- sum(testPred == test$Status)/ length(test$Status)*100
predictability

[1] 0

Rpart_Pred <- signif(predictability,4)
caret::varImp(rp)

               Overall
Age          1497.3918
Alcoholism    255.1612
AwaitingTime 1494.7662
Hypertension  323.2188
Scholarship   348.2283
Smokes        382.7392
Sms_Reminder  181.5362

With R Part, we recieve an accuracy of 0%. We can tell from the Decision Tree that it isn’t all that great.

Neural Network

model_nnet <- nnet(formula = Status ~ ., data=temp_wait, size=4, decay=0.0001, maxit = 700, trace= FALSE)

x = temp_wait[,2:ncol(temp_wait)]
y = temp_wait[,1]
pred <- predict(model_nnet,x,type="class") 
pred %<>% as.factor()
cfm_nnet <- confusionMatrix(pred, y$Status)
cfm_nnet

Confusion Matrix and Statistics

          Reference
Prediction     No   Show
      No        0      0
      Show  90731 209269
                                             
               Accuracy : 0.6976             
                 95% CI : (0.6959, 0.6992)   
    No Information Rate : 0.6976             
    P-Value [Acc > NIR] : 0.5009             
                                             
                  Kappa : 0                  
 Mcnemar's Test P-Value : <0.0000000000000002
                                             
            Sensitivity : 0.0000             
            Specificity : 1.0000             
         Pos Pred Value :    NaN             
         Neg Pred Value : 0.6976             
             Prevalence : 0.3024             
         Detection Rate : 0.0000             
   Detection Prevalence : 0.0000             
      Balanced Accuracy : 0.5000             
                                             
       'Positive' Class : No

nnet_pred <- signif(cfm_nnet$overall[[1]]*100,4)
nnet_plot <- plot.nnet(model_nnet, alpha.val = 0.5, cex= 0.7, circle.col = list('lightblue', 'white'), bord.col = 'black')

For a Neural Network, we receive an accuracy of 69.76%.

C5 Bagging Cart

wait_ml <- temp_wait %>% sample_frac(.05) 
x = wait_ml[,2:ncol(wait_ml)]
y = wait_ml [,1]

model_bag <- bagging(Status ~ . , data=wait_ml) 
pred <- predict(model_bag,x) 
l <- union(pred,y)
cfm_bag <- confusionMatrix(pred, y$Status)
cfm_bag

Confusion Matrix and Statistics

          Reference
Prediction    No  Show
      No    3775   269
      Show   693 10263
                                               
               Accuracy : 0.9359               
                 95% CI : (0.9318, 0.9397)     
    No Information Rate : 0.7021               
    P-Value [Acc > NIR] : < 0.00000000000000022
                                               
                  Kappa : 0.8424               
 Mcnemar's Test P-Value : < 0.00000000000000022
                                               
            Sensitivity : 0.8449               
            Specificity : 0.9745               
         Pos Pred Value : 0.9335               
         Neg Pred Value : 0.9367               
             Prevalence : 0.2979               
         Detection Rate : 0.2517               
   Detection Prevalence : 0.2696               
      Balanced Accuracy : 0.9097               
                                               
       'Positive' Class : No

Cart_pred <- signif(cfm_bag$overall[[1]]*100,4)

For a C5 Bagging CART we receive an accuracy of 93.59%.

Conclusion

C5 Bagging CART was the best accuracy we received with 93.59%.

Neural Network Accuracy = 69.76%. Rpart Accuracy = 0%.

I wanted to try out PCA and other Clustering Methods with the “factoMineR” library, and although it gave some pretty cool outputs, it wasn’t really useful with this Dataset.

Doctor Wait Times

Benjamin Lott

February 28, 2017

EDA