Data 2 - Homework #1

#clear the workspace
rm(list = ls())




#load req's packages
library(knitr)
library(psych)
library(tidyr)
library(ggplot2)

#clear out the cache
clean_cache(clean = TRUE, path = opts_chunk$get("cache.path"))

## NULL

df <- read.csv("https://raw.githubusercontent.com/plb2018/DATA622/master/homework1/data/prospecting_dataset.txt", header=T)

Question 1

kable(df,caption="Prospecting Dataset")

Prospecting Dataset
age.group	networth	status	credit_rating	class.prospect
youth	high	employed	fair	no
youth	high	employed	excellent	no
middle	high	employed	fair	yes
senior	medium	employed	fair	yes
senior	low	unemployed	fair	yes
senior	low	unemployed	excellent	no
middle	low	unemployed	excellent	yes
youth	medium	employed	fair	no
youth	low	unemployed	fair	yes
senior	medium	unemployed	fair	yes
youth	medium	unemployed	excellent	yes
middle	medium	employed	excellent	yes
middle	high	unemployed	fair	yes
senior	medium	employed	excellent	no

You have been hired by a local electronics retailer and the above dataset has been given to you. Manager Bayes Jr.9th wants to create a spreadsheet to predict is a customer is likely prospect. To that end:

Compute prior probabilities for the Prospect Yes/No
Compute the conditional probabilities \(P(age-group=youth|prospect=yes)\) and \(P(age-group=youth|prospect=no)\) where age-group is a predictor variable. Compute the conditional probabilities for each predictor variable, namely,\((age\_group,networth,status,credit\_rating)\)
Assuming the assumptions of Naive Bayes are met, compute the posterior probability \(P(prospect|X)\) where X is one of the predictor variable.

1 Compute prior probabilities for the Prospect Yes/No

#group the "yes" and "no" to simplify things
yes <- df[df$class.prospect == "yes",]
no <- df[!df$class.prospect == "yes",] 

#compute priors
p.yes <- nrow(yes)/nrow(df)
p.no <- 1-p.yes

#format for output
probs <- data.frame(c(p.yes,p.no))
row.names(probs) <- c("Yes","No")
colnames(probs) <- c("Prior.Probs")

#output
kable(probs)

	Prior.Probs
Yes	0.6428571
No	0.3571429

2 Compute the conditional probabilities

Compute the conditional probabilities \(P(age-group=youth|prospect=yes)\) and \(P(age-group=youth|prospect=no)\) where age-group is a predictor variable. Compute the conditional probabilities for each predictor variable, namely,\((age\_group,networth,status,credit\_rating)\)

name <- c("P(age-group=youth|prospect=yes)",
"P(age-group=middle|prospect=yes)",
"P(age-group=senior|prospect=yes)",
"P(age-group=youth|prospect=no)",
"P(age-group=middle|prospect=no)",
"P(age-group=senior|prospect=no)",
"P(networth=high|prospect=yes)",
"P(networth=low|prospect=yes)",
"P(networth=medium|prospect=yes)",
"P(networth=high|prospect=no)",
"P(networth=low|prospect=no)",
"P(networth=medium|prospect=no)",
"P(status=employed|prospect=yes)",
"P(status=employed|prospect=no)",
"P(status=unemployed|prospect=yes)",
"P(status=unemployed|prospect=no)",
"P(credit=fair|prospect=yes)",
"P(credit=excellent|prospect=no)",
"P(credit=fair|prospect=yes)",
"P(credit=excellent|prospect=no)")

#manually compute all 20 priors
value <- vector(mode = "list", length = length(name))

value[1]  <- sum(yes$age.group == "youth")/ nrow(yes)         #P(age-group=youth|prospect=yes)
value[2]  <- sum(yes$age.group == "middle")/ nrow(yes)        #P(age-group=middle|prospect=yes)
value[3]  <- sum(yes$age.group == "senior")/ nrow(yes)        #P(age-group=senior|prospect=yes)
value[4]  <- sum(no$age.group == "youth")/ nrow(no)           #P(age-group=youth|prospect=no)",
value[5]  <- sum(no$age.group == "middle")/ nrow(no)          #P(age-group=middle|prospect=no)"
value[6]  <- sum(no$age.group == "senior")/ nrow(no)          #P(age-group=senior|prospect=no)
value[7]  <- sum(yes$networth == "high")/ nrow(yes)           #P(networth=high|prospect=yes)
value[8]  <- sum(yes$networth == "low")/ nrow(yes)            #P(networth=low|prospect=yes)
value[9]  <- sum(yes$networth == "medium")/ nrow(yes)         #P(networth=medium|prospect=yes)
value[10] <- sum(no$networth == "high")/ nrow(no)             #P(networth=high|prospect=no)
value[11] <- sum(no$networth == "low")/ nrow(no)              #P(networth=low|prospect=no)
value[12] <- sum(no$networth == "medium")/ nrow(no)           #P(networth=medium|prospect=no)
value[13] <- sum(yes$status == "employed")/ nrow(yes)         #P(status=employed|prospect=yes)
value[14] <- sum(no$status == "employed")/ nrow(no)           #P(status=employed|prospect=no)
value[15] <- sum(yes$status == "unemployed")/ nrow(yes)       #P(status=unemployed|prospect=yes)
value[16] <- sum(no$status == "unemployed")/ nrow(no)         #P(status=unemployed|prospect=no)
value[17] <- sum(yes$credit_rating == "fair")/ nrow(yes)      #P(credit=fair|prospect=yes)
value[18] <- sum(no$credit_rating == "fair")/ nrow(no)        #P(credit=excellent|prospect=no)
value[19] <- sum(yes$credit_rating == "excellent")/ nrow(yes) #P(credit=fair|prospect=yes)
value[20] <- sum(no$credit_rating == "excellent")/ nrow(no)   #P(credit=excellent|prospect=no)

#bind names and values
q1 <- data.frame(cbind(name,value))

#output as a table
kable(q1)

name	value
P(age-group=youth\|prospect=yes)	0.222222222222222
P(age-group=middle\|prospect=yes)	0.444444444444444
P(age-group=senior\|prospect=yes)	0.333333333333333
P(age-group=youth\|prospect=no)	0.6
P(age-group=middle\|prospect=no)	0
P(age-group=senior\|prospect=no)	0.4
P(networth=high\|prospect=yes)	0.222222222222222
P(networth=low\|prospect=yes)	0.333333333333333
P(networth=medium\|prospect=yes)	0.444444444444444
P(networth=high\|prospect=no)	0.4
P(networth=low\|prospect=no)	0.2
P(networth=medium\|prospect=no)	0.4
P(status=employed\|prospect=yes)	0.333333333333333
P(status=employed\|prospect=no)	0.8
P(status=unemployed\|prospect=yes)	0.666666666666667
P(status=unemployed\|prospect=no)	0.2
P(credit=fair\|prospect=yes)	0.666666666666667
P(credit=excellent\|prospect=no)	0.4
P(credit=fair\|prospect=yes)	0.333333333333333
P(credit=excellent\|prospect=no)	0.6

3 Assuming the assumptions of Naive Bayes are met, compute the posterior probability \(P(prospect|X)\) where X is one of the predictor variable.

It appears as though the assumptions of naive bayes are satisfied in that sense that it is plausible that all of the categories of predictors are independent. It could be argued that things like “networth”, “credit” and “employment” are not independent in practice… but here we will assume that they if only because I’d like to answer part 3 of the question.

We will look at the posterior probability of credit given prospect.

prob.employed <- 7/14
prob.unemployed <- 1-prob.employed

prob.yes.given.employed <- value[[13]]
prob.no.given.employed <- value[[14]]
prob.yes.given.unemployed <- value[[15]]
prob.no.given.unemployed <- value[[16]] 

prob.employed.given.yes <- (prob.yes.given.employed * prob.employed )/((prob.yes.given.employed *prob.employed) + (prob.yes.given.unemployed * prob.unemployed)) 

prob.employed.given.no <-  (prob.yes.given.unemployed * prob.unemployed) /((prob.yes.given.employed *prob.employed) + (prob.yes.given.unemployed * prob.unemployed)) 


prob.unemployed.given.yes <- 1-prob.employed.given.yes
prob.unemployed.given.no <- 1-prob.employed.given.no

e <- c(prob.employed.given.yes,prob.employed.given.no)
u <- c(prob.unemployed.given.yes,prob.unemployed.given.no)

out <- cbind(e,u)

row.names(out) <- c("Prospect.Yes", "Prospect.No")
colnames(out) <- c("Employed","Unemployed")

kable(out)

	Employed	Unemployed
Prospect.Yes	0.3333333	0.6666667
Prospect.No	0.6666667	0.3333333

Question 2

You just recently joined a datascience team.

There are two datasets junk1.txt and junk2.csv They have two options 1. They can go back to the client and ask for more data to remedy problems with the data. 2. They can accept the data and undertake a major analytics exercise.

The team is relying on your dsc skills to determine how they should proceed.

Can you explore the data and recommend actions for each file enumerating the reasons.

Initial Observations

Upon loading the files I note that they have different extentions and different delimiters, despite looking reasonably similar otherwise. The inconsistencies between these 2 files would probably cause me to approach them with a higher level of scrutiny than would otherwise be the case.

In addition, I would likely request additional data regarding the context of the data. What is the process that produces these data? What info is there available about the origin of the data. This data would be the basis for forming some intuition about the data - intuition is essential in data science.

#load the data
junk1 <- read.csv("https://raw.githubusercontent.com/plb2018/DATA622/master/homework1/data/junk1.txt", header=T,sep = " ")
junk2 <- read.csv("https://raw.githubusercontent.com/plb2018/DATA622/master/homework1/data/junk2.csv", header=T, sep = ",")

Inspect the data

In both cases we have data in “long” format where each each row contains a full set of observations or a “case”. We have variables “a” & “b” which are floating point and apparently continuous and a “class” variable which appears to be an integer and based on the name, likely a categorical variable.

kable(head(junk1,5))

a	b	class
1.6204214	3.0036241	1
1.4340220	0.7852487	1
2.4766615	0.9367761	1
0.5283093	0.1196222	1
1.0054081	0.7872866	1

kable(head(junk2,5))

a	b	class
3.1886481	0.9291774	0
0.8224527	0.0476031	0
0.8147247	0.0291093	0
-1.5065362	3.1323136	0
0.4426887	2.8494282	0

Look for missing cases

Next we check for missing values and find that there are no incomplete cases in either of the files.

missing.data <- data.frame(c(sum(!complete.cases(junk1)),sum(!complete.cases(junk2))))
row.names(missing.data) <- c("Junk1","Junk2")
colnames(missing.data) <- c("Number of Missing Values")

kable(missing.data)

	Number of Missing Values
Junk1	0
Junk2	0

Summary Statistics

Now we compute summary statistics for reference. It may be difficult to deduce anything from the summary stats alone without actually inspecting the data visually, however, we compute them now so that we can refer back to them as needed.

describe(junk1)

describe(junk2)

Visualize the Data

From the plots below we can see that:

Junk1

Both A and B appear to be reasonably normally distributed around zero. B appears to be bi-modal however this is likely a resolution issue due to small sample size. Density plots are also shown as they better illustrate the shape of the distributions given the sample size.
The pairs plot shows that A and B appear to be uncorrelated

junk1 %>%
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins=100)+
    ggtitle("Junk1 - Histograms")

junk1 %>%
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()+
    ggtitle("Junk1 - Density Plots")

par(mfrow=c(3,1))
plot(junk1$a,type='l')
plot(junk1$b,type='l')
plot(junk1$class,type='l')

pairs(junk1, main="Junk1 Pairs Plot")

Junk2

Similar story between Junk1 and Junk2, however, in the distributions of A and B almost look like mirror images of one another.
The timeseries makes it look as if A & B from Junk2 may be samples from several distinct processes. Series A has a level change at point 1000 and a level AND variability change at point 3000. Series B has similar changes at point 2000 and point 3000 respectively.
If we knew more about the process it might make sense to align a[1:1000] and b[2001:3000], a[1000:3000] and b[1:2000], a[3000:end], b[3000:end]

junk2 %>%
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()+
    ggtitle("Junk2 - Histograms")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

junk2 %>%
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()+
    ggtitle("Junk2 - Density Plots")

par(mfrow=c(3,1))
plot(junk2$a,type='l')
plot(junk2$b,type='l')
plot(junk2$class,type='l')

pairs(junk2, main="Junk2 Pairs Plot")

Overall Assessment

I would request more information about the context and process to help drive context about the data. Otherwise we see that

The data appear to be similar to one another (junk1,junk2) and appea to be gaussian with a mean of apporx zero.
The data appear to be normally distributed in both cases
The data appear to be uncorrelated
The line-plot for data be shows a strange pattern which makes me wonder whether A and B might be reported out of synch or something

Reccomendations

-Determine whether these two data sets can be combined (based only on the character of the data, it appears yes) -Determine why the variance in Junk2 isn’t stable and either transform (standarize variables using BoxCox, for example) or re-organize into more variables as appropriate

Question 3

Read the icu.csv subset it with these 5 features in the formula and STA is the labelcol. - Split the icu 70/30 train/test and - run the kNN.R for K=(3,5,7,15,25,50)

Submit the result confusionMatrix, Accuracy for each K & Plot Accuracy vs K.

write a short summary of your findings.

Load ICU

icu <- read.csv("https://raw.githubusercontent.com/plb2018/DATA622/master/homework1/data/icu.csv", header=T, sep = ",")

#add coma variable as per instructions
icu$COMA <- 0
icu$COMA[icu$LOC == 2] <-1

#subset keeping STA as the LAST column
knn.in <- icu[c("TYP","COMA","AGE","INF","STA")]

Load KNN.r

euclideanDist <- function(a, b){
  d = 0
  for(i in c(1:(length(a)) ))
  {
    d = d + (a[[i]]-b[[i]])^2
  }
  d = sqrt(d)
  return(d)
}

knn_predict2 <- function(test_data, train_data, k_value, labelcol){
  pred <- c()  #empty pred vector 
  #LOOP-1
  for(i in c(1:nrow(test_data))){   #looping over each record of test data
    eu_dist =c()          #eu_dist & eu_char empty  vector
    eu_char = c()
    good = 0              #good & bad variable initialization with 0 value
    bad = 0
    
    #LOOP-2-looping over train data 
    for(j in c(1:nrow(train_data))){
 
      #adding euclidean distance b/w test data point and train data to eu_dist vector
      eu_dist <- c(eu_dist, euclideanDist(test_data[i,-c(labelcol)], train_data[j,-c(labelcol)]))
 
      #adding class variable of training data in eu_char
      eu_char <- c(eu_char, as.character(train_data[j,][[labelcol]]))
    }
    
    eu <- data.frame(eu_char, eu_dist) #eu dataframe created with eu_char & eu_dist columns
 
    eu <- eu[order(eu$eu_dist),]       #sorting eu dataframe to gettop K neighbors
    eu <- eu[1:k_value,]               #eu dataframe with top K neighbors
 
    tbl.sm.df<-table(eu$eu_char)
    cl_label<-  names(tbl.sm.df)[[as.integer(which.max(tbl.sm.df))]]
    
    pred <- c(pred, cl_label)
    }
    return(pred) #return pred vector
  }
  

accuracy <- function(test_data,labelcol,predcol){
  correct = 0
  for(i in c(1:nrow(test_data))){
    if(test_data[i,labelcol] == test_data[i,predcol]){ 
      correct = correct+1
    }
  }
  accu = (correct/nrow(test_data)) * 100  
  return(accu)
}

#load data
knn.df<- knn.in
labelcol <- 5 # for iris it is the fifth col 
predictioncol<-labelcol+1

# create train/test partitions
set.seed(2)
n<-nrow(knn.df)
knn.df<- knn.df[sample(n),]

train.df <- knn.df[1:as.integer(0.7*n),]

k.values <- c(3,5,7,15,25,50)

acc <- vector()

for (kval in k.values ){

  K = kval # number of neighbors to determine the class
  table(train.df[,labelcol])
  test.df <- knn.df[as.integer(0.7*n +1):n,]
  table(test.df[,labelcol])
  
  predictions <- knn_predict2(test.df, train.df, K,labelcol) #calling knn_predict()
  
  test.df[,predictioncol] <- predictions #Adding predictions in test data as 7th column
  print(paste0("The accuracy for K=",K," is ",accuracy(test.df,labelcol,predictioncol)))
  print("The Confustion Matrix is:")
  print(table(test.df[[predictioncol]],test.df[[labelcol]]))
  
  acc <- c(acc,accuracy(test.df,labelcol,predictioncol))

}

## [1] "The accuracy for K=3 is 80"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 48  7
##   1  5  0
## [1] "The accuracy for K=5 is 85"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 50  6
##   1  3  1
## [1] "The accuracy for K=7 is 83.3333333333333"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 50  7
##   1  3  0
## [1] "The accuracy for K=15 is 88.3333333333333"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 53  7
## [1] "The accuracy for K=25 is 88.3333333333333"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 53  7
## [1] "The accuracy for K=50 is 88.3333333333333"
## [1] "The Confustion Matrix is:"
##    
##      0  1
##   0 53  7

plot(acc,main="Accuracy Plot",
        xlab="Value of K",
        ylab="Accuracy",
        xaxt="n")



axis(1,at=1:6,labels = k.values)

Summary

Here I was able to successfully run the code as per the instructions. In this case (i.e. ICU data) we can see that accuracy continues to climb intul it reaches 15, where it remains steady.