Question 1

You have been hired by a local electronics retailer and the above dataset has been given to you. Manager Bayes Jr.9th wants to create a spreadsheet to predict is a customer is likely prospect. To that end:
1) Compute prior probabilities for the Prospect Yes/No
2) Compute the conditional probabilities
P(age-group=youth|prospect=yes) and P(age-group=youth|prospect=no)
where age-group is a predictor variable.
Compute the conditional probabilities for each predictor variable namely (age_group networth status credit_rating) 3) Assuming the assumptions of Naive Bayes are met compute the posterior probability
P(prospect|X) where X is one of the predictor variable.

##    agegroup networth     status    credit prospect
## 1     youth     high   employed      fair       no
## 2     youth     high   employed excellent       no
## 3    middle     high   employed      fair      yes
## 4    senior   medium   employed      fair      yes
## 5    senior      low unemployed      fair      yes
## 6    senior      low unemployed excellent       no
## 7    middle      low unemployed excellent      yes
## 8     youth   medium   employed      fair       no
## 9     youth      low unemployed      fair      yes
## 10   senior   medium unemployed      fair      yes
## 11    youth   medium unemployed excellent      yes
## 12   middle   medium   employed excellent      yes
## 13   middle     high unemployed      fair      yes
## 14   senior   medium   employed excellent       no

Calculate Prior Probabilites:

## [1] 0.64
## [1] 0.36
## [1] 0.36
## [1] 0.29
## [1] 0.36
## [1] 0.29
## [1] 0.29
## [1] 0.43
## [1] 0.5
## [1] 0.5
## [1] 0.57
## [1] 0.43

Calculate Conditional Probabilities:

# Conditional Probability: P(B|A) = P(A and B)/P(A)

cond_agegroup_youth_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="youth" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_agegroup_middle_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="middle" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_agegroup_senior_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="senior" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)

cond_agegroup_youth_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="youth" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_agegroup_middle_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="middle" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_agegroup_senior_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="senior" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)

cond_networth_high_given_prospect_yes <- round(nrow(prospect[prospect$networth=="high" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_networth_low_given_prospect_yes <- round(nrow(prospect[prospect$networth=="low" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_networth_medium_given_prospect_yes <- round(nrow(prospect[prospect$networth=="medium" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)

cond_networth_high_given_prospect_no <- round(nrow(prospect[prospect$networth=="high" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_networth_low_given_prospect_no <- round(nrow(prospect[prospect$networth=="low" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_networth_medium_given_prospect_no <- round(nrow(prospect[prospect$networth=="medium" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)

cond_status_employed_given_prospect_yes <- round(nrow(prospect[prospect$status=="employed" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_status_unemployed_given_prospect_yes <- round(nrow(prospect[prospect$status=="unemployed" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)

cond_status_employed_given_prospect_no <- round(nrow(prospect[prospect$status=="employed" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_status_unemployed_given_prospect_no <- round(nrow(prospect[prospect$status=="unemployed" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)

cond_credit_fair_given_prospect_yes <- round(nrow(prospect[prospect$credit=="fair" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_credit_excellent_given_prospect_yes <- round(nrow(prospect[prospect$credit=="excellent" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)

cond_credit_fair_given_prospect_no <- round(nrow(prospect[prospect$credit=="fair" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_credit_excellent_given_prospect_no <- round(nrow(prospect[prospect$credit=="excellent" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)

cond_agegroup_youth_given_prospect_yes
## [1] 0.22
## [1] 0.44
## [1] 0.33
## [1] 0.6
## [1] 0
## [1] 0.4
## [1] 0.22
## [1] 0.33
## [1] 0.44
## [1] 0.4
## [1] 0.2
## [1] 0.4
## [1] 0.33
## [1] 0.67
## [1] 0.8
## [1] 0.2
## [1] 0.67
## [1] 0.33
## [1] 0.4
## [1] 0.6

Calculate Posterior Probabilities:

# Posterior Probability: P(A|B) = P(B|A)*P(A)/P(B)

post_prospect_yes_given_agegroup_youth <- (cond_agegroup_youth_given_prospect_yes*prior_prospect_yes)/prior_agegroup_youth
post_prospect_yes_given_agegroup_middle <- (cond_agegroup_middle_given_prospect_yes*prior_prospect_yes)/prior_agegroup_middle
post_prospect_yes_given_agegroup_senior <- (cond_agegroup_senior_given_prospect_yes*prior_prospect_yes)/prior_agegroup_senior

post_prospect_no_given_agegroup_youth <- (cond_agegroup_youth_given_prospect_no*prior_prospect_no)/prior_agegroup_youth
post_prospect_no_given_agegroup_middle <- (cond_agegroup_middle_given_prospect_no*prior_prospect_no)/prior_agegroup_middle
post_prospect_no_given_agegroup_senior <- (cond_agegroup_senior_given_prospect_no*prior_prospect_no)/prior_agegroup_senior

post_prospect_yes_given_networth_high <- (cond_networth_high_given_prospect_yes*prior_prospect_yes)/prior_networth_high
post_prospect_yes_given_networth_low <- (cond_networth_low_given_prospect_yes*prior_prospect_yes)/prior_networth_low
post_prospect_yes_given_networth_medium <- (cond_networth_medium_given_prospect_yes*prior_prospect_yes)/prior_networth_medium

post_prospect_no_given_networth_high <- (cond_agegroup_youth_given_prospect_no*prior_prospect_no)/prior_agegroup_youth
post_prospect_no_given_networth_low <- (cond_networth_low_given_prospect_no*prior_prospect_no)/prior_networth_low
post_prospect_no_given_networth_medium <- (cond_networth_medium_given_prospect_no*prior_prospect_no)/prior_networth_medium

post_prospect_yes_given_status_employed <- (cond_status_employed_given_prospect_yes*prior_prospect_yes)/prior_status_employed
post_prospect_yes_given_status_unemployed <- (cond_status_unemployed_given_prospect_yes*prior_prospect_yes)/prior_status_unemployed

post_prospect_no_given_status_employed <- (cond_status_employed_given_prospect_no*prior_prospect_no)/prior_status_employed
post_prospect_no_given_status_unemployed <- (cond_status_unemployed_given_prospect_no*prior_prospect_no)/prior_status_unemployed

post_prospect_yes_given_credit_fair <- (cond_credit_fair_given_prospect_yes*prior_prospect_yes)/prior_credit_fair
post_prospect_yes_given_credit_excellent <- (cond_credit_excellent_given_prospect_yes*prior_prospect_yes)/prior_credit_excellent

post_prospect_no_given_credit_fair <- (cond_credit_fair_given_prospect_no*prior_prospect_no)/prior_credit_fair
post_prospect_no_given_status_unemployed <- (cond_credit_excellent_given_prospect_no*prior_prospect_no)/prior_credit_excellent


post_prospect_yes_given_agegroup_youth
## [1] 0.3911111
## [1] 0.9710345
## [1] 0.5866667
## [1] 0.6
## [1] 0
## [1] 0.4
## [1] 0.4855172
## [1] 0.7282759
## [1] 0.6548837
## [1] 0.6
## [1] 0.2482759
## [1] 0.3348837
## [1] 0.4224
## [1] 0.8576
## [1] 0.576
## [1] 0.5023256
## [1] 0.7522807
## [1] 0.4911628
## [1] 0.2526316
## [1] 0.5023256

Question 2

You just recently joined a datascience team. There are two datasets junk1.txt and junk2.csv They have two options 1. They can go back to the client and ask for more data to remedy problems with the data. 2. They can accept the data and undertake a major analytics exercise.

The team is relying on your dsc skills to determine how they should proceed. Can you explore the data and recommend actions for each file enumerating the reasons.

##           a         b class
## 1 1.6204214 3.0036241     1
## 2 1.4340220 0.7852487     1
## 3 2.4766615 0.9367761     1
## 4 0.5283093 0.1196222     1
## 5 1.0054081 0.7872866     1
## 6 1.1032636 0.7330594     1
##            a           b class
## 1  3.1886481  0.92917735     0
## 2  0.8224527  0.04760314     0
## 3  0.8147247  0.02910931     0
## 4 -1.5065362  3.13231360     0
## 5  0.4426887  2.84942822     0
## 6  0.8564405 -0.66143851     0
##        a                  b                class    
##  Min.   :-2.29854   Min.   :-3.17174   Min.   :1.0  
##  1st Qu.:-0.85014   1st Qu.:-1.04712   1st Qu.:1.0  
##  Median :-0.04754   Median :-0.07456   Median :1.5  
##  Mean   : 0.04758   Mean   : 0.01324   Mean   :1.5  
##  3rd Qu.: 1.09109   3rd Qu.: 1.05342   3rd Qu.:2.0  
##  Max.   : 3.00604   Max.   : 3.10230   Max.   :2.0
##        a                  b                class       
##  Min.   :-4.16505   Min.   :-3.90472   Min.   :0.0000  
##  1st Qu.:-1.01447   1st Qu.:-0.89754   1st Qu.:0.0000  
##  Median : 0.08754   Median :-0.08358   Median :0.0000  
##  Mean   :-0.05126   Mean   : 0.05624   Mean   :0.0625  
##  3rd Qu.: 0.89842   3rd Qu.: 1.00354   3rd Qu.:0.0000  
##  Max.   : 4.62647   Max.   : 4.31052   Max.   :1.0000
## 'data.frame':    100 obs. of  3 variables:
##  $ a    : num  1.62 1.434 2.477 0.528 1.005 ...
##  $ b    : num  3.004 0.785 0.937 0.12 0.787 ...
##  $ class: int  1 1 1 1 1 1 1 1 1 1 ...
## 'data.frame':    4000 obs. of  3 variables:
##  $ a    : num  3.189 0.822 0.815 -1.507 0.443 ...
##  $ b    : num  0.9292 0.0476 0.0291 3.1323 2.8494 ...
##  $ class: int  0 0 0 0 0 0 0 0 0 0 ...

## 
##  1  2 
## 50 50
## 
##    0    1 
## 3750  250
##     a     b class 
##     0     0     0
##     a     b class 
##     0     0     0
Junk1 - Looks like a very small dataset, exactly balanced (50% belonging to each class). Distribution of predictor b is quasi normal, but a shows right-skewed and bi-modal. No missing data present. I would ask for more data that is more representative of the real class distribution (not 50/50)
Junk2 - Looks like a good size dataset, class unbalanced. Distributions of the predictions are somewhat left and right skewed (a & b respectively). No missing data present. Multiple outliers across the two predictors, but not that far away from the maximum values. I would accept this data set and proceed with further analysis

Question 3

Please find kNN.R. This R script requires a dataset, labelcol and K (number of nearest neighbors to be considered)

The dataset MUST Be numeric, except the labelcol The labelcol must be the last column in the data.frame All the other columns must be before the labelcol

To DO:

Please find icu.csv The formula to fit is “STA ~ TYP + COMA + AGE + INF”

Read the icu.csv subset it with these 5 features in the formula and STA is the labelcol.

Split the icu 70/30 train/test and run the kNN.R for K=(3,5,7,15,25,50)

submit the result confusionMatrix, Accuracy for each K

Plot Accuracy vs K.

write a short summary of your findings.

euclideanDist <- function(a, b){
  d = 0
  for(i in c(1:(length(a)) ))
  {
    d = d + (a[[i]]-b[[i]])^2
  }
  d = sqrt(d)
  return(d)
}

knn_predict2 <- function(test_data, train_data, k_value, labelcol){
  pred <- c()  #empty pred vector 
  #LOOP-1
  for(i in c(1:nrow(test_data))){   #looping over each record of test data
    eu_dist =c()          #eu_dist & eu_char empty  vector
    eu_char = c()
    good = 0              #good & bad variable initialization with 0 value
    bad = 0
    
    #LOOP-2-looping over train data 
    for(j in c(1:nrow(train_data))){
 
      #adding euclidean distance b/w test data point and train data to eu_dist vector
      eu_dist <- c(eu_dist, euclideanDist(test_data[i,-c(labelcol)], train_data[j,-c(labelcol)]))
 
      #adding class variable of training data in eu_char
      eu_char <- c(eu_char, as.character(train_data[j,][[labelcol]]))
    }
    
    eu <- data.frame(eu_char, eu_dist) #eu dataframe created with eu_char & eu_dist columns
 
    eu <- eu[order(eu$eu_dist),]       #sorting eu dataframe to gettop K neighbors
    eu <- eu[1:k_value,]               #eu dataframe with top K neighbors
 
    tbl.sm.df<-table(eu$eu_char)
    cl_label<-  names(tbl.sm.df)[[as.integer(which.max(tbl.sm.df))]]
    
    pred <- c(pred, cl_label)
    }
    return(pred) #return pred vector
  }
  

accuracy <- function(test_data,labelcol,predcol){
  correct = 0
  for(i in c(1:nrow(test_data))){
    if(test_data[i,labelcol] == test_data[i,predcol]){ 
      correct = correct+1
    }
  }
  accu = (correct/nrow(test_data)) * 100  
  return(accu)
}

#load data
icu_original <- as.data.frame(read.csv("icu.csv"))
icu_original["COMA"] <- ifelse(icu_original$LOC==2,1,0)
icu <- icu_original[,c("TYP", "COMA", "AGE", "INF", "STA")]
knn.df<-icu
labelcol <- 5 # for icu, rearranged STA as the fifth column
predictioncol<-labelcol+1
# create train/test partitions
set.seed(2)
n<-nrow(knn.df)
knn.df<- knn.df[sample(n),]

train.df <- knn.df[1:as.integer(0.7*n),]

#K = 3 # number of neighbors to determine the class
table(train.df[,labelcol])
## 
##   0   1 
## 107  33
## 
##  0  1 
## 53  7
## [1] 80
##    
##      0  1
##   0 48  7
##   1  5  0
## [1] 85
##    
##      0  1
##   0 50  6
##   1  3  1
## [1] 83.33333
##    
##      0  1
##   0 50  7
##   1  3  0
## [1] 88.33333
##    
##      0  1
##   0 53  7
## [1] 88.33333
##    
##      0  1
##   0 53  7
## [1] 88.33333
##    
##      0  1
##   0 53  7

After running the KNN classifier for the following number of neighbors: K values = 3,5,7,15,25,50, the accuracy gets highest for one class at K=15 and is maintained (flattens) for the rest of the neighbor settings (25,50). At K=15 and above, the classifier predicts all of the classes as 0 and since the test set contains mostly classes 0, the classifier shows perfect performance. One additional observation is that at K=5, the classifier shows lower accuracy (83.3) but at least is able to get one of the other classes right, class 1.
In summary, at K=15 and above, the classifier created becomes and excellent one to classify observations of class 0, but it is definitely not reliable to classify others, therefore a new classifier needs to be created to do the job for class 1