You have been hired by a local electronics retailer and the above dataset has been given to you. Manager Bayes Jr.9th wants to create a spreadsheet to predict is a customer is likely prospect. To that end:
1) Compute prior probabilities for the Prospect Yes/No
2) Compute the conditional probabilities
P(age-group=youth|prospect=yes) and P(age-group=youth|prospect=no)
where age-group is a predictor variable.
Compute the conditional probabilities for each predictor variable namely (age_group networth status credit_rating) 3) Assuming the assumptions of Naive Bayes are met compute the posterior probability
P(prospect|X) where X is one of the predictor variable.
## agegroup networth status credit prospect
## 1 youth high employed fair no
## 2 youth high employed excellent no
## 3 middle high employed fair yes
## 4 senior medium employed fair yes
## 5 senior low unemployed fair yes
## 6 senior low unemployed excellent no
## 7 middle low unemployed excellent yes
## 8 youth medium employed fair no
## 9 youth low unemployed fair yes
## 10 senior medium unemployed fair yes
## 11 youth medium unemployed excellent yes
## 12 middle medium employed excellent yes
## 13 middle high unemployed fair yes
## 14 senior medium employed excellent no
# Prior Probability: Proportion of desired outcomes vs all outcomes
prior_prospect_yes <- round(nrow(prospect[prospect$prospect=="yes",])/nrow(prospect),2)
prior_prospect_no <- round(nrow(prospect[prospect$prospect=="no",])/nrow(prospect),2)
prior_agegroup_youth <- round(nrow(prospect[prospect$agegroup=="youth",])/nrow(prospect),2)
prior_agegroup_middle <- round(nrow(prospect[prospect$agegroup=="middle",])/nrow(prospect),2)
prior_agegroup_senior <- round(nrow(prospect[prospect$agegroup=="senior",])/nrow(prospect),2)
prior_networth_high <- round(nrow(prospect[prospect$networth=="high",])/nrow(prospect),2)
prior_networth_low <- round(nrow(prospect[prospect$networth=="low",])/nrow(prospect),2)
prior_networth_medium <- round(nrow(prospect[prospect$networth=="medium",])/nrow(prospect),2)
prior_status_employed <- round(nrow(prospect[prospect$status=="employed",])/nrow(prospect),2)
prior_status_unemployed <- round(nrow(prospect[prospect$status=="unemployed",])/nrow(prospect),2)
prior_credit_fair <- round(nrow(prospect[prospect$credit=="fair",])/nrow(prospect),2)
prior_credit_excellent <- round(nrow(prospect[prospect$credit=="excellent",])/nrow(prospect),2)
prior_prospect_yes
## [1] 0.64
## [1] 0.36
## [1] 0.36
## [1] 0.29
## [1] 0.36
## [1] 0.29
## [1] 0.29
## [1] 0.43
## [1] 0.5
## [1] 0.5
## [1] 0.57
## [1] 0.43
# Conditional Probability: P(B|A) = P(A and B)/P(A)
cond_agegroup_youth_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="youth" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_agegroup_middle_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="middle" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_agegroup_senior_given_prospect_yes <- round(nrow(prospect[prospect$agegroup=="senior" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_agegroup_youth_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="youth" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_agegroup_middle_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="middle" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_agegroup_senior_given_prospect_no <- round(nrow(prospect[prospect$agegroup=="senior" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_networth_high_given_prospect_yes <- round(nrow(prospect[prospect$networth=="high" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_networth_low_given_prospect_yes <- round(nrow(prospect[prospect$networth=="low" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_networth_medium_given_prospect_yes <- round(nrow(prospect[prospect$networth=="medium" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_networth_high_given_prospect_no <- round(nrow(prospect[prospect$networth=="high" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_networth_low_given_prospect_no <- round(nrow(prospect[prospect$networth=="low" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_networth_medium_given_prospect_no <- round(nrow(prospect[prospect$networth=="medium" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_status_employed_given_prospect_yes <- round(nrow(prospect[prospect$status=="employed" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_status_unemployed_given_prospect_yes <- round(nrow(prospect[prospect$status=="unemployed" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_status_employed_given_prospect_no <- round(nrow(prospect[prospect$status=="employed" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_status_unemployed_given_prospect_no <- round(nrow(prospect[prospect$status=="unemployed" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_credit_fair_given_prospect_yes <- round(nrow(prospect[prospect$credit=="fair" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_credit_excellent_given_prospect_yes <- round(nrow(prospect[prospect$credit=="excellent" & prospect$prospect=="yes",])/nrow(prospect[prospect$prospect=="yes",]),2)
cond_credit_fair_given_prospect_no <- round(nrow(prospect[prospect$credit=="fair" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_credit_excellent_given_prospect_no <- round(nrow(prospect[prospect$credit=="excellent" & prospect$prospect=="no",])/nrow(prospect[prospect$prospect=="no",]),2)
cond_agegroup_youth_given_prospect_yes
## [1] 0.22
## [1] 0.44
## [1] 0.33
## [1] 0.6
## [1] 0
## [1] 0.4
## [1] 0.22
## [1] 0.33
## [1] 0.44
## [1] 0.4
## [1] 0.2
## [1] 0.4
## [1] 0.33
## [1] 0.67
## [1] 0.8
## [1] 0.2
## [1] 0.67
## [1] 0.33
## [1] 0.4
## [1] 0.6
# Posterior Probability: P(A|B) = P(B|A)*P(A)/P(B)
post_prospect_yes_given_agegroup_youth <- (cond_agegroup_youth_given_prospect_yes*prior_prospect_yes)/prior_agegroup_youth
post_prospect_yes_given_agegroup_middle <- (cond_agegroup_middle_given_prospect_yes*prior_prospect_yes)/prior_agegroup_middle
post_prospect_yes_given_agegroup_senior <- (cond_agegroup_senior_given_prospect_yes*prior_prospect_yes)/prior_agegroup_senior
post_prospect_no_given_agegroup_youth <- (cond_agegroup_youth_given_prospect_no*prior_prospect_no)/prior_agegroup_youth
post_prospect_no_given_agegroup_middle <- (cond_agegroup_middle_given_prospect_no*prior_prospect_no)/prior_agegroup_middle
post_prospect_no_given_agegroup_senior <- (cond_agegroup_senior_given_prospect_no*prior_prospect_no)/prior_agegroup_senior
post_prospect_yes_given_networth_high <- (cond_networth_high_given_prospect_yes*prior_prospect_yes)/prior_networth_high
post_prospect_yes_given_networth_low <- (cond_networth_low_given_prospect_yes*prior_prospect_yes)/prior_networth_low
post_prospect_yes_given_networth_medium <- (cond_networth_medium_given_prospect_yes*prior_prospect_yes)/prior_networth_medium
post_prospect_no_given_networth_high <- (cond_agegroup_youth_given_prospect_no*prior_prospect_no)/prior_agegroup_youth
post_prospect_no_given_networth_low <- (cond_networth_low_given_prospect_no*prior_prospect_no)/prior_networth_low
post_prospect_no_given_networth_medium <- (cond_networth_medium_given_prospect_no*prior_prospect_no)/prior_networth_medium
post_prospect_yes_given_status_employed <- (cond_status_employed_given_prospect_yes*prior_prospect_yes)/prior_status_employed
post_prospect_yes_given_status_unemployed <- (cond_status_unemployed_given_prospect_yes*prior_prospect_yes)/prior_status_unemployed
post_prospect_no_given_status_employed <- (cond_status_employed_given_prospect_no*prior_prospect_no)/prior_status_employed
post_prospect_no_given_status_unemployed <- (cond_status_unemployed_given_prospect_no*prior_prospect_no)/prior_status_unemployed
post_prospect_yes_given_credit_fair <- (cond_credit_fair_given_prospect_yes*prior_prospect_yes)/prior_credit_fair
post_prospect_yes_given_credit_excellent <- (cond_credit_excellent_given_prospect_yes*prior_prospect_yes)/prior_credit_excellent
post_prospect_no_given_credit_fair <- (cond_credit_fair_given_prospect_no*prior_prospect_no)/prior_credit_fair
post_prospect_no_given_status_unemployed <- (cond_credit_excellent_given_prospect_no*prior_prospect_no)/prior_credit_excellent
post_prospect_yes_given_agegroup_youth
## [1] 0.3911111
## [1] 0.9710345
## [1] 0.5866667
## [1] 0.6
## [1] 0
## [1] 0.4
## [1] 0.4855172
## [1] 0.7282759
## [1] 0.6548837
## [1] 0.6
## [1] 0.2482759
## [1] 0.3348837
## [1] 0.4224
## [1] 0.8576
## [1] 0.576
## [1] 0.5023256
## [1] 0.7522807
## [1] 0.4911628
## [1] 0.2526316
## [1] 0.5023256
You just recently joined a datascience team. There are two datasets junk1.txt and junk2.csv They have two options 1. They can go back to the client and ask for more data to remedy problems with the data. 2. They can accept the data and undertake a major analytics exercise.
The team is relying on your dsc skills to determine how they should proceed. Can you explore the data and recommend actions for each file enumerating the reasons.
## a b class
## 1 1.6204214 3.0036241 1
## 2 1.4340220 0.7852487 1
## 3 2.4766615 0.9367761 1
## 4 0.5283093 0.1196222 1
## 5 1.0054081 0.7872866 1
## 6 1.1032636 0.7330594 1
## a b class
## 1 3.1886481 0.92917735 0
## 2 0.8224527 0.04760314 0
## 3 0.8147247 0.02910931 0
## 4 -1.5065362 3.13231360 0
## 5 0.4426887 2.84942822 0
## 6 0.8564405 -0.66143851 0
## a b class
## Min. :-2.29854 Min. :-3.17174 Min. :1.0
## 1st Qu.:-0.85014 1st Qu.:-1.04712 1st Qu.:1.0
## Median :-0.04754 Median :-0.07456 Median :1.5
## Mean : 0.04758 Mean : 0.01324 Mean :1.5
## 3rd Qu.: 1.09109 3rd Qu.: 1.05342 3rd Qu.:2.0
## Max. : 3.00604 Max. : 3.10230 Max. :2.0
## a b class
## Min. :-4.16505 Min. :-3.90472 Min. :0.0000
## 1st Qu.:-1.01447 1st Qu.:-0.89754 1st Qu.:0.0000
## Median : 0.08754 Median :-0.08358 Median :0.0000
## Mean :-0.05126 Mean : 0.05624 Mean :0.0625
## 3rd Qu.: 0.89842 3rd Qu.: 1.00354 3rd Qu.:0.0000
## Max. : 4.62647 Max. : 4.31052 Max. :1.0000
## 'data.frame': 100 obs. of 3 variables:
## $ a : num 1.62 1.434 2.477 0.528 1.005 ...
## $ b : num 3.004 0.785 0.937 0.12 0.787 ...
## $ class: int 1 1 1 1 1 1 1 1 1 1 ...
## 'data.frame': 4000 obs. of 3 variables:
## $ a : num 3.189 0.822 0.815 -1.507 0.443 ...
## $ b : num 0.9292 0.0476 0.0291 3.1323 2.8494 ...
## $ class: int 0 0 0 0 0 0 0 0 0 0 ...
##
## 1 2
## 50 50
##
## 0 1
## 3750 250
## a b class
## 0 0 0
## a b class
## 0 0 0
Please find kNN.R. This R script requires a dataset, labelcol and K (number of nearest neighbors to be considered)
The dataset MUST Be numeric, except the labelcol The labelcol must be the last column in the data.frame All the other columns must be before the labelcol
To DO:
Please find icu.csv The formula to fit is “STA ~ TYP + COMA + AGE + INF”
Read the icu.csv subset it with these 5 features in the formula and STA is the labelcol.
Split the icu 70/30 train/test and run the kNN.R for K=(3,5,7,15,25,50)
submit the result confusionMatrix, Accuracy for each K
Plot Accuracy vs K.
write a short summary of your findings.
euclideanDist <- function(a, b){
d = 0
for(i in c(1:(length(a)) ))
{
d = d + (a[[i]]-b[[i]])^2
}
d = sqrt(d)
return(d)
}
knn_predict2 <- function(test_data, train_data, k_value, labelcol){
pred <- c() #empty pred vector
#LOOP-1
for(i in c(1:nrow(test_data))){ #looping over each record of test data
eu_dist =c() #eu_dist & eu_char empty vector
eu_char = c()
good = 0 #good & bad variable initialization with 0 value
bad = 0
#LOOP-2-looping over train data
for(j in c(1:nrow(train_data))){
#adding euclidean distance b/w test data point and train data to eu_dist vector
eu_dist <- c(eu_dist, euclideanDist(test_data[i,-c(labelcol)], train_data[j,-c(labelcol)]))
#adding class variable of training data in eu_char
eu_char <- c(eu_char, as.character(train_data[j,][[labelcol]]))
}
eu <- data.frame(eu_char, eu_dist) #eu dataframe created with eu_char & eu_dist columns
eu <- eu[order(eu$eu_dist),] #sorting eu dataframe to gettop K neighbors
eu <- eu[1:k_value,] #eu dataframe with top K neighbors
tbl.sm.df<-table(eu$eu_char)
cl_label<- names(tbl.sm.df)[[as.integer(which.max(tbl.sm.df))]]
pred <- c(pred, cl_label)
}
return(pred) #return pred vector
}
accuracy <- function(test_data,labelcol,predcol){
correct = 0
for(i in c(1:nrow(test_data))){
if(test_data[i,labelcol] == test_data[i,predcol]){
correct = correct+1
}
}
accu = (correct/nrow(test_data)) * 100
return(accu)
}
#load data
icu_original <- as.data.frame(read.csv("icu.csv"))
icu_original["COMA"] <- ifelse(icu_original$LOC==2,1,0)
icu <- icu_original[,c("TYP", "COMA", "AGE", "INF", "STA")]
knn.df<-icu
labelcol <- 5 # for icu, rearranged STA as the fifth column
predictioncol<-labelcol+1
# create train/test partitions
set.seed(2)
n<-nrow(knn.df)
knn.df<- knn.df[sample(n),]
train.df <- knn.df[1:as.integer(0.7*n),]
#K = 3 # number of neighbors to determine the class
table(train.df[,labelcol])
##
## 0 1
## 107 33
test.df <- knn.df[as.integer(0.7*n +1):n,] # copied to the execution loop below
table(test.df[,labelcol])
##
## 0 1
## 53 7
acc_df <- data.frame()
for (K in c(3,5,7,15,25,50)) # number of neighbors to determine the class
{
test.df <- knn.df[as.integer(0.7*n +1):n,]
predictions <- knn_predict2(test.df, train.df, K,labelcol) #calling knn_predict()
test.df[,predictioncol] <- predictions #Adding predictions in test data as 7th column
print(accuracy(test.df,labelcol,predictioncol))
print(table(test.df[[predictioncol]],test.df[[labelcol]]))
K_acc <- c(K, accuracy(test.df,labelcol,predictioncol))
acc_df <- rbind(acc_df, K_acc)
}
## [1] 80
##
## 0 1
## 0 48 7
## 1 5 0
## [1] 85
##
## 0 1
## 0 50 6
## 1 3 1
## [1] 83.33333
##
## 0 1
## 0 50 7
## 1 3 0
## [1] 88.33333
##
## 0 1
## 0 53 7
## [1] 88.33333
##
## 0 1
## 0 53 7
## [1] 88.33333
##
## 0 1
## 0 53 7
plot(acc_df,type="o",col="blue",xlab="K",ylab="Accuracy",main="KNN Accuracy Plot")
axis(side = 1, at = acc_df$X3,labels = T)