You have been hired by a local electronics retailer and the above dataset has been given to you.Manager Bayes Jr.9th wants to create a spreadsheet to predict is a customer is likely prospect. To that end
1)Compute prior probabilities for the Prospect Yes/No
where age-group is a predictor variable. Compute the conditional probabilities for each predictor variable namely
# import the required dataset
data<-read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA622/master/HW1-Q1-40.csv",header=TRUE,sep=",")
head(data)
## age.group networth status credit_rating class.prospect
## 1 youth high employed fair no
## 2 youth high employed excellent no
## 3 middle high employed fair yes
## 4 senior medium employed fair yes
## 5 senior low unemployed fair yes
## 6 senior low unemployed excellent no
# Prior Probabilities:
#age.group: senior
prior_age.group_senior<-nrow(data[data$age.group=="senior",]) / nrow(data)
prior_age.group_senior<-round(prior_age.group_senior,2)
prior_age.group_senior
## [1] 0.36
#age.group:middle
prior_age.group_middle<-nrow(data[data$age.group=="middle",]) / nrow(data)
prior_age.group_middle<-round(prior_age.group_middle,2)
prior_age.group_middle
## [1] 0.29
#age.group:youth
prior_age.group_youth<-nrow(data[data$age.group=="youth",]) / nrow(data)
prior_age.group_youth<-round(prior_age.group_youth,2)
prior_age.group_youth
## [1] 0.36
#networth:high
prior_networth_high<-nrow(data[data$networth=="high",]) / nrow(data)
prior_networth_high<-round(prior_networth_high,2)
prior_networth_high
## [1] 0.29
#networth:medium
prior_networth_medium<-nrow(data[data$networth=="medium",]) / nrow(data)
prior_networth_medium<-round(prior_networth_medium,2)
prior_networth_medium
## [1] 0.43
#networth:low
prior_networth_low<-nrow(data[data$networth=="low",]) / nrow(data)
prior_networth_low<-round(prior_networth_low,2)
prior_networth_low
## [1] 0.29
#status:employed
prior_status_employed<-nrow(data[data$status=="employed",]) / nrow(data)
prior_status_employed<-round(prior_status_employed,2)
prior_status_employed
## [1] 0.5
#status:unemployed
prior_status_unemployed<-nrow(data[data$status=="unemployed",]) / nrow(data)
prior_status_unemployed<-round(prior_status_unemployed,2)
prior_status_unemployed
## [1] 0.5
#credit_rating:excellent
prior_credit_rating_excellent<-nrow(data[data$credit_rating=="excellent",]) / nrow(data)
prior_credit_rating_excellent<-round(prior_credit_rating_excellent,2)
prior_credit_rating_excellent
## [1] 0.43
#credit_rating:fair
prior_credit_rating_fair<-nrow(data[data$credit_rating=="fair",]) / nrow(data)
prior_credit_rating_fair<-round(prior_credit_rating_fair,2)
prior_credit_rating_fair
## [1] 0.57
#class.prospect:yes
prior_class.prospect_yes<-nrow(data[data$class.prospect=="yes",]) / nrow(data)
prior_class.prospect_yes<-round(prior_class.prospect_yes,2)
prior_class.prospect_yes
## [1] 0.64
#class.prospect:no
prior_class.prospect_no<-nrow(data[data$class.prospect=="no",]) / nrow(data)
prior_class.prospect_no<-round(prior_class.prospect_no,2)
prior_class.prospect_no
## [1] 0.36
#conditional probabilities:
#P(age.group=senior|prospect=yes)
conditional_age.group_senior_class.prosepct_yes<-nrow(data[data$age.group=="senior" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_age.group_senior_class.prosepct_yes<-round(conditional_age.group_senior_class.prosepct_yes,2)
conditional_age.group_senior_class.prosepct_yes
## [1] 0.33
#P(age.group=middle|prospect=yes)
conditional_age.group_middle_class.prosepct_yes<-nrow(data[data$age.group=="middle" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_age.group_middle_class.prosepct_yes<-round(conditional_age.group_middle_class.prosepct_yes,2)
conditional_age.group_middle_class.prosepct_yes
## [1] 0.44
#P(age.group=youth|prospect=yes)
conditional_age.group_youth_class.prosepct_yes<-nrow(data[data$age.group=="youth" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_age.group_youth_class.prosepct_yes<-round(conditional_age.group_youth_class.prosepct_yes,2)
conditional_age.group_youth_class.prosepct_yes
## [1] 0.22
#P(age.group=senior|prospect=no)
conditional_age.group_senior_class.prosepct_no<-nrow(data[data$age.group=="senior" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_age.group_senior_class.prosepct_no<-round(conditional_age.group_senior_class.prosepct_no,2)
conditional_age.group_senior_class.prosepct_no
## [1] 0.4
#P(age.group=middle|prospect=no)
conditional_age.group_middle_class.prosepct_no<-nrow(data[data$age.group=="middle" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_age.group_middle_class.prosepct_no<-round(conditional_age.group_middle_class.prosepct_no,2)
conditional_age.group_middle_class.prosepct_no
## [1] 0
#P(age.group=youth|prospect=no)
conditional_age.group_youth_class.prosepct_no<-nrow(data[data$age.group=="youth" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_age.group_youth_class.prosepct_no<-round(conditional_age.group_youth_class.prosepct_no,2)
conditional_age.group_youth_class.prosepct_no
## [1] 0.6
#P(networth=high|prospect=yes)
conditional_networth_high_prospect_yes<-nrow(data[data$networth=="high" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_networth_high_prospect_yes<-round(conditional_networth_high_prospect_yes,2)
conditional_networth_high_prospect_yes
## [1] 0.22
#P(networth=medium|prospect=yes)
conditional_networth_medium_prospect_yes<-nrow(data[data$networth=="medium" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_networth_medium_prospect_yes<-round(conditional_networth_medium_prospect_yes,2)
conditional_networth_medium_prospect_yes
## [1] 0.44
#P(networth=low|prospect=yes)
conditional_networth_low_prospect_yes<-nrow(data[data$networth=="low" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_networth_low_prospect_yes<-round(conditional_networth_low_prospect_yes,2)
conditional_networth_low_prospect_yes
## [1] 0.33
#P(networth=high|prospect=no)
conditional_networth_high_prospect_no<-nrow(data[data$networth=="high" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_networth_high_prospect_no<-round(conditional_networth_high_prospect_no,2)
conditional_networth_high_prospect_no
## [1] 0.4
#P(networth=medium|prospect=no)
conditional_networth_medium_prospect_no<-nrow(data[data$networth=="medium" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_networth_medium_prospect_no<-round(conditional_networth_medium_prospect_no,2)
conditional_networth_medium_prospect_no
## [1] 0.4
#P(networth=low|prospect=no)
conditional_networth_low_prospect_no<-nrow(data[data$networth=="low" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_networth_low_prospect_no<-round(conditional_networth_low_prospect_no,2)
conditional_networth_low_prospect_no
## [1] 0.2
#P(status=employed|prospect=yes)
conditional_status_employed_prospect_yes<-nrow(data[data$status=="employed" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_status_employed_prospect_yes<-round(conditional_status_employed_prospect_yes,2)
conditional_status_employed_prospect_yes
## [1] 0.33
#P(status=unemployed|prospect=yes)
conditional_status_unemployed_prospect_yes<-nrow(data[data$status=="unemployed" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_status_unemployed_prospect_yes<-round(conditional_status_unemployed_prospect_yes,2)
conditional_status_unemployed_prospect_yes
## [1] 0.67
#P(status=employed|prospect=no)
conditional_status_employed_prospect_no<-nrow(data[data$status=="employed" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_status_employed_prospect_no<-round(conditional_status_employed_prospect_no,2)
conditional_status_employed_prospect_no
## [1] 0.8
#P(status=unemployed|prospect=no)
conditional_status_unemployed_prospect_no<-nrow(data[data$status=="unemployed" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_status_unemployed_prospect_no<-round(conditional_status_unemployed_prospect_no,2)
conditional_status_unemployed_prospect_no
## [1] 0.2
#P(credit=excellent|prospect=yes)
conditional_credit_rating_excellent_prospect_yes<-nrow(data[data$credit_rating=="excellent" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_credit_rating_excellent_prospect_yes<-round(conditional_credit_rating_excellent_prospect_yes,2)
conditional_credit_rating_excellent_prospect_yes
## [1] 0.33
#P(credit=fair|prospect=yes)
conditional_credit_rating_fair_prospect_yes<-nrow(data[data$credit_rating=="fair" & data$class.prospect=="yes",]) / nrow(data[data$class.prospect=="yes",])
conditional_credit_rating_fair_prospect_yes<-round(conditional_credit_rating_fair_prospect_yes,2)
conditional_credit_rating_fair_prospect_yes
## [1] 0.67
#P(credit=excellent|prospect=no)
conditional_credit_rating_excellent_prospect_no<-nrow(data[data$credit_rating=="excellent" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_credit_rating_excellent_prospect_no<-round(conditional_credit_rating_excellent_prospect_no,2)
conditional_credit_rating_excellent_prospect_no
## [1] 0.6
#P(credit=fair|prospect=no)
conditional_credit_rating_fair_prospect_no<-nrow(data[data$credit_rating=="fair" & data$class.prospect=="no",]) / nrow(data[data$class.prospect=="no",])
conditional_credit_rating_fair_prospect_no<-round(conditional_credit_rating_fair_prospect_no,2)
conditional_credit_rating_fair_prospect_no
## [1] 0.4
#posterior probabilities:
#P(prospect=yes|age.group=senior)
post_prospect_yes_age.group_senior<-(conditional_age.group_senior_class.prosepct_yes * prior_class.prospect_yes) / prior_age.group_senior
post_prospect_yes_age.group_senior<-round(post_prospect_yes_age.group_senior,2)
post_prospect_yes_age.group_senior
## [1] 0.59
#P(prospect=no|age.group=senior)
post_prospect_no_age.group_senior<-(conditional_age.group_senior_class.prosepct_no * prior_class.prospect_no) / prior_age.group_senior
post_prospect_no_age.group_senior<-round(post_prospect_no_age.group_senior,2)
post_prospect_no_age.group_senior
## [1] 0.4
#P(prospect=yes|age.group=middle)
post_prospect_yes_age.group_middle<-(conditional_age.group_middle_class.prosepct_yes * prior_class.prospect_yes) / prior_age.group_middle
post_prospect_yes_age.group_middle<-round(post_prospect_yes_age.group_middle,2)
post_prospect_yes_age.group_middle
## [1] 0.97
#P(prospect=no|age.group=middle)
post_prospect_no_age.group_middle<-(conditional_age.group_middle_class.prosepct_no * prior_class.prospect_no) / prior_age.group_middle
post_prospect_no_age.group_middle<-round(post_prospect_no_age.group_middle,2)
post_prospect_no_age.group_middle
## [1] 0
#P(prospect=yes|age.group=youth)
post_prospect_yes_age.group_youth<-(conditional_age.group_youth_class.prosepct_yes * prior_class.prospect_yes) / prior_age.group_youth
post_prospect_yes_age.group_youth<-round(post_prospect_yes_age.group_youth,2)
post_prospect_yes_age.group_youth
## [1] 0.39
#P(prospect=no|age.group=youth)
post_prospect_no_age.group_youth<-(conditional_age.group_youth_class.prosepct_no * prior_class.prospect_no) / prior_age.group_youth
post_prospect_no_age.group_youth<-round(post_prospect_no_age.group_youth,2)
post_prospect_no_age.group_youth
## [1] 0.6
#P(prospect=yes|networth=high)
post_prospect_yes_networth_high<-(conditional_networth_high_prospect_yes * prior_class.prospect_yes) / prior_networth_high
post_prospect_yes_networth_high<-round(post_prospect_yes_networth_high,2)
post_prospect_yes_networth_high
## [1] 0.49
#P(prospect=no|networth=high)
post_prospect_no_networth_high<-(conditional_networth_high_prospect_no * prior_class.prospect_no) / prior_networth_high
post_prospect_no_networth_high<-round(post_prospect_no_networth_high,2)
post_prospect_no_networth_high
## [1] 0.5
#P(prospect=yes|networth=medium)
post_prospect_yes_networth_medium<-(conditional_networth_medium_prospect_yes * prior_class.prospect_yes) / prior_networth_medium
post_prospect_yes_networth_medium<-round(post_prospect_yes_networth_medium,2)
post_prospect_yes_networth_medium
## [1] 0.65
#P(prospect=no|networth=medium)
post_prospect_no_networth_medium<-(conditional_networth_medium_prospect_no * prior_class.prospect_no) / prior_networth_medium
post_prospect_no_networth_medium<-round(post_prospect_no_networth_medium,2)
post_prospect_no_networth_medium
## [1] 0.33
#P(prospect=yes|networth=low)
post_prospect_yes_networth_low<-(conditional_networth_low_prospect_yes * prior_class.prospect_yes) / prior_networth_low
post_prospect_yes_networth_low<-round(post_prospect_yes_networth_low,2)
post_prospect_yes_networth_low
## [1] 0.73
#P(prospect=no|networth=low)
post_prospect_no_networth_low<-(conditional_networth_low_prospect_no * prior_class.prospect_no) / prior_networth_low
post_prospect_no_networth_low<-round(post_prospect_no_networth_low,2)
post_prospect_no_networth_low
## [1] 0.25
There are two datasets junk1.txt and junk2.csv They have two options 1. They can go back to the client and ask for more data to remedy problems with the data. 2. They can accept the data and undertake a major analytics exercise.
The team is relying on your dsc skills to determine how they should proceed.
Can you explore the data and recommend actions for each file enumerating the reasons.
# import the required datasets
junk1<-read.delim("https://raw.githubusercontent.com/maharjansudhan/DATA622/master/junk1.txt",header=TRUE,sep=" ")
junk2<-read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA622/master/junk2.csv",header=TRUE,sep=",")
#import required libraries and let's have a look at the datasets
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
head(junk1)
## a b class
## 1 1.6204214 3.0036241 1
## 2 1.4340220 0.7852487 1
## 3 2.4766615 0.9367761 1
## 4 0.5283093 0.1196222 1
## 5 1.0054081 0.7872866 1
## 6 1.1032636 0.7330594 1
head(junk2)
## a b class
## 1 3.1886481 0.92917735 0
## 2 0.8224527 0.04760314 0
## 3 0.8147247 0.02910931 0
## 4 -1.5065362 3.13231360 0
## 5 0.4426887 2.84942822 0
## 6 0.8564405 -0.66143851 0
dim(junk1)
## [1] 100 3
dim(junk2)
## [1] 4000 3
glimpse(junk1)
## Observations: 100
## Variables: 3
## $ a <dbl> 1.6204214, 1.4340220, 2.4766615, 0.5283093, 1.0054081, 1.1…
## $ b <dbl> 3.00362413, 0.78524873, 0.93677611, 0.11962219, 0.78728662…
## $ class <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
glimpse(junk2)
## Observations: 4,000
## Variables: 3
## $ a <dbl> 3.18864809, 0.82245267, 0.81472472, -1.50653621, 0.4426886…
## $ b <dbl> 0.92917735, 0.04760314, 0.02910931, 3.13231360, 2.84942822…
## $ class <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
summary(junk1)
## a b class
## Min. :-2.29854 Min. :-3.17174 Min. :1.0
## 1st Qu.:-0.85014 1st Qu.:-1.04712 1st Qu.:1.0
## Median :-0.04754 Median :-0.07456 Median :1.5
## Mean : 0.04758 Mean : 0.01324 Mean :1.5
## 3rd Qu.: 1.09109 3rd Qu.: 1.05342 3rd Qu.:2.0
## Max. : 3.00604 Max. : 3.10230 Max. :2.0
summary(junk2)
## a b class
## Min. :-4.16505 Min. :-3.90472 Min. :0.0000
## 1st Qu.:-1.01447 1st Qu.:-0.89754 1st Qu.:0.0000
## Median : 0.08754 Median :-0.08358 Median :0.0000
## Mean :-0.05126 Mean : 0.05624 Mean :0.0625
## 3rd Qu.: 0.89842 3rd Qu.: 1.00354 3rd Qu.:0.0000
## Max. : 4.62647 Max. : 4.31052 Max. :1.0000
count(junk1)
## # A tibble: 1 x 1
## n
## <int>
## 1 100
count(junk2)
## # A tibble: 1 x 1
## n
## <int>
## 1 4000
#let's see data spread for both datasets
hist(junk1$a)
hist(junk1$b)
hist(junk1$class)
hist(junk2$a)
hist(junk2$b)
hist(junk2$class)
#let's boxplot both datasets
boxplot(junk1, horizontal=TRUE, main="junk1")
boxplot(junk2, horizontal=TRUE, main="junk2")
Junk1 is a small dataset with no missing data. It has 100 records. The class has values 1 and 2.
Junk2 is a big dataset compared to Junk1 with no missing data. It has 4000 records. The class has 0 and 1. There are many outliers in this dataset.
Comparing these two datasets, we can work with Junk2 dataset which looks pretty huge dataset. But for Junk1 its better to ask the client for more data.
Please find kNN.R
This R script requires a dataset, labelcol and K (number of nearest neighbors to be considered)
The dataset MUST Be numeric, except the labelcol The labelcol must be the last column in the data.frame All the other columns must be before the labelcol
To DO:
Please find icu.csv The formula to fit is “STA ~ TYP + COMA + AGE + INF”
Read the icu.csv subset it with these 5 features in the formula and STA is the labelcol.
Split the icu 70/30 train/test and run the kNN.R for K=(3,5,7,15,25,50)
submit the result confusionMatrix, Accuracy for each K
Plot Accuracy vs K.
write a short summary of your findings.
euclideanDist <- function(a, b){
d = 0
for(i in c(1:(length(a)) ))
{
d = d + (a[[i]]-b[[i]])^2
}
d = sqrt(d)
return(d)
}
knn_predict2 <- function(test_data, train_data, k_value, labelcol){
pred <- c() #empty pred vector
#LOOP-1
for(i in c(1:nrow(test_data))){ #looping over each record of test data
eu_dist =c() #eu_dist & eu_char empty vector
eu_char = c()
good = 0 #good & bad variable initialization with 0 value
bad = 0
#LOOP-2-looping over train data
for(j in c(1:nrow(train_data))){
#adding euclidean distance b/w test data point and train data to eu_dist vector
eu_dist <- c(eu_dist, euclideanDist(test_data[i,-c(labelcol)], train_data[j,-c(labelcol)]))
#adding class variable of training data in eu_char
eu_char <- c(eu_char, as.character(train_data[j,][[labelcol]]))
}
eu <- data.frame(eu_char, eu_dist) #eu dataframe created with eu_char & eu_dist columns
eu <- eu[order(eu$eu_dist),] #sorting eu dataframe to gettop K neighbors
eu <- eu[1:k_value,] #eu dataframe with top K neighbors
tbl.sm.df<-table(eu$eu_char)
cl_label<- names(tbl.sm.df)[[as.integer(which.max(tbl.sm.df))]]
pred <- c(pred, cl_label)
}
return(pred) #return pred vector
}
accuracy <- function(test_data,labelcol,predcol){
correct = 0
for(i in c(1:nrow(test_data))){
if(test_data[i,labelcol] == test_data[i,predcol]){
correct = correct+1
}
}
accu = (correct/nrow(test_data)) * 100
return(accu)
}
#load data
icu<-read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA622/master/icu.csv",header=TRUE,sep=",")
icu["COMA"]<-ifelse(icu$LOC==2,1,0)
knn.df<-icu[,c("TYP","COMA","AGE","INF","STA")]
labelcol <- 5 # for icu, STA is the fifth col
predictioncol<-labelcol+1
# create train/test partitions
set.seed(123)
n<-nrow(knn.df)
knn.df<- knn.df[sample(n),]
train.df <- knn.df[1:as.integer(0.7*n),]
#K = 3 # number of neighbors to determine the class
table(knn.df[,labelcol])
##
## 0 1
## 160 40
test.df <- knn.df[as.integer(0.7*n +1):n,]
table(test.df[,labelcol])
##
## 0 1
## 55 5
df<-data.frame() #create empty dataframe
for (K in c(3,5,7,15,25,50)) # loop the K values
{
test.df<-test.df <- knn.df[as.integer(0.7*n +1):n,]
predictions <- knn_predict2(test.df, train.df, K,labelcol) #calling knn_predict()
test.df[,predictioncol] <- predictions #Adding predictions in test data as 7th column
print(accuracy(test.df,labelcol,predictioncol))
print(table(test.df[[predictioncol]],test.df[[labelcol]]))
K<-c(K, accuracy(test.df,labelcol,predictioncol))
df<-rbind(df,K)
}
## [1] 75
##
## 0 1
## 0 45 5
## 1 10 0
## [1] 81.66667
##
## 0 1
## 0 49 5
## 1 6 0
## [1] 83.33333
##
## 0 1
## 0 50 5
## 1 5 0
## [1] 86.66667
##
## 0 1
## 0 52 5
## 1 3 0
## [1] 91.66667
##
## 0 1
## 0 55 5
## [1] 91.66667
##
## 0 1
## 0 55 5
plot(df, type="o", col="red", main="KNN Accurary plot", xlab="K value", ylab="Accuracy",lwd=2)
axis(side=1,at=df$X3)
After plotting the dataframe for K values: 3,5,7,15,25,50, the accuracy is highest at K=25 and for all the values above 25 it maintains the same value which is 91.66. K=3 has the lowest value at 75.