###PART 1
###Q1- (one thing to make sure you do!): Remember that the class variable needs to be a categorical data type in order to build a Naïve Bayes Classifier. This means that you’ll need to convert your class variable. ###Next, use a 75%/25% split for training and test data, i.e. use 75% of the records for the training set and 25% of the records for the test set. Report the number of missing values you find in the data in your results report. Use the randomization seed of 12345.
###ANSWER- Q1:
creditData <- read.csv("C:/Users/Priya/Downloads/creditData.csv")
sum(is.na(creditData))
## [1] 0
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
creditData$Creditability <- as.factor(creditData$Creditability)
creditData$Account.Balance <- as.factor(creditData$Account.Balance)
creditData$Payment.Status.of.Previous.Credit <- as.factor(creditData$Payment.Status.of.Previous.Credit)
creditData$Purpose <- as.factor(creditData$Purpose)
creditData$Value.Savings.Stocks <- as.factor(creditData$Value.Savings.Stocks)
creditData$Length.of.current.employment <- as.factor(creditData$Length.of.current.employment)
creditData$Instalment.per.cent <- as.factor(creditData$Instalment.per.cent)
creditData$Sex...Marital.Status <- as.factor(creditData$Sex...Marital.Status)
creditData$Guarantors <- as.factor(creditData$Guarantors)
creditData$Duration.in.Current.address <- as.factor(creditData$Duration.in.Current.address)
creditData$Most.valuable.available.asset <- as.factor(creditData$Most.valuable.available.asset)
creditData$Concurrent.Credits <- as.factor(creditData$Concurrent.Credits)
creditData$Type.of.apartment <- as.factor(creditData$Type.of.apartment)
creditData$No.of.Credits.at.this.Bank <- as.factor(creditData$No.of.Credits.at.this.Bank)
creditData$Occupation <- as.factor(creditData$Occupation)
creditData$No.of.dependents <- as.factor(creditData$No.of.dependents)
creditData$Telephone <- as.factor(creditData$Telephone)
creditData$Foreign.Worker <- as.factor(creditData$Foreign.Worker)
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Account.Balance : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
## $ Purpose : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
###Q2- Compute the percentage of both classes similar to what you did in lab 1 and see if the distribution of both classes preserved for both training and testing data.
###ANSWER - Q2:
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.22 0.78
###From the prop tables above we can see that the distribution is uniform for test as well as train datasets.
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_model <- naive_bayes(as.character(Creditability) ~ ., data= credit_train)
naive_model
##
## ================================ Naive Bayes =================================
##
## Call:
## naive_bayes.formula(formula = as.character(Creditability) ~ .,
## data = credit_train)
##
## ------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.3088889 0.6911111
##
## ------------------------------------------------------------------------------
##
## Tables:
##
## ------------------------------------------------------------------------------
## ::: Account.Balance (Categorical)
## ------------------------------------------------------------------------------
##
## Account.Balance 0 1
## 1 0.44244604 0.20418006
## 2 0.35611511 0.22829582
## 3 0.05035971 0.07234727
## 4 0.15107914 0.49517685
##
## ------------------------------------------------------------------------------
## ::: Duration.of.Credit..month. (Gaussian)
## ------------------------------------------------------------------------------
##
## Duration.of.Credit..month. 0 1
## mean 24.91727 19.18489
## sd 13.41521 11.06282
##
## ------------------------------------------------------------------------------
## ::: Payment.Status.of.Previous.Credit (Categorical)
## ------------------------------------------------------------------------------
##
## Payment.Status.of.Previous.Credit 0 1
## 0 0.08273381 0.02090032
## 1 0.09352518 0.03215434
## 2 0.56115108 0.51929260
## 3 0.09352518 0.08038585
## 4 0.16906475 0.34726688
##
## ------------------------------------------------------------------------------
## ::: Purpose (Categorical)
## ------------------------------------------------------------------------------
##
## Purpose 0 1
## 0 0.298561151 0.200964630
## 1 0.057553957 0.118971061
## 2 0.197841727 0.181672026
## 3 0.201438849 0.319935691
## 4 0.014388489 0.009646302
## 5 0.025179856 0.016077170
## 6 0.079136691 0.043408360
## 8 0.003597122 0.011254019
## 9 0.104316547 0.088424437
## 10 0.017985612 0.009646302
##
## ------------------------------------------------------------------------------
## ::: Credit.Amount (Gaussian)
## ------------------------------------------------------------------------------
##
## Credit.Amount 0 1
## mean 3977.813 2962.659
## sd 3568.550 2383.757
##
## ------------------------------------------------------------------------------
##
## # ... and 15 more tables
##
## ------------------------------------------------------------------------------
conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy
## [1] 80
###As we can see from above that the accuracy is 80%.