Lab 2 - 530

###PART 1

###Q1- (one thing to make sure you do!): Remember that the class variable needs to be a categorical data type in order to build a Naïve Bayes Classifier. This means that you’ll need to convert your class variable. ###Next, use a 75%/25% split for training and test data, i.e. use 75% of the records for the training set and 25% of the records for the test set. Report the number of missing values you find in the data in your results report. Use the randomization seed of 12345.

###ANSWER- Q1:

creditData <- read.csv("C:/Users/Priya/Downloads/creditData.csv")

sum(is.na(creditData))

## [1] 0

str(creditData)

## 'data.frame':  1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

creditData$Creditability <- as.factor(creditData$Creditability)
creditData$Account.Balance <- as.factor(creditData$Account.Balance)
creditData$Payment.Status.of.Previous.Credit <- as.factor(creditData$Payment.Status.of.Previous.Credit)
creditData$Purpose <- as.factor(creditData$Purpose)
creditData$Value.Savings.Stocks <- as.factor(creditData$Value.Savings.Stocks)
creditData$Length.of.current.employment <- as.factor(creditData$Length.of.current.employment)
creditData$Instalment.per.cent <- as.factor(creditData$Instalment.per.cent)
creditData$Sex...Marital.Status <- as.factor(creditData$Sex...Marital.Status)
creditData$Guarantors <- as.factor(creditData$Guarantors)
creditData$Duration.in.Current.address <- as.factor(creditData$Duration.in.Current.address)
creditData$Most.valuable.available.asset <- as.factor(creditData$Most.valuable.available.asset)
creditData$Concurrent.Credits <- as.factor(creditData$Concurrent.Credits)
creditData$Type.of.apartment <- as.factor(creditData$Type.of.apartment)
creditData$No.of.Credits.at.this.Bank <- as.factor(creditData$No.of.Credits.at.this.Bank)
creditData$Occupation <- as.factor(creditData$Occupation)
creditData$No.of.dependents <- as.factor(creditData$No.of.dependents)
creditData$Telephone <- as.factor(creditData$Telephone)
creditData$Foreign.Worker <- as.factor(creditData$Foreign.Worker)

str(creditData)

## 'data.frame':  1000 obs. of  21 variables:
##  $ Creditability                    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Account.Balance                  : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
##  $ Purpose                          : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...

set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

###Q2- Compute the percentage of both classes similar to what you did in lab 1 and see if the distribution of both classes preserved for both training and testing data.

###ANSWER - Q2:

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3088889 0.6911111

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.22 0.78

###From the prop tables above we can see that the distribution is uniform for test as well as train datasets.

library(naivebayes)

## Warning: package 'naivebayes' was built under R version 3.5.3

## naivebayes 0.9.6 loaded

naive_model <- naive_bayes(as.character(Creditability) ~ ., data= credit_train)
naive_model

## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = as.character(Creditability) ~ ., 
##     data = credit_train)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 0
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
##         0         1 
## 0.3088889 0.6911111 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: Account.Balance (Categorical) 
## ------------------------------------------------------------------------------ 
##                
## Account.Balance          0          1
##               1 0.44244604 0.20418006
##               2 0.35611511 0.22829582
##               3 0.05035971 0.07234727
##               4 0.15107914 0.49517685
## 
## ------------------------------------------------------------------------------ 
##  ::: Duration.of.Credit..month. (Gaussian) 
## ------------------------------------------------------------------------------ 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.91727 19.18489
##                       sd   13.41521 11.06282
## 
## ------------------------------------------------------------------------------ 
##  ::: Payment.Status.of.Previous.Credit (Categorical) 
## ------------------------------------------------------------------------------ 
##                                  
## Payment.Status.of.Previous.Credit          0          1
##                                 0 0.08273381 0.02090032
##                                 1 0.09352518 0.03215434
##                                 2 0.56115108 0.51929260
##                                 3 0.09352518 0.08038585
##                                 4 0.16906475 0.34726688
## 
## ------------------------------------------------------------------------------ 
##  ::: Purpose (Categorical) 
## ------------------------------------------------------------------------------ 
##        
## Purpose           0           1
##      0  0.298561151 0.200964630
##      1  0.057553957 0.118971061
##      2  0.197841727 0.181672026
##      3  0.201438849 0.319935691
##      4  0.014388489 0.009646302
##      5  0.025179856 0.016077170
##      6  0.079136691 0.043408360
##      8  0.003597122 0.011254019
##      9  0.104316547 0.088424437
##      10 0.017985612 0.009646302
## 
## ------------------------------------------------------------------------------ 
##  ::: Credit.Amount (Gaussian) 
## ------------------------------------------------------------------------------ 
##              
## Credit.Amount        0        1
##          mean 3977.813 2962.659
##          sd   3568.550 2383.757
## 
## ------------------------------------------------------------------------------
## 
## # ... and 15 more tables
## 
## ------------------------------------------------------------------------------

conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability)

## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.

Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy

## [1] 80

###As we can see from above that the accuracy is 80%.

Part 2

Part 3

Part 4

Lab 2 - 530

Priya

9/29/2019