club <- read.csv("C:/Users/Public/LC_Week9_Data.csv")
head(club)
summary(club)
a) Comb_Risk_One: Create a binary column by combining categories A
and B (Low
Risk) into one category and all the remaining categories in another
(High Risk). ## b) Comb_Risk_Two: Create a binary column by combining
categories A, B and C (Low Risk) into one category and all the remaining
categories in another (High Risk).
library(dplyr)
club<-club %>%
mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))
break the file into two files filtering out data for 2012, 13, and
14 in one
file and 2015, 16 and 17 in another file.
club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))
##Check for null values.Impute the missing values.
summary(club_12_13_14)
summary(club_15_16_17)
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
mean(club_15_16_17$mths_since_last_delinq,na.rm=T)
predict Low and High-risk categories (for the two new response
variables)
using various modeling techniques like Naïve Bayes’, KNN, Logistic
Regression, and CART model
library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]
BASIC EDA
Relation between annual income and total payment.
library(ggplot2)
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
Histogram of loan amount
hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")
Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <-
subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
Check for multicollinearity
Use correlation function for correlation analysis.Variables with
high
correlation leads to multicollinearity.These variables need to be
dropped.
M<-cor(df)
train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]
KNN for a) Comb_Risk_One
library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
KNN for b) Comb_Risk_Two
library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)
##Naïve Bayes’ for a) Comb_Risk_One
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])
##Naïve Bayes’ for a) Comb_Risk_Two
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])
##Logistic regression for a)Comb_Risk_One
log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)
##Logistic regression for b)Comb_Risk_Two
log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)
##CART for a)Comb_Risk_One
library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)
##CART for a)Comb_Risk_Two
library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)
From the accuracy results it can be seen that models Naive Bayes,KNN
and CART
perform really well.Of this Naive Bayes classifier performs slightly
better. #For period 2015-2017 ## predict Low and High-risk categories
(for the two new response variables) using various modeling techniques
like Naïve Bayes’, KNN, Logistic Regression, and CART model
club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]
BASIC EDA
Relation between annual income and total payment.
library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
Histogram of loan amount
hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")
Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <-
subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
Check for multicollinearity
Use correlation function for correlation analysis.Variables with
high
correlation leads to multicollinearity.These variables need to be
dropped.
M<-cor(df)
train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]
##Naïve Bayes’ for a) Comb_Risk_One
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 40])
##Naïve Bayes’ for a) Comb_Risk_Two
library(caret)
nb<- naiveBayes(train.df[, 41] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 41])
##Logistic regression for a)Comb_Risk_One
log <- glm(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_One`)
##Logistic regression for b)Comb_Risk_Two
log <- glm(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_Two`)
##CART for a)Comb_Risk_One
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
##CART for a)Comb_Risk_Two
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1) pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_15_16_17$Comb_Risk_Two`)
From the accuracy results it can be seen that models Naive Bayes,KNN
and CART
perform really well.Of this Naive Bayes and CART classifier performs
slightly better.
---
title: "R Notebook"
output: html_notebook
---

```{r}
club <- read.csv("C:/Users/Public/LC_Week9_Data.csv")
head(club)
summary(club)
```
## a) Comb_Risk_One: Create a binary column by combining categories A and B (Low
Risk) into one category and all the remaining categories in another (High Risk).
## b) Comb_Risk_Two: Create a binary column by combining categories A, B and C
(Low Risk) into one category and all the remaining categories in another (High
Risk).
```{r}
library(dplyr)
club<-club %>%
 mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))
```
## break the file into two files filtering out data for 2012, 13, and 14 in one
file and 2015, 16 and 17 in another file.
```{r}
club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))
```
##Check for null values.Impute the missing values.
```{r}
summary(club_12_13_14)
summary(club_15_16_17)
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
 mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
 mean(club_15_16_17$mths_since_last_delinq,na.rm=T)
```
## predict Low and High-risk categories (for the two new response variables)
using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression,
and CART model
```{r}
library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]
```
## BASIC EDA
## Relation between annual income and total payment.
```{r}
library(ggplot2)
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
```
## barchart of home ownership vs loan amount
```{r}
data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
```
## Histogram of loan amount
```{r}
hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")
```
## Find the outliers
```{r}
FindOutliers <- function(data) {
 lowerq = quantile(data)[2]
 upperq = quantile(data)[4]
 iqr = upperq - lowerq
 ### identify extreme outliers
 extreme.threshold.upper = (iqr * 3) + upperq
 extreme.threshold.lower = lowerq - (iqr * 3)
 result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
 length(result)
}
df <-
subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
```
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high
correlation leads to multicollinearity.These variables need to be dropped.
```{r}
M<-cor(df)
```
## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the
3 variables.
```{r}
df<-select(df,-total_pymnt,-total_rec_int)
```
###Scaling and standardization of predictors.
```{r}
df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled<-as.data.frame(df.scaled)
```
##combine scaled numerical columns with categorical column in original dataframe
```{r}
club_12_13_14.dummy<-
select(club_12_13_14,term,emp_length,home_ownership,loan_status,purpose,
       addr_sta
te,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_12_13_14.dummy$term<-(fct_lump(club_12_13_14.dummy$term, n=5))
club_12_13_14.dummy$emp_length<-fct_lump(club_12_13_14.dummy$emp_length, n=5)
club_12_13_14.dummy$home_ownership<-fct_lump(club_12_13_14.dummy$home_ownership,
n = 5)
club_12_13_14.dummy$loan_status<-fct_lump(club_12_13_14.dummy$loan_status, n =
5)
club_12_13_14.dummy$purpose<-fct_lump(club_12_13_14.dummy$purpose, n=5)
club_12_13_14.dummy$addr_state<-fct_lump(club_12_13_14.dummy$addr_state,n=5)
club_12_13_14.dummy$application_type<-
fct_lump(club_12_13_14.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_12_13_14.dummy)
club_12_13_14.nw<-
cbind(df.dummy,df.scaled,club_12_13_14$Comb_Risk_One,club_12_13_14$Comb_Risk_Two
)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`)
club_12_13_14.nw<-club_12_13_14.nw %>%mutate_if(is.integer,as.factor)
```

## train-test split and build different models.
```{r}
set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]
```
## KNN for a) Comb_Risk_One
```{r}
library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
```
## KNN for b) Comb_Risk_Two
```{r}
library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)
```
##Naïve Bayes’ for a) Comb_Risk_One
```{r}
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])
```
##Naïve Bayes’ for a) Comb_Risk_Two
```{r}
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])
```
##Logistic regression for a)Comb_Risk_One
```{r}
log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)
```
##Logistic regression for b)Comb_Risk_Two
```{r}
log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)
```
##CART for a)Comb_Risk_One
```{r}
library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)
```
##CART for a)Comb_Risk_Two

```{r}
library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)
```
## From the accuracy results it can be seen that models Naive Bayes,KNN and CART
perform really well.Of this Naive Bayes classifier performs slightly better.
#For period 2015-2017
## predict Low and High-risk categories (for the two new response variables)
using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression,
and CART model
```{r}
club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]
```
## BASIC EDA
## Relation between annual income and total payment.
```{r}
library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
```
## barchart of home ownership vs loan amount
```{r}
data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
```
## Histogram of loan amount
```{r}
hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")
```
## Find the outliers
```{r}
FindOutliers <- function(data) {
 lowerq = quantile(data)[2]
 upperq = quantile(data)[4]
 iqr = upperq - lowerq
 ### identify extreme outliers
 extreme.threshold.upper = (iqr * 3) + upperq
 extreme.threshold.lower = lowerq - (iqr * 3)
 result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
 length(result)
}
df <-
subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index)) 
apply(df, 2, FindOutliers)
```
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high
correlation leads to multicollinearity.These variables need to be dropped.
```{r}
M<-cor(df)
```
## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the
3 variables.
```{r}
df<-select(df,-total_pymnt,-total_rec_int)
```
###Scaling and standardization of predictors.
```{r}
df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled=as.data.frame(df.scaled)
```
##combine scaled numerical columns with categorical column in original dataframe
```{r}
club_15_16_17.dummy<-
select(club_15_16_17,term,emp_length,home_ownership,loan_status,purpose,
       addr_sta
te,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_15_16_17.dummy$term<-(fct_lump(club_15_16_17.dummy$term, n=5))
club_15_16_17.dummy$emp_length<-fct_lump(club_15_16_17.dummy$emp_length, n=5)
club_15_16_17.dummy$home_ownership<-fct_lump(club_15_16_17.dummy$home_ownership,
n = 5)
club_15_16_17.dummy$loan_status<-fct_lump(club_15_16_17.dummy$loan_status, n =
5)
club_15_16_17.dummy$purpose<-fct_lump(club_15_16_17.dummy$purpose, n=5)
club_15_16_17.dummy$addr_state<-fct_lump(club_15_16_17.dummy$addr_state,n=5)
club_15_16_17.dummy$application_type<-
fct_lump(club_15_16_17.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_15_16_17.dummy)
club_15_16_17.nw<-
cbind(df.dummy,df.scaled,club_15_16_17$Comb_Risk_One,club_15_16_17$Comb_Risk_Two
)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`)
club_15_16_17.nw<-club_15_16_17.nw %>%mutate_if(is.integer,as.factor)
```
## train-test split and build different models.
```{r}
set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]
```
##Naïve Bayes’ for a) Comb_Risk_One
```{r}
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 40])
```
##Naïve Bayes’ for a) Comb_Risk_Two
```{r}
library(caret)
nb<- naiveBayes(train.df[, 41] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 41])
```
##Logistic regression for a)Comb_Risk_One
```{r}
log <- glm(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_One`)
```
##Logistic regression for b)Comb_Risk_Two
```{r}
log <- glm(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_Two`)
```
##CART for a)Comb_Risk_One
```{r}
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
```
##CART for a)Comb_Risk_Two
```{r}
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1) pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_15_16_17$Comb_Risk_Two`)
```
## From the accuracy results it can be seen that models Naive Bayes,KNN and CART
perform really well.Of this Naive Bayes and CART classifier performs slightly
better.

