Recitation Problem chapter 5 ques 2 part g:

n = 1:100000
plot(n,sapply(n,function(n){1-((1-(1/n))^n)}),xlab="n",ylab="Prob. that jth obs. is in sample",typ = "l",log="x")
abline(h = 1 - exp(-1), lty = "dotted")
plot of chunk unnamed-chunk-1

Recitation Problem chapter 5 ques 2 part h:

n = 10000
store = rep(NA, n)
for (i in 1:n) {
  store[i] = sum(sample(1:100, rep = T) == 4) > 0
}
mean(store)
## [1] 0.635
#PROBLEM 1

Load the abalone sample dataset from the UCI Machine Learning Repository (abalone.data) into R using a dataframe. Remove all observations in the Infant category, keeping the Male/Female classes. Using the caret package, use createDataPartition to perform an 80/20 test-train split (80% training and 20% testing). Fit a logistic regression using all feature variables via glm,and observe which predictors are relevant. Do the confidence intervals for the predictors contain 0 within the range? How does this relate to the null hypothesis? Use the confusionMatrix function in caret to observe testing results (use a 50% cutoff to tag Male/Female) - how does the accuracy compare to a random classifier ROC curve? Use the corrplot package to plot correlations between the predictors. How does this help explain the classifier performance?

library(stringr)
library(knitr)
library(readr)
library("data.table")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(corrplot)
## corrplot 0.92 loaded
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR)

abaloneURl = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
abaloneTDataSet = fread(abaloneURl, header = FALSE)
abaloneHeader = c("Sex","Length","Diameter","Height","Whole weight","Shucked weight","Viscera weight","Shell weight","Rings")
str(abaloneHeader)
##  chr [1:9] "Sex" "Length" "Diameter" "Height" "Whole weight" ...
summary(abaloneHeader)
##    Length     Class      Mode 
##         9 character character
print(abaloneHeader)
## [1] "Sex"            "Length"         "Diameter"       "Height"        
## [5] "Whole weight"   "Shucked weight" "Viscera weight" "Shell weight"  
## [9] "Rings"
# Remove all observations in the Infant category, keeping the Male/Female classes- 
colnames(abaloneTDataSet) = c("Sex","Length","Diameter","Height","Whole weight","Shucked weight","Viscera weight","Shell weight","Rings")

#Returns the indices when the following condition is true in which()
condition = which(abaloneTDataSet$Sex!="I")
str(condition)
##  int [1:2835] 1 2 3 4 7 8 9 10 11 12 ...
#All the Rows with Sex = I are removed
checkInfant = abaloneTDataSet[condition]
str(checkInfant)
## Classes 'data.table' and 'data.frame':  2835 obs. of  9 variables:
##  $ Sex           : chr  "M" "M" "F" "M" ...
##  $ Length        : num  0.455 0.35 0.53 0.44 0.53 0.545 0.475 0.55 0.525 0.43 ...
##  $ Diameter      : num  0.365 0.265 0.42 0.365 0.415 0.425 0.37 0.44 0.38 0.35 ...
##  $ Height        : num  0.095 0.09 0.135 0.125 0.15 0.125 0.125 0.15 0.14 0.11 ...
##  $ Whole weight  : num  0.514 0.226 0.677 0.516 0.778 ...
##  $ Shucked weight: num  0.2245 0.0995 0.2565 0.2155 0.237 ...
##  $ Viscera weight: num  0.101 0.0485 0.1415 0.114 0.1415 ...
##  $ Shell weight  : num  0.15 0.07 0.21 0.155 0.33 0.26 0.165 0.32 0.21 0.135 ...
##  $ Rings         : int  15 7 9 10 20 16 9 19 14 10 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#As the response variable need to be numeric, converting categorical Sex to a factor
checkInfant$Sex = factor(checkInfant$Sex)
summary(checkInfant$Sex)
##    F    M 
## 1307 1528
str(checkInfant$Sex)
##  Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
#Creating 80-20 Training Testing Split, createDataPartition() returns the indices
dataIndex = createDataPartition(y = checkInfant$Sex, p = 0.8, list = FALSE)
#Training data
hydroTrainData = checkInfant[dataIndex,]
#Testing data (note the minus sign)
hydroTestData = checkInfant[-dataIndex,]
dim(hydroTrainData)
## [1] 2269    9
dim(hydroTestData)
## [1] 566   9
#Predicting Sex using glm
logisticRegressionModel <- glm(Sex~.,family=binomial,data=hydroTrainData)
#Summary of our model
summary(logisticRegressionModel)
## 
## Call:
## glm(formula = Sex ~ ., family = binomial, data = hydroTrainData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8618  -1.2042   0.8908   1.1149   1.5020  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       2.693360   0.519302   5.187 2.14e-07 ***
## Length           -2.274966   2.333074  -0.975  0.32951    
## Diameter         -3.435759   2.765768  -1.242  0.21415    
## Height           -4.078882   2.502277  -1.630  0.10309    
## `Whole weight`    0.087696   0.846998   0.104  0.91754    
## `Shucked weight`  3.044605   1.010230   3.014  0.00258 ** 
## `Viscera weight` -2.664753   1.458107  -1.828  0.06762 .  
## `Shell weight`    0.429699   1.284029   0.335  0.73789    
## Rings            -0.002331   0.018263  -0.128  0.89842    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3131.7  on 2268  degrees of freedom
## Residual deviance: 3075.2  on 2260  degrees of freedom
## AIC: 3093.2
## 
## Number of Fisher Scoring iterations: 4
# The Predictors which have lower p-value are the relevant and null hypothesis can be avoided for those variables. From the output we can see that 'ShuckedWeight' are the relevant predictor. 

coef(logisticRegressionModel)
##      (Intercept)           Length         Diameter           Height 
##      2.693360337     -2.274965757     -3.435759456     -4.078881994 
##   `Whole weight` `Shucked weight` `Viscera weight`   `Shell weight` 
##      0.087695783      3.044605234     -2.664753414      0.429698517 
##            Rings 
##     -0.002331353
# Do the confidence intervals for the predictors contain 0 within the range? How does this relate to the null hypothesis?
confint(logisticRegressionModel)
## Waiting for profiling to be done...
##                        2.5 %      97.5 %
## (Intercept)       1.69009950  3.72687955
## Length           -6.85279350  2.29927491
## Diameter         -8.87032329  1.98032694
## Height           -9.37952928 -0.08601448
## `Whole weight`   -1.57995947  1.75137577
## `Shucked weight`  1.07224126  5.04031937
## `Viscera weight` -5.53170081  0.18961821
## `Shell weight`   -2.09101903  2.95419223
## Rings            -0.03814637  0.03349445
### excluding the “Shucked Weight” contain 0 within their confidence interval range which is in line with the basic assumption of Null Hypothesis which says hence there is no relationship between X and Y.
### The only attribute with low p-value is “Shucked Weight” with a p-value of 0.00215 and all the other attributes have a high p-value. 
### Thus, we can conclude there is no relationship between the predictors and the response variable (except the “Shucked Weight”) and Null Hypothesis holds true for all the predictors except the “Shucked Weight”. 
# So Shucked Weight is the only predictor which is relevant.

# Use the confusionMatrix function in caret to observe testing results (use a 50% cutoff to tag Male/Female)
predictChecktData1= predict(logisticRegressionModel,hydroTestData,type="response")
predictCheck = ifelse(predictChecktData1>0.5,'M','F')

confusionMatrix(as.factor(predictCheck),as.factor(hydroTestData$Sex))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   F   M
##          F  84  83
##          M 177 222
##                                           
##                Accuracy : 0.5406          
##                  95% CI : (0.4986, 0.5823)
##     No Information Rate : 0.5389          
##     P-Value [Acc > NIR] : 0.4836          
##                                           
##                   Kappa : 0.051           
##                                           
##  Mcnemar's Test P-Value : 8.04e-09        
##                                           
##             Sensitivity : 0.3218          
##             Specificity : 0.7279          
##          Pos Pred Value : 0.5030          
##          Neg Pred Value : 0.5564          
##              Prevalence : 0.4611          
##          Detection Rate : 0.1484          
##    Detection Prevalence : 0.2951          
##       Balanced Accuracy : 0.5249          
##                                           
##        'Positive' Class : F               
## 
predROC = prediction(predictChecktData1,hydroTestData$Sex)
performa = performance(predROC , "acc")
plot(performa)
plot of chunk unnamed-chunk-3
perfROC = performance(predROC, measure = "tpr", x.measure = "fpr")
plot(perfROC)
abline(0,1)
plot of chunk unnamed-chunk-3
cat("Area Under the Curve: ")
## Area Under the Curve:
auc = performance(predROC, measure = "auc")
accuracy= auc@y.values
print(accuracy)
## [[1]]
## [1] 0.5902519
# how does the accuracy compare to a random classifier ROC curve?

cat("\n As we can see ROC curve is better for our model, hence it will predict better than selecting
+  random value. Accuracy of the model is nearly 0.55.")
## 
##  As we can see ROC curve is better for our model, hence it will predict better than selecting
## +  random value. Accuracy of the model is nearly 0.55.
# Use the corrplot package to plot correlations between the predictors
corrplot(cor(checkInfant[,-1]), method = "number")
plot of chunk unnamed-chunk-3
corrplot.mixed(cor(checkInfant[,2:8]))
plot of chunk unnamed-chunk-3
# The above figure depicts that there is a positive linear connection for all factors. Only the Rings predictor has a weak uphill (positive) association, whereas the others have a high uphill (positive) relationship.

# How does this help explain the classifier performance?

cat("\n All the variables are highly correlated this shows the poor performance of the classifier because
+  they don’t explain much. Good model have variables that are not correlated.")
## 
##  All the variables are highly correlated this shows the poor performance of the classifier because
## +  they don’t explain much. Good model have variables that are not correlated.
#PROBLEM 2

Load the mushroom sample dataset from the UCI Machine Learning Repository (agaricuslepiota.data) into R using a dataframe (Note: There are missing values with a ? character, you will have to explain your handling of these). Create a Naive Bayes classifier using the e1071 package, using the sample function to split the data between 80% for training and 20% for testing. With the target class of interest being edible mushrooms, calculate the accuracy of the classifier both in-training and in-test.Use the table function to create a confusion matrix of predicted vs. actual classes - how many false positives did the model produce?

library(data.table) #Data Import
library(e1071)      #Naive Bayes

mushroomDataSet = fread("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",header=FALSE)
colnames(mushroomDataSet) =c("Class","cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat")
#Display Summary:
summary(mushroomDataSet)
##     Class            cap-shape         cap-surface         cap-color        
##  Length:8124        Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    bruises              odor           gill-attachment    gill-spacing      
##  Length:8124        Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##   gill-size          gill-color        stalk-shape         stalk-root       
##  Length:8124        Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
##  Length:8124              Length:8124              Length:8124           
##  Class :character         Class :character         Class :character      
##  Mode  :character         Mode  :character         Mode  :character      
##  stalk-color-below-ring  veil-type          veil-color       
##  Length:8124            Length:8124        Length:8124       
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##  ring-number         ring-type         spore-print-color   population       
##  Length:8124        Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    habitat         
##  Length:8124       
##  Class :character  
##  Mode  :character
table(mushroomDataSet$Class)
## 
##    e    p 
## 4208 3916
mushroomDataSet$Class <- factor(mushroomDataSet$Class)
dim(mushroomDataSet)
## [1] 8124   23
str(mushroomDataSet)
## Classes 'data.table' and 'data.frame':  8124 obs. of  23 variables:
##  $ Class                   : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap-shape               : chr  "x" "x" "b" "x" ...
##  $ cap-surface             : chr  "s" "s" "s" "y" ...
##  $ cap-color               : chr  "n" "y" "w" "w" ...
##  $ bruises                 : chr  "t" "t" "t" "t" ...
##  $ odor                    : chr  "p" "a" "l" "p" ...
##  $ gill-attachment         : chr  "f" "f" "f" "f" ...
##  $ gill-spacing            : chr  "c" "c" "c" "c" ...
##  $ gill-size               : chr  "n" "b" "b" "n" ...
##  $ gill-color              : chr  "k" "k" "n" "n" ...
##  $ stalk-shape             : chr  "e" "e" "e" "e" ...
##  $ stalk-root              : chr  "e" "c" "c" "e" ...
##  $ stalk-surface-above-ring: chr  "s" "s" "s" "s" ...
##  $ stalk-surface-below-ring: chr  "s" "s" "s" "s" ...
##  $ stalk-color-above-ring  : chr  "w" "w" "w" "w" ...
##  $ stalk-color-below-ring  : chr  "w" "w" "w" "w" ...
##  $ veil-type               : chr  "p" "p" "p" "p" ...
##  $ veil-color              : chr  "w" "w" "w" "w" ...
##  $ ring-number             : chr  "o" "o" "o" "o" ...
##  $ ring-type               : chr  "p" "p" "p" "p" ...
##  $ spore-print-color       : chr  "k" "n" "n" "k" ...
##  $ population              : chr  "s" "n" "n" "s" ...
##  $ habitat                 : chr  "u" "g" "m" "u" ...
##  - attr(*, ".internal.selfref")=<externalptr>
cat("Number of missing values = ",sum(mushroomDataSet=="?"))
## Number of missing values =  2480
# we have new dataset with removed missing values
mushroomMissingVal = mushroomDataSet[mushroomDataSet$`stalk-root`!="?"]

cat(" \n There is no damage in removing the observations with missing values because we have plenty after omitting the ones with missing values.")
##  
##  There is no damage in removing the observations with missing values because we have plenty after omitting the ones with missing values.
#Creating a split
trainSize = floor(0.80*nrow(mushroomMissingVal))
trainIndex = sample(nrow(mushroomMissingVal), size = trainSize)
hydroTrainData = mushroomMissingVal[trainIndex,]
hydroTestData = mushroomMissingVal[-trainIndex,]


#Naive Bayes 
model = naiveBayes(hydroTrainData[,-1],hydroTrainData$Class)
# Prediction of Testing Data
testPrediction = predict(model,hydroTestData[,-1])
#Prediction of Training Data
trainPrediction = predict(model,hydroTrainData[,-1])
#Accuracy of Testing Model
cat("Accuracy of Testing Model: ",mean(testPrediction == hydroTestData$Class)*100,"%")
## Accuracy of Testing Model:  94.33127 %
#Accuracy of Training Model is
cat("Accuracy of Training Model: ",mean(trainPrediction == hydroTrainData$Class)*100,"%")
## Accuracy of Training Model:  95.06091 %
#Confusion Matrix
table(testPrediction, hydroTestData$Class)
##               
## testPrediction   e   p
##              e 682  56
##              p   8 383
# how many false positives did the model produce? 
confusionMatrix(table(testPrediction,hydroTestData$Class),dnn=c("Predicted","Actual"))
## Confusion Matrix and Statistics
## 
##               
## testPrediction   e   p
##              e 682  56
##              p   8 383
##                                           
##                Accuracy : 0.9433          
##                  95% CI : (0.9282, 0.9561)
##     No Information Rate : 0.6112          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8783          
##                                           
##  Mcnemar's Test P-Value : 4.228e-09       
##                                           
##             Sensitivity : 0.9884          
##             Specificity : 0.8724          
##          Pos Pred Value : 0.9241          
##          Neg Pred Value : 0.9795          
##              Prevalence : 0.6112          
##          Detection Rate : 0.6041          
##    Detection Prevalence : 0.6537          
##       Balanced Accuracy : 0.9304          
##                                           
##        'Positive' Class : e               
## 
confusionMatrix(table(trainPrediction,hydroTrainData$Class),dnn=c("Predicted","Actual"))
## Confusion Matrix and Statistics
## 
##                
## trainPrediction    e    p
##               e 2790  215
##               p    8 1502
##                                           
##                Accuracy : 0.9506          
##                  95% CI : (0.9439, 0.9567)
##     No Information Rate : 0.6197          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8927          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9971          
##             Specificity : 0.8748          
##          Pos Pred Value : 0.9285          
##          Neg Pred Value : 0.9947          
##              Prevalence : 0.6197          
##          Detection Rate : 0.6179          
##    Detection Prevalence : 0.6656          
##       Balanced Accuracy : 0.9360          
##                                           
##        'Positive' Class : e               
## 
# The accuracy for the missing values for the test is 0.9495 and false positive is 40
# The accuracy for the missing values for the training is 0.9548 and false positive is 172

#From the above the confusion matrix we have the following:
#  TP = 674, FP = 51, FN = 1 and TN = 403
#PROBLEM 3

Load the Yacht Hydrodynamics sample dataset from the UCI Machine Learning Repository (yacht hydrodynamics.data) into R using a dataframe (Note: The feature labels need to be manually specified). Use the caret package to perform a 80/20 test-train split (via the createDataPartition function), and obtain a training fit for a linear model. (Hint: The model fit should use all available features with the residuary resistance as the target.). What are the training MSE/RMSE and R2 results? Next, use the caret package to perform a bootstrap from the full sample dataset with N=1000 samples for fitting a linear model (via the trainControl method), resulting in a training MSE/RMSE and R2 for each resample. Plot a histogram of the RMSE values, and provide a mean RMSE and R2 for the fit. How do these values compare to the basic model? How does the performance on the test set for the original and boostrap model compare?

library(readr)
library(data.table)
library(caret)
library(ROCR)


hydroYacht = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",header = F)
colnames(hydroYacht) = c("longitudinalPos","prismaticCoef","LDR","BDR","LBR","froudeNo","Residuary")

#Direct use of attribute names in code
attach(hydroYacht)

#Creating 80-20 Training Testing Split, createDataPartition() returns the indices
trainIndex = createDataPartition(y = hydroYacht$Residuary , p = 0.8, list = FALSE)

hydroTrainData = hydroYacht[trainIndex,]

hydroTestData = hydroYacht[-trainIndex,]

hydroLinearModel1 = lm(hydroYacht$Residuary~hydroYacht$longitudinalPos + hydroYacht$prismaticCoef + hydroYacht$LDR + hydroYacht$BDR + hydroYacht$BDR + hydroYacht$LBR +hydroYacht$froudeNo, data = hydroTrainData)

MSEFunc = function(yActual, yPred)
{
  return (mean((yActual - yPred)^2))
}

hydroMSE = MSEFunc(hydroYacht$Residuary, hydroLinearModel1$fitted.values )

# Results
cat("Training MSE: ", hydroMSE)
## Training MSE:  78.45015
cat("Training RMSE: ", sqrt(hydroMSE))
## Training RMSE:  8.857209
cat("Training R-squared: ",summary(hydroLinearModel1)$r.sq)
## Training R-squared:  0.6575638
train.control =  trainControl(method = "boot", number = 1000)

# Training the model
hydroLinearModel2 = train(Residuary~., data = hydroTrainData, method = "lm", trControl = train.control)


summary(hydroLinearModel2$resample$RMSE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.397   8.935   9.347   9.373   9.768  11.890
summary(hydroLinearModel2$resample$Rsquared)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5284  0.6284  0.6475  0.6470  0.6665  0.7269
# Histogram
hist(hydroLinearModel2$resample$RMSE, xlab = "RMSE Values", main = "Histogram of the RMSE values")
plot of chunk unnamed-chunk-5
# Computing the MSE from RMSE
hydroMSE2 = mean(hydroLinearModel2$resample$RMSE)^2

# Results
cat("Training Mean MSE (Bootstrap): ", hydroMSE2)
## Training Mean MSE (Bootstrap):  87.85267
cat("Training Mean RMSE (Bootstrap): ", mean(hydroLinearModel2$resample$RMSE))
## Training Mean RMSE (Bootstrap):  9.372976
cat("Training Mean R-squared (Bootstrap): ",mean(hydroLinearModel2$resample$Rsquared))
## Training Mean R-squared (Bootstrap):  0.6470394
val_pred_boot = predict(hydroLinearModel2,hydroTestData)
hydroMSE3 = MSEFunc(hydroTestData$Residuary, val_pred_boot)

RSSFunc = function (yActual, yPred)
{
  return (sum((yActual - yPred)^2))
}

TSSFunc = function (yActual)
{
  return (sum((yActual - mean(yActual))^2))
}

hydroRss = RSSFunc(hydroTestData$Residuary, val_pred_boot)
hydroTss = TSSFunc(hydroTestData$Residuary)

#  Results
cat("Testing MSE (Bootstrap): ",hydroMSE3)
## Testing MSE (Bootstrap):  73.00708
cat("Testing RMSE (Bootstrap): ",sqrt(hydroMSE3))
## Testing RMSE (Bootstrap):  8.544418
cat("Testing Mean R-squared: ",1 - (hydroRss/hydroTss))
## Testing Mean R-squared:  0.5881496
cat(" \n In the test set, there is no difference in performance between the original and bootstrap models as they are identical")
##  
##  In the test set, there is no difference in performance between the original and bootstrap models as they are identical
### How do these values compare to the basic model?
### So basic model RMSE = 8.92 and R2 = 0.65
### So Bootstrap model RMSE = 9.11 and R2 = 0.64
### So both the models the values are almost similar.

### How does the performance on the test set for the original and bootstrap model compare?
### Since the original model test RMSE = 9.243 and R2 = 0.655
### Since Bootstrap model test RMSE = 9.243 and R2= 0.655
### So values are same for both the models.
# PROBLEM 4

Load the German Credit Data sample dataset from the UCI Machine Learning Repository (german.data-numeric) into R using a dataframe (Note: The final column is the class variable coded as 1 or 2). Use the caret package to perform a 80/20 test-train split (via the createDataPartition function), and obtain a training fit for a logistic model via the glm package. (Hint: You may select a subset of the predictors based on exploratory analysis, or use all predictors for simplicity.). What are the training Precision/Recall and F1 results? Next, use the trainControl and train functions to perform a k=10 fold cross-validation fit of the same model, and obtain cross-validated training Precision/Recall and F1 values. How do these values compare to the original fit? How does the performance on the test set for the original and cross-validated model compare?

library(readr)
library(data.table)
library(caret)
gdnDataSet<-read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data-numeric",header = FALSE)

gdnDataSet$V25 = factor(gdnDataSet$V25)

# Making 80-20 Training Testing Split, createDataPartition() returns the indices
trainIndex = createDataPartition(y = gdnDataSet$V25 , p = 0.8, list = FALSE)

hydroTrainData = gdnDataSet[trainIndex,]

hydroTestData = gdnDataSet[-trainIndex,]

logisticModel1 = glm(V25~.,family=binomial,data=hydroTrainData)

value_fit_data = ifelse(logisticModel1$fitted.values > 0.5,2,1)
value_fit_data = factor(value_fit_data)

confusionMatrixData = confusionMatrix(value_fit_data, hydroTrainData$V25)
print(confusionMatrixData)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 508 107
##          2  52 133
##                                           
##                Accuracy : 0.8012          
##                  95% CI : (0.7719, 0.8284)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 5.503e-11       
##                                           
##                   Kappa : 0.4936          
##                                           
##  Mcnemar's Test P-Value : 1.848e-05       
##                                           
##             Sensitivity : 0.9071          
##             Specificity : 0.5542          
##          Pos Pred Value : 0.8260          
##          Neg Pred Value : 0.7189          
##              Prevalence : 0.7000          
##          Detection Rate : 0.6350          
##    Detection Prevalence : 0.7688          
##       Balanced Accuracy : 0.7307          
##                                           
##        'Positive' Class : 1               
## 
# Results
cat("Training Precision: ", confusionMatrixData$byClass[5] * 100, "%")
## Training Precision:  82.60163 %
cat("Training Recall: ", confusionMatrixData$byClass[6] * 100, "%")
## Training Recall:  90.71429 %
cat("Training F1-Score: ", confusionMatrixData$byClass[7] * 100, "%")
## Training F1-Score:  86.46809 %
# Predict [By setting the parameter type='response', R will output probabilities in the form of P(y=1|X)]
probs = predict(logisticModel1, hydroTestData, type = "response")

# We are creating a 50% cut-off factor i.e probabilities > 0.5 are Males and rest are Females 
val_fit_test = ifelse(probs > 0.5,2,1)
val_fit_test = factor(val_fit_test)

confusionMatrixDataTest = confusionMatrix(val_fit_test, hydroTestData$V25)
print(confusionMatrixDataTest)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 122  37
##          2  18  23
##                                           
##                Accuracy : 0.725           
##                  95% CI : (0.6576, 0.7856)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.24545         
##                                           
##                   Kappa : 0.2801          
##                                           
##  Mcnemar's Test P-Value : 0.01522         
##                                           
##             Sensitivity : 0.8714          
##             Specificity : 0.3833          
##          Pos Pred Value : 0.7673          
##          Neg Pred Value : 0.5610          
##              Prevalence : 0.7000          
##          Detection Rate : 0.6100          
##    Detection Prevalence : 0.7950          
##       Balanced Accuracy : 0.6274          
##                                           
##        'Positive' Class : 1               
## 
# Summarize the results
cat("Testing Precision: ", confusionMatrixDataTest$byClass[5] * 100, "%")
## Testing Precision:  76.72956 %
cat("Testing Recall: ", confusionMatrixDataTest$byClass[6] * 100, "%")
## Testing Recall:  87.14286 %
cat("Testing F1-Score: ", confusionMatrixDataTest$byClass[7] * 100, "%")
## Testing F1-Score:  81.60535 %
#--------------------USE the trainControl and train functions to perform a k=10 fold cross-validation fit of the same model---------------------------

train.control =  trainControl(method = "cv", number = 10)

# Model Train
logisticModelData = train(V25~., data = hydroTrainData, method = "glm", family = "binomial", trControl = train.control)


val_fit_cv = ifelse(logisticModelData$finalModel$fitted.values > 0.5,2,1)
val_fit_cv = factor(val_fit_cv)

confusion_matrix_cv = confusionMatrix(val_fit_cv, hydroTrainData$V25)
confusion_matrix_cv
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 508 107
##          2  52 133
##                                           
##                Accuracy : 0.8012          
##                  95% CI : (0.7719, 0.8284)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 5.503e-11       
##                                           
##                   Kappa : 0.4936          
##                                           
##  Mcnemar's Test P-Value : 1.848e-05       
##                                           
##             Sensitivity : 0.9071          
##             Specificity : 0.5542          
##          Pos Pred Value : 0.8260          
##          Neg Pred Value : 0.7189          
##              Prevalence : 0.7000          
##          Detection Rate : 0.6350          
##    Detection Prevalence : 0.7688          
##       Balanced Accuracy : 0.7307          
##                                           
##        'Positive' Class : 1               
## 
# Results
cat("Training Precision with 10-fold CV: ", confusion_matrix_cv$byClass[5] * 100, "%")
## Training Precision with 10-fold CV:  82.60163 %
cat("Training Recall with 10-fold CV: ", confusion_matrix_cv$byClass[6] * 100, "%")
## Training Recall with 10-fold CV:  90.71429 %
cat("Training F1-Score with 10-fold CV: ", confusion_matrix_cv$byClass[7] * 100, "%")
## Training F1-Score with 10-fold CV:  86.46809 %
probs_cv = predict(logisticModelData, hydroTestData, type = "prob")

fittedVals_cv_test = ifelse(probs > 0.5,2,1)
fittedVals_cv_test = factor(val_fit_test)

confusion_matrix_cv_test = confusionMatrix(val_fit_test, hydroTestData$V25)
print(confusion_matrix_cv_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 122  37
##          2  18  23
##                                           
##                Accuracy : 0.725           
##                  95% CI : (0.6576, 0.7856)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.24545         
##                                           
##                   Kappa : 0.2801          
##                                           
##  Mcnemar's Test P-Value : 0.01522         
##                                           
##             Sensitivity : 0.8714          
##             Specificity : 0.3833          
##          Pos Pred Value : 0.7673          
##          Neg Pred Value : 0.5610          
##              Prevalence : 0.7000          
##          Detection Rate : 0.6100          
##    Detection Prevalence : 0.7950          
##       Balanced Accuracy : 0.6274          
##                                           
##        'Positive' Class : 1               
## 
# Results
cat("Testing Precision: ", confusion_matrix_cv_test$byClass[5] * 100, "%")
## Testing Precision:  76.72956 %
cat("Testing Recall: ", confusion_matrix_cv_test$byClass[6] * 100, "%")
## Testing Recall:  87.14286 %
cat("Testing F1-Score: ", confusion_matrix_cv_test$byClass[7] * 100, "%")
## Testing F1-Score:  81.60535 %
cat("\n we can see that above observations it can be see that both the cross validation and basic model have
+  same result.")
## 
##  we can see that above observations it can be see that both the cross validation and basic model have
## +  same result.