options(scipen=999)

setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(tidyverse)
library(MASS)
library(pROC)
library(rpart)
library(gtsummary)
library(rattle)
library(ISLR2)
library(ggplot2)
library(caret)
library(splines)
library(tidymodels)

IncomeData  = read.csv("Income2.csv", header=TRUE)
head(IncomeData)

  X Education Seniority   Income
1 1  21.58621  113.1034 99.91717
2 2  18.27586  119.3103 92.57913
3 3  12.06897  100.6897 34.67873
4 4  17.03448  187.5862 78.70281
5 5  19.93103   20.0000 68.00992
6 6  18.27586   26.2069 71.50449

plot(IncomeData$Education, IncomeData$Income)

## fit models and calculate MSE

mod.0 = lm(IncomeData$Income ~ IncomeData$Education)
summary(mod.0)


Call:
lm(formula = IncomeData$Income ~ IncomeData$Education)

Residuals:
    Min      1Q  Median      3Q     Max 
-19.568  -8.012   1.474   5.754  23.701 

Coefficients:
                     Estimate Std. Error t value        Pr(>|t|)    
(Intercept)          -41.9166     9.7689  -4.291        0.000192 ***
IncomeData$Education   6.3872     0.5812  10.990 0.0000000000115 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.93 on 28 degrees of freedom
Multiple R-squared:  0.8118,    Adjusted R-squared:  0.8051 
F-statistic: 120.8 on 1 and 28 DF,  p-value: 0.00000000001151

mse.0 = mean( (IncomeData$Income - predict(mod.0))^2 )
mse.0

[1] 132.7502

#Plot of Education vs Income

plot(IncomeData$Education, IncomeData$Income)
lines( IncomeData$Education, predict(mod.0))

mod.1 = lm(IncomeData$Income ~ poly(IncomeData$Education, degree=3))
summary(mod.1)


Call:
lm(formula = IncomeData$Income ~ poly(IncomeData$Education, degree = 3))

Residuals:
     Min       1Q   Median       3Q      Max 
-22.1594  -6.5919  -0.6362   7.8437  16.3670 

Coefficients:
                                        Estimate Std. Error t value
(Intercept)                               62.745      1.678  37.386
poly(IncomeData$Education, degree = 3)1  131.070      9.192  14.258
poly(IncomeData$Education, degree = 3)2    1.163      9.192   0.126
poly(IncomeData$Education, degree = 3)3  -42.239      9.192  -4.595
                                                    Pr(>|t|)    
(Intercept)                             < 0.0000000000000002 ***
poly(IncomeData$Education, degree = 3)1   0.0000000000000839 ***
poly(IncomeData$Education, degree = 3)2                  0.9    
poly(IncomeData$Education, degree = 3)3   0.0000979255001437 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.192 on 26 degrees of freedom
Multiple R-squared:  0.8962,    Adjusted R-squared:  0.8842 
F-statistic: 74.81 on 3 and 26 DF,  p-value: 0.0000000000006476

mse.1 = mean( (IncomeData$Income - predict(mod.1))^2 )
mse.1

[1] 73.23476

mod.2 = lm(IncomeData$Income ~ bs(IncomeData$Education, degree=3))
summary(mod.2)


Call:
lm(formula = IncomeData$Income ~ bs(IncomeData$Education, degree = 3))

Residuals:
     Min       1Q   Median       3Q      Max 
-22.1594  -6.5919  -0.6362   7.8437  16.3670 

Coefficients:
                                      Estimate Std. Error t value     Pr(>|t|)
(Intercept)                             36.240      5.292   6.848 0.0000002865
bs(IncomeData$Education, degree = 3)1  -48.767     17.232  -2.830      0.00885
bs(IncomeData$Education, degree = 3)2   84.783     10.865   7.803 0.0000000281
bs(IncomeData$Education, degree = 3)3   47.310      7.794   6.070 0.0000020564
                                         
(Intercept)                           ***
bs(IncomeData$Education, degree = 3)1 ** 
bs(IncomeData$Education, degree = 3)2 ***
bs(IncomeData$Education, degree = 3)3 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.192 on 26 degrees of freedom
Multiple R-squared:  0.8962,    Adjusted R-squared:  0.8842 
F-statistic: 74.81 on 3 and 26 DF,  p-value: 0.0000000000006476

plot(IncomeData$Education, IncomeData$Income)
lines( IncomeData$Education[order(IncomeData$Education)], predict(mod.2)[order(IncomeData$Education)] )

mse.2 = mean( (IncomeData$Income - predict(mod.2))^2 )
mse.2

[1] 73.23476

HeartData  = read.csv("Heart.csv", header=TRUE)

#View(Default) #attach(Default)

View(Default)

y = as.numeric( Default$default=="Yes" )
mean(y)

[1] 0.0333

fit logistic regression for different variables

mod.0 = glm( y ~ Default$balance, family=binomial)
summary(mod.0)


Call:
glm(formula = y ~ Default$balance, family = binomial)

Coefficients:
                   Estimate  Std. Error z value            Pr(>|z|)    
(Intercept)     -10.6513306   0.3611574  -29.49 <0.0000000000000002 ***
Default$balance   0.0054989   0.0002204   24.95 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 1596.5  on 9998  degrees of freedom
AIC: 1600.5

Number of Fisher Scoring iterations: 8

mod.1 = glm( y ~ as.factor(Default$student), family=binomial)
summary(mod.1)


Call:
glm(formula = y ~ as.factor(Default$student), family = binomial)

Coefficients:
                              Estimate Std. Error z value             Pr(>|z|)
(Intercept)                   -3.50413    0.07071  -49.55 < 0.0000000000000002
as.factor(Default$student)Yes  0.40489    0.11502    3.52             0.000431
                                 
(Intercept)                   ***
as.factor(Default$student)Yes ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 2908.7  on 9998  degrees of freedom
AIC: 2912.7

Number of Fisher Scoring iterations: 6

mod.2 = glm( y ~ as.factor(Default$student) + Default$balance, family=binomial)
summary(mod.2)


Call:
glm(formula = y ~ as.factor(Default$student) + Default$balance, 
    family = binomial)

Coefficients:
                                 Estimate  Std. Error z value
(Intercept)                   -10.7494959   0.3691914 -29.116
as.factor(Default$student)Yes  -0.7148776   0.1475190  -4.846
Default$balance                 0.0057381   0.0002318  24.750
                                          Pr(>|z|)    
(Intercept)                   < 0.0000000000000002 ***
as.factor(Default$student)Yes           0.00000126 ***
Default$balance               < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 1571.7  on 9997  degrees of freedom
AIC: 1577.7

Number of Fisher Scoring iterations: 8

income=Default $ income
mod.3 = glm( y ~ as.factor(Default$student) + Default$balance + income, family=binomial)
summary(mod.3)


Call:
glm(formula = y ~ as.factor(Default$student) + Default$balance + 
    income, family = binomial)

Coefficients:
                                   Estimate    Std. Error z value
(Intercept)                   -10.869045196   0.492255516 -22.080
as.factor(Default$student)Yes  -0.646775807   0.236252529  -2.738
Default$balance                 0.005736505   0.000231895  24.738
income                          0.000003033   0.000008203   0.370
                                          Pr(>|z|)    
(Intercept)                   < 0.0000000000000002 ***
as.factor(Default$student)Yes              0.00619 ** 
Default$balance               < 0.0000000000000002 ***
income                                     0.71152    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 1571.5  on 9996  degrees of freedom
AIC: 1579.5

Number of Fisher Scoring iterations: 8

#Plot of roc curve

roc_curve = roc(Default$default ~ predict(mod.3, type="response"))
plot.roc(roc_curve, print.auc=T, print.thres=T)

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

train_model <- trainControl(method = "repeatedcv", number = 5, repeats = 10)

Train the model

model <- train(
  default ~ balance + income + student, 
  data = Default, 
  method = "glm",
  family = "binomial",
  trControl = train_model)
model

Generalized Linear Model 

10000 samples
    3 predictor
    2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times) 
Summary of sample sizes: 8001, 8001, 7999, 8000, 7999, 8000, ... 
Resampling results:

  Accuracy   Kappa    
  0.9732499  0.4289372

#confusionMatrix

confusionMatrix(predict(model), Default$ default, positive="Yes")

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  9627  228
       Yes   40  105
                                               
               Accuracy : 0.9732               
                 95% CI : (0.9698, 0.9763)     
    No Information Rate : 0.9667               
    P-Value [Acc > NIR] : 0.0001044            
                                               
                  Kappa : 0.4278               
                                               
 Mcnemar's Test P-Value : < 0.00000000000000022
                                               
            Sensitivity : 0.3153               
            Specificity : 0.9959               
         Pos Pred Value : 0.7241               
         Neg Pred Value : 0.9769               
             Prevalence : 0.0333               
         Detection Rate : 0.0105               
   Detection Prevalence : 0.0145               
      Balanced Accuracy : 0.6556               
                                               
       'Positive' Class : Yes

NAMBARI DAY ONE

D.K.MURIITHI

2023-07-05

fit logistic regression for different variables

looking at classification based on p.hat = .5 cutoff

10-fold CV, repeated 5 times

Train the model