options(scipen=999)
setwd("C:\\Users\\user\\Desktop\\R_CODE_2023")
library(tidyverse)
library(MASS)
library(pROC)
library(rpart)
library(gtsummary)
library(rattle)
library(ISLR2)
library(ggplot2)
library(caret)
library(splines)
library(tidymodels)
IncomeData = read.csv("Income2.csv", header=TRUE)
head(IncomeData)
X Education Seniority Income
1 1 21.58621 113.1034 99.91717
2 2 18.27586 119.3103 92.57913
3 3 12.06897 100.6897 34.67873
4 4 17.03448 187.5862 78.70281
5 5 19.93103 20.0000 68.00992
6 6 18.27586 26.2069 71.50449
plot(IncomeData$Education, IncomeData$Income)
## fit models and calculate MSE
mod.0 = lm(IncomeData$Income ~ IncomeData$Education)
summary(mod.0)
Call:
lm(formula = IncomeData$Income ~ IncomeData$Education)
Residuals:
Min 1Q Median 3Q Max
-19.568 -8.012 1.474 5.754 23.701
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -41.9166 9.7689 -4.291 0.000192 ***
IncomeData$Education 6.3872 0.5812 10.990 0.0000000000115 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 11.93 on 28 degrees of freedom
Multiple R-squared: 0.8118, Adjusted R-squared: 0.8051
F-statistic: 120.8 on 1 and 28 DF, p-value: 0.00000000001151
mse.0 = mean( (IncomeData$Income - predict(mod.0))^2 )
mse.0
[1] 132.7502
#Plot of Education vs Income
plot(IncomeData$Education, IncomeData$Income)
lines( IncomeData$Education, predict(mod.0))
mod.1 = lm(IncomeData$Income ~ poly(IncomeData$Education, degree=3))
summary(mod.1)
Call:
lm(formula = IncomeData$Income ~ poly(IncomeData$Education, degree = 3))
Residuals:
Min 1Q Median 3Q Max
-22.1594 -6.5919 -0.6362 7.8437 16.3670
Coefficients:
Estimate Std. Error t value
(Intercept) 62.745 1.678 37.386
poly(IncomeData$Education, degree = 3)1 131.070 9.192 14.258
poly(IncomeData$Education, degree = 3)2 1.163 9.192 0.126
poly(IncomeData$Education, degree = 3)3 -42.239 9.192 -4.595
Pr(>|t|)
(Intercept) < 0.0000000000000002 ***
poly(IncomeData$Education, degree = 3)1 0.0000000000000839 ***
poly(IncomeData$Education, degree = 3)2 0.9
poly(IncomeData$Education, degree = 3)3 0.0000979255001437 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.192 on 26 degrees of freedom
Multiple R-squared: 0.8962, Adjusted R-squared: 0.8842
F-statistic: 74.81 on 3 and 26 DF, p-value: 0.0000000000006476
mse.1 = mean( (IncomeData$Income - predict(mod.1))^2 )
mse.1
[1] 73.23476
mod.2 = lm(IncomeData$Income ~ bs(IncomeData$Education, degree=3))
summary(mod.2)
Call:
lm(formula = IncomeData$Income ~ bs(IncomeData$Education, degree = 3))
Residuals:
Min 1Q Median 3Q Max
-22.1594 -6.5919 -0.6362 7.8437 16.3670
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.240 5.292 6.848 0.0000002865
bs(IncomeData$Education, degree = 3)1 -48.767 17.232 -2.830 0.00885
bs(IncomeData$Education, degree = 3)2 84.783 10.865 7.803 0.0000000281
bs(IncomeData$Education, degree = 3)3 47.310 7.794 6.070 0.0000020564
(Intercept) ***
bs(IncomeData$Education, degree = 3)1 **
bs(IncomeData$Education, degree = 3)2 ***
bs(IncomeData$Education, degree = 3)3 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.192 on 26 degrees of freedom
Multiple R-squared: 0.8962, Adjusted R-squared: 0.8842
F-statistic: 74.81 on 3 and 26 DF, p-value: 0.0000000000006476
plot(IncomeData$Education, IncomeData$Income)
lines( IncomeData$Education[order(IncomeData$Education)], predict(mod.2)[order(IncomeData$Education)] )
mse.2 = mean( (IncomeData$Income - predict(mod.2))^2 )
mse.2
[1] 73.23476
HeartData = read.csv("Heart.csv", header=TRUE)
#View(Default) #attach(Default)
View(Default)
y = as.numeric( Default$default=="Yes" )
mean(y)
[1] 0.0333
mod.0 = glm( y ~ Default$balance, family=binomial)
summary(mod.0)
Call:
glm(formula = y ~ Default$balance, family = binomial)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.6513306 0.3611574 -29.49 <0.0000000000000002 ***
Default$balance 0.0054989 0.0002204 24.95 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1596.5 on 9998 degrees of freedom
AIC: 1600.5
Number of Fisher Scoring iterations: 8
mod.1 = glm( y ~ as.factor(Default$student), family=binomial)
summary(mod.1)
Call:
glm(formula = y ~ as.factor(Default$student), family = binomial)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.50413 0.07071 -49.55 < 0.0000000000000002
as.factor(Default$student)Yes 0.40489 0.11502 3.52 0.000431
(Intercept) ***
as.factor(Default$student)Yes ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 2908.7 on 9998 degrees of freedom
AIC: 2912.7
Number of Fisher Scoring iterations: 6
mod.2 = glm( y ~ as.factor(Default$student) + Default$balance, family=binomial)
summary(mod.2)
Call:
glm(formula = y ~ as.factor(Default$student) + Default$balance,
family = binomial)
Coefficients:
Estimate Std. Error z value
(Intercept) -10.7494959 0.3691914 -29.116
as.factor(Default$student)Yes -0.7148776 0.1475190 -4.846
Default$balance 0.0057381 0.0002318 24.750
Pr(>|z|)
(Intercept) < 0.0000000000000002 ***
as.factor(Default$student)Yes 0.00000126 ***
Default$balance < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1571.7 on 9997 degrees of freedom
AIC: 1577.7
Number of Fisher Scoring iterations: 8
income=Default $ income
mod.3 = glm( y ~ as.factor(Default$student) + Default$balance + income, family=binomial)
summary(mod.3)
Call:
glm(formula = y ~ as.factor(Default$student) + Default$balance +
income, family = binomial)
Coefficients:
Estimate Std. Error z value
(Intercept) -10.869045196 0.492255516 -22.080
as.factor(Default$student)Yes -0.646775807 0.236252529 -2.738
Default$balance 0.005736505 0.000231895 24.738
income 0.000003033 0.000008203 0.370
Pr(>|z|)
(Intercept) < 0.0000000000000002 ***
as.factor(Default$student)Yes 0.00619 **
Default$balance < 0.0000000000000002 ***
income 0.71152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1571.5 on 9996 degrees of freedom
AIC: 1579.5
Number of Fisher Scoring iterations: 8
#Plot of roc curve
roc_curve = roc(Default$default ~ predict(mod.3, type="response"))
plot.roc(roc_curve, print.auc=T, print.thres=T)
train_model <- trainControl(method = "repeatedcv", number = 5, repeats = 10)
model <- train(
default ~ balance + income + student,
data = Default,
method = "glm",
family = "binomial",
trControl = train_model)
model
Generalized Linear Model
10000 samples
3 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 10 times)
Summary of sample sizes: 8001, 8001, 7999, 8000, 7999, 8000, ...
Resampling results:
Accuracy Kappa
0.9732499 0.4289372
#confusionMatrix
confusionMatrix(predict(model), Default$ default, positive="Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 9627 228
Yes 40 105
Accuracy : 0.9732
95% CI : (0.9698, 0.9763)
No Information Rate : 0.9667
P-Value [Acc > NIR] : 0.0001044
Kappa : 0.4278
Mcnemar's Test P-Value : < 0.00000000000000022
Sensitivity : 0.3153
Specificity : 0.9959
Pos Pred Value : 0.7241
Neg Pred Value : 0.9769
Prevalence : 0.0333
Detection Rate : 0.0105
Detection Prevalence : 0.0145
Balanced Accuracy : 0.6556
'Positive' Class : Yes