# importing the german credit data frame from local drive
library(readxl)
my_germ <- read_excel("C:/Users/Jinina Rei Garcia/OneDrive/Documents/HULT/STUDIES/Introduction to R/german credit card.xlsx")
View(my_germ)
massaging the data
### nested function (which), we got descriptive statistics the "good" customers
### check if its numeric, and we will have fine NA's, since they are empty now
summary(my_germ[which(my_germ$good_bad=="good") , ])
summary(my_germ[which(my_germ$good_bad=="bad") , ])
#fixing the purpose and massaging the good_bad
my_germ$purpose_fixed <- as.numeric(gsub("X", "", my_germ$purpose))
### create a new variable called binary, define "good" with 1
my_germ$binary <- gsub("good", "1", my_germ$good_bad)
### now create the "bad" variable, USING THE RESULT FROM PREVIOUS, my_germ$binary
my_germ$binary <- gsub("bad", "0", my_germ$binary)
my_germ$binary <- as.numeric(my_germ$binary)
for loops and if statements
my_germ <- as.data.frame(my_germ)
#bringing back to a data frame
for (i in 1:3) {
print(min(my_germ[,i], na.rm = TRUE))
print(mean(my_germ[,i], na.rm = TRUE))
print(max(my_germ[,i], na.rm = TRUE))
}#closing the i loop
## [1] 1
## [1] 2.577
## [1] 4
## [1] 4
## [1] 20.903
## [1] 72
## [1] 0
## [1] 2.545
## [1] 4
#creating a custom variable:
my_germ$score <- c()
for (i in 1:1000) {
my_germ$score[i] <- 0.5*my_germ$duration[i] +
0.1*my_germ$amount[i] +
0.1*my_germ$age[i] +
0.3*my_germ$installp[i]
}#closing the i loop
#adding an if statement to analyze the score
my_germ$result <- c()
for (i in 1:1000)
{
if(my_germ$score[i] <500 && my_germ$binary[i] == 1)
{
my_germ$result[i] <- "outstanding"
}
else
{
my_germ$result[i] <- "not outstanding"
}
}
creating a UDF to standardize and normalize data
#UDF to convert data standardization
standard <- function(var1) {
my_standard <- ((var1 - mean(var1))/sd(var1))*10+50
return(my_standard)
} #closing the standard UDF
my_germ$amount_standard <- standard(var1=my_germ$amount)
mean(my_germ$amount_standard)
sd(my_germ$amount_standard)
summary(my_germ$amount_standard)
my_germ$age_standard <- standard(var1=my_germ$age)
mean(my_germ$age_standard)
sd(my_germ$age_standard)
hist(my_germ$age_standard)
#normalizing our data with min max rescaling
normal <- function(var1) {
my_normal <- (var1-min(var1))/(max(var1)-min(var1))
return(my_normal)
} # closing the normal UDF
my_germ$checking_norm <- normal(my_germ$checking)
my_germ$amount_norm <- normal(my_germ$amount)
my_germ$duration_norm <- normal(my_germ$duration)
my_germ$age_norm <- normal(my_germ$age)
my_germ$savings_norm <- normal(my_germ$savings)
my_germ$installp_norm <- normal(my_germ$installp)
my_germ$existcr_norm <- normal(my_germ$existcr)
my_germ$telephon_norm <- normal(my_germ$telephon)
#uniform distribution
uniform <- sample(c(1,2,3,4,5,6), replace=TRUE, size=50)
mean(uniform)
## [1] 3.78
hist(uniform)
binary <- sample(c(1,0), replace=TRUE, size=10000)
mean(binary)
## [1] 0.5083
hist(binary)
exponential <- rexp(1000, rate=1)
mean(exponential)
## [1] 1.02634
hist(exponential)
#creating a for loop to print histograms for all variables
for (i in 1:ncol(my_germ)){
try(hist(my_germ[,i]))
}
## Error in hist.default(my_germ[, i]) : 'x' must be numeric
## Error in hist.default(my_germ[, i]) : 'x' must be numeric
## Error in hist.default(my_germ[, i]) : 'x' must be numeric
#visualizing some of the variables session 6
library(ggplot2)
ggplot(data=my_germ, aes(x=purpose, y=age, color=good_bad)) +
geom_jitter()
ggplot(data=my_germ, aes(x=amount, y=property, color=good_bad)) +
geom_jitter()
my_chart <- ggplot(data=my_germ, aes(x=age, y=amount, color=good_bad)) +
geom_point()+
scale_color_manual(values=c("#1CCE1F", "#B81CEE"))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(my_chart)
#moving into predictive statistics (machine learning)
index <- sample(1:nrow(my_germ), size=0.8*nrow(my_germ))
my_germ_train <- my_germ[index,]
my_germ_test <- my_germ[-index,]
##building predictive linear regression
my_linear <- lm(amount~age, data=my_germ_train)
summary(my_linear)
##
## Call:
## lm(formula = amount ~ age, data = my_germ_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3160.8 -1927.3 -960.4 702.6 15144.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2951.572 333.116 8.860 <2e-16 ***
## age 10.246 8.895 1.152 0.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2883 on 798 degrees of freedom
## Multiple R-squared: 0.00166, Adjusted R-squared: 0.0004088
## F-statistic: 1.327 on 1 and 798 DF, p-value: 0.2497
##we cannot compare the variables side by side because they have different units
##session 7 logistic regression modeling
my_logit <- glm(binary~checking+duration+age+telephon+amount+installp,
data=my_germ_train, family="binomial")
summary(my_logit)
##
## Call:
## glm(formula = binary ~ checking + duration + age + telephon +
## amount + installp, family = "binomial", data = my_germ_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3031 -0.9694 0.4932 0.8299 1.8574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.185e-02 4.438e-01 -0.139 0.88916
## checking 6.548e-01 7.342e-02 8.919 < 2e-16 ***
## duration -2.732e-02 8.759e-03 -3.120 0.00181 **
## age 1.571e-02 7.899e-03 1.989 0.04666 *
## telephon 3.024e-01 1.876e-01 1.612 0.10704
## amount -8.134e-05 3.998e-05 -2.034 0.04192 *
## installp -2.448e-01 8.454e-02 -2.895 0.00379 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 982.41 on 799 degrees of freedom
## Residual deviance: 830.36 on 793 degrees of freedom
## AIC: 844.36
##
## Number of Fisher Scoring iterations: 4
##creating a better model
my_logit_better <- glm(binary~checking+duration+age+telephon+amount+installp,
data=my_germ_train, family="binomial")
summary(my_logit_better)
##
## Call:
## glm(formula = binary ~ checking + duration + age + telephon +
## amount + installp, family = "binomial", data = my_germ_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3031 -0.9694 0.4932 0.8299 1.8574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.185e-02 4.438e-01 -0.139 0.88916
## checking 6.548e-01 7.342e-02 8.919 < 2e-16 ***
## duration -2.732e-02 8.759e-03 -3.120 0.00181 **
## age 1.571e-02 7.899e-03 1.989 0.04666 *
## telephon 3.024e-01 1.876e-01 1.612 0.10704
## amount -8.134e-05 3.998e-05 -2.034 0.04192 *
## installp -2.448e-01 8.454e-02 -2.895 0.00379 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 982.41 on 799 degrees of freedom
## Residual deviance: 830.36 on 793 degrees of freedom
## AIC: 844.36
##
## Number of Fisher Scoring iterations: 4
##creating a unitless model with normalized variables
my_logit_norm <- glm(binary~checking_norm+duration_norm+
age_norm+telephon_norm+amount_norm+
installp_norm,
data=my_germ_train, family="binomial")
summary(my_logit_norm)
##
## Call:
## glm(formula = binary ~ checking_norm + duration_norm + age_norm +
## telephon_norm + amount_norm + installp_norm, family = "binomial",
## data = my_germ_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3031 -0.9694 0.4932 0.8299 1.8574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8195 0.2639 3.106 0.00190 **
## checking_norm 1.9645 0.2203 8.919 < 2e-16 ***
## duration_norm -1.8580 0.5956 -3.120 0.00181 **
## age_norm 0.8800 0.4423 1.989 0.04666 *
## telephon_norm 0.3024 0.1876 1.612 0.10704
## amount_norm -1.4782 0.7266 -2.034 0.04192 *
## installp_norm -0.7343 0.2536 -2.895 0.00379 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 982.41 on 799 degrees of freedom
## Residual deviance: 830.36 on 793 degrees of freedom
## AIC: 844.36
##
## Number of Fisher Scoring iterations: 4
##in a unitless model, we can see that the amount has second negative impact
####session #8 how good is my model?
#install.packages("caret")
library(caret)
## Loading required package: lattice
my_prediction_testing <- predict(my_logit, my_germ_test, type = "response")
confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
reference = as.factor(as.numeric(my_germ_test$binary)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15 9
## 1 42 134
##
## Accuracy : 0.745
## 95% CI : (0.6787, 0.8039)
## No Information Rate : 0.715
## P-Value [Acc > NIR] : 0.1953
##
## Kappa : 0.2424
##
## Mcnemar's Test P-Value : 7.433e-06
##
## Sensitivity : 0.2632
## Specificity : 0.9371
## Pos Pred Value : 0.6250
## Neg Pred Value : 0.7614
## Prevalence : 0.2850
## Detection Rate : 0.0750
## Detection Prevalence : 0.1200
## Balanced Accuracy : 0.6001
##
## 'Positive' Class : 0
##
##building a confusion matrix for training
my_prediction_training <- predict(my_logit, my_germ_train, type = "response")
confusionMatrix(data=as.factor(as.numeric(my_prediction_training>0.5)),
reference = as.factor(as.numeric(my_germ_train$binary)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 106 59
## 1 137 498
##
## Accuracy : 0.755
## 95% CI : (0.7237, 0.7844)
## No Information Rate : 0.6962
## P-Value [Acc > NIR] : 0.0001352
##
## Kappa : 0.3631
##
## Mcnemar's Test P-Value : 3.798e-08
##
## Sensitivity : 0.4362
## Specificity : 0.8941
## Pos Pred Value : 0.6424
## Neg Pred Value : 0.7843
## Prevalence : 0.3038
## Detection Rate : 0.1325
## Detection Prevalence : 0.2062
## Balanced Accuracy : 0.6651
##
## 'Positive' Class : 0
##
##confusion matrix for the better logit
my_prediction_training <- predict(my_logit_better, my_germ_test, type = "response")
confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
reference = as.factor(as.numeric(my_germ_test$binary)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15 9
## 1 42 134
##
## Accuracy : 0.745
## 95% CI : (0.6787, 0.8039)
## No Information Rate : 0.715
## P-Value [Acc > NIR] : 0.1953
##
## Kappa : 0.2424
##
## Mcnemar's Test P-Value : 7.433e-06
##
## Sensitivity : 0.2632
## Specificity : 0.9371
## Pos Pred Value : 0.6250
## Neg Pred Value : 0.7614
## Prevalence : 0.2850
## Detection Rate : 0.0750
## Detection Prevalence : 0.1200
## Balanced Accuracy : 0.6001
##
## 'Positive' Class : 0
##
##AUC ROC
#install.packages("ROCR")
library(ROCR)
my_prediction <- my_prediction_training
pred_val_logit <- prediction(my_prediction, my_germ_test$binary)
perf_logit <- performance(pred_val_logit, "tpr", "fpr" )
plot(perf_logit)
##building a gini decision tree as a challenger model
library(rpart)
library(rpart.plot)
my_tree <- rpart(binary~checking+duration+age+telephon+amount+installp+coapp,
data = my_germ_train, method = "class", cp=0.015)
rpart.plot(my_tree, type=1, extra=1)
#building ROC AUC for the tree
my_tree_predict_test <- predict(my_tree, my_germ_test, type="prob")
my_tree_predict_train <- predict(my_tree, my_germ_train, type="prob")
##[,2] is for probability of success
my_tree_train_prediction <- prediction(my_tree_predict_train[,2],
my_germ_train$binary)
my_tree_performance <- performance(my_tree_train_prediction,"tpr", "fpr")
plot(my_tree_performance,col="black")
plot(perf_logit, col="green4", add=TRUE)