# importing the german credit data frame from local drive
library(readxl)
my_germ <- read_excel("C:/Users/Jinina Rei Garcia/OneDrive/Documents/HULT/STUDIES/Introduction to R/german credit card.xlsx")
View(my_germ)

massaging the data

### nested function (which), we got descriptive statistics the "good" customers
### check if its numeric, and we will have fine NA's, since they are empty  now
summary(my_germ[which(my_germ$good_bad=="good") , ])
summary(my_germ[which(my_germ$good_bad=="bad") , ])

#fixing the purpose and massaging the good_bad
my_germ$purpose_fixed <- as.numeric(gsub("X", "", my_germ$purpose))
### create a new variable called binary, define "good" with 1
my_germ$binary <- gsub("good", "1", my_germ$good_bad)
### now create the "bad" variable, USING THE RESULT FROM PREVIOUS, my_germ$binary
my_germ$binary <- gsub("bad", "0", my_germ$binary)
my_germ$binary <- as.numeric(my_germ$binary)

for loops and if statements

my_germ <- as.data.frame(my_germ) 
#bringing back to a data frame 
for (i in 1:3) {
  print(min(my_germ[,i], na.rm = TRUE))
  print(mean(my_germ[,i], na.rm = TRUE))
  print(max(my_germ[,i], na.rm = TRUE))
}#closing the i loop
## [1] 1
## [1] 2.577
## [1] 4
## [1] 4
## [1] 20.903
## [1] 72
## [1] 0
## [1] 2.545
## [1] 4
#creating a custom variable: 
my_germ$score <- c()

for (i in 1:1000) {
my_germ$score[i] <- 0.5*my_germ$duration[i] +
    0.1*my_germ$amount[i] +
      0.1*my_germ$age[i] +
        0.3*my_germ$installp[i]
  
}#closing the i loop 

#adding an if statement to analyze the score
my_germ$result <- c()

for (i in 1:1000)
{
  if(my_germ$score[i] <500 && my_germ$binary[i] == 1)
    {
      my_germ$result[i] <- "outstanding"
    }
  else
    {
      my_germ$result[i] <- "not outstanding"
    }
}

creating a UDF to standardize and normalize data

#UDF to convert data standardization 
standard <- function(var1) {
  my_standard <- ((var1 - mean(var1))/sd(var1))*10+50
  return(my_standard)               
} #closing the standard UDF

my_germ$amount_standard <- standard(var1=my_germ$amount) 
mean(my_germ$amount_standard)
sd(my_germ$amount_standard)
summary(my_germ$amount_standard)

my_germ$age_standard <- standard(var1=my_germ$age)
mean(my_germ$age_standard)
sd(my_germ$age_standard)

hist(my_germ$age_standard)

#normalizing our data with min max rescaling 

normal <- function(var1) {
  my_normal <- (var1-min(var1))/(max(var1)-min(var1))
  return(my_normal)
} # closing the normal UDF

my_germ$checking_norm <- normal(my_germ$checking)
my_germ$amount_norm <- normal(my_germ$amount)
my_germ$duration_norm <- normal(my_germ$duration)
my_germ$age_norm <- normal(my_germ$age)
my_germ$savings_norm <- normal(my_germ$savings)
my_germ$installp_norm <- normal(my_germ$installp)
my_germ$existcr_norm <- normal(my_germ$existcr)
my_germ$telephon_norm <- normal(my_germ$telephon)
#uniform distribution
uniform <- sample(c(1,2,3,4,5,6), replace=TRUE, size=50)
mean(uniform)
## [1] 3.78
hist(uniform)

binary <- sample(c(1,0), replace=TRUE, size=10000)
mean(binary)
## [1] 0.5083
hist(binary)

exponential <- rexp(1000, rate=1)
mean(exponential)
## [1] 1.02634
hist(exponential)

#creating a for loop to print histograms for all variables

for (i in 1:ncol(my_germ)){
  try(hist(my_germ[,i]))
}

## Error in hist.default(my_germ[, i]) : 'x' must be numeric

## Error in hist.default(my_germ[, i]) : 'x' must be numeric

## Error in hist.default(my_germ[, i]) : 'x' must be numeric

#visualizing some of the variables session 6 
library(ggplot2)

ggplot(data=my_germ, aes(x=purpose, y=age, color=good_bad)) + 
  geom_jitter() 

ggplot(data=my_germ, aes(x=amount, y=property, color=good_bad)) +
  geom_jitter()

my_chart <- ggplot(data=my_germ, aes(x=age, y=amount, color=good_bad)) +
  geom_point()+
  scale_color_manual(values=c("#1CCE1F", "#B81CEE"))

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(my_chart)
#moving into predictive statistics (machine learning)
index <- sample(1:nrow(my_germ), size=0.8*nrow(my_germ))
my_germ_train <- my_germ[index,]
my_germ_test <- my_germ[-index,]
##building predictive linear regression
my_linear <- lm(amount~age, data=my_germ_train)
summary(my_linear)
## 
## Call:
## lm(formula = amount ~ age, data = my_germ_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3160.8 -1927.3  -960.4   702.6 15144.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2951.572    333.116   8.860   <2e-16 ***
## age           10.246      8.895   1.152     0.25    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2883 on 798 degrees of freedom
## Multiple R-squared:  0.00166,    Adjusted R-squared:  0.0004088 
## F-statistic: 1.327 on 1 and 798 DF,  p-value: 0.2497
##we cannot compare the variables side by side because they have different units

##session 7 logistic regression modeling 

my_logit <- glm(binary~checking+duration+age+telephon+amount+installp,
                data=my_germ_train, family="binomial")
summary(my_logit)
## 
## Call:
## glm(formula = binary ~ checking + duration + age + telephon + 
##     amount + installp, family = "binomial", data = my_germ_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3031  -0.9694   0.4932   0.8299   1.8574  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.185e-02  4.438e-01  -0.139  0.88916    
## checking     6.548e-01  7.342e-02   8.919  < 2e-16 ***
## duration    -2.732e-02  8.759e-03  -3.120  0.00181 ** 
## age          1.571e-02  7.899e-03   1.989  0.04666 *  
## telephon     3.024e-01  1.876e-01   1.612  0.10704    
## amount      -8.134e-05  3.998e-05  -2.034  0.04192 *  
## installp    -2.448e-01  8.454e-02  -2.895  0.00379 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 982.41  on 799  degrees of freedom
## Residual deviance: 830.36  on 793  degrees of freedom
## AIC: 844.36
## 
## Number of Fisher Scoring iterations: 4
##creating a better model 
my_logit_better <- glm(binary~checking+duration+age+telephon+amount+installp,
                data=my_germ_train, family="binomial")
summary(my_logit_better)
## 
## Call:
## glm(formula = binary ~ checking + duration + age + telephon + 
##     amount + installp, family = "binomial", data = my_germ_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3031  -0.9694   0.4932   0.8299   1.8574  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.185e-02  4.438e-01  -0.139  0.88916    
## checking     6.548e-01  7.342e-02   8.919  < 2e-16 ***
## duration    -2.732e-02  8.759e-03  -3.120  0.00181 ** 
## age          1.571e-02  7.899e-03   1.989  0.04666 *  
## telephon     3.024e-01  1.876e-01   1.612  0.10704    
## amount      -8.134e-05  3.998e-05  -2.034  0.04192 *  
## installp    -2.448e-01  8.454e-02  -2.895  0.00379 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 982.41  on 799  degrees of freedom
## Residual deviance: 830.36  on 793  degrees of freedom
## AIC: 844.36
## 
## Number of Fisher Scoring iterations: 4
##creating a unitless model with normalized variables
my_logit_norm <- glm(binary~checking_norm+duration_norm+
                       age_norm+telephon_norm+amount_norm+
                       installp_norm,
                       data=my_germ_train, family="binomial")
summary(my_logit_norm)
## 
## Call:
## glm(formula = binary ~ checking_norm + duration_norm + age_norm + 
##     telephon_norm + amount_norm + installp_norm, family = "binomial", 
##     data = my_germ_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3031  -0.9694   0.4932   0.8299   1.8574  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.8195     0.2639   3.106  0.00190 ** 
## checking_norm   1.9645     0.2203   8.919  < 2e-16 ***
## duration_norm  -1.8580     0.5956  -3.120  0.00181 ** 
## age_norm        0.8800     0.4423   1.989  0.04666 *  
## telephon_norm   0.3024     0.1876   1.612  0.10704    
## amount_norm    -1.4782     0.7266  -2.034  0.04192 *  
## installp_norm  -0.7343     0.2536  -2.895  0.00379 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 982.41  on 799  degrees of freedom
## Residual deviance: 830.36  on 793  degrees of freedom
## AIC: 844.36
## 
## Number of Fisher Scoring iterations: 4

##in a unitless model, we can see that the amount has second negative impact

####session #8 how good is my model?
#install.packages("caret")
library(caret)
## Loading required package: lattice
my_prediction_testing <- predict(my_logit, my_germ_test, type = "response")

confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
                reference = as.factor(as.numeric(my_germ_test$binary)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  15   9
##          1  42 134
##                                           
##                Accuracy : 0.745           
##                  95% CI : (0.6787, 0.8039)
##     No Information Rate : 0.715           
##     P-Value [Acc > NIR] : 0.1953          
##                                           
##                   Kappa : 0.2424          
##                                           
##  Mcnemar's Test P-Value : 7.433e-06       
##                                           
##             Sensitivity : 0.2632          
##             Specificity : 0.9371          
##          Pos Pred Value : 0.6250          
##          Neg Pred Value : 0.7614          
##              Prevalence : 0.2850          
##          Detection Rate : 0.0750          
##    Detection Prevalence : 0.1200          
##       Balanced Accuracy : 0.6001          
##                                           
##        'Positive' Class : 0               
## 
##building a confusion matrix for training 
my_prediction_training <- predict(my_logit, my_germ_train, type = "response")

confusionMatrix(data=as.factor(as.numeric(my_prediction_training>0.5)),
                reference = as.factor(as.numeric(my_germ_train$binary)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 106  59
##          1 137 498
##                                           
##                Accuracy : 0.755           
##                  95% CI : (0.7237, 0.7844)
##     No Information Rate : 0.6962          
##     P-Value [Acc > NIR] : 0.0001352       
##                                           
##                   Kappa : 0.3631          
##                                           
##  Mcnemar's Test P-Value : 3.798e-08       
##                                           
##             Sensitivity : 0.4362          
##             Specificity : 0.8941          
##          Pos Pred Value : 0.6424          
##          Neg Pred Value : 0.7843          
##              Prevalence : 0.3038          
##          Detection Rate : 0.1325          
##    Detection Prevalence : 0.2062          
##       Balanced Accuracy : 0.6651          
##                                           
##        'Positive' Class : 0               
## 
##confusion matrix for the better logit
my_prediction_training <- predict(my_logit_better, my_germ_test, type = "response")

confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
                reference = as.factor(as.numeric(my_germ_test$binary)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  15   9
##          1  42 134
##                                           
##                Accuracy : 0.745           
##                  95% CI : (0.6787, 0.8039)
##     No Information Rate : 0.715           
##     P-Value [Acc > NIR] : 0.1953          
##                                           
##                   Kappa : 0.2424          
##                                           
##  Mcnemar's Test P-Value : 7.433e-06       
##                                           
##             Sensitivity : 0.2632          
##             Specificity : 0.9371          
##          Pos Pred Value : 0.6250          
##          Neg Pred Value : 0.7614          
##              Prevalence : 0.2850          
##          Detection Rate : 0.0750          
##    Detection Prevalence : 0.1200          
##       Balanced Accuracy : 0.6001          
##                                           
##        'Positive' Class : 0               
## 
##AUC ROC 
#install.packages("ROCR")
library(ROCR)

my_prediction <- my_prediction_training

pred_val_logit <- prediction(my_prediction, my_germ_test$binary)

perf_logit <- performance(pred_val_logit, "tpr", "fpr" )

plot(perf_logit)

##building a gini decision tree as a challenger model
library(rpart)
library(rpart.plot)

my_tree <- rpart(binary~checking+duration+age+telephon+amount+installp+coapp,
                 data = my_germ_train, method = "class", cp=0.015)
rpart.plot(my_tree, type=1, extra=1)

#building ROC AUC for the tree
my_tree_predict_test <- predict(my_tree, my_germ_test, type="prob")

my_tree_predict_train <- predict(my_tree, my_germ_train, type="prob")
##[,2] is for probability of success
my_tree_train_prediction <- prediction(my_tree_predict_train[,2],
                                       my_germ_train$binary)
my_tree_performance <- performance(my_tree_train_prediction,"tpr", "fpr")

plot(my_tree_performance,col="black")
plot(perf_logit, col="green4", add=TRUE)