# library(readr)
# default <- read.csv("C:/Users/dan/Desktop/Chpt 7/default.csv")
# #head(default)

library(readr)
default <- read.csv("C:/Users/dan/Desktop/a Visualization mod 2/Chpt 7/default.csv")
#View(default)

Preparing data

#default <-read.csv("../data/Default.csv")

default$SEX <- factor(default$SEX, levels = c(1,2),
                          labels = c("Male","Female")) 

default$EDUCATION[default$EDUCATION == 0] <- 4
default$EDUCATION[default$EDUCATION == 5] <- 4 
default$EDUCATION[default$EDUCATION == 6] <- 4 

default$EDUCATION <- factor(default$EDUCATION, levels = c(1,2,3,4),
                          labels = c("Graduate","University","High school","Other"),
                          ordered = TRUE) 

default$MARRIAGE[default$MARRIAGE == 0] <- 3 

default$MARRIAGE <- factor(default$MARRIAGE, levels = c(1,2,3),
                          labels = c("Married","Single","Other")) 

default$default <- factor(default$default, levels = c(0,1),
                          labels = c("No","Yes"))

default$PAY_0[default$PAY_0 == 0] <- 4

default[7:12][default[7:12] == -2] <- 0
default[7:12][default[7:12] == -1] <- 0

default[7:12] <- lapply(default[7:12], factor, levels = c(0,1,2,3,4,5,6,7,8,9),
                       labels = c("Pay Duly", "Delay 1M", "Delay 2M",
                                  "Delay 3M", "Delay 4M", "Delay 5M",
                                  "Delay 6M", "Delay 7M", "Delay 8M",
                                  "Delay 9M"))

#head(default, n=7)
#str(default)

Exercise

Design a multivariate data visualisation, using any method and combination of variables, that provides insight into the likelihood of a customer defaulting on a loan.

#str(default)    ## sex =Fac, PAY_3 = Fac, EDUC = Ord.fac ? , limit_Bal= int, Bill_Amt3 = int, 
                 ## default = Fac 

#default$SEX #<-    as.character(default$SEX)
default$PAY_3 <- as.numeric(default$PAY_3)
default$EDUCATION <-  as.numeric( default$EDUCATION)
default$LIMIT_BAL <- as.numeric(default$LIMIT_BAL)
default$BILL_AMT3 <- as.numeric(default$BILL_AMT3)
default$default <- as.numeric(default$default)
default$default.payment.next.month <- as.numeric(default$default.payment.next.month)
#default$MARRIAGE


default_1 <- cbind.data.frame(default$SEX,
                              default$MARRIAGE,
                              default$EDUCATION,
                              default$LIMIT_BAL,
                              default$PAY_3,
                              default$BILL_AMT3,
                              default$default,
                              default$default.payment.next.month)

names(default_1)[1] <- 'Sex'
names(default_1)[2] <- 'Marriage' ##      <<=========  Linked to pay amount
names(default_1)[3] <- 'Educ'                    ## Should show ability to pay
names(default_1)[4] <- 'Limit_Bal'
names(default_1)[5] <- 'Pay_3' ##  <<=========  Bill amount and pay amount are linked
names(default_1)[6] <- 'Bill_amt3'                 ##  defaulted amount
names(default_1)[7] <- 'default' ##  amount to pay ??
names(default_1)[8] <- 'defaultPaymentNextMonth' ##  amount to pay ??

#default_1 #<- na.omit(default_1)
#str(default_1)
#View(default_1)

# library(corrr)
# correlate(default_1 [,2:7])
#boxplot(default_1)   ##  <<===  showed something between Limit Balance and repayment Amount maybe ??
#boxplot(default_1$Limit_Bal, default_1$Bill_amt3)

cat("\014")   ##  Clear console   <<===    needed to clear this space  ===

Use the “hit it with the big hammer” to hopefully gain some insight

library(GGally)
ggpairs(default, columns = 1:5,axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Creating a subgroup for closer examination <<======= From this point on

1 = Male , 2 = Female

1 = Married , 2 = Single

#str(default_1)
default_1$Sex <- as.integer(default_1$Sex)
default_1$Marriage <-as.integer(default_1$Marriage)
#str(default_1)
default_1$Sex <- as.numeric(default_1$Sex)              ## 1 = Male    , 2 = Female
default_1$Marriage <-as.numeric(default_1$Marriage)     ## 1 = Married , 2 = Single
#str(default_1)

Chosing a subgroup

###################  this one

MostLikely <-   default_1 %>% filter(default_1$default > 1 &  ## default > 1 time   <<======
                                     default_1$Educ <=2 &    
                                     default_1$Limit_Bal >30000)  #10000   
#str(MostLikely)
#MostLikely$Bill_amt3 <-  sort(MostLikely$Bill_amt3) ##huh    ??? a negative bill amount ?
MostLikely$default <-  sort(MostLikely$default)                                                  
                                                 
#sum(MostLikely$default)
#head(MostLikely, n=5)
#View(MostLikely)
#nrow(MostLikely)

At this point I want more on correlation with the variables I’ve chosen

Blanked out, essentually it pretty much matches the above visualization

#library(GGally)
#ggpairs(MostLikely, columns = 1:5,axisLabels = "internal")

New working from here

#nrow(MostLikely)
#summary(MostLikely$Sex == "1")

## 1 = Male    , 2 = Female
percentage_SEX <- round( ( 2521 / 5223)*100, 0)  ##  Total rows = 5223
#    Mode   FALSE    TRUE 
# logical    2998    2225 
#percentage_SEX

#summary(MostLikely$Marriage =="1")

## 1 = Married , 2 = Single 
percentage_Marriage <- round( (2261 / 5223)*100, 0)  ##  Total rows = 5223
#    Mode   FALSE    TRUE 
# logical    2826    2397
#percentage_Marriage
cat("For a default balance of greater than $30000 and a period of 3 months,\n\n")
## For a default balance of greater than $30000 and a period of 3 months,
cat("This shows theres a ", percentage_SEX, "% chance of the defaulter being Female,\n\n")
## This shows theres a  48 % chance of the defaulter being Female,
cat("And a ", percentage_Marriage, "% chance of the defaulter being single.\n\n")
## And a  43 % chance of the defaulter being single.
cat("These stats are based only on the small subgroup set to 3 mths ONLY.")
## These stats are based only on the small subgroup set to 3 mths ONLY.
p1 <- ggplot(MostLikely, aes(MostLikely$Limit_Bal, MostLikely$Bill_amt3,  
                              group = 1 ))#$Sex
p1 + #geom_boxplot(outlier.shape = NA) + 
     facet_wrap(~MostLikely$Sex, scales="free")+
     #scale_x_continuous(labels = comma)+
     #scale_y_continuous(labels = comma)+
     xlab( c("Male Bill Amout Owing                                     Female Bill Amout Owing "))+
     ylab("Balance Limit in $")+
     ylim(-1500, 90000)+
     stat_summary(fun.data = "mean_cl_boot", colour = "red", 
               geom = "errorbar", width = .2)
## Warning: Removed 861 rows containing non-finite values (stat_summary).
## Warning: Removed 10 rows containing missing values (geom_errorbar).

     #scale_x_continuous(labels = comma)  ##  <<== code works but won't knit with this line 

These stats are ONLY from the small subset based at 3months ONLY for amounts greater than $30000

p <- ggplot(MostLikely, aes(MostLikely$Bill_amt3, MostLikely$Sex  == "1"))#, colour = Sex
                                              ##  colour = Sex, givesthe right label  
require(scales)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
p +  geom_jitter(width = 0.2, outlier.shape = NA) +
  ylab("")+
  xlab("Balance Limit in $")+ scale_x_continuous(labels = comma) +
  ggtitle("Density Plot (Using Jitter)\n                                 Key :  True  = Male  ,  False = Female")
## Warning: Ignoring unknown parameters: outlier.shape

#scale_color_gradient(low = "purple", high = "blue")
#+geom_boxplot(outlier.colour = "red") 

##  outlier.shape = NA   <<======  TO NOT SHOW Outliers
MostLikely <- as.data.frame(MostLikely)
MostLikely <- na.omit(MostLikely)  ##  Just incase 
#str(MostLikely)
# Bivariate Thinking
# Let's start with a side-by-side box plot with mean and error bars (95% CI)

p1 <- ggplot(data = MostLikely, aes(x = MostLikely$Limit_Bal, y = MostLikely$Bill_amt3,group = 1))
p1 +  stat_summary(fun.y = "mean", geom = "point", colour = "red") +
      stat_summary(fun.data = "mean_cl_boot", colour = "red", 
               geom = "errorbar", width = .2,outlier.shape = NA)+
      scale_x_continuous(labels = comma)+
      scale_y_continuous(labels = comma)+
      xlab("Balance Limit in $")+
      ylab("Bill Amount Owing") +
      ggtitle("Bill Amount Increase with Increasing Balance Limit")
## Warning: Ignoring unknown parameters: outlier.shape
## Warning: Removed 12 rows containing missing values (geom_errorbar).

# geom_boxplot( outlier.shape = NA) +
ggplot(MostLikely, aes(Sex, color=MostLikely$Sex)) + #grupo
  geom_histogram(position="identity", binwidth=1, aes(y=..density.., fill=MostLikely$Sex),  alpha=0.5) +
  geom_density()+
  ggtitle("Density Plot\n                                      Key :  1  = Male  ,  2 = Female")

Conclusion

I ran this code with varying limit amounts ($) in the my subgroup, interestingly at greater than ($10000) it was Females who where single that defaulted more. And at greater than ($30000) its Males whom are married.