# library(readr)
# default <- read.csv("C:/Users/dan/Desktop/Chpt 7/default.csv")
# #head(default)
library(readr)
default <- read.csv("C:/Users/dan/Desktop/a Visualization mod 2/Chpt 7/default.csv")
#View(default)
#default <-read.csv("../data/Default.csv")
default$SEX <- factor(default$SEX, levels = c(1,2),
labels = c("Male","Female"))
default$EDUCATION[default$EDUCATION == 0] <- 4
default$EDUCATION[default$EDUCATION == 5] <- 4
default$EDUCATION[default$EDUCATION == 6] <- 4
default$EDUCATION <- factor(default$EDUCATION, levels = c(1,2,3,4),
labels = c("Graduate","University","High school","Other"),
ordered = TRUE)
default$MARRIAGE[default$MARRIAGE == 0] <- 3
default$MARRIAGE <- factor(default$MARRIAGE, levels = c(1,2,3),
labels = c("Married","Single","Other"))
default$default <- factor(default$default, levels = c(0,1),
labels = c("No","Yes"))
default$PAY_0[default$PAY_0 == 0] <- 4
default[7:12][default[7:12] == -2] <- 0
default[7:12][default[7:12] == -1] <- 0
default[7:12] <- lapply(default[7:12], factor, levels = c(0,1,2,3,4,5,6,7,8,9),
labels = c("Pay Duly", "Delay 1M", "Delay 2M",
"Delay 3M", "Delay 4M", "Delay 5M",
"Delay 6M", "Delay 7M", "Delay 8M",
"Delay 9M"))
#head(default, n=7)
#str(default)
Design a multivariate data visualisation, using any method and combination of variables, that provides insight into the likelihood of a customer defaulting on a loan.
#str(default) ## sex =Fac, PAY_3 = Fac, EDUC = Ord.fac ? , limit_Bal= int, Bill_Amt3 = int,
## default = Fac
#default$SEX #<- as.character(default$SEX)
default$PAY_3 <- as.numeric(default$PAY_3)
default$EDUCATION <- as.numeric( default$EDUCATION)
default$LIMIT_BAL <- as.numeric(default$LIMIT_BAL)
default$BILL_AMT3 <- as.numeric(default$BILL_AMT3)
default$default <- as.numeric(default$default)
default$default.payment.next.month <- as.numeric(default$default.payment.next.month)
#default$MARRIAGE
default_1 <- cbind.data.frame(default$SEX,
default$MARRIAGE,
default$EDUCATION,
default$LIMIT_BAL,
default$PAY_3,
default$BILL_AMT3,
default$default,
default$default.payment.next.month)
names(default_1)[1] <- 'Sex'
names(default_1)[2] <- 'Marriage' ## <<========= Linked to pay amount
names(default_1)[3] <- 'Educ' ## Should show ability to pay
names(default_1)[4] <- 'Limit_Bal'
names(default_1)[5] <- 'Pay_3' ## <<========= Bill amount and pay amount are linked
names(default_1)[6] <- 'Bill_amt3' ## defaulted amount
names(default_1)[7] <- 'default' ## amount to pay ??
names(default_1)[8] <- 'defaultPaymentNextMonth' ## amount to pay ??
#default_1 #<- na.omit(default_1)
#str(default_1)
#View(default_1)
# library(corrr)
# correlate(default_1 [,2:7])
#boxplot(default_1) ## <<=== showed something between Limit Balance and repayment Amount maybe ??
#boxplot(default_1$Limit_Bal, default_1$Bill_amt3)
cat("\014") ## Clear console <<=== needed to clear this space ===
library(GGally)
ggpairs(default, columns = 1:5,axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#str(default_1)
default_1$Sex <- as.integer(default_1$Sex)
default_1$Marriage <-as.integer(default_1$Marriage)
#str(default_1)
default_1$Sex <- as.numeric(default_1$Sex) ## 1 = Male , 2 = Female
default_1$Marriage <-as.numeric(default_1$Marriage) ## 1 = Married , 2 = Single
#str(default_1)
################### this one
MostLikely <- default_1 %>% filter(default_1$default > 1 & ## default > 1 time <<======
default_1$Educ <=2 &
default_1$Limit_Bal >30000) #10000
#str(MostLikely)
#MostLikely$Bill_amt3 <- sort(MostLikely$Bill_amt3) ##huh ??? a negative bill amount ?
MostLikely$default <- sort(MostLikely$default)
#sum(MostLikely$default)
#head(MostLikely, n=5)
#View(MostLikely)
#nrow(MostLikely)
#library(GGally)
#ggpairs(MostLikely, columns = 1:5,axisLabels = "internal")
#nrow(MostLikely)
#summary(MostLikely$Sex == "1")
## 1 = Male , 2 = Female
percentage_SEX <- round( ( 2521 / 5223)*100, 0) ## Total rows = 5223
# Mode FALSE TRUE
# logical 2998 2225
#percentage_SEX
#summary(MostLikely$Marriage =="1")
## 1 = Married , 2 = Single
percentage_Marriage <- round( (2261 / 5223)*100, 0) ## Total rows = 5223
# Mode FALSE TRUE
# logical 2826 2397
#percentage_Marriage
cat("For a default balance of greater than $30000 and a period of 3 months,\n\n")
## For a default balance of greater than $30000 and a period of 3 months,
cat("This shows theres a ", percentage_SEX, "% chance of the defaulter being Female,\n\n")
## This shows theres a 48 % chance of the defaulter being Female,
cat("And a ", percentage_Marriage, "% chance of the defaulter being single.\n\n")
## And a 43 % chance of the defaulter being single.
cat("These stats are based only on the small subgroup set to 3 mths ONLY.")
## These stats are based only on the small subgroup set to 3 mths ONLY.
p1 <- ggplot(MostLikely, aes(MostLikely$Limit_Bal, MostLikely$Bill_amt3,
group = 1 ))#$Sex
p1 + #geom_boxplot(outlier.shape = NA) +
facet_wrap(~MostLikely$Sex, scales="free")+
#scale_x_continuous(labels = comma)+
#scale_y_continuous(labels = comma)+
xlab( c("Male Bill Amout Owing Female Bill Amout Owing "))+
ylab("Balance Limit in $")+
ylim(-1500, 90000)+
stat_summary(fun.data = "mean_cl_boot", colour = "red",
geom = "errorbar", width = .2)
## Warning: Removed 861 rows containing non-finite values (stat_summary).
## Warning: Removed 10 rows containing missing values (geom_errorbar).
#scale_x_continuous(labels = comma) ## <<== code works but won't knit with this line
p <- ggplot(MostLikely, aes(MostLikely$Bill_amt3, MostLikely$Sex == "1"))#, colour = Sex
## colour = Sex, givesthe right label
require(scales)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
p + geom_jitter(width = 0.2, outlier.shape = NA) +
ylab("")+
xlab("Balance Limit in $")+ scale_x_continuous(labels = comma) +
ggtitle("Density Plot (Using Jitter)\n Key : True = Male , False = Female")
## Warning: Ignoring unknown parameters: outlier.shape
#scale_color_gradient(low = "purple", high = "blue")
#+geom_boxplot(outlier.colour = "red")
## outlier.shape = NA <<====== TO NOT SHOW Outliers
MostLikely <- as.data.frame(MostLikely)
MostLikely <- na.omit(MostLikely) ## Just incase
#str(MostLikely)
# Bivariate Thinking
# Let's start with a side-by-side box plot with mean and error bars (95% CI)
p1 <- ggplot(data = MostLikely, aes(x = MostLikely$Limit_Bal, y = MostLikely$Bill_amt3,group = 1))
p1 + stat_summary(fun.y = "mean", geom = "point", colour = "red") +
stat_summary(fun.data = "mean_cl_boot", colour = "red",
geom = "errorbar", width = .2,outlier.shape = NA)+
scale_x_continuous(labels = comma)+
scale_y_continuous(labels = comma)+
xlab("Balance Limit in $")+
ylab("Bill Amount Owing") +
ggtitle("Bill Amount Increase with Increasing Balance Limit")
## Warning: Ignoring unknown parameters: outlier.shape
## Warning: Removed 12 rows containing missing values (geom_errorbar).
# geom_boxplot( outlier.shape = NA) +
ggplot(MostLikely, aes(Sex, color=MostLikely$Sex)) + #grupo
geom_histogram(position="identity", binwidth=1, aes(y=..density.., fill=MostLikely$Sex), alpha=0.5) +
geom_density()+
ggtitle("Density Plot\n Key : 1 = Male , 2 = Female")
I ran this code with varying limit amounts ($) in the my subgroup, interestingly at greater than ($10000) it was Females who where single that defaulted more. And at greater than ($30000) its Males whom are married.