# load data
library(ggplot2)
library(corrplot)
library(manifestoR)
library(readr)
library(dplyr)
url.test<-"https://raw.githubusercontent.com/kglan/MSDS/main/DATA606/DATA%20606%20Data%20Project%20Proposal/test.csv"
testdata <- read_csv(url(url.test))
url.train <- "https://raw.githubusercontent.com/kglan/MSDS/main/DATA606/DATA%20606%20Data%20Project%20Proposal/train.csv"
traindata <- read_csv(url(url.train))
df<- bind_rows(traindata, testdata)
#Categorical Vairables
cat_features <- df%>%
select(5,6,8,9,10,12,13,14,16,17,18,20,21)%>%
colnames()%>%
as.vector()%>%
append("bankruptcy")
cat_features
#Numerical Variables
num_features <-df%>%
select(2,3,4,7,11,15,19)%>%
colnames()%>%
as.vector()
num_features
corrro <-df%>%
select(2,3,4,7,11,15,19)
df
## # A tibble: 1,000 × 22
## id sum term payment guarantees reason credits other…¹ credi…² marit…³
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 0 1169 6 4 none televis… 2 none critic… male s…
## 2 1 5951 48 2 none televis… 1 none existi… female…
## 3 2 2096 12 2 none educati… 1 none critic… male s…
## 4 3 7882 42 2 guarantor furnitu… 1 none existi… male s…
## 5 4 4870 24 3 none new car 2 none delay … male s…
## 6 5 9055 36 2 none educati… 1 none existi… male s…
## 7 6 2835 24 3 none furnitu… 1 none existi… male s…
## 8 7 6948 36 2 none used car 1 none existi… male s…
## 9 8 3059 12 2 none televis… 1 none existi… male d…
## 10 9 5234 30 4 none new car 2 none critic… male m…
## # … with 990 more rows, 12 more variables: age <dbl>, employment <chr>,
## # qualification <chr>, immigrant <chr>, residence_since <dbl>,
## # accommodation <chr>, estate <chr>, savings <chr>, dependents <dbl>,
## # phone <chr>, status <chr>, bankruptcy <dbl>, and abbreviated variable names
## # ¹​other_credits, ²​credit_report, ³​marital_status
Using a Logistic Regression, what is the likelihood that the individuals within the testdata set will have a bankruptcy?
Each case represents individuals who have utilized American Express as their financial institution. There are 1000 observations
The Data was Collected by American Express as a part of their coding competition hosted on Kaggle.com. The Data is utilized as a method of improving analytical skills
This is an observational study
The response variable is bankruptcy, it is binary categorical
There are many explanatory variables which contribute to this dataset as many factors could contribute. The magnitude of the factors which contribute will be explored in the Analysis and Logistic Regression model.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
summary(df)
## id sum term payment
## Min. : 0.0 Min. : 250 Min. : 4.0 Min. :1.000
## 1st Qu.:249.8 1st Qu.: 1366 1st Qu.:12.0 1st Qu.:2.000
## Median :499.5 Median : 2320 Median :18.0 Median :3.000
## Mean :499.5 Mean : 3271 Mean :20.9 Mean :2.973
## 3rd Qu.:749.2 3rd Qu.: 3972 3rd Qu.:24.0 3rd Qu.:4.000
## Max. :999.0 Max. :18424 Max. :72.0 Max. :4.000
##
## guarantees reason credits other_credits
## Length:1000 Length:1000 Min. :1.000 Length:1000
## Class :character Class :character 1st Qu.:1.000 Class :character
## Mode :character Mode :character Median :1.000 Mode :character
## Mean :1.407
## 3rd Qu.:2.000
## Max. :4.000
##
## credit_report marital_status age employment
## Length:1000 Length:1000 Min. :19.00 Length:1000
## Class :character Class :character 1st Qu.:27.00 Class :character
## Mode :character Mode :character Median :33.00 Mode :character
## Mean :35.55
## 3rd Qu.:42.00
## Max. :75.00
##
## qualification immigrant residence_since accommodation
## Length:1000 Length:1000 Min. :1.000 Length:1000
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :3.000 Mode :character
## Mean :2.845
## 3rd Qu.:4.000
## Max. :4.000
##
## estate savings dependents phone
## Length:1000 Length:1000 Min. :1.000 Length:1000
## Class :character Class :character 1st Qu.:1.000 Class :character
## Mode :character Mode :character Median :1.000 Mode :character
## Mean :1.155
## 3rd Qu.:1.000
## Max. :2.000
##
## status bankruptcy
## Length:1000 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.2988
## 3rd Qu.:1.0000
## Max. :1.0000
## NA's :200
ggplot(df, aes(x=sum)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Demonstrate correlation between numerical values
library(corrplot)
Tracefile_cor <- cor(corrro, method = "pearson")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(Tracefile_cor, method = "color", col = col(200),
type = "upper", order = "hclust",
addCoef.col = "black",
tl.col = "black", tl.srt = 45,)