DATA 606 Data Project Proposal

Data Preparation

# load data
library(ggplot2)
library(corrplot)
library(manifestoR)
library(readr)
library(dplyr)
url.test<-"https://raw.githubusercontent.com/kglan/MSDS/main/DATA606/DATA%20606%20Data%20Project%20Proposal/test.csv"
testdata <- read_csv(url(url.test))


url.train <- "https://raw.githubusercontent.com/kglan/MSDS/main/DATA606/DATA%20606%20Data%20Project%20Proposal/train.csv"
traindata <- read_csv(url(url.train))

df<- bind_rows(traindata, testdata)

#Categorical Vairables
cat_features <- df%>%
  select(5,6,8,9,10,12,13,14,16,17,18,20,21)%>%
  colnames()%>%
  as.vector()%>%
  append("bankruptcy")
cat_features

#Numerical Variables
num_features <-df%>%
  select(2,3,4,7,11,15,19)%>%
  colnames()%>%
  as.vector()
num_features

corrro <-df%>%
  select(2,3,4,7,11,15,19)

df

## # A tibble: 1,000 × 22
##       id   sum  term payment guarantees reason   credits other…¹ credi…² marit…³
##    <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>      <dbl> <chr>   <chr>   <chr>  
##  1     0  1169     6       4 none       televis…       2 none    critic… male s…
##  2     1  5951    48       2 none       televis…       1 none    existi… female…
##  3     2  2096    12       2 none       educati…       1 none    critic… male s…
##  4     3  7882    42       2 guarantor  furnitu…       1 none    existi… male s…
##  5     4  4870    24       3 none       new car        2 none    delay … male s…
##  6     5  9055    36       2 none       educati…       1 none    existi… male s…
##  7     6  2835    24       3 none       furnitu…       1 none    existi… male s…
##  8     7  6948    36       2 none       used car       1 none    existi… male s…
##  9     8  3059    12       2 none       televis…       1 none    existi… male d…
## 10     9  5234    30       4 none       new car        2 none    critic… male m…
## # … with 990 more rows, 12 more variables: age <dbl>, employment <chr>,
## #   qualification <chr>, immigrant <chr>, residence_since <dbl>,
## #   accommodation <chr>, estate <chr>, savings <chr>, dependents <dbl>,
## #   phone <chr>, status <chr>, bankruptcy <dbl>, and abbreviated variable names
## #   ¹other_credits, ²credit_report, ³marital_status

Research question

Using a Logistic Regression, what is the likelihood that the individuals within the testdata set will have a bankruptcy?

Cases

Each case represents individuals who have utilized American Express as their financial institution. There are 1000 observations

Data collection

The Data was Collected by American Express as a part of their coding competition hosted on Kaggle.com. The Data is utilized as a method of improving analytical skills

Type of study

This is an observational study

Data Source

https://www.kaggle.com/competitions/bankruptcy-risk-prediction/data?select=train.csv

Dependent Variable

The response variable is bankruptcy, it is binary categorical

Independent Variable(s)

There are many explanatory variables which contribute to this dataset as many factors could contribute. The magnitude of the factors which contribute will be explored in the Analysis and Logistic Regression model.

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

summary(df)

##        id             sum             term         payment     
##  Min.   :  0.0   Min.   :  250   Min.   : 4.0   Min.   :1.000  
##  1st Qu.:249.8   1st Qu.: 1366   1st Qu.:12.0   1st Qu.:2.000  
##  Median :499.5   Median : 2320   Median :18.0   Median :3.000  
##  Mean   :499.5   Mean   : 3271   Mean   :20.9   Mean   :2.973  
##  3rd Qu.:749.2   3rd Qu.: 3972   3rd Qu.:24.0   3rd Qu.:4.000  
##  Max.   :999.0   Max.   :18424   Max.   :72.0   Max.   :4.000  
##                                                                
##   guarantees           reason             credits      other_credits     
##  Length:1000        Length:1000        Min.   :1.000   Length:1000       
##  Class :character   Class :character   1st Qu.:1.000   Class :character  
##  Mode  :character   Mode  :character   Median :1.000   Mode  :character  
##                                        Mean   :1.407                     
##                                        3rd Qu.:2.000                     
##                                        Max.   :4.000                     
##                                                                          
##  credit_report      marital_status          age         employment       
##  Length:1000        Length:1000        Min.   :19.00   Length:1000       
##  Class :character   Class :character   1st Qu.:27.00   Class :character  
##  Mode  :character   Mode  :character   Median :33.00   Mode  :character  
##                                        Mean   :35.55                     
##                                        3rd Qu.:42.00                     
##                                        Max.   :75.00                     
##                                                                          
##  qualification       immigrant         residence_since accommodation     
##  Length:1000        Length:1000        Min.   :1.000   Length:1000       
##  Class :character   Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Mode  :character   Median :3.000   Mode  :character  
##                                        Mean   :2.845                     
##                                        3rd Qu.:4.000                     
##                                        Max.   :4.000                     
##                                                                          
##     estate            savings            dependents       phone          
##  Length:1000        Length:1000        Min.   :1.000   Length:1000       
##  Class :character   Class :character   1st Qu.:1.000   Class :character  
##  Mode  :character   Mode  :character   Median :1.000   Mode  :character  
##                                        Mean   :1.155                     
##                                        3rd Qu.:1.000                     
##                                        Max.   :2.000                     
##                                                                          
##     status            bankruptcy    
##  Length:1000        Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.2988  
##                     3rd Qu.:1.0000  
##                     Max.   :1.0000  
##                     NA's   :200

ggplot(df, aes(x=sum)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Demonstrate correlation between numerical values
library(corrplot)
Tracefile_cor <- cor(corrro, method = "pearson")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(Tracefile_cor, method = "color", col = col(200),
         type = "upper", order = "hclust",
         addCoef.col = "black",
         tl.col = "black", tl.srt = 45,)