Data Set Information
The German credit scoring data is a dataset provided by Prof. Hogmann in the file german.data. The data set has information about 1000 individuals, on the basis of which they have been classified as risky or not. The variable response in the dataset corresponds to the risk label, 1 has been classified as bad and 2 has been classified as good. In our analysis, we have changed these labels to 1 and 0, 0 corresponding to a good credit record and 1 corresponding to a bad one (positive class).
Libraries Used
library(knitr)
library(dplyr)
library(tidyr)
library(reshape2)
library(RColorBrewer)
library(GGally)
library(ggplot2)
Importing Data and Column Names
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german_credit$response = german_credit$response - 1
german_credit$response <- as.factor(german_credit$response)
Attributes There is a total on 21 attributes in the dataset. Their descriptions and details have been tabulated below: 1. Status of existing checking account. 2. Duration in month 3. Credit history 4. Purpose 5. Credit amount 6. Savings account/bonds 7: Present employment since 8. Installment rate in percentage of disposable income 9. Personal status and sex 10. Other debtors / guarantors 11. Present residence since 12. Property 13. Age in years 14. Other installment plans 15. Housing 16. Number of existing credits at this bank 17. Job 18. Number of people being liable to provide maintenance for 19. Telephone 20. foreign worker
Summary Statistics
kable(summary(german_credit))
We take the summary statistics of the dataset, the dataset has a total of 1000 observations with 21 variables, out of which 8 are numerical variables including the response and 13 are categorical variables with various levels. The summary statistics for the variables have been presented
Exploratory Data Analysis of Continuous Data
amount.mean = german_credit %>% select(amount, response) %>%group_by(response) %>% summarise( m =mean(amount))
duration.mean = german_credit %>% select(duration, response) %>%group_by(response) %>% summarise( m =mean(duration))
duration.mean = german_credit %>% select(duration, response) %>%group_by(response) %>% summarise(m =mean(duration))
ggplot(german_credit, aes(duration, fill=response)) +
geom_density(alpha=.5) + geom_vline(data=german_credit, aes(xintercept=duration.mean[,2], colour=response),
linetype="dashed", size=1)
ggplot(german_credit,aes(x=installment_rate, fill=response)) +
geom_histogram()
ggplot(german_credit, aes(x=installment_rate, fill=response)) +
geom_histogram(position = "dodge") +
scale_y_log10() + theme(axis.text.x=element_text(angle=45, hjust=1)) + ylab("Log Value") + xlab("Variables")
test.m = german_credit[,c(2,5,8,13,16,18,21)]
test.m$response <- as.numeric(test.m$response)
ggplot(melt(german_credit[,c(2,21)]), aes(x = variable, y = value, fill = response)) + geom_boxplot() + xlab("duration")
ggplot(melt(german_credit[,c(5,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot() + xlab("amount")
ggplot(melt(german_credit[,c(8,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot() + xlab("installment_rate")
ggplot(melt(german_credit[,c(13,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot() + xlab("age")
ggplot(melt(german_credit[,c(16,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot() + xlab("n_credits")
EDA of Categorical Data
ggplot(german_credit, aes(chk_acct, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(credit_his, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(purpose, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(saving_acct, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(other_debtor, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(sex, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(other_install, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(foreign, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")