nrow(credit)
## [1] 1000
ncol(credit)
## [1] 17
names(credit)
## [1] "checking_balance" "months_loan_duration" "credit_history"
## [4] "purpose" "amount" "savings_balance"
## [7] "employment_duration" "percent_of_income" "years_at_residence"
## [10] "age" "other_credit" "housing"
## [13] "existing_loans_count" "job" "dependents"
## [16] "phone" "default"
The column credit_history is a character value indicating how good the credit history of the customer was.
What are the different values of the credit_history variable and how often did each occur?
cred.hist <- credit %>%
group_by(credit_history) %>%
summarise(
numbers = n()
)
cred.hist
## Source: local data frame [5 x 2]
##
## credit_history numbers
## (chr) (int)
## 1 critical 293
## 2 good 530
## 3 perfect 40
## 4 poor 88
## 5 very good 49
Or alternatively:
table(credit$credit_history)
##
## critical good perfect poor very good
## 293 530 40 88 49
What is the mean loan amount (column is called amount) for each level of credit_history?
cred.hist <- credit %>%
group_by(credit_history) %>%
summarise(
numbers = n(),
means = mean(amount)
)
cred.hist
## Source: local data frame [5 x 3]
##
## credit_history numbers means
## (chr) (int) (dbl)
## 1 critical 293 3088.038
## 2 good 530 3040.958
## 3 perfect 40 5305.675
## 4 poor 88 4302.602
## 5 very good 49 3344.878
What is the median age for each level of credit_history?
cred.hist <- credit %>%
group_by(credit_history) %>%
summarise(
numbers = n(),
means = mean(amount),
medians = median(age)
)
cred.hist
## Source: local data frame [5 x 4]
##
## credit_history numbers means medians
## (chr) (int) (dbl) (dbl)
## 1 critical 293 3088.038 36
## 2 good 530 3040.958 31
## 3 perfect 40 5305.675 32
## 4 poor 88 4302.602 34
## 5 very good 49 3344.878 34
What was the purpose of the highest loan amount? (Hint: Start by answering the question: What was the maximum loan amount for each each loan purpose?)
credit$purpose[credit$amount == max(credit$amount)]
## [1] "car"
What was the purpose of the smallest loan amount?
credit$purpose[credit$amount == min(credit$amount)]
## [1] "car"
Does it look like there is a relationship between a person’s housing status and their age?
relat.house.age <- credit %>%
group_by(housing) %>%
summarise(
mean.age = mean(age)
)
relat.house.age
## Source: local data frame [3 x 2]
##
## housing mean.age
## (chr) (dbl)
## 1 other 43.81481
## 2 own 35.59327
## 3 rent 30.36872
The results indicate that there is probably a result.
Create a new column called amount_gt1000, a binary variable that has TRUE if the loan amount is greater than 1000 and FALSE if the loan amount is less than or equal to 1000
amount_gt1000 <- credit$amount > 1000
head(amount_gt1000)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
Were people who were unemployed more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years? (Hint: Calculate the percentage for all employment_duration values).
employ.dur.amount <- credit %>%
group_by(employment_duration) %>%
summarise(
ratio = mean(amount > 1000)
)
employ.dur.amount
## Source: local data frame [5 x 2]
##
## employment_duration ratio
## (chr) (dbl)
## 1 1 - 4 years 0.9056047
## 2 < 1 year 0.8604651
## 3 4 - 7 years 0.8850575
## 4 > 7 years 0.8616601
## 5 unemployed 0.9193548
The consequence is that people who were unemployed are more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years. Unfair world!
Using logical indexing, tell me the exact percentage of unemployed people who had a loan amount greater than 1000. (Hint: Assign your previous result to a new object called agg.result. Then, use basic logical indexing on columns in this dataframe!)
employ.dur.amount$ratio[employ.dur.amount$employment_duration == "unemployed"]
## [1] 0.9193548
Did loans that defaulted have a higher mean loan amount than those that did not default? (Hint: Use the default column)
default.loans <- credit %>%
group_by(default) %>%
summarise(
mean.loan = mean(amount)
)
default.loans
## Source: local data frame [2 x 2]
##
## default mean.loan
## (chr) (dbl)
## 1 no 2985.457
## 2 yes 3938.127
Did loans that defaulted come from younger people on average or older people?
default.loans <- credit %>%
group_by(age) %>%
summarise(
ratio = mean(default == "yes"),
numbers = n()
)
default.loans
## Source: local data frame [53 x 3]
##
## age ratio numbers
## (int) (dbl) (int)
## 1 19 0.5000000 2
## 2 20 0.3571429 14
## 3 21 0.3571429 14
## 4 22 0.4074074 27
## 5 23 0.4166667 48
## 6 24 0.4318182 44
## 7 25 0.4634146 41
## 8 26 0.2800000 50
## 9 27 0.2549020 51
## 10 28 0.3488372 43
## .. ... ... ...