Q1

nrow(credit)
## [1] 1000
ncol(credit)
## [1] 17
names(credit)
##  [1] "checking_balance"     "months_loan_duration" "credit_history"      
##  [4] "purpose"              "amount"               "savings_balance"     
##  [7] "employment_duration"  "percent_of_income"    "years_at_residence"  
## [10] "age"                  "other_credit"         "housing"             
## [13] "existing_loans_count" "job"                  "dependents"          
## [16] "phone"                "default"

Q2

The column credit_history is a character value indicating how good the credit history of the customer was.

What are the different values of the credit_history variable and how often did each occur?

cred.hist <- credit %>%
  group_by(credit_history) %>%
  summarise(
    numbers = n()
  )
cred.hist
## Source: local data frame [5 x 2]
## 
##   credit_history numbers
##            (chr)   (int)
## 1       critical     293
## 2           good     530
## 3        perfect      40
## 4           poor      88
## 5      very good      49

Or alternatively:

table(credit$credit_history)
## 
##  critical      good   perfect      poor very good 
##       293       530        40        88        49

What is the mean loan amount (column is called amount) for each level of credit_history?

cred.hist <- credit %>%
  group_by(credit_history) %>%
  summarise(
    numbers = n(),
    means = mean(amount)
  )
cred.hist
## Source: local data frame [5 x 3]
## 
##   credit_history numbers    means
##            (chr)   (int)    (dbl)
## 1       critical     293 3088.038
## 2           good     530 3040.958
## 3        perfect      40 5305.675
## 4           poor      88 4302.602
## 5      very good      49 3344.878

What is the median age for each level of credit_history?

cred.hist <- credit %>%
  group_by(credit_history) %>%
  summarise(
    numbers = n(),
    means = mean(amount),
    medians = median(age)
  )
cred.hist
## Source: local data frame [5 x 4]
## 
##   credit_history numbers    means medians
##            (chr)   (int)    (dbl)   (dbl)
## 1       critical     293 3088.038      36
## 2           good     530 3040.958      31
## 3        perfect      40 5305.675      32
## 4           poor      88 4302.602      34
## 5      very good      49 3344.878      34

Q3

What was the purpose of the highest loan amount? (Hint: Start by answering the question: What was the maximum loan amount for each each loan purpose?)

credit$purpose[credit$amount == max(credit$amount)] 
## [1] "car"

What was the purpose of the smallest loan amount?

credit$purpose[credit$amount == min(credit$amount)] 
## [1] "car"

Q4

Does it look like there is a relationship between a person’s housing status and their age?

relat.house.age <- credit %>%
  group_by(housing) %>%
  summarise(
    mean.age = mean(age)
  )
relat.house.age
## Source: local data frame [3 x 2]
## 
##   housing mean.age
##     (chr)    (dbl)
## 1   other 43.81481
## 2     own 35.59327
## 3    rent 30.36872

The results indicate that there is probably a result.

Q5

Create a new column called amount_gt1000, a binary variable that has TRUE if the loan amount is greater than 1000 and FALSE if the loan amount is less than or equal to 1000

amount_gt1000 <- credit$amount > 1000
head(amount_gt1000)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE

Were people who were unemployed more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years? (Hint: Calculate the percentage for all employment_duration values).

employ.dur.amount <- credit %>%
  group_by(employment_duration) %>%
  summarise(
    ratio = mean(amount > 1000)
  )
employ.dur.amount
## Source: local data frame [5 x 2]
## 
##   employment_duration     ratio
##                 (chr)     (dbl)
## 1         1 - 4 years 0.9056047
## 2            < 1 year 0.8604651
## 3         4 - 7 years 0.8850575
## 4           > 7 years 0.8616601
## 5          unemployed 0.9193548

The consequence is that people who were unemployed are more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years. Unfair world!

Using logical indexing, tell me the exact percentage of unemployed people who had a loan amount greater than 1000. (Hint: Assign your previous result to a new object called agg.result. Then, use basic logical indexing on columns in this dataframe!)

employ.dur.amount$ratio[employ.dur.amount$employment_duration == "unemployed"]
## [1] 0.9193548

Q6

Did loans that defaulted have a higher mean loan amount than those that did not default? (Hint: Use the default column)

default.loans <- credit %>%
  group_by(default) %>%
  summarise(
    mean.loan = mean(amount)
  )
default.loans
## Source: local data frame [2 x 2]
## 
##   default mean.loan
##     (chr)     (dbl)
## 1      no  2985.457
## 2     yes  3938.127

Did loans that defaulted come from younger people on average or older people?

default.loans <- credit %>%
  group_by(age) %>%
  summarise(
    ratio = mean(default == "yes"),
    numbers = n()
  )
default.loans
## Source: local data frame [53 x 3]
## 
##      age     ratio numbers
##    (int)     (dbl)   (int)
## 1     19 0.5000000       2
## 2     20 0.3571429      14
## 3     21 0.3571429      14
## 4     22 0.4074074      27
## 5     23 0.4166667      48
## 6     24 0.4318182      44
## 7     25 0.4634146      41
## 8     26 0.2800000      50
## 9     27 0.2549020      51
## 10    28 0.3488372      43
## ..   ...       ...     ...