library(yarrr)
## Loading required package: jpeg
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dplyr)
credit <- read.csv("~/Dropbox/RSeminar/credit.txt")

Q1

How many rows and columns are in the dataframe?

What are the names of the columns in the dataframe?

ncol(credit)
## [1] 17
nrow(credit)
## [1] 1000

Q2

The column credit_history is a character value indicating how good the credit history of the customer was.

What are the different values of the credit_history variable and how often did each occur?

What is the mean loan amount (column is called amount) for each level of credit_history?

What is the median age for each level of credit_history?

table(credit$credit_history)
## 
##  critical      good   perfect      poor very good 
##       293       530        40        88        49
aggregate (amount~credit_history, mean, na.rm = T, data = credit)
##   credit_history   amount
## 1       critical 3088.038
## 2           good 3040.958
## 3        perfect 5305.675
## 4           poor 4302.602
## 5      very good 3344.878
aggregate (age~credit_history, median, na.rm = T, data = credit)
##   credit_history age
## 1       critical  36
## 2           good  31
## 3        perfect  32
## 4           poor  34
## 5      very good  34

Q3

What was the purpose of the highest loan amount? (Hint: Start by answering the question: What was the maximum loan amount for each each loan purpose?)

What was the purpose of the smallest loan amount?

cmax <- aggregate (amount~purpose, max, na.rm = T, data = credit)
cmax$purpose[cmax$amount == max(cmax$amount)]
## [1] car
## Levels: business car education furniture/appliances renovations
cmin <- aggregate (amount~purpose, min, na.rm = T, data = credit)
cmin$purpose[cmin$amount == min(cmin$amount)]
## [1] car
## Levels: business car education furniture/appliances renovations

Q4

Does it look like there is a relationship between a person’s housing status and their age?

hous <- aggregate(age~housing, mean, na.rm = T, data = credit)

barplot(height = hous$age, names.arg = hous$housing)

vec <- hous$housing
boxplot(age ~ housing,
        names = vec,
        ylab = "Age",
        xlab = "Housing",
        main = "Plot 1",
        data = credit)

Q5

Create a new column called amount_gt1000, a binary variable that has TRUE if the loan amount is greater than 1000 and FALSE if the loan amount is less than or equal to 1000

Were people who were unemployed more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years? (Hint: Calculate the percentage for all employment_duration values).

Using logical indexing, tell me the exact percentage of unemployed people who had a loan amount greater than 1000. (Hint: Assign your previous result to a new object called agg.result. Then, use basic logical indexing on columns in this dataframe!)

credit$amount_gt1000 <- (credit$amount > 1000)

aggregate(amount_gt1000 ~ employment_duration, mean, na.rm = T, data = credit)
##   employment_duration amount_gt1000
## 1            < 1 year     0.8604651
## 2           > 7 years     0.8616601
## 3         1 - 4 years     0.9056047
## 4         4 - 7 years     0.8850575
## 5          unemployed     0.9193548
agg.result <- aggregate(amount_gt1000 ~ employment_duration, mean, na.rm = T, data = credit)

agg.result$amount_gt1000[agg.result$employment_duration == "unemployed"]
## [1] 0.9193548

Q6

Did loans that defaulted have a higher mean loan amount than those that did not default? (Hint: Use the default column)

Did loans that defaulted come from younger people on average or older people?

aggregate(amount~default, mean, na.rm = T, data = credit)
##   default   amount
## 1      no 2985.457
## 2     yes 3938.127
aggregate(age~default, mean, na.rm = T, data = credit)
##   default      age
## 1      no 36.22429
## 2     yes 33.96333

Q7

Create a new column called default.log, a logical variable indicating whether or not a loan defaulted or not.

Calculate the percent of loans that defaulted for every unique combination of both job type (job) and housing type (housing).

credit$default.log <- (credit$default == "yes")

aggregate(default.log ~ job + housing, mean, na.rm = T, data = credit)
##           job housing default.log
## 1  management   other   0.3333333
## 2     skilled   other   0.4285714
## 3  unemployed   other   0.2500000
## 4   unskilled   other   0.6250000
## 5  management     own   0.3617021
## 6     skilled     own   0.2433628
## 7  unemployed     own   0.3076923
## 8   unskilled     own   0.2467532
## 9  management    rent   0.2857143
## 10    skilled    rent   0.4260870
## 11 unemployed    rent   0.4000000
## 12  unskilled    rent   0.3421053

Q8

For each level of savings_balance, calculate the following summary statistics of the loan amount: mean, median, sd, and max. Make sure all the results are in a single data frame! (Hint: use dplyr!)

Repeat your previous analysis, but only for those borrowers who are older than 40. (Hint: use the filter() verb in dplyr)

credit %>%
  group_by(savings_balance) %>%
  summarise(
            loan.mean = as.numeric(mean(amount)),
            loan.median = as.numeric(median(amount)),
            loan.sd = as.numeric(sd(amount)),
            loan.max = as.numeric(max(amount))
            )
## Source: local data frame [5 x 5]
## 
##   savings_balance loan.mean loan.median  loan.sd loan.max
## 1        < 100 DM  3187.833        2238 2787.683    18424
## 2       > 1000 DM  2573.396        1587 2174.448    10961
## 3    100 - 500 DM  3384.039        2463 2874.944    14782
## 4   500 - 1000 DM  2572.111        2326 2208.532    12749
## 5         unknown  3906.410        2859 3127.706    14555
credit %>%
  filter(age > 40) %>%
  group_by(savings_balance) %>%
    summarise(
      loan.mean = mean(amount),
      loan.median = median(amount),
      loan.stdev = sd(amount),
      loan.max = max(amount)
    )
## Source: local data frame [5 x 5]
## 
##   savings_balance loan.mean loan.median loan.stdev loan.max
## 1        < 100 DM  3455.994        2322   3084.411    15945
## 2       > 1000 DM  1928.267        1516   1256.809     4591
## 3    100 - 500 DM  3434.600        1439   4140.429    14782
## 4   500 - 1000 DM  1895.200        1287   2078.180    10127
## 5         unknown  4041.864        3342   2940.513    13756