library(yarrr)
## Loading required package: jpeg
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
credit <- read.csv("~/Dropbox/RSeminar/credit.txt")
Q1
How many rows and columns are in the dataframe?
What are the names of the columns in the dataframe?
ncol(credit)
## [1] 17
nrow(credit)
## [1] 1000
Q2
The column credit_history is a character value indicating how good the credit history of the customer was.
What are the different values of the credit_history variable and how often did each occur?
What is the mean loan amount (column is called amount) for each level of credit_history?
What is the median age for each level of credit_history?
table(credit$credit_history)
##
## critical good perfect poor very good
## 293 530 40 88 49
aggregate (amount~credit_history, mean, na.rm = T, data = credit)
## credit_history amount
## 1 critical 3088.038
## 2 good 3040.958
## 3 perfect 5305.675
## 4 poor 4302.602
## 5 very good 3344.878
aggregate (age~credit_history, median, na.rm = T, data = credit)
## credit_history age
## 1 critical 36
## 2 good 31
## 3 perfect 32
## 4 poor 34
## 5 very good 34
Q3
What was the purpose of the highest loan amount? (Hint: Start by answering the question: What was the maximum loan amount for each each loan purpose?)
What was the purpose of the smallest loan amount?
cmax <- aggregate (amount~purpose, max, na.rm = T, data = credit)
cmax$purpose[cmax$amount == max(cmax$amount)]
## [1] car
## Levels: business car education furniture/appliances renovations
cmin <- aggregate (amount~purpose, min, na.rm = T, data = credit)
cmin$purpose[cmin$amount == min(cmin$amount)]
## [1] car
## Levels: business car education furniture/appliances renovations
Q4
Does it look like there is a relationship between a person’s housing status and their age?
hous <- aggregate(age~housing, mean, na.rm = T, data = credit)
barplot(height = hous$age, names.arg = hous$housing)
vec <- hous$housing
boxplot(age ~ housing,
names = vec,
ylab = "Age",
xlab = "Housing",
main = "Plot 1",
data = credit)
Q5
Create a new column called amount_gt1000, a binary variable that has TRUE if the loan amount is greater than 1000 and FALSE if the loan amount is less than or equal to 1000
Were people who were unemployed more likely to have a loan amount greater than 1000 than people with an employment duration greater than 7 years? (Hint: Calculate the percentage for all employment_duration values).
Using logical indexing, tell me the exact percentage of unemployed people who had a loan amount greater than 1000. (Hint: Assign your previous result to a new object called agg.result. Then, use basic logical indexing on columns in this dataframe!)
credit$amount_gt1000 <- (credit$amount > 1000)
aggregate(amount_gt1000 ~ employment_duration, mean, na.rm = T, data = credit)
## employment_duration amount_gt1000
## 1 < 1 year 0.8604651
## 2 > 7 years 0.8616601
## 3 1 - 4 years 0.9056047
## 4 4 - 7 years 0.8850575
## 5 unemployed 0.9193548
agg.result <- aggregate(amount_gt1000 ~ employment_duration, mean, na.rm = T, data = credit)
agg.result$amount_gt1000[agg.result$employment_duration == "unemployed"]
## [1] 0.9193548
Q6
Did loans that defaulted have a higher mean loan amount than those that did not default? (Hint: Use the default column)
Did loans that defaulted come from younger people on average or older people?
aggregate(amount~default, mean, na.rm = T, data = credit)
## default amount
## 1 no 2985.457
## 2 yes 3938.127
aggregate(age~default, mean, na.rm = T, data = credit)
## default age
## 1 no 36.22429
## 2 yes 33.96333
Q7
Create a new column called default.log, a logical variable indicating whether or not a loan defaulted or not.
Calculate the percent of loans that defaulted for every unique combination of both job type (job) and housing type (housing).
credit$default.log <- (credit$default == "yes")
aggregate(default.log ~ job + housing, mean, na.rm = T, data = credit)
## job housing default.log
## 1 management other 0.3333333
## 2 skilled other 0.4285714
## 3 unemployed other 0.2500000
## 4 unskilled other 0.6250000
## 5 management own 0.3617021
## 6 skilled own 0.2433628
## 7 unemployed own 0.3076923
## 8 unskilled own 0.2467532
## 9 management rent 0.2857143
## 10 skilled rent 0.4260870
## 11 unemployed rent 0.4000000
## 12 unskilled rent 0.3421053
Q8
For each level of savings_balance, calculate the following summary statistics of the loan amount: mean, median, sd, and max. Make sure all the results are in a single data frame! (Hint: use dplyr!)
Repeat your previous analysis, but only for those borrowers who are older than 40. (Hint: use the filter() verb in dplyr)
credit %>%
group_by(savings_balance) %>%
summarise(
loan.mean = as.numeric(mean(amount)),
loan.median = as.numeric(median(amount)),
loan.sd = as.numeric(sd(amount)),
loan.max = as.numeric(max(amount))
)
## Source: local data frame [5 x 5]
##
## savings_balance loan.mean loan.median loan.sd loan.max
## 1 < 100 DM 3187.833 2238 2787.683 18424
## 2 > 1000 DM 2573.396 1587 2174.448 10961
## 3 100 - 500 DM 3384.039 2463 2874.944 14782
## 4 500 - 1000 DM 2572.111 2326 2208.532 12749
## 5 unknown 3906.410 2859 3127.706 14555
credit %>%
filter(age > 40) %>%
group_by(savings_balance) %>%
summarise(
loan.mean = mean(amount),
loan.median = median(amount),
loan.stdev = sd(amount),
loan.max = max(amount)
)
## Source: local data frame [5 x 5]
##
## savings_balance loan.mean loan.median loan.stdev loan.max
## 1 < 100 DM 3455.994 2322 3084.411 15945
## 2 > 1000 DM 1928.267 1516 1256.809 4591
## 3 100 - 500 DM 3434.600 1439 4140.429 14782
## 4 500 - 1000 DM 1895.200 1287 2078.180 10127
## 5 unknown 4041.864 3342 2940.513 13756