Dataset
credit <- read.table("http://nathanieldphillips.com/wp-content/uploads/2015/05/credit.txt",
sep = ",",
header = T,
stringsAsFactors = F)
Question 1
nrow(credit)
## [1] 1000
ncol(credit)
## [1] 17
names(credit)
## [1] "checking_balance" "months_loan_duration" "credit_history"
## [4] "purpose" "amount" "savings_balance"
## [7] "employment_duration" "percent_of_income" "years_at_residence"
## [10] "age" "other_credit" "housing"
## [13] "existing_loans_count" "job" "dependents"
## [16] "phone" "default"
Question 2
table(credit$credit_history)
##
## critical good perfect poor very good
## 293 530 40 88 49
mean(credit$amount[credit$credit_history == "critical"])
## [1] 3088.038
mean(credit$amount[credit$credit_history == "good"])
## [1] 3040.958
mean(credit$amount[credit$credit_history == "perfect"])
## [1] 5305.675
mean(credit$amount[credit$credit_history == "poor"])
## [1] 4302.602
mean(credit$amount[credit$credit_history == "very good"])
## [1] 3344.878
#or more simple
aggregate(formula = amount ~ credit_history,
data = credit,
FUN = mean)
## credit_history amount
## 1 critical 3088.038
## 2 good 3040.958
## 3 perfect 5305.675
## 4 poor 4302.602
## 5 very good 3344.878
median(credit$age[credit$credit_history == "critical"])
## [1] 36
median(credit$age[credit$credit_history == "good"])
## [1] 31
median(credit$age[credit$credit_history == "perfect"])
## [1] 32
median(credit$age[credit$credit_history == "poor"])
## [1] 34
median(credit$age[credit$credit_history == "very good"])
## [1] 34
#or more simple
aggregate(formula = age ~ credit_history,
data = credit,
FUN = median)
## credit_history age
## 1 critical 36
## 2 good 31
## 3 perfect 32
## 4 poor 34
## 5 very good 34
Question 3
max.val1 <- max(credit$amount[credit$purpose == "business"])
max.val2 <- max(credit$amount[credit$purpose == "car"])
max.val3 <- max(credit$amount[credit$purpose == "education"])
max.val4 <- max(credit$amount[credit$purpose == "furniture/appliances"])
max.val5 <- max(credit$amount[credit$purpose == "renovations"])
max.val <- max(max.val1, max.val2, max.val3, max.val4, max.val5)
credit$purpose[credit$amount == max.val]
## [1] "car"
#or
aggregate(formula = amount ~ purpose,
data = credit,
FUN = max)
## purpose amount
## 1 business 15945
## 2 car 18424
## 3 education 12612
## 4 furniture/appliances 15653
## 5 renovations 11998
min.val1 <- min(credit$amount[credit$purpose == "business"])
min.val2 <- min(credit$amount[credit$purpose == "car"])
min.val3 <- min(credit$amount[credit$purpose == "education"])
min.val4 <- min(credit$amount[credit$purpose == "furniture/appliances"])
min.val5 <- min(credit$amount[credit$purpose == "renovations"])
min.val <- min(min.val1, min.val2, min.val3, min.val4, min.val5)
credit$purpose[credit$amount == min.val]
## [1] "car"
#or
aggregate(formula = amount ~ purpose,
data = credit,
FUN = min)
## purpose amount
## 1 business 609
## 2 car 250
## 3 education 339
## 4 furniture/appliances 338
## 5 renovations 454
Question 4
aggregate(formula = age ~ housing,
data = credit,
FUN = mean)
## housing age
## 1 other 43.81481
## 2 own 35.59327
## 3 rent 30.36872
aggregate(formula = age ~ housing,
data = credit,
FUN = median)
## housing age
## 1 other 42
## 2 own 33
## 3 rent 26
boxplot( age ~ housing,
data = credit)

#looks like there is a relationship between age and housing type
Question 5
logvec1 <- credit$amount > 1000
credit$amount_gt1000 <- logvec1
aggregate(formula = amount_gt1000 ~ employment_duration,
data = credit,
FUN = mean)
## employment_duration amount_gt1000
## 1 < 1 year 0.8604651
## 2 > 7 years 0.8616601
## 3 1 - 4 years 0.9056047
## 4 4 - 7 years 0.8850575
## 5 unemployed 0.9193548
#Yes people who were unemployed were more likely to have a loan amount greater than 1000
agg.result <- aggregate(formula = amount_gt1000 ~ employment_duration,
data = credit,
FUN = mean)
mean(agg.result$amount_gt1000[agg.result$employment_duration == "unemployed"])
## [1] 0.9193548
Question 6
aggregate(formula = amount ~ default,
data = credit,
FUN = mean)
## default amount
## 1 no 2985.457
## 2 yes 3938.127
#Loans that defaulted had a lower loan amount, so No!
aggregate(formula = age ~ default,
data = credit,
FUN = mean)
## default age
## 1 no 36.22429
## 2 yes 33.96333
#Yes, loans that defaulted came from younger people
Question 7
credit$default.log <- (credit$default == "yes")
aggregate(formula = default.log ~ job + housing,
data = credit,
FUN = mean)
## job housing default.log
## 1 management other 0.3333333
## 2 skilled other 0.4285714
## 3 unemployed other 0.2500000
## 4 unskilled other 0.6250000
## 5 management own 0.3617021
## 6 skilled own 0.2433628
## 7 unemployed own 0.3076923
## 8 unskilled own 0.2467532
## 9 management rent 0.2857143
## 10 skilled rent 0.4260870
## 11 unemployed rent 0.4000000
## 12 unskilled rent 0.3421053
Question 8
library(dplyr)
##
## Attaching package: 'dplyr'
##
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
##
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
credit.frame <- credit %>%
group_by(savings_balance) %>%
summarise(amount.mean = mean(amount),
amount.median = median(amount),
amount.sd = sd(amount),
amount.max = max(amount))
credit.frame
## Source: local data frame [5 x 5]
##
## savings_balance amount.mean amount.median amount.sd amount.max
## (chr) (dbl) (dbl) (dbl) (int)
## 1 < 100 DM 3187.833 2238 2787.683 18424
## 2 > 1000 DM 2573.396 1587 2174.448 10961
## 3 100 - 500 DM 3384.039 2463 2874.944 14782
## 4 500 - 1000 DM 2572.111 2326 2208.532 12749
## 5 unknown 3906.410 2859 3127.706 14555
credit.frame2 <- credit %>%
filter(age > 40) %>%
group_by(savings_balance) %>%
summarise(amount.mean = mean(amount),
amount.median = median(amount),
amount.sd = sd(amount),
amount.max = max(amount))
credit.frame2
## Source: local data frame [5 x 5]
##
## savings_balance amount.mean amount.median amount.sd amount.max
## (chr) (dbl) (dbl) (dbl) (int)
## 1 < 100 DM 3455.994 2322 3084.411 15945
## 2 > 1000 DM 1928.267 1516 1256.809 4591
## 3 100 - 500 DM 3434.600 1439 4140.429 14782
## 4 500 - 1000 DM 1895.200 1287 2078.180 10127
## 5 unknown 4041.864 3342 2940.513 13756