Dataset

credit <- read.table("http://nathanieldphillips.com/wp-content/uploads/2015/05/credit.txt", 
                     sep = ",", 
                     header = T, 
                     stringsAsFactors = F)

Question 1

nrow(credit)

## [1] 1000

ncol(credit)

## [1] 17

names(credit)

##  [1] "checking_balance"     "months_loan_duration" "credit_history"      
##  [4] "purpose"              "amount"               "savings_balance"     
##  [7] "employment_duration"  "percent_of_income"    "years_at_residence"  
## [10] "age"                  "other_credit"         "housing"             
## [13] "existing_loans_count" "job"                  "dependents"          
## [16] "phone"                "default"

Question 2

table(credit$credit_history)

## 
##  critical      good   perfect      poor very good 
##       293       530        40        88        49

mean(credit$amount[credit$credit_history == "critical"])

## [1] 3088.038

mean(credit$amount[credit$credit_history == "good"])

## [1] 3040.958

mean(credit$amount[credit$credit_history == "perfect"])

## [1] 5305.675

mean(credit$amount[credit$credit_history == "poor"])

## [1] 4302.602

mean(credit$amount[credit$credit_history == "very good"])

## [1] 3344.878

#or more simple

aggregate(formula = amount ~ credit_history,
          data = credit,
          FUN = mean)

##   credit_history   amount
## 1       critical 3088.038
## 2           good 3040.958
## 3        perfect 5305.675
## 4           poor 4302.602
## 5      very good 3344.878

median(credit$age[credit$credit_history == "critical"])

## [1] 36

median(credit$age[credit$credit_history == "good"])

## [1] 31

median(credit$age[credit$credit_history == "perfect"])

## [1] 32

median(credit$age[credit$credit_history == "poor"])

## [1] 34

median(credit$age[credit$credit_history == "very good"])

## [1] 34

#or more simple 

aggregate(formula = age ~ credit_history,
          data = credit,
          FUN = median)

##   credit_history age
## 1       critical  36
## 2           good  31
## 3        perfect  32
## 4           poor  34
## 5      very good  34

Question 3

max.val1 <- max(credit$amount[credit$purpose == "business"])
max.val2 <- max(credit$amount[credit$purpose == "car"])
max.val3 <- max(credit$amount[credit$purpose == "education"])
max.val4 <- max(credit$amount[credit$purpose == "furniture/appliances"])
max.val5 <- max(credit$amount[credit$purpose == "renovations"])

max.val <- max(max.val1, max.val2, max.val3, max.val4, max.val5)

credit$purpose[credit$amount == max.val]

## [1] "car"

#or
aggregate(formula = amount ~ purpose,
          data = credit,
          FUN = max)

##                purpose amount
## 1             business  15945
## 2                  car  18424
## 3            education  12612
## 4 furniture/appliances  15653
## 5          renovations  11998

min.val1 <- min(credit$amount[credit$purpose == "business"])
min.val2 <- min(credit$amount[credit$purpose == "car"])
min.val3 <- min(credit$amount[credit$purpose == "education"])
min.val4 <- min(credit$amount[credit$purpose == "furniture/appliances"])
min.val5 <- min(credit$amount[credit$purpose == "renovations"])

min.val <- min(min.val1, min.val2, min.val3, min.val4, min.val5)

credit$purpose[credit$amount == min.val]

## [1] "car"

#or
aggregate(formula = amount ~ purpose,
          data = credit,
          FUN = min)

##                purpose amount
## 1             business    609
## 2                  car    250
## 3            education    339
## 4 furniture/appliances    338
## 5          renovations    454

Question 4

aggregate(formula = age ~ housing,
          data = credit,
          FUN = mean)

##   housing      age
## 1   other 43.81481
## 2     own 35.59327
## 3    rent 30.36872

aggregate(formula = age ~ housing,
          data = credit,
          FUN = median)

##   housing age
## 1   other  42
## 2     own  33
## 3    rent  26

boxplot( age ~ housing,
         data = credit)

#looks like there is a relationship between age and housing type

Question 5

logvec1 <- credit$amount > 1000
credit$amount_gt1000 <- logvec1

aggregate(formula = amount_gt1000 ~ employment_duration,
          data = credit,
          FUN = mean)

##   employment_duration amount_gt1000
## 1            < 1 year     0.8604651
## 2           > 7 years     0.8616601
## 3         1 - 4 years     0.9056047
## 4         4 - 7 years     0.8850575
## 5          unemployed     0.9193548

#Yes people who were unemployed were more likely to have a loan amount greater than 1000


agg.result <- aggregate(formula = amount_gt1000 ~ employment_duration,
          data = credit,
          FUN = mean)
mean(agg.result$amount_gt1000[agg.result$employment_duration == "unemployed"])

## [1] 0.9193548

Question 6

aggregate(formula = amount ~ default,
          data = credit,
          FUN = mean)

##   default   amount
## 1      no 2985.457
## 2     yes 3938.127

#Loans that defaulted had a lower loan amount, so No!

aggregate(formula = age ~ default,
          data = credit,
          FUN = mean)

##   default      age
## 1      no 36.22429
## 2     yes 33.96333

#Yes, loans that defaulted came from younger people

Question 7

credit$default.log <- (credit$default == "yes")

aggregate(formula = default.log ~ job + housing,
          data = credit,
          FUN = mean)

##           job housing default.log
## 1  management   other   0.3333333
## 2     skilled   other   0.4285714
## 3  unemployed   other   0.2500000
## 4   unskilled   other   0.6250000
## 5  management     own   0.3617021
## 6     skilled     own   0.2433628
## 7  unemployed     own   0.3076923
## 8   unskilled     own   0.2467532
## 9  management    rent   0.2857143
## 10    skilled    rent   0.4260870
## 11 unemployed    rent   0.4000000
## 12  unskilled    rent   0.3421053

Question 8

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## 
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union

credit.frame <- credit %>%
  group_by(savings_balance) %>%
  summarise(amount.mean = mean(amount),
            amount.median = median(amount),
            amount.sd = sd(amount),
            amount.max = max(amount))
credit.frame

## Source: local data frame [5 x 5]
## 
##   savings_balance amount.mean amount.median amount.sd amount.max
##             (chr)       (dbl)         (dbl)     (dbl)      (int)
## 1        < 100 DM    3187.833          2238  2787.683      18424
## 2       > 1000 DM    2573.396          1587  2174.448      10961
## 3    100 - 500 DM    3384.039          2463  2874.944      14782
## 4   500 - 1000 DM    2572.111          2326  2208.532      12749
## 5         unknown    3906.410          2859  3127.706      14555

credit.frame2 <- credit %>%
  filter(age > 40) %>%
  group_by(savings_balance) %>%
  summarise(amount.mean = mean(amount),
            amount.median = median(amount),
            amount.sd = sd(amount),
            amount.max = max(amount))
credit.frame2

## Source: local data frame [5 x 5]
## 
##   savings_balance amount.mean amount.median amount.sd amount.max
##             (chr)       (dbl)         (dbl)     (dbl)      (int)
## 1        < 100 DM    3455.994          2322  3084.411      15945
## 2       > 1000 DM    1928.267          1516  1256.809       4591
## 3    100 - 500 DM    3434.600          1439  4140.429      14782
## 4   500 - 1000 DM    1895.200          1287  2078.180      10127
## 5         unknown    4041.864          3342  2940.513      13756

WPA_5

Lena Bareuther

Dezember 2015

Dataset

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8