| Title: “Final_Project” |
| Author: “vskrelja” |
| Date: “July 26, 2015” |
| Output: html_document |
# I. Reading in data / creating data frames and subsets:
lendingclub_zipcodes <- read.csv("https://raw.githubusercontent.com/vskrelja/Final/master/lendingclub_zipcodes.csv", header=TRUE, sep=",")
loans <- data.frame(lendingclub_zipcodes)
numloans <- nrow(loans)
defaults <- nrow(loans[loans$loanstatus=="ChargedOff",])
homeowner <- data.frame(loans[loans$homeownership=="OWN"|loans$homeownership=="MORTGAGE",])
homeownercount <- nrow(homeowner)
defaults_homeowner <- nrow(homeowner[homeowner$loanstatus=="ChargedOff",])
defaults_df <- data.frame(loans[loans$loanstatus=="ChargedOff",])
# II. Transforming dataframe to add new column (Income Ratio vs. Defaults Chart 1):
loans$incomeperhousehold[loans$incomeperhousehold==0] <- NA #replace missing values with NA to exclude from calcuation
loans["income_ratio"] <- NA # Adds a new column with NA placeholders
loans$income_ratio <- loans$annualinc/loans$incomeperhousehold # populates new column with calculated values
summary(loans$income_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0429 1.0560 1.5390 1.9020 2.2720 269.8000 464
loans$income_ratio[loans$income_ratio > 5] <- NA # Ignoring outliers > 5 in ratios
summary(loans$income_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.043 1.040 1.505 1.715 2.178 5.000 7737
plot(loans$loanstatus,loans$income_ratio,ylab="Borrower Income / Avg. Income for Zipcode", main = "Chart 1 - Income Ratio vs. Defaults")

# III. Fico Score vs. Avg House Value by Zipcode for Defaults (Chart 2):
defaults_df$lastficorangelow[defaults_df$lastficorangelow==0] <- NA # Replace missing ficos w/NA to exclude from plot
summary(defaults_df$lastficorangelow)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 500.0 530.0 555.0 565.5 590.0 840.0 725
plot(defaults_df$lastficorangelow,defaults_df$averagehousevalue/1000,xlab = "FICO",ylab = "Avg. House Value ('000s)",main = "Chart 2 - Default Drilldown: High Fico - Low Home Values")

# IV. Defaults Distribution by State (Chart 3):
plot(defaults_df$state, col="blue",ylab="Number of Defaults per State", main = "Chart 3 - Defaults by State")

# V. Conditional Probability Calculations:
prob_defaults <- defaults / numloans * 100 # Probability of defaulting
print(prob_defaults)
## [1] 2.329085
prob_homeowner <- homeownercount / numloans * 100 # Probability of homeownership
print(prob_homeowner)
## [1] 60.66486
prob_defaults_homeowner <- defaults_homeowner/homeownercount * 100 #Probability of default given homeownership
print(prob_defaults_homeowner)
## [1] 2.044157