Title: “Final_Project”
Author: “vskrelja”
Date: “July 26, 2015”
Output: html_document

# I. Reading in data / creating data frames and subsets:
lendingclub_zipcodes <- read.csv("https://raw.githubusercontent.com/vskrelja/Final/master/lendingclub_zipcodes.csv", header=TRUE, sep=",")
loans <- data.frame(lendingclub_zipcodes)
numloans <- nrow(loans)
defaults <- nrow(loans[loans$loanstatus=="ChargedOff",])
homeowner <- data.frame(loans[loans$homeownership=="OWN"|loans$homeownership=="MORTGAGE",])
homeownercount <- nrow(homeowner)
defaults_homeowner <- nrow(homeowner[homeowner$loanstatus=="ChargedOff",])
defaults_df <- data.frame(loans[loans$loanstatus=="ChargedOff",])
# II. Transforming dataframe to add new column (Income Ratio vs. Defaults Chart 1):
loans$incomeperhousehold[loans$incomeperhousehold==0] <- NA  #replace missing values with NA to exclude from calcuation
loans["income_ratio"] <- NA  # Adds a new column with NA placeholders
loans$income_ratio <- loans$annualinc/loans$incomeperhousehold  # populates new column with calculated values 
summary(loans$income_ratio)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   0.0429   1.0560   1.5390   1.9020   2.2720 269.8000      464
loans$income_ratio[loans$income_ratio > 5] <- NA  # Ignoring outliers > 5 in ratios
summary(loans$income_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.043   1.040   1.505   1.715   2.178   5.000    7737
plot(loans$loanstatus,loans$income_ratio,ylab="Borrower Income / Avg. Income for Zipcode", main = "Chart 1 - Income Ratio vs. Defaults")

# III. Fico Score vs. Avg House Value by Zipcode for Defaults (Chart 2):
defaults_df$lastficorangelow[defaults_df$lastficorangelow==0] <- NA  # Replace missing ficos w/NA to exclude from plot
summary(defaults_df$lastficorangelow)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   500.0   530.0   555.0   565.5   590.0   840.0     725
plot(defaults_df$lastficorangelow,defaults_df$averagehousevalue/1000,xlab = "FICO",ylab = "Avg. House Value ('000s)",main = "Chart 2 - Default Drilldown: High Fico - Low Home Values")

# IV. Defaults Distribution by State (Chart 3):
plot(defaults_df$state, col="blue",ylab="Number of Defaults per State", main = "Chart 3 - Defaults by State")

# V. Conditional Probability Calculations:
prob_defaults <- defaults / numloans * 100  # Probability of defaulting
print(prob_defaults)
## [1] 2.329085
prob_homeowner <- homeownercount / numloans * 100  # Probability of homeownership  
print(prob_homeowner)
## [1] 60.66486
prob_defaults_homeowner <- defaults_homeowner/homeownercount * 100  #Probability of default given homeownership
print(prob_defaults_homeowner)
## [1] 2.044157