library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)

Dataset 1: Loan Data From Lending Tree

This analysis looks at the 2012-2013 loans data from Lending Tree.

lendingdata<-read.csv(filename,header=TRUE,stringsAsFactors=FALSE,skip=1)
#dataset can be located at
lendingdata$url<-NULL
lendingdata$desc<-NULL
#clean up zip code data
lendingdata$zip_code<-str_replace(lendingdata$zip_code,"xx","")
#clean interest rate data
lendingdata$int_rate<-as.numeric(str_replace(lendingdata$int_rate,"%",""))
summary(lendingdata$int_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    6.00   11.14   14.09   14.28   17.27   26.06       2
lendingdata$revol_util<-as.numeric(str_replace(lendingdata$revol_util,"%",""))
summary(lendingdata$revol_util)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   42.50   60.50   58.39   76.60  140.40     127
# clean loan term
lendingdata$term<-as.numeric(str_replace(lendingdata$term,"months",""))
summary(lendingdata$term)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   36.00   36.00   36.00   41.65   36.00   60.00       2
# clean emp_length
lendingdata$emp_length<-str_trim(str_replace(lendingdata$emp_length,"years",""),"both")
lendingdata$emp_length<-str_replace(lendingdata$emp_length,"\\+","")
lendingdata$emp_length<-as.numeric(lendingdata$emp_length)
lendingdata<-mutate(lendingdata,emp_lencat=ifelse(emp_length<1,"<1",ifelse(((emp_length>=1)&(emp_length<3)),"1-3",ifelse(((emp_length>=3)&(emp_length<5)),"3-5",ifelse(((emp_length>=5)&(emp_length<=9)),"5-9",ifelse(emp_length>9,"10+",NA))))))
#process dates
lendingdata<-rename(lendingdata,cr_line=earliest_cr_line)
lendingdata$cr_line<-str_c("01-",lendingdata$cr_line)
lendingdata$cr_line<-as.Date(lendingdata$cr_line,format="%d-%b-%Y")

lendingdata$last_pymnt_d<-str_c("01-",lendingdata$last_pymnt_d)
lendingdata$last_pymnt_d<-as.Date(lendingdata$last_pymnt_d,format="%d-%b-%Y")
 
lendingdata$issue_d<-str_c("01-",lendingdata$issue_d)
lendingdata$issue_d<-as.Date(lendingdata$issue_d,format="%d-%b-%Y")

#credit history
lendingdata<-mutate(lendingdata,cr_hist=difftime(issue_d,cr_line,units=c("weeks")))
lendingdata$cr_hist<-lendingdata$cr_hist/52  #convert to years
lendingdata$last_pymnt_d<-format(lendingdata$last_pymnt_d,"%b-%Y")
lendingdata$issue_d<-format(lendingdata$issue_d,"%b-%Y") 
lendingdata$cr_line<-format(lendingdata$cr_line,"%b-%Y")
#convert to factors
lendingdata$application_type<-as.factor(lendingdata$application_type)
lendingdata$home_ownership<-as.factor(lendingdata$home_ownership)
lendingdata$verification_status<-as.factor(lendingdata$verification_status)
lendingdata$loan_status<-as.factor(lendingdata$loan_status)
#select variables of interest
lendingdata<-select(lendingdata,one_of(c("id","member_id","addr_state","loan_amnt","int_rate","term","installment","emp_length","home_ownership","annual_inc","cr_hist","issue_d","cr_line","last_pymnt_d","verification_status","loan_status","emp_length","emp_lencat")))
head(lendingdata)
##         id member_id addr_state loan_amnt int_rate term installment
## 1 10159611  12011228         MA     10000     9.67   36      321.13
## 2 10129477  11981093         NC     14000    12.85   36      470.71
## 3 10149342  12000897         MI     27050    10.99   36      885.46
## 4 10139658  11991209         NM     12000    13.53   36      407.40
## 5 10159548  12011167         FL     15000     8.90   36      476.30
## 6 10119623  11971241         CO     12000    11.99   36      398.52
##   emp_length home_ownership annual_inc        cr_hist  issue_d  cr_line
## 1          7       MORTGAGE     102000 24.25000 weeks Dec-2013 Oct-1989
## 2          4           RENT      88000 25.58791 weeks Dec-2013 Jun-1988
## 3         10            OWN      55000 27.26099 weeks Dec-2013 Oct-1986
## 4         10           RENT      40000 15.21978 weeks Dec-2013 Oct-1998
## 5          2       MORTGAGE      63000 15.80769 weeks Dec-2013 Mar-1998
## 6         10       MORTGAGE     130000 16.13736 weeks Dec-2013 Nov-1997
##   last_pymnt_d verification_status loan_status emp_lencat
## 1     Sep-2016        Not Verified     Current        5-9
## 2     Sep-2016        Not Verified     Current        3-5
## 3     Jul-2016            Verified  Fully Paid        10+
## 4     Sep-2015     Source Verified  Fully Paid        10+
## 5     Sep-2016        Not Verified     Current        1-3
## 6     Sep-2016     Source Verified     Current        10+

Various summaries and charts of the clean data are presented below:

#summary of loans by state
su_total<-lendingdata %>% 
  group_by(addr_state) %>% 
  summarize(total_loans=sum(loan_amnt/1000000))
su_total<-su_total[!(su_total$addr_state==""),]
ggplot(su_total,aes(x=addr_state,y=total_loans))+geom_bar(stat="identity")+ylab("Total Loans (in Millions)")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("US State")+ggtitle("Total Loans by State (in Millions)")

a<-subset(lendingdata,((addr_state=="NY")|(addr_state=="NJ")|(addr_state=="CT")),select=c(addr_state,home_ownership,int_rate))
y <- ggplot(a, aes(y=int_rate, x=home_ownership)) + geom_boxplot(aes(color=home_ownership))
y+facet_grid(~addr_state)+ylab("Interest Rate %")+xlab("Home Ownership")+theme(legend.position="top",axis.text.x=element_text(size=5))+ggtitle("Interest Rates by Home Ownership in Tristate Area")

su_intrate<-lendingdata %>% 
  group_by(addr_state) %>% 
  summarize(avg_irate=mean(int_rate))
su_intrate<-su_intrate[!(su_intrate$addr_state==""),]
ggplot(su_intrate,aes(x=addr_state,y=avg_irate))+geom_point()+ylab("Interest Rate %")+xlab("US State")+ggtitle("Average Interest Rate by State")+theme(axis.text.x=element_text(size=8,angle=90))

b<-subset(lendingdata,((addr_state=="NY")|(addr_state=="NJ")),select=c(addr_state,emp_lencat,int_rate))
ggplot(b, aes(y=int_rate, x=emp_lencat)) + geom_boxplot(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Employment Length")
## Warning: Removed 4104 rows containing non-finite values (stat_boxplot).

lendingdata$annual_inc<-lendingdata$annual_inc/1000
z<-ggplot(lendingdata,aes(x=annual_inc,y=int_rate))+geom_point(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Annual Income in Ths dollars")+ggtitle("Loan Interest Rates by Employment Category and Income")
z+facet_grid(~emp_lencat)
## Warning: Removed 2 rows containing missing values (geom_point).

p<-ggplot(lendingdata,aes(x=cr_hist,y=int_rate))+geom_point(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Credit History in Years")+ggtitle("Loan Interest Rates by Credit History and Employment")+scale_x_continuous(breaks=seq(0,75,15))
p+facet_grid(~emp_lencat)
## Warning: Removed 2 rows containing missing values (geom_point).

Dataset 2: Citibike Usage Data

datadir=file.path("C://Users//talha//Documents//Training//Datasets//Transportation//Citibike//")
filename1=file.path(datadir,"201509-citibike-tripdata.csv")
filename2=file.path(datadir,"201609-citibike-tripdata.csv")
# dataset can be located at
#load and merge the files
bikedata<-read.csv(filename1,header=TRUE,stringsAsFactors=FALSE)
sep16<-read.csv(filename2,header=TRUE,stringsAsFactors=FALSE)
head(bikedata)
##   tripduration         starttime          stoptime start.station.id
## 1          288 9/1/2015 00:00:00 9/1/2015 00:04:48              263
## 2          165 9/1/2015 00:00:00 9/1/2015 00:02:45              495
## 3          367 9/1/2015 00:00:01 9/1/2015 00:06:08             3119
## 4          926 9/1/2015 00:00:07 9/1/2015 00:15:34              536
## 5          658 9/1/2015 00:00:09 9/1/2015 00:11:07              347
## 6          392 9/1/2015 00:00:13 9/1/2015 00:06:46             2004
##            start.station.name start.station.latitude
## 1    Elizabeth St & Hester St               40.71729
## 2            W 47 St & 10 Ave               40.76270
## 3        Vernon Blvd & 50 Ave               40.74233
## 4             1 Ave & E 30 St               40.74144
## 5 Greenwich St & W Houston St               40.72885
## 6           6 Ave & Broome St               40.72440
##   start.station.longitude end.station.id           end.station.name
## 1               -73.99638            307      Canal St & Rutgers St
## 2               -73.99301            449            W 52 St & 9 Ave
## 3               -73.95412           3118 McGuinness Blvd & Eagle St
## 4               -73.97536            340    Madison St & Clinton St
## 5               -74.00859            483            E 12 St & 3 Ave
## 6               -74.00470            254            W 11 St & 6 Ave
##   end.station.latitude end.station.longitude bikeid   usertype birth.year
## 1             40.71427             -73.98990  15479 Subscriber       1989
## 2             40.76462             -73.98789  23899 Subscriber       1987
## 3             40.73555             -73.95284  23921   Customer         NA
## 4             40.71269             -73.98776  23256 Subscriber       1992
## 5             40.73223             -73.98890  17289 Subscriber       1976
## 6             40.73532             -73.99800  15438 Subscriber       1955
##   gender
## 1      1
## 2      1
## 3      0
## 4      1
## 5      1
## 6      1
head(sep16)
##   tripduration         starttime          stoptime start.station.id
## 1          975 9/1/2016 00:00:02 9/1/2016 00:16:18              312
## 2         1220 9/1/2016 00:00:04 9/1/2016 00:20:25              316
## 3          308 9/1/2016 00:00:19 9/1/2016 00:05:27              479
## 4          250 9/1/2016 00:00:25 9/1/2016 00:04:35              297
## 5          439 9/1/2016 00:00:25 9/1/2016 00:07:45              494
## 6          730 9/1/2016 00:00:39 9/1/2016 00:12:50              491
##       start.station.name start.station.latitude start.station.longitude
## 1  Allen St & Stanton St               40.72205               -73.98911
## 2 Fulton St & William St               40.70956               -74.00654
## 3        9 Ave & W 45 St               40.76019               -73.99126
## 4        E 15 St & 3 Ave               40.73423               -73.98692
## 5        W 26 St & 8 Ave               40.74735               -73.99724
## 6   E 24 St & Park Ave S               40.74096               -73.98602
##   end.station.id          end.station.name end.station.latitude
## 1            313 Washington Ave & Park Ave             40.69610
## 2            239  Willoughby St & Fleet St             40.69197
## 3            448          W 37 St & 10 Ave             40.75660
## 4            487       E 20 St & FDR Drive             40.73314
## 5            533        Broadway & W 39 St             40.75300
## 6            477           W 41 St & 8 Ave             40.75641
##   end.station.longitude bikeid   usertype birth.year gender
## 1             -73.96751  22609 Subscriber       1985      1
## 2             -73.98130  16966 Subscriber       1977      2
## 3             -73.99790  25601 Subscriber       1983      1
## 4             -73.97574  22094 Subscriber       1953      1
## 5             -73.98722  16319 Subscriber       1985      1
## 6             -73.99003  23730 Subscriber       1986      1
bikedata<-bind_rows(bikedata,sep16)
sep16<-NULL
#clean & classify date data
bikedata$starttime<-strptime(bikedata$starttime,"%m/%d/%Y %H:%M:%S")
bikedata$starttime<-as.POSIXct(bikedata$starttime,format="%m/%d/%Y %H:%M:%S")
bikedata$stday=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%u")
bikedata$stdate=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%F")
bikedata$sthour=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%H")
bikedata$year=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%Y")
bikedata$stoptime<-strptime(bikedata$stoptime,"%m/%d/%Y %H:%M:%S")
bikedata$stoptime<-as.POSIXct(bikedata$stoptime,format="%m/%d/%Y %H:%M:%S")
bikedata$endday=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%u")
bikedata$enddate=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%F")
bikedata$endhour=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%H")
#trip duration in minutes
bikedata$tripduration<-bikedata$tripduration/60
#age of users
bikedata$year<-as.integer(bikedata$year)
bikedata<-mutate(bikedata,age=year-birth.year)
#flag for start end location
bikedata<-mutate(bikedata,sameloc=ifelse(start.station.id==end.station.id,1,0))
#age category
bikedata<-mutate(bikedata,age_cat=ifelse(age<=21,"<21",ifelse((age>21 & age<=29),"21-29",ifelse((age>29&age<=39),"30-39",ifelse((age>39&age<=49),"40-49",ifelse(age>50,"50+",NA))))))
#weekend flag 
bikedata<-mutate(bikedata,weekend=ifelse(stday<=5,0,1))
#gender 
bikedata$gender<-as.factor(bikedata$gender)
bikedata$gender<-factor(bikedata$gender,labels=c("unknown","male","female"))
head(bikedata)
##   tripduration           starttime            stoptime start.station.id
## 1     4.800000 2015-09-01 00:00:00 2015-09-01 00:04:48              263
## 2     2.750000 2015-09-01 00:00:00 2015-09-01 00:02:45              495
## 3     6.116667 2015-09-01 00:00:01 2015-09-01 00:06:08             3119
## 4    15.433333 2015-09-01 00:00:07 2015-09-01 00:15:34              536
## 5    10.966667 2015-09-01 00:00:09 2015-09-01 00:11:07              347
## 6     6.533333 2015-09-01 00:00:13 2015-09-01 00:06:46             2004
##            start.station.name start.station.latitude
## 1    Elizabeth St & Hester St               40.71729
## 2            W 47 St & 10 Ave               40.76270
## 3        Vernon Blvd & 50 Ave               40.74233
## 4             1 Ave & E 30 St               40.74144
## 5 Greenwich St & W Houston St               40.72885
## 6           6 Ave & Broome St               40.72440
##   start.station.longitude end.station.id           end.station.name
## 1               -73.99638            307      Canal St & Rutgers St
## 2               -73.99301            449            W 52 St & 9 Ave
## 3               -73.95412           3118 McGuinness Blvd & Eagle St
## 4               -73.97536            340    Madison St & Clinton St
## 5               -74.00859            483            E 12 St & 3 Ave
## 6               -74.00470            254            W 11 St & 6 Ave
##   end.station.latitude end.station.longitude bikeid   usertype birth.year
## 1             40.71427             -73.98990  15479 Subscriber       1989
## 2             40.76462             -73.98789  23899 Subscriber       1987
## 3             40.73555             -73.95284  23921   Customer         NA
## 4             40.71269             -73.98776  23256 Subscriber       1992
## 5             40.73223             -73.98890  17289 Subscriber       1976
## 6             40.73532             -73.99800  15438 Subscriber       1955
##    gender stday     stdate sthour year endday    enddate endhour age
## 1    male     2 2015-09-01     00 2015      2 2015-09-01      00  26
## 2    male     2 2015-09-01     00 2015      2 2015-09-01      00  28
## 3 unknown     2 2015-09-01     00 2015      2 2015-09-01      00  NA
## 4    male     2 2015-09-01     00 2015      2 2015-09-01      00  23
## 5    male     2 2015-09-01     00 2015      2 2015-09-01      00  39
## 6    male     2 2015-09-01     00 2015      2 2015-09-01      00  60
##   sameloc age_cat weekend
## 1       0   21-29       0
## 2       0   21-29       0
## 3       0    <NA>       0
## 4       0   21-29       0
## 5       0   30-39       0
## 6       0     50+       0
#create summary of users by age
subike_age<-bikedata %>% 
  group_by(year,age_cat) %>% 
  count()
subike_age<-subike_age[!(is.na(subike_age$age_cat)),]
subike_age$n<-subike_age$n/1000
z<-ggplot(subike_age,aes(x=age_cat,y=n, color=age_cat,fill=age_cat))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("Age Category")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)

#create summary of users by sex type
subike_sex<-bikedata %>% 
  group_by(year,gender) %>% 
  count()
subike_sex$n<-subike_sex$n/1000
z<-ggplot(subike_sex,aes(x=gender,y=n, color=gender,fill=gender))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("User Sex")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)

#summary of users by subscriber type
subike_sub<-bikedata %>% 
  group_by(year,usertype) %>% 
  count()
subike_sub$n<-subike_sub$n/1000
z<-ggplot(subike_sub,aes(x=usertype,y=n, color=usertype,fill=usertype))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("User Type")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)

Dataset 3: Chicago Crime Data Set

datadir=file.path("C://Users//talha//Documents//Training//Datasets//Crime//Chicago//")
filename1=file.path(datadir,"Crimes.csv")
crimedata<-read.csv(filename1,header=TRUE,stringsAsFactors=FALSE)
#rename variables
crimedata<-rename(crimedata,id=ï..ID,casenum=Case.Number,crimedate=Date,primary=Primary.Type,area=Community.Area,loc_type=Location.Description,fbicode=FBI.Code,x_loc=X.Coordinate,y_loc=Y.Coordinate,last_update=Updated.On)
#change case of column names
names(crimedata)<-tolower(names(crimedata))
#change to NULL
crimedata$x<-NULL
crimedata<-separate(crimedata,location,c("lat","long"),sep=",")
## Warning: Too few values at 23376 locations: 7, 164, 165, 198, 246, 258,
## 281, 287, 300, 302, 306, 326, 329, 331, 335, 336, 348, 354, 425, 560, ...
crimedata$lat<-transmute(crimedata,lat=ifelse(lat=="",NA,lat))
head(crimedata)
##         id  casenum              crimedate                 block iucr
## 1 10001595 HY191041 11/02/2014 05:57:00 AM      003XX W 110TH ST 2825
## 2 10007031 HY196398 03/08/2015 09:00:00 AM     082XX S EVANS AVE 1155
## 3 10009684 HY199045 03/08/2015 04:00:00 AM 018XX N SHEFFIELD AVE 1150
## 4 10012713 HY202475 03/08/2015 08:00:00 AM  084XX S EXCHANGE AVE 5002
## 5 10033820 HY223111 11/02/2014 08:00:00 AM     030XX N DRAKE AVE 0281
## 6 10158010 HY347156 07/19/2015 03:44:00 PM       016XX W 46TH ST 143B
##               primary                         description  loc_type arrest
## 1       OTHER OFFENSE             HARASSMENT BY TELEPHONE RESIDENCE  false
## 2  DECEPTIVE PRACTICE AGGRAVATED FINANCIAL IDENTITY THEFT APARTMENT  false
## 3  DECEPTIVE PRACTICE                   CREDIT CARD FRAUD RESIDENCE  false
## 4       OTHER OFFENSE               OTHER VEHICLE OFFENSE    STREET  false
## 5 CRIM SEXUAL ASSAULT                      NON-AGGRAVATED RESIDENCE  false
## 6   WEAPONS VIOLATION         UNLAWFUL POSS OTHER FIREARM    STREET   true
##   domestic beat district ward area fbicode   x_loc   y_loc year
## 1    false  513        5   34   49      26 1176052 1831981 2014
## 2    false  631        6    6   44      11 1182660 1850549 2015
## 3    false 1813       18   43    7      11 1169343 1912524 2015
## 4     true  423        4   10   46      26 1197281 1849663 2015
## 5     true 1412       14   35   21      02 1152188 1920160 2014
## 6    false  924        9    3   61      15 1165992 1874208 2015
##              last_update latitude longitude        lat         long
## 1 02/04/2016 06:33:39 AM 41.69431 -87.63105 (41.694312  -87.631046)
## 2 08/17/2015 03:03:40 PM 41.74511 -87.60628 (41.745115  -87.606279)
## 3 08/17/2015 03:03:40 PM 41.91548 -87.65328 (41.915478  -87.653277)
## 4 08/17/2015 03:03:40 PM 41.74233 -87.55274 (41.742332  -87.552736)
## 5 02/04/2016 06:33:39 AM 41.93679 -87.71610 (41.936788  -87.716101)
## 6 04/25/2016 03:49:09 PM 41.81041 -87.66668 (41.810408  -87.666681)