library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
This analysis looks at the 2012-2013 loans data from Lending Tree.
lendingdata<-read.csv(filename,header=TRUE,stringsAsFactors=FALSE,skip=1)
#dataset can be located at
lendingdata$url<-NULL
lendingdata$desc<-NULL
#clean up zip code data
lendingdata$zip_code<-str_replace(lendingdata$zip_code,"xx","")
#clean interest rate data
lendingdata$int_rate<-as.numeric(str_replace(lendingdata$int_rate,"%",""))
summary(lendingdata$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 6.00 11.14 14.09 14.28 17.27 26.06 2
lendingdata$revol_util<-as.numeric(str_replace(lendingdata$revol_util,"%",""))
summary(lendingdata$revol_util)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 42.50 60.50 58.39 76.60 140.40 127
# clean loan term
lendingdata$term<-as.numeric(str_replace(lendingdata$term,"months",""))
summary(lendingdata$term)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 36.00 36.00 41.65 36.00 60.00 2
# clean emp_length
lendingdata$emp_length<-str_trim(str_replace(lendingdata$emp_length,"years",""),"both")
lendingdata$emp_length<-str_replace(lendingdata$emp_length,"\\+","")
lendingdata$emp_length<-as.numeric(lendingdata$emp_length)
lendingdata<-mutate(lendingdata,emp_lencat=ifelse(emp_length<1,"<1",ifelse(((emp_length>=1)&(emp_length<3)),"1-3",ifelse(((emp_length>=3)&(emp_length<5)),"3-5",ifelse(((emp_length>=5)&(emp_length<=9)),"5-9",ifelse(emp_length>9,"10+",NA))))))
#process dates
lendingdata<-rename(lendingdata,cr_line=earliest_cr_line)
lendingdata$cr_line<-str_c("01-",lendingdata$cr_line)
lendingdata$cr_line<-as.Date(lendingdata$cr_line,format="%d-%b-%Y")
lendingdata$last_pymnt_d<-str_c("01-",lendingdata$last_pymnt_d)
lendingdata$last_pymnt_d<-as.Date(lendingdata$last_pymnt_d,format="%d-%b-%Y")
lendingdata$issue_d<-str_c("01-",lendingdata$issue_d)
lendingdata$issue_d<-as.Date(lendingdata$issue_d,format="%d-%b-%Y")
#credit history
lendingdata<-mutate(lendingdata,cr_hist=difftime(issue_d,cr_line,units=c("weeks")))
lendingdata$cr_hist<-lendingdata$cr_hist/52 #convert to years
lendingdata$last_pymnt_d<-format(lendingdata$last_pymnt_d,"%b-%Y")
lendingdata$issue_d<-format(lendingdata$issue_d,"%b-%Y")
lendingdata$cr_line<-format(lendingdata$cr_line,"%b-%Y")
#convert to factors
lendingdata$application_type<-as.factor(lendingdata$application_type)
lendingdata$home_ownership<-as.factor(lendingdata$home_ownership)
lendingdata$verification_status<-as.factor(lendingdata$verification_status)
lendingdata$loan_status<-as.factor(lendingdata$loan_status)
#select variables of interest
lendingdata<-select(lendingdata,one_of(c("id","member_id","addr_state","loan_amnt","int_rate","term","installment","emp_length","home_ownership","annual_inc","cr_hist","issue_d","cr_line","last_pymnt_d","verification_status","loan_status","emp_length","emp_lencat")))
head(lendingdata)
## id member_id addr_state loan_amnt int_rate term installment
## 1 10159611 12011228 MA 10000 9.67 36 321.13
## 2 10129477 11981093 NC 14000 12.85 36 470.71
## 3 10149342 12000897 MI 27050 10.99 36 885.46
## 4 10139658 11991209 NM 12000 13.53 36 407.40
## 5 10159548 12011167 FL 15000 8.90 36 476.30
## 6 10119623 11971241 CO 12000 11.99 36 398.52
## emp_length home_ownership annual_inc cr_hist issue_d cr_line
## 1 7 MORTGAGE 102000 24.25000 weeks Dec-2013 Oct-1989
## 2 4 RENT 88000 25.58791 weeks Dec-2013 Jun-1988
## 3 10 OWN 55000 27.26099 weeks Dec-2013 Oct-1986
## 4 10 RENT 40000 15.21978 weeks Dec-2013 Oct-1998
## 5 2 MORTGAGE 63000 15.80769 weeks Dec-2013 Mar-1998
## 6 10 MORTGAGE 130000 16.13736 weeks Dec-2013 Nov-1997
## last_pymnt_d verification_status loan_status emp_lencat
## 1 Sep-2016 Not Verified Current 5-9
## 2 Sep-2016 Not Verified Current 3-5
## 3 Jul-2016 Verified Fully Paid 10+
## 4 Sep-2015 Source Verified Fully Paid 10+
## 5 Sep-2016 Not Verified Current 1-3
## 6 Sep-2016 Source Verified Current 10+
Various summaries and charts of the clean data are presented below:
#summary of loans by state
su_total<-lendingdata %>%
group_by(addr_state) %>%
summarize(total_loans=sum(loan_amnt/1000000))
su_total<-su_total[!(su_total$addr_state==""),]
ggplot(su_total,aes(x=addr_state,y=total_loans))+geom_bar(stat="identity")+ylab("Total Loans (in Millions)")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("US State")+ggtitle("Total Loans by State (in Millions)")
a<-subset(lendingdata,((addr_state=="NY")|(addr_state=="NJ")|(addr_state=="CT")),select=c(addr_state,home_ownership,int_rate))
y <- ggplot(a, aes(y=int_rate, x=home_ownership)) + geom_boxplot(aes(color=home_ownership))
y+facet_grid(~addr_state)+ylab("Interest Rate %")+xlab("Home Ownership")+theme(legend.position="top",axis.text.x=element_text(size=5))+ggtitle("Interest Rates by Home Ownership in Tristate Area")
su_intrate<-lendingdata %>%
group_by(addr_state) %>%
summarize(avg_irate=mean(int_rate))
su_intrate<-su_intrate[!(su_intrate$addr_state==""),]
ggplot(su_intrate,aes(x=addr_state,y=avg_irate))+geom_point()+ylab("Interest Rate %")+xlab("US State")+ggtitle("Average Interest Rate by State")+theme(axis.text.x=element_text(size=8,angle=90))
b<-subset(lendingdata,((addr_state=="NY")|(addr_state=="NJ")),select=c(addr_state,emp_lencat,int_rate))
ggplot(b, aes(y=int_rate, x=emp_lencat)) + geom_boxplot(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Employment Length")
## Warning: Removed 4104 rows containing non-finite values (stat_boxplot).
lendingdata$annual_inc<-lendingdata$annual_inc/1000
z<-ggplot(lendingdata,aes(x=annual_inc,y=int_rate))+geom_point(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Annual Income in Ths dollars")+ggtitle("Loan Interest Rates by Employment Category and Income")
z+facet_grid(~emp_lencat)
## Warning: Removed 2 rows containing missing values (geom_point).
p<-ggplot(lendingdata,aes(x=cr_hist,y=int_rate))+geom_point(aes(color=emp_lencat))+ylab("Interest Rate %")+xlab("Credit History in Years")+ggtitle("Loan Interest Rates by Credit History and Employment")+scale_x_continuous(breaks=seq(0,75,15))
p+facet_grid(~emp_lencat)
## Warning: Removed 2 rows containing missing values (geom_point).
datadir=file.path("C://Users//talha//Documents//Training//Datasets//Transportation//Citibike//")
filename1=file.path(datadir,"201509-citibike-tripdata.csv")
filename2=file.path(datadir,"201609-citibike-tripdata.csv")
# dataset can be located at
#load and merge the files
bikedata<-read.csv(filename1,header=TRUE,stringsAsFactors=FALSE)
sep16<-read.csv(filename2,header=TRUE,stringsAsFactors=FALSE)
head(bikedata)
## tripduration starttime stoptime start.station.id
## 1 288 9/1/2015 00:00:00 9/1/2015 00:04:48 263
## 2 165 9/1/2015 00:00:00 9/1/2015 00:02:45 495
## 3 367 9/1/2015 00:00:01 9/1/2015 00:06:08 3119
## 4 926 9/1/2015 00:00:07 9/1/2015 00:15:34 536
## 5 658 9/1/2015 00:00:09 9/1/2015 00:11:07 347
## 6 392 9/1/2015 00:00:13 9/1/2015 00:06:46 2004
## start.station.name start.station.latitude
## 1 Elizabeth St & Hester St 40.71729
## 2 W 47 St & 10 Ave 40.76270
## 3 Vernon Blvd & 50 Ave 40.74233
## 4 1 Ave & E 30 St 40.74144
## 5 Greenwich St & W Houston St 40.72885
## 6 6 Ave & Broome St 40.72440
## start.station.longitude end.station.id end.station.name
## 1 -73.99638 307 Canal St & Rutgers St
## 2 -73.99301 449 W 52 St & 9 Ave
## 3 -73.95412 3118 McGuinness Blvd & Eagle St
## 4 -73.97536 340 Madison St & Clinton St
## 5 -74.00859 483 E 12 St & 3 Ave
## 6 -74.00470 254 W 11 St & 6 Ave
## end.station.latitude end.station.longitude bikeid usertype birth.year
## 1 40.71427 -73.98990 15479 Subscriber 1989
## 2 40.76462 -73.98789 23899 Subscriber 1987
## 3 40.73555 -73.95284 23921 Customer NA
## 4 40.71269 -73.98776 23256 Subscriber 1992
## 5 40.73223 -73.98890 17289 Subscriber 1976
## 6 40.73532 -73.99800 15438 Subscriber 1955
## gender
## 1 1
## 2 1
## 3 0
## 4 1
## 5 1
## 6 1
head(sep16)
## tripduration starttime stoptime start.station.id
## 1 975 9/1/2016 00:00:02 9/1/2016 00:16:18 312
## 2 1220 9/1/2016 00:00:04 9/1/2016 00:20:25 316
## 3 308 9/1/2016 00:00:19 9/1/2016 00:05:27 479
## 4 250 9/1/2016 00:00:25 9/1/2016 00:04:35 297
## 5 439 9/1/2016 00:00:25 9/1/2016 00:07:45 494
## 6 730 9/1/2016 00:00:39 9/1/2016 00:12:50 491
## start.station.name start.station.latitude start.station.longitude
## 1 Allen St & Stanton St 40.72205 -73.98911
## 2 Fulton St & William St 40.70956 -74.00654
## 3 9 Ave & W 45 St 40.76019 -73.99126
## 4 E 15 St & 3 Ave 40.73423 -73.98692
## 5 W 26 St & 8 Ave 40.74735 -73.99724
## 6 E 24 St & Park Ave S 40.74096 -73.98602
## end.station.id end.station.name end.station.latitude
## 1 313 Washington Ave & Park Ave 40.69610
## 2 239 Willoughby St & Fleet St 40.69197
## 3 448 W 37 St & 10 Ave 40.75660
## 4 487 E 20 St & FDR Drive 40.73314
## 5 533 Broadway & W 39 St 40.75300
## 6 477 W 41 St & 8 Ave 40.75641
## end.station.longitude bikeid usertype birth.year gender
## 1 -73.96751 22609 Subscriber 1985 1
## 2 -73.98130 16966 Subscriber 1977 2
## 3 -73.99790 25601 Subscriber 1983 1
## 4 -73.97574 22094 Subscriber 1953 1
## 5 -73.98722 16319 Subscriber 1985 1
## 6 -73.99003 23730 Subscriber 1986 1
bikedata<-bind_rows(bikedata,sep16)
sep16<-NULL
#clean & classify date data
bikedata$starttime<-strptime(bikedata$starttime,"%m/%d/%Y %H:%M:%S")
bikedata$starttime<-as.POSIXct(bikedata$starttime,format="%m/%d/%Y %H:%M:%S")
bikedata$stday=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%u")
bikedata$stdate=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%F")
bikedata$sthour=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%H")
bikedata$year=format(as.POSIXct(bikedata$starttime, format="%m/%d/%Y %H:%M:%S"), format="%Y")
bikedata$stoptime<-strptime(bikedata$stoptime,"%m/%d/%Y %H:%M:%S")
bikedata$stoptime<-as.POSIXct(bikedata$stoptime,format="%m/%d/%Y %H:%M:%S")
bikedata$endday=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%u")
bikedata$enddate=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%F")
bikedata$endhour=format(as.POSIXct(bikedata$stoptime, format="%m/%d/%Y %H:%M:%S"), format="%H")
#trip duration in minutes
bikedata$tripduration<-bikedata$tripduration/60
#age of users
bikedata$year<-as.integer(bikedata$year)
bikedata<-mutate(bikedata,age=year-birth.year)
#flag for start end location
bikedata<-mutate(bikedata,sameloc=ifelse(start.station.id==end.station.id,1,0))
#age category
bikedata<-mutate(bikedata,age_cat=ifelse(age<=21,"<21",ifelse((age>21 & age<=29),"21-29",ifelse((age>29&age<=39),"30-39",ifelse((age>39&age<=49),"40-49",ifelse(age>50,"50+",NA))))))
#weekend flag
bikedata<-mutate(bikedata,weekend=ifelse(stday<=5,0,1))
#gender
bikedata$gender<-as.factor(bikedata$gender)
bikedata$gender<-factor(bikedata$gender,labels=c("unknown","male","female"))
head(bikedata)
## tripduration starttime stoptime start.station.id
## 1 4.800000 2015-09-01 00:00:00 2015-09-01 00:04:48 263
## 2 2.750000 2015-09-01 00:00:00 2015-09-01 00:02:45 495
## 3 6.116667 2015-09-01 00:00:01 2015-09-01 00:06:08 3119
## 4 15.433333 2015-09-01 00:00:07 2015-09-01 00:15:34 536
## 5 10.966667 2015-09-01 00:00:09 2015-09-01 00:11:07 347
## 6 6.533333 2015-09-01 00:00:13 2015-09-01 00:06:46 2004
## start.station.name start.station.latitude
## 1 Elizabeth St & Hester St 40.71729
## 2 W 47 St & 10 Ave 40.76270
## 3 Vernon Blvd & 50 Ave 40.74233
## 4 1 Ave & E 30 St 40.74144
## 5 Greenwich St & W Houston St 40.72885
## 6 6 Ave & Broome St 40.72440
## start.station.longitude end.station.id end.station.name
## 1 -73.99638 307 Canal St & Rutgers St
## 2 -73.99301 449 W 52 St & 9 Ave
## 3 -73.95412 3118 McGuinness Blvd & Eagle St
## 4 -73.97536 340 Madison St & Clinton St
## 5 -74.00859 483 E 12 St & 3 Ave
## 6 -74.00470 254 W 11 St & 6 Ave
## end.station.latitude end.station.longitude bikeid usertype birth.year
## 1 40.71427 -73.98990 15479 Subscriber 1989
## 2 40.76462 -73.98789 23899 Subscriber 1987
## 3 40.73555 -73.95284 23921 Customer NA
## 4 40.71269 -73.98776 23256 Subscriber 1992
## 5 40.73223 -73.98890 17289 Subscriber 1976
## 6 40.73532 -73.99800 15438 Subscriber 1955
## gender stday stdate sthour year endday enddate endhour age
## 1 male 2 2015-09-01 00 2015 2 2015-09-01 00 26
## 2 male 2 2015-09-01 00 2015 2 2015-09-01 00 28
## 3 unknown 2 2015-09-01 00 2015 2 2015-09-01 00 NA
## 4 male 2 2015-09-01 00 2015 2 2015-09-01 00 23
## 5 male 2 2015-09-01 00 2015 2 2015-09-01 00 39
## 6 male 2 2015-09-01 00 2015 2 2015-09-01 00 60
## sameloc age_cat weekend
## 1 0 21-29 0
## 2 0 21-29 0
## 3 0 <NA> 0
## 4 0 21-29 0
## 5 0 30-39 0
## 6 0 50+ 0
#create summary of users by age
subike_age<-bikedata %>%
group_by(year,age_cat) %>%
count()
subike_age<-subike_age[!(is.na(subike_age$age_cat)),]
subike_age$n<-subike_age$n/1000
z<-ggplot(subike_age,aes(x=age_cat,y=n, color=age_cat,fill=age_cat))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("Age Category")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)
#create summary of users by sex type
subike_sex<-bikedata %>%
group_by(year,gender) %>%
count()
subike_sex$n<-subike_sex$n/1000
z<-ggplot(subike_sex,aes(x=gender,y=n, color=gender,fill=gender))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("User Sex")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)
#summary of users by subscriber type
subike_sub<-bikedata %>%
group_by(year,usertype) %>%
count()
subike_sub$n<-subike_sub$n/1000
z<-ggplot(subike_sub,aes(x=usertype,y=n, color=usertype,fill=usertype))+geom_bar(stat="identity")+ylab("Users in Thousands")+theme(axis.text.x=element_text(size=8,angle=90))+ xlab("User Type")+ggtitle("Users in Sept 2015 vs Sept 2016")
z+facet_grid(~year)
datadir=file.path("C://Users//talha//Documents//Training//Datasets//Crime//Chicago//")
filename1=file.path(datadir,"Crimes.csv")
crimedata<-read.csv(filename1,header=TRUE,stringsAsFactors=FALSE)
#rename variables
crimedata<-rename(crimedata,id=ï..ID,casenum=Case.Number,crimedate=Date,primary=Primary.Type,area=Community.Area,loc_type=Location.Description,fbicode=FBI.Code,x_loc=X.Coordinate,y_loc=Y.Coordinate,last_update=Updated.On)
#change case of column names
names(crimedata)<-tolower(names(crimedata))
#change to NULL
crimedata$x<-NULL
crimedata<-separate(crimedata,location,c("lat","long"),sep=",")
## Warning: Too few values at 23376 locations: 7, 164, 165, 198, 246, 258,
## 281, 287, 300, 302, 306, 326, 329, 331, 335, 336, 348, 354, 425, 560, ...
crimedata$lat<-transmute(crimedata,lat=ifelse(lat=="",NA,lat))
head(crimedata)
## id casenum crimedate block iucr
## 1 10001595 HY191041 11/02/2014 05:57:00 AM 003XX W 110TH ST 2825
## 2 10007031 HY196398 03/08/2015 09:00:00 AM 082XX S EVANS AVE 1155
## 3 10009684 HY199045 03/08/2015 04:00:00 AM 018XX N SHEFFIELD AVE 1150
## 4 10012713 HY202475 03/08/2015 08:00:00 AM 084XX S EXCHANGE AVE 5002
## 5 10033820 HY223111 11/02/2014 08:00:00 AM 030XX N DRAKE AVE 0281
## 6 10158010 HY347156 07/19/2015 03:44:00 PM 016XX W 46TH ST 143B
## primary description loc_type arrest
## 1 OTHER OFFENSE HARASSMENT BY TELEPHONE RESIDENCE false
## 2 DECEPTIVE PRACTICE AGGRAVATED FINANCIAL IDENTITY THEFT APARTMENT false
## 3 DECEPTIVE PRACTICE CREDIT CARD FRAUD RESIDENCE false
## 4 OTHER OFFENSE OTHER VEHICLE OFFENSE STREET false
## 5 CRIM SEXUAL ASSAULT NON-AGGRAVATED RESIDENCE false
## 6 WEAPONS VIOLATION UNLAWFUL POSS OTHER FIREARM STREET true
## domestic beat district ward area fbicode x_loc y_loc year
## 1 false 513 5 34 49 26 1176052 1831981 2014
## 2 false 631 6 6 44 11 1182660 1850549 2015
## 3 false 1813 18 43 7 11 1169343 1912524 2015
## 4 true 423 4 10 46 26 1197281 1849663 2015
## 5 true 1412 14 35 21 02 1152188 1920160 2014
## 6 false 924 9 3 61 15 1165992 1874208 2015
## last_update latitude longitude lat long
## 1 02/04/2016 06:33:39 AM 41.69431 -87.63105 (41.694312 -87.631046)
## 2 08/17/2015 03:03:40 PM 41.74511 -87.60628 (41.745115 -87.606279)
## 3 08/17/2015 03:03:40 PM 41.91548 -87.65328 (41.915478 -87.653277)
## 4 08/17/2015 03:03:40 PM 41.74233 -87.55274 (41.742332 -87.552736)
## 5 02/04/2016 06:33:39 AM 41.93679 -87.71610 (41.936788 -87.716101)
## 6 04/25/2016 03:49:09 PM 41.81041 -87.66668 (41.810408 -87.666681)