Use data set of Credit Car Loans.
1. Analysis of Data
2. Visualization of Data
3. Cluster Analysis of Car Credit Loans
Setup
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(vcd)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
Functions
detect_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
#inp <- count(inp[is.na(otp)])
sum(is.na(otp))
}
Non_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
Remove_Outliers <- function ( z, na.rm = TRUE){
Out <- Non_outliers(z)
Out <-as.data.frame (Out)
z <- Out$Out[match(z, Out$Out)]
z
}
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfr_Model, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="green") +
labs(title="Outliers")
Plot
}
detect_na <- function(inp) {
sum(is.na(inp))
}
Dataset
setwd("D:/Welingkar/Trim 4/Data and Information/Project/Data")
dfr_Model <- read.csv("Full_Clean_Car_Data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfr_Model)
head(dfr_Model)
## Cust_ID Car_ID State location_code age Gender marital_status Education
## 1 10001 14 1 2 45 0 1 3
## 2 10002 19 4 2 50 0 0 3
## 3 10003 19 5 2 33 0 1 3
## 4 10004 21 2 2 27 1 1 3
## 5 10005 36 1 1 28 1 0 3
## 6 10006 35 3 1 35 0 1 3
## employment_status Home.Ownership Dependents Type.of.job Employment.Term
## 1 1 1 1 1 23
## 2 1 1 1 1 28
## 3 1 2 2 1 11
## 4 0 2 2 2 5
## 5 1 3 2 1 6
## 6 1 1 2 1 13
## Job.Gaps Annual.Income Car.Type loan_status_New Loan_Status Manufacturer
## 1 2 450991 2 1 2 3
## 2 0 351254 2 1 2 15
## 3 0 435040 2 1 2 15
## 4 1 283326 2 1 2 15
## 5 2 319140 5 1 2 15
## 6 0 454107 5 1 2 14
## Brand Price.Lakhs. new_vehicle_year Insurance.Coverage
## 1 1 4.890 2013 1
## 2 6 2.905 2010 1
## 3 6 2.905 2010 1
## 4 61 4.385 2011 1
## 5 38 2.625 2007 1
## 6 43 4.075 2012 1
## Months.Since.Last.Claim Months.Since.Policy.Inception
## 1 0 48
## 2 0 84
## 3 0 84
## 4 0 72
## 5 0 120
## 6 0 60
## Number.of.Open.Complaints Annual.Income.1 Monthly.Debt
## 1 0 450991 3758
## 2 0 351254 2927
## 3 0 435040 3625
## 4 0 283326 9123
## 5 0 319140 2508
## 6 0 454107 0
## Years.of.Credit.History Number.of.Credit.Problems Risk_Score Loan.Amount
## 1 17.2 1 693 389000
## 2 21.1 0 703 190500
## 3 14.9 1 715 240500
## 4 12.0 0 698 388500
## 5 6.1 0 509 212500
## 6 17.3 0 645 357500
## terms Interest EMI past_due_days Collection_Cost Sales.Channel
## 1 36 8 12190 0 486 1
## 2 84 12 3363 0 22 1
## 3 84 8 3748 0 433 1
## 4 36 12 12904 0 90 3
## 5 120 10 2808 0 341 1
## 6 36 10 11536 0 346 4
Observation
1. There are total ‘intRowCount’ data records in the file.
Missing Data
#sum(is.na(dfrModel$Age))
lapply(dfr_Model, FUN=detect_na)
## $Cust_ID
## [1] 0
##
## $Car_ID
## [1] 0
##
## $State
## [1] 0
##
## $location_code
## [1] 0
##
## $age
## [1] 0
##
## $Gender
## [1] 0
##
## $marital_status
## [1] 0
##
## $Education
## [1] 0
##
## $employment_status
## [1] 0
##
## $Home.Ownership
## [1] 0
##
## $Dependents
## [1] 0
##
## $Type.of.job
## [1] 0
##
## $Employment.Term
## [1] 0
##
## $Job.Gaps
## [1] 0
##
## $Annual.Income
## [1] 0
##
## $Car.Type
## [1] 0
##
## $loan_status_New
## [1] 0
##
## $Loan_Status
## [1] 0
##
## $Manufacturer
## [1] 0
##
## $Brand
## [1] 0
##
## $Price.Lakhs.
## [1] 0
##
## $new_vehicle_year
## [1] 0
##
## $Insurance.Coverage
## [1] 0
##
## $Months.Since.Last.Claim
## [1] 0
##
## $Months.Since.Policy.Inception
## [1] 0
##
## $Number.of.Open.Complaints
## [1] 0
##
## $Annual.Income.1
## [1] 0
##
## $Monthly.Debt
## [1] 0
##
## $Years.of.Credit.History
## [1] 0
##
## $Number.of.Credit.Problems
## [1] 0
##
## $Risk_Score
## [1] 0
##
## $Loan.Amount
## [1] 0
##
## $terms
## [1] 0
##
## $Interest
## [1] 0
##
## $EMI
## [1] 0
##
## $past_due_days
## [1] 0
##
## $Collection_Cost
## [1] 0
##
## $Sales.Channel
## [1] 0
Observations
There are no NA records in the data sets.
Summary
#summary(dfrModel)
str(dfr_Model)
## 'data.frame': 500 obs. of 38 variables:
## $ Cust_ID : int 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 ...
## $ Car_ID : int 14 19 19 21 36 35 15 15 20 19 ...
## $ State : int 1 4 5 2 1 3 3 4 3 3 ...
## $ location_code : int 2 2 2 2 1 1 2 3 2 3 ...
## $ age : int 45 50 33 27 28 35 29 36 28 26 ...
## $ Gender : int 0 0 0 1 1 0 0 1 1 0 ...
## $ marital_status : int 1 0 1 1 0 1 1 0 2 1 ...
## $ Education : int 3 3 3 3 3 3 2 5 3 2 ...
## $ employment_status : int 1 1 1 0 1 1 1 0 0 1 ...
## $ Home.Ownership : int 1 1 2 2 3 1 1 1 3 3 ...
## $ Dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ Type.of.job : int 1 1 1 2 1 1 1 2 2 1 ...
## $ Employment.Term : int 23 28 11 5 6 13 7 14 6 4 ...
## $ Job.Gaps : int 2 0 0 1 2 0 2 0 0 2 ...
## $ Annual.Income : int 450991 351254 435040 283326 319140 454107 236398 216766 444254 411141 ...
## $ Car.Type : int 2 2 2 2 5 5 2 2 2 2 ...
## $ loan_status_New : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Loan_Status : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Manufacturer : int 3 15 15 15 15 14 5 5 15 15 ...
## $ Brand : int 1 6 6 61 38 43 41 41 3 6 ...
## $ Price.Lakhs. : num 4.89 2.9 2.9 4.38 2.62 ...
## $ new_vehicle_year : int 2013 2010 2010 2011 2007 2012 2011 2009 2007 2010 ...
## $ Insurance.Coverage : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Months.Since.Last.Claim : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Months.Since.Policy.Inception: int 48 84 84 72 120 60 72 96 120 84 ...
## $ Number.of.Open.Complaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Annual.Income.1 : int 450991 351254 435040 283326 319140 454107 236398 216766 444254 411141 ...
## $ Monthly.Debt : int 3758 2927 3625 9123 2508 0 1970 1806 3702 4029 ...
## $ Years.of.Credit.History : num 17.2 21.1 14.9 12 6.1 17.3 19.6 8.2 22.6 13.9 ...
## $ Number.of.Credit.Problems : int 1 0 1 0 0 0 1 0 0 0 ...
## $ Risk_Score : int 693 703 715 698 509 645 693 700 694 573 ...
## $ Loan.Amount : int 389000 190500 240500 388500 212500 357500 391500 341500 131000 190500 ...
## $ terms : int 36 84 84 36 120 36 36 84 120 60 ...
## $ Interest : int 8 12 8 12 10 10 8 10 10 12 ...
## $ EMI : int 12190 3363 3748 12904 2808 11536 12268 5669 1731 4238 ...
## $ past_due_days : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Collection_Cost : num 486 22 433 90 341 346 163 151 457 314 ...
## $ Sales.Channel : int 1 1 1 3 1 4 1 1 1 2 ...
lapply(dfr_Model, FUN=describe)
## $Cust_ID
## vars n mean sd median trimmed mad min max range skew
## X1 1 500 10250.5 144.48 10250.5 10250.5 185.32 10001 10500 499 0
## kurtosis se
## X1 -1.21 6.46
##
## $Car_ID
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 27.46 16.86 21 26.4 17.79 1 63 62 0.52 -0.86
## se
## X1 0.75
##
## $State
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 2.82 1.15 3 2.79 1.48 1 5 4 0.24 -0.77 0.05
##
## $location_code
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.95 0.57 2 1.94 0 1 3 2 0 0.02 0.03
##
## $age
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 31.12 6.08 30 30.66 5.93 18 51 33 0.72 0.37
## se
## X1 0.27
##
## $Gender
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.49 0.5 0 0.48 0 0 1 1 0.06 -2 0.02
##
## $marital_status
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.85 0.63 1 0.82 0 0 2 2 0.13 -0.57 0.03
##
## $Education
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 2.35 1.15 2 2.21 1.48 1 5 4 0.69 -0.01 0.05
##
## $employment_status
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.96 0.24 1 1 0 0 2 2 -2.6 13.23 0.01
##
## $Home.Ownership
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.92 0.96 1 1.89 0 1 4 3 0.18 -1.86 0.04
##
## $Dependents
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.14 0.35 1 1.05 0 1 2 1 2.02 2.09 0.02
##
## $Type.of.job
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.49 0.5 1 1.49 0 1 2 1 0.05 -2 0.02
##
## $Employment.Term
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 9.21 5.94 8 8.65 5.93 1 29 28 0.85 0.4 0.27
##
## $Job.Gaps
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.52 0.78 0 0.4 0 0 2 2 1.08 -0.51 0.03
##
## $Annual.Income
## vars n mean sd median trimmed mad min max
## X1 1 500 786157.7 473497.6 633332 721588.9 395002.5 205858 2479810
## range skew kurtosis se
## X1 2273952 1.32 1.68 21175.46
##
## $Car.Type
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 3.56 1.99 2 3.44 1.48 1 7 6 0.44 -1.34 0.09
##
## $loan_status_New
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.91 0.29 1 1 0 0 1 1 -2.86 6.17 0.01
##
## $Loan_Status
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 2.45 0.89 2 2.42 0 1 4 3 0.55 -0.62 0.04
##
## $Manufacturer
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 13.6 6.34 15 13.72 5.93 1 25 24 -0.12 -0.75
## se
## X1 0.28
##
## $Brand
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 33.09 18.86 35 33.37 22.24 1 63 62 -0.16 -1.11
## se
## X1 0.84
##
## $Price.Lakhs.
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 7.92 6.95 4.79 6.41 1.76 2.31 35 32.69 1.99 3.33
## se
## X1 0.31
##
## $new_vehicle_year
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 2009.97 1.92 2010 2009.95 1.48 2007 2014 7 -0.1 -0.89
## se
## X1 0.09
##
## $Insurance.Coverage
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.62 0.75 1 1.52 0 1 3 2 0.74 -0.85 0.03
##
## $Months.Since.Last.Claim
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.41 1.02 0 0.12 0 0 4 4 2.47 4.9 0.05
##
## $Months.Since.Policy.Inception
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 81.24 27.58 84 82.89 17.79 11 120 109 -0.41 -0.22
## se
## X1 1.23
##
## $Number.of.Open.Complaints
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.17 1.63 0 0.86 0 0 5 5 1.26 0.23 0.07
##
## $Annual.Income.1
## vars n mean sd median trimmed mad min max
## X1 1 500 786157.7 473497.6 633332 721588.9 395002.5 205858 2479810
## range skew kurtosis se
## X1 2273952 1.32 1.68 21175.46
##
## $Monthly.Debt
## vars n mean sd median trimmed mad min max range skew
## X1 1 500 12106.19 11699.33 8226 10195.78 8946.01 0 52945 52945 1.34
## kurtosis se
## X1 1.25 523.21
##
## $Years.of.Credit.History
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 18.2 7.1 16.75 17.38 5.56 4.9 50.1 45.2 1.37 2.66 0.32
##
## $Number.of.Credit.Problems
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 0.19 0.52 0 0.08 0 0 5 5 4.01 23.59 0.02
##
## $Risk_Score
## vars n mean sd median trimmed mad min max range skew
## X1 1 500 535.86 154.77 558.5 560.1 95.63 0 789 789 -2.31
## kurtosis se
## X1 5.84 6.92
##
## $Loan.Amount
## vars n mean sd median trimmed mad min max range
## X1 1 500 635578 581148.9 391500 506285 195703.2 131000 3300000 3169000
## skew kurtosis se
## X1 2.23 4.9 25989.77
##
## $terms
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 67.63 31.38 60 65.64 35.58 24 120 96 0.52 -1
## se
## X1 1.4
##
## $Interest
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 10.2 1.71 10 10.15 2.97 8 14 6 0.22 -0.98 0.08
##
## $EMI
## vars n mean sd median trimmed mad min max range
## X1 1 500 15250.39 17040.86 9333 11509.44 5961.53 1731 155342 153611
## skew kurtosis se
## X1 3.1 13.62 762.09
##
## $past_due_days
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 500 93.39 151.3 0 72.02 0 0 396 396 1.06 -0.86
## se
## X1 6.77
##
## $Collection_Cost
## vars n mean sd median trimmed mad min max range skew
## X1 1 500 1205.52 2232.76 455.5 782.19 505.57 2 25000 24998 6.56
## kurtosis se
## X1 58.15 99.85
##
## $Sales.Channel
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 500 1.95 1.06 2 1.81 1.48 1 4 3 0.72 -0.82 0.05
Box Plot
lapply(dfr_Model, FUN=Graph_Boxplot)
## $Cust_ID
##
## $Car_ID
##
## $State
##
## $location_code
##
## $age
##
## $Gender
##
## $marital_status
##
## $Education
##
## $employment_status
##
## $Home.Ownership
##
## $Dependents
##
## $Type.of.job
##
## $Employment.Term
##
## $Job.Gaps
##
## $Annual.Income
##
## $Car.Type
##
## $loan_status_New
##
## $Loan_Status
##
## $Manufacturer
##
## $Brand
##
## $Price.Lakhs.
##
## $new_vehicle_year
##
## $Insurance.Coverage
##
## $Months.Since.Last.Claim
##
## $Months.Since.Policy.Inception
##
## $Number.of.Open.Complaints
##
## $Annual.Income.1
##
## $Monthly.Debt
##
## $Years.of.Credit.History
##
## $Number.of.Credit.Problems
##
## $Risk_Score
##
## $Loan.Amount
##
## $terms
##
## $Interest
##
## $EMI
##
## $past_due_days
##
## $Collection_Cost
##
## $Sales.Channel
Observation
Here we can see that there are some outliers in the data but they may be useful while making the cluster.
Subset of Data
dfr_Model1 <- filter(dfr_Model, Cust_ID>=10001 )
No of Outliers in Each Variable
lapply(dfr_Model1, FUN=detect_outliers)
## $Cust_ID
## [1] 0
##
## $Car_ID
## [1] 0
##
## $State
## [1] 0
##
## $location_code
## [1] 165
##
## $age
## [1] 6
##
## $Gender
## [1] 0
##
## $marital_status
## [1] 0
##
## $Education
## [1] 0
##
## $employment_status
## [1] 29
##
## $Home.Ownership
## [1] 0
##
## $Dependents
## [1] 72
##
## $Type.of.job
## [1] 0
##
## $Employment.Term
## [1] 6
##
## $Job.Gaps
## [1] 0
##
## $Annual.Income
## [1] 19
##
## $Car.Type
## [1] 0
##
## $loan_status_New
## [1] 45
##
## $Loan_Status
## [1] 0
##
## $Manufacturer
## [1] 0
##
## $Brand
## [1] 0
##
## $Price.Lakhs.
## [1] 75
##
## $new_vehicle_year
## [1] 0
##
## $Insurance.Coverage
## [1] 0
##
## $Months.Since.Last.Claim
## [1] 83
##
## $Months.Since.Policy.Inception
## [1] 0
##
## $Number.of.Open.Complaints
## [1] 0
##
## $Annual.Income.1
## [1] 19
##
## $Monthly.Debt
## [1] 28
##
## $Years.of.Credit.History
## [1] 20
##
## $Number.of.Credit.Problems
## [1] 80
##
## $Risk_Score
## [1] 30
##
## $Loan.Amount
## [1] 58
##
## $terms
## [1] 0
##
## $Interest
## [1] 0
##
## $EMI
## [1] 63
##
## $past_due_days
## [1] 0
##
## $Collection_Cost
## [1] 51
##
## $Sales.Channel
## [1] 0
Outliers Obsevations
1. We can see that there are outliers in the data, still we will go with outliers.
As all the variables are having different data ranges so we are going with Data Normalization so it will be easy to find out the Euclidian Distance which is very useful while doing the cluster analysis.
Data Normalization
dfr_Model2 <- select(dfr_Model1, -c(Cust_ID))
m <- apply(dfr_Model2, 2, mean)
s <- apply(dfr_Model2, 2, sd)
dfr_Model2 <- scale(dfr_Model2,m,s)
class(dfr_Model2)
## [1] "matrix"
dfr_Model2 <- as.data.frame(dfr_Model2)
Observations
1. As all the data values are different in different variables so for Cluster analysis we need to normalize the data
2. Data is normalized successfully to get the better clusters through Euclidien distance.
Calculating the Euclidean Distance
distance <- dist(dfr_Model2)
Observation
No errors. Distance is calucalated successfully.
hc.c <- hclust(distance)
hc.c
##
## Call:
## hclust(d = distance)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 500
#plot(hc.c, labels=dfr_Model$Player_ID)
#plot(hc.c, hang=-1)
hc.d <- hclust(distance, method="ward.D")
hc.d
##
## Call:
## hclust(d = distance, method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 500
Cluster Membership With 5 cluster
member.c <- cutree(hc.c, 5)
member.d <- cutree(hc.d, 5)
Cluster Membership Comparison
table(member.c, member.d)
## member.d
## member.c 1 2 3 4 5
## 1 257 22 0 104 2
## 2 0 3 37 0 0
## 3 0 0 0 4 62
## 4 0 0 5 0 0
## 5 0 0 0 0 4
Observations Complete Linkage is putting most of the data in few Cluster only.
We can see that Complete Linkage Method is not giving better results compare to Wards method.
wss <- (nrow(dfr_Model2)-1)*sum(apply(dfr_Model2,2,var))
for( i in 2:20) wss[i] <- sum(kmeans(dfr_Model2, centers=i)$withinss)
plot(1:20, wss, type="b", Xlab="Number of Cluster", ylab = "Within group SS")
## Warning in plot.window(...): "Xlab" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Xlab" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Xlab" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Xlab" is not
## a graphical parameter
## Warning in box(...): "Xlab" is not a graphical parameter
## Warning in title(...): "Xlab" is not a graphical parameter
Observations
We can see that At starting withon group Sum of square changes is very high, after Cluster 3 it is going near to flat & there are no large changes.
So here 3 Cluster would be good to take.
Cluster Membership With 4 cluster
member.c <- cutree(hc.c, 3)
member.d <- cutree(hc.d, 3)
Cluster Membership Comparison
table(member.c, member.d)
## member.d
## member.c 1 2 3
## 1 282 141 2
## 2 0 4 66
## 3 0 5 0
Observations
Complete Linkage is putting most of the data in few Cluster only.
We can see that Complete Linkage Method is not giving better results compare to Wards method.
Cluster Means
aggregate(dfr_Model2, list(member.c), mean)
## Group.1 Car_ID State location_code age Gender
## 1 1 0.07206652 -0.03920719 0.001026859 -0.02486434 -0.0119927
## 2 2 -0.41357423 0.21473520 -0.012468999 0.15232376 0.0565370
## 3 3 -0.33561490 0.32631816 0.087282992 -0.01906395 0.2278612
## marital_status Education employment_status Home.Ownership
## 1 0.01825486 0.0008195823 -0.03122336 0.01396158
## 2 -0.10495214 -0.0079616562 0.17693239 -0.07614135
## 3 -0.08233315 0.0417986951 0.17693239 -0.12075542
## Dependents Type.of.job Employment.Term Job.Gaps Annual.Income
## 1 -0.001339024 -0.1805771 -0.02309196 -0.04318847 -0.3084246
## 2 -0.043900843 1.0232702 0.14272842 0.18116722 1.8005235
## 3 0.728428799 1.0232702 -0.03538124 1.13467889 1.0087609
## Car.Type loan_status_New Loan_Status Manufacturer Brand
## 1 0.02130927 -0.01437383 -0.2683768 -0.03480576 0.0270448
## 2 -0.12362757 0.31417081 1.7454402 0.17531622 -0.2199189
## 3 -0.08050167 -3.17661595 -1.6241355 0.50406227 0.7800568
## Price.Lakhs. new_vehicle_year Insurance.Coverage Months.Since.Last.Claim
## 1 -0.3733693 -0.007273602 -0.326389 0.02918946
## 2 2.1068530 -0.068134356 1.849538 -0.31731131
## 3 2.2404511 1.572137183 1.849538 1.96125397
## Months.Since.Policy.Inception Number.of.Open.Complaints Annual.Income.1
## 1 -0.001719337 -0.07563786 -0.3084246
## 2 0.168544396 0.38771619 1.8005235
## 3 -2.213477900 1.00119116 1.0087609
## Monthly.Debt Years.of.Credit.History Number.of.Credit.Problems
## 1 -0.1277716 0.001273702 -0.02472924
## 2 0.6611571 -0.024380163 0.03911951
## 3 1.6043831 0.233057624 1.55431194
## Risk_Score Loan.Amount terms Interest EMI past_due_days
## 1 0.03138842 -0.3551956 0.014127726 -0.1719535 -0.3128304 0.06327707
## 2 -0.17069887 1.9827852 0.002622435 0.9855257 1.6038157 -0.49966208
## 3 -0.27823139 2.4326330 -1.237570813 0.8186854 4.1371635 1.61671795
## Collection_Cost Sales.Channel
## 1 -0.2307256 0.002441793
## 2 1.3907540 -0.031806735
## 3 0.1411173 0.237741870
aggregate(dfr_Model2, list(member.d), mean)
## Group.1 Car_ID State location_code age Gender
## 1 1 -0.1861868 -0.04729570 0.03776073 0.03571751 0.04215872
## 2 2 0.5442521 0.04281479 -0.08728299 -0.14615693 -0.10526922
## 3 3 -0.4284284 0.10169364 0.03594006 0.17428238 0.05737682
## marital_status Education employment_status Home.Ownership Dependents
## 1 0.04904953 -0.003903188 -0.1367775 -0.001151739 0.04431598
## 2 -0.06122208 -0.010449674 0.1769324 0.038863815 -0.06829020
## 3 -0.06836259 0.039237501 0.1769324 -0.080952673 -0.03314083
## Type.of.job Employment.Term Job.Gaps Annual.Income Car.Type
## 1 -0.7910387 0.03810564 -0.02508885 -0.6410811 -0.2442883
## 2 1.0232702 -0.14657944 -0.02909433 0.4017135 0.5333236
## 3 1.0232702 0.16531069 0.16822362 1.7724684 -0.1633710
## loan_status_New Loan_Status Manufacturer Brand Price.Lakhs.
## 1 0.2770348 -0.5049265 -0.08706101 0.06405228 -0.52296727
## 2 -0.6632495 0.1579957 0.10161105 -0.02845890 0.03176565
## 3 0.3141708 1.7454402 0.13690511 -0.20285158 2.09870475
## new_vehicle_year Insurance.Coverage Months.Since.Last.Claim
## 1 -0.11177176 -0.6836199 -0.1154106
## 2 0.24903429 0.4467483 0.3596945
## 3 -0.08581628 1.8495379 -0.3148292
## Months.Since.Policy.Inception Number.of.Open.Complaints Annual.Income.1
## 1 0.1913352 -0.4141522 -0.6410811
## 2 -0.4428310 0.5881180 0.4017135
## 3 0.1833547 0.4201943 1.7724684
## Monthly.Debt Years.of.Credit.History Number.of.Credit.Problems
## 1 -0.6124938 -0.04143570 -0.04587194
## 2 0.8434724 0.06636666 0.05013909
## 3 0.6794469 0.02543925 0.07963268
## Risk_Score Loan.Amount terms Interest EMI past_due_days
## 1 0.08126691 -0.50352994 0.12157745 -0.1218325 -0.4568316 -0.5932598
## 2 -0.16826208 0.05720622 -0.24833003 -0.2168368 0.1428128 1.3421692
## 3 0.03414772 1.96197808 0.04359798 0.9835629 1.5794793 -0.5003841
## Collection_Cost Sales.Channel
## 1 -0.4081416 0.08585792
## 2 0.1329928 -0.17107352
## 3 1.3992207 0.02131020
Observation
Through Cluster means we get to know which variables are important while forming the clusters.
Variables which are having high mean difference between the the clusters are good variables to formulate the clusters.
Dendogram
plot(hc.d) # display dendogram
groups <- cutree(hc.d, k=3) # cut tree into 5 clusters
# draw dendogram with red borders around the 5 clusters
rect.hclust(hc.d, k=3, border="red")
m.a <- kmeans(dfr_Model2,3)
m.a
## K-means clustering with 3 clusters of sizes 393, 25, 82
##
## Cluster means:
## Car_ID State location_code age Gender
## 1 0.08965542 -0.05483686 0.01621287 -0.02784571 -0.02033367
## 2 -0.47085738 0.25688876 -0.05236979 0.11898533 0.06795861
## 3 -0.28613594 0.18449592 -0.06173675 0.09717964 0.07673375
## marital_status Education employment_status Home.Ownership Dependents
## 1 0.02080487 -0.01182346 0.2198095 0.01062716 0.002954029
## 2 -0.01899996 0.32045666 -4.0357436 0.00416398 -0.068290200
## 3 -0.09391848 -0.04103408 0.1769324 -0.05220209 0.006662459
## Type.of.job Employment.Term Job.Gaps Annual.Income Car.Type
## 1 -0.2786003 -0.02509226 -0.03196849 -0.2808235 0.04086535
## 2 1.0232702 0.11288302 -0.14889451 -1.0525634 -0.56351172
## 3 1.0232702 0.08584369 0.19860950 1.6668013 -0.02405233
## loan_status_New Loan_Status Manufacturer Brand Price.Lakhs.
## 1 0.01216890 -0.2551561 -0.03071585 0.02207923 -0.3851532
## 2 -0.10472360 -0.6357266 -0.07597186 0.19037416 -0.5547498
## 3 -0.02639375 1.4167011 0.17037349 -0.16385965 2.0150483
## new_vehicle_year Insurance.Coverage Months.Since.Last.Claim
## 1 -0.01689771 -0.3296389 -0.008377573
## 2 -0.09040049 -0.7773420 0.425332187
## 3 0.10854648 1.8168490 -0.089523396
## Months.Since.Policy.Inception Number.of.Open.Complaints Annual.Income.1
## 1 0.01323827 -0.06403867 -0.2808235
## 2 0.03198479 -0.39753178 -1.0525634
## 3 -0.07319831 0.42811576 1.6668013
## Monthly.Debt Years.of.Credit.History Number.of.Credit.Problems
## 1 -0.1098259 -0.00294032 -0.02572217
## 2 -0.7678892 -0.13749189 -0.06556651
## 3 0.7604733 0.05601028 0.14326800
## Risk_Score Loan.Amount terms Interest EMI past_due_days
## 1 0.05264686 -0.3594777 0.03021750 -0.1854555 -0.3234045 0.08400829
## 2 -0.25109443 -0.5352811 -0.09025547 -0.2090509 -0.4420127 -0.34755464
## 3 -0.17576651 1.8860580 -0.11730600 0.9525646 1.6847350 -0.29666332
## Collection_Cost Sales.Channel
## 1 -0.2415965 0.005847764
## 2 -0.3271882 -0.101889373
## 3 1.2576480 0.003037353
##
## Clustering vector:
## [1] 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1 2 1
## [71] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1
## [106] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1
## [421] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [456] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [491] 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 11601.3937 583.4032 3167.6807
## (between_SS / total_SS = 16.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Observations
Cluster Membership Comparison
table(member.d, m.a$cluster)
##
## member.d 1 2 3
## 1 257 25 0
## 2 136 0 14
## 3 0 0 68
Cluster Plots
dfr_Model2$Player_ID <- dfr_Model1$Player_ID
plot(dfr_Model2[c("Annual.Income","Collection_Cost")], main="K Means Clustering", col=member.d)
###########End of the Project#########