1. Load necessary libraries
library(GGally)
library(knitr)
library(ggplot2)
library(dplyr)
library(corrr)
library(MASS)
library(corrgram)
Cluster <- read.csv("clusteringData.csv",
header = T, stringsAsFactors = F)
#get rid of percent sign
Cluster$Positive_Profitability = as.numeric(gsub("\\%", "", Cluster$Positive_Profitability))
Cluster$Profitability = as.numeric(gsub("\\%", "", Cluster$Profitability))
#Verify no percent sign
kable(head(Cluster) %>% arrange(Profitability))
Random | Customer_Index | Average_Risk_Grade | Max_of_Cards | Account_Open_Date | Years_on_Books | Gross_Revenue_Amt | Total_Expense_Amt | Net_Profit | Net_Profit_Card | Profitability | Positive_Profitability | Card_Fees_Amt | Purchase_Gallons_Qty | Average_Gallons_Card | Acquisition_Cost | Rebate_Amt | Net_Late_Fee_Count | No_Accounts_by_Customer |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.7806251 | Company 31528 | 6 | 1 | 2012-03-21 | 4 | 9022.74 | 573.07 | 8449.67 | 8449.67 | 94 | 144 | 24 | 41383 | 41383 | 0 | 0 | 10 | 1 |
0.1023051 | Company 40586 | 1 | 1 | 2008-11-11 | 7 | 4143.15 | 223.47 | 3919.68 | 3919.68 | 95 | 145 | 26 | 21854 | 21854 | 0 | 0 | 11 | 1 |
0.5832711 | Company 14002 | 1 | 4 | 2012-08-15 | 3 | 62047.75 | 77.27 | 61970.48 | 15492.62 | 100 | 150 | 74 | 5210 | 1303 | 0 | 0 | 1 | 1 |
0.2187821 | Company 50814 | 3 | 10 | 2014-10-24 | 1 | 99575.62 | 280.61 | 99295.01 | 9929.50 | 100 | 150 | 0 | 2150 | 215 | 0 | 0 | 11 | 1 |
0.4913845 | Company 52955 | 1 | 1 | 2014-03-10 | 2 | 7879.56 | NA | 7879.56 | 7879.56 | 100 | 150 | 24 | 0 | 0 | 0 | 0 | 4 | 1 |
0.9517827 | Company 16765 | 1 | 1 | 2000-09-08 | 15 | 4272.36 | 1.04 | 4271.32 | 4271.32 | 100 | 150 | 2 | 50 | 50 | 0 | 0 | 0 | 1 |
#convert profitability and positive profitability to numeric
Cluster$Positive_Profitability <- as.numeric(Cluster$Positive_Profitability)
Cluster$Profitability <- as.numeric(Cluster$Profitability)
#if need to convert them to integer
# Cluster$Profitability <- as.integer(Cluster$Profitability)
# Cluster$Positive_Profitability <- as.integer(Cluster$Positive_Profitability)
Cluster$Account_Open_Date <- as.Date(Cluster$Account_Open_Date , format = "%m/%d/%Y")
summary(Cluster)
Random Customer_Index
Min. :0.0000058 Length:55898
1st Qu.:0.2480272 Class :character
Median :0.4963082 Mode :character
Mean :0.4985677
3rd Qu.:0.7492804
Max. :0.9999339
Average_Risk_Grade Max_of_Cards
Min. :0.000 Min. : 1.00
1st Qu.:1.000 1st Qu.: 4.00
Median :1.000 Median : 8.00
Mean :1.718 Mean :12.22
3rd Qu.:2.000 3rd Qu.:15.00
Max. :7.000 Max. :79.00
Account_Open_Date Years_on_Books
Min. :1983-07-01 Min. : 0.000
1st Qu.:2005-03-10 1st Qu.: 3.000
Median :2009-02-20 Median : 7.000
Mean :2008-07-17 Mean : 7.205
3rd Qu.:2012-08-30 3rd Qu.:11.000
Max. :2016-02-02 Max. :32.000
Gross_Revenue_Amt Total_Expense_Amt Net_Profit
Min. : 0.95 Min. : 0.18 Min. : -378.9
1st Qu.: 512.61 1st Qu.: 46.07 1st Qu.: 420.4
Median : 935.35 Median : 111.25 Median : 810.6
Mean : 1697.13 Mean : 205.96 Mean : 1492.3
3rd Qu.: 1944.61 3rd Qu.: 265.72 3rd Qu.: 1704.2
Max. :99575.62 Max. :6445.75 Max. :99295.0
NA's :302
Net_Profit_Card Profitability
Min. : -229.89 Min. :-50.00
1st Qu.: 67.95 1st Qu.: 85.00
Median : 103.44 Median : 89.00
Mean : 136.28 Mean : 84.77
3rd Qu.: 164.34 3rd Qu.: 93.00
Max. :15492.62 Max. :100.00
Positive_Profitability Card_Fees_Amt
Min. : 0.0 Min. :-626.0
1st Qu.:135.0 1st Qu.: 74.0
Median :139.0 Median : 150.0
Mean :134.8 Mean : 242.5
3rd Qu.:143.0 3rd Qu.: 300.0
Max. :150.0 Max. :1912.0
Purchase_Gallons_Qty Average_Gallons_Card
Min. : 0 Min. : 0
1st Qu.: 3210 1st Qu.: 591
Median : 7702 Median : 986
Mean : 15650 Mean : 1297
3rd Qu.: 17751 3rd Qu.: 1524
Max. :1850413 Max. :108848
Acquisition_Cost Rebate_Amt
Min. : 0.000 Min. :-960.000
1st Qu.: 0.000 1st Qu.: 0.000
Median : 0.000 Median : 0.000
Mean : 5.666 Mean : -1.893
3rd Qu.: 0.000 3rd Qu.: 0.000
Max. :475.000 Max. : 0.000
Net_Late_Fee_Count No_Accounts_by_Customer
Min. :-2.000 Min. :1
1st Qu.: 0.000 1st Qu.:1
Median : 2.000 Median :1
Mean : 2.915 Mean :1
3rd Qu.: 5.000 3rd Qu.:1
Max. :12.000 Max. :1
set.seed(121)
sampleSet <- sample_n(Cluster, 30)
ggplot(sampleSet, aes(x = Profitability, y = Card_Fees_Amt)) + geom_point(alpha = 0.1, color = "blue")+ geom_smooth(method = "lm")
ggplot(sampleSet, aes(x = Profitability, y = Card_Fees_Amt)) + geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth(method = "lm")
ggplot(sampleSet, aes(x = Positive_Profitability, y = Card_Fees_Amt, color = factor(Average_Risk_Grade))) + geom_point(alpha = 0.3)
ggplot(sampleSet, aes(x = Positive_Profitability, y = Card_Fees_Amt, color = factor(Average_Risk_Grade))) + geom_point(alpha = 0.3) + facet_wrap(~ Average_Risk_Grade)
newdata = sampleSet[,c(3:4, 5:18)]
plot(newdata, pch=16, col="blue", main="Matrix Scatterplot of ...")
mod1 = lm(Positive_Profitability ~ Average_Gallons_Card + Net_Late_Fee_Count +Net_Profit_Card + Years_on_Books + Card_Fees_Amt +No_Accounts_by_Customer + Rebate_Amt + Average_Risk_Grade + Gross_Revenue_Amt+ Purchase_Gallons_Qty, data=sampleSet)
summary(mod1)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Net_Late_Fee_Count + Net_Profit_Card + Years_on_Books + Card_Fees_Amt +
No_Accounts_by_Customer + Rebate_Amt + Average_Risk_Grade +
Gross_Revenue_Amt + Purchase_Gallons_Qty, data = sampleSet)
Residuals:
Min 1Q Median 3Q Max
-32.587 -4.390 1.088 8.801 20.728
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.343e+02 1.102e+01 12.189 1.03e-10 ***
Average_Gallons_Card -2.557e-02 8.107e-03 -3.154 0.00500 **
Net_Late_Fee_Count -1.235e+00 1.675e+00 -0.738 0.46937
Net_Profit_Card 1.706e-01 5.549e-02 3.075 0.00597 **
Years_on_Books 9.266e-01 6.299e-01 1.471 0.15686
Card_Fees_Amt 3.038e-02 3.518e-02 0.863 0.39817
No_Accounts_by_Customer NA NA NA NA
Rebate_Amt 2.088e+02 2.318e+02 0.901 0.37841
Average_Risk_Grade -1.431e+00 2.722e+00 -0.526 0.60491
Gross_Revenue_Amt -5.873e-03 1.290e-02 -0.455 0.65372
Purchase_Gallons_Qty 8.866e-04 9.550e-04 0.928 0.36428
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15.07 on 20 degrees of freedom
Multiple R-squared: 0.6004, Adjusted R-squared: 0.4205
F-statistic: 3.338 on 9 and 20 DF, p-value: 0.01185
mod2 = lm(Positive_Profitability ~ Average_Gallons_Card + Max_of_Cards + Net_Late_Fee_Count + Total_Expense_Amt + Net_Profit + Net_Profit_Card + Years_on_Books + Card_Fees_Amt + No_Accounts_by_Customer + Rebate_Amt + Average_Risk_Grade + Gross_Revenue_Amt+ Purchase_Gallons_Qty + Acquisition_Cost, data=sampleSet)
summary(mod2)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Net_Late_Fee_Count + Total_Expense_Amt + Net_Profit +
Net_Profit_Card + Years_on_Books + Card_Fees_Amt + No_Accounts_by_Customer +
Rebate_Amt + Average_Risk_Grade + Gross_Revenue_Amt + Purchase_Gallons_Qty +
Acquisition_Cost, data = sampleSet)
Residuals:
Min 1Q Median 3Q Max
-10.2186 -3.5368 0.3706 3.7782 13.9619
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.275e+02 5.833e+00 21.852 2.08e-14 ***
Average_Gallons_Card -2.718e-03 4.973e-03 -0.547 0.59136
Max_of_Cards 2.013e+00 6.051e-01 3.327 0.00375 **
Net_Late_Fee_Count -1.510e+00 8.435e-01 -1.790 0.09023 .
Total_Expense_Amt -1.045e-01 1.512e-02 -6.914 1.83e-06 ***
Net_Profit -1.607e-03 6.417e-03 -0.250 0.80505
Net_Profit_Card 1.015e-01 3.083e-02 3.292 0.00405 **
Years_on_Books 2.313e-01 3.378e-01 0.685 0.50223
Card_Fees_Amt 1.715e-02 2.924e-02 0.587 0.56473
No_Accounts_by_Customer NA NA NA NA
Rebate_Amt 4.079e+02 1.182e+02 3.452 0.00284 **
Average_Risk_Grade 4.954e-01 1.372e+00 0.361 0.72220
Gross_Revenue_Amt NA NA NA NA
Purchase_Gallons_Qty 1.428e-04 4.835e-04 0.295 0.77111
Acquisition_Cost NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.471 on 18 degrees of freedom
Multiple R-squared: 0.9116, Adjusted R-squared: 0.8575
F-statistic: 16.87 on 11 and 18 DF, p-value: 2.77e-07
mod3 <- lm(Positive_Profitability ~ Average_Gallons_Card + Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books + Purchase_Gallons_Qty + Acquisition_Cost, data = sampleSet)
summary(mod3)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books +
Purchase_Gallons_Qty + Acquisition_Cost, data = sampleSet)
Residuals:
Min 1Q Median 3Q Max
-22.5467 -3.3218 -0.1206 3.7106 21.7142
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.260e+02 6.744e+00 18.685 2.12e-15 ***
Average_Gallons_Card 4.329e-03 5.793e-03 0.747 0.462511
Max_of_Cards 1.415e+00 3.498e-01 4.046 0.000502 ***
Total_Expense_Amt -1.049e-01 1.670e-02 -6.284 2.07e-06 ***
Net_Profit_Card 5.439e-02 3.105e-02 1.752 0.093136 .
Years_on_Books 4.045e-01 4.117e-01 0.983 0.336034
Purchase_Gallons_Qty 6.039e-05 1.961e-04 0.308 0.760868
Acquisition_Cost NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.701 on 23 degrees of freedom
Multiple R-squared: 0.8095, Adjusted R-squared: 0.7598
F-statistic: 16.29 on 6 and 23 DF, p-value: 3.006e-07
sampleSet <- sampleSet[, -c(1:3, 5, 7, 9, 11:13, 17:19 )]
Cluster1 <- Cluster %>% dplyr::select(Positive_Profitability, Average_Risk_Grade, Average_Gallons_Card, Max_of_Cards, Total_Expense_Amt, Net_Profit_Card, Years_on_Books, Purchase_Gallons_Qty, Acquisition_Cost)
#Sig Variables 56%
c1 <- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books + Purchase_Gallons_Qty + Acquisition_Cost, data = Cluster1)
Dropping 302 rows with missing values
summary(c1)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books +
Purchase_Gallons_Qty + Acquisition_Cost, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-505.88 -1.81 1.87 4.94 154.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.307e+02 1.356e-01 963.66 <2e-16 ***
Average_Gallons_Card -1.764e-03 5.666e-05 -31.14 <2e-16 ***
Max_of_Cards 3.489e-01 7.134e-03 48.90 <2e-16 ***
Total_Expense_Amt -4.596e-02 3.845e-04 -119.54 <2e-16 ***
Net_Profit_Card 3.400e-02 4.330e-04 78.52 <2e-16 ***
Years_on_Books 5.453e-01 1.208e-02 45.12 <2e-16 ***
Purchase_Gallons_Qty 2.529e-04 4.193e-06 60.31 <2e-16 ***
Acquisition_Cost -1.852e-01 1.866e-03 -99.27 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 13.01 on 55588 degrees of freedom
(302 observations deleted due to missingness)
Multiple R-squared: 0.566, Adjusted R-squared: 0.5659
F-statistic: 1.036e+04 on 7 and 55588 DF, p-value: < 2.2e-16
#Removed acquisition 49%
c2 <- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books + Purchase_Gallons_Qty, data = Cluster1)
Dropping 302 rows with missing values
summary(c2)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books +
Purchase_Gallons_Qty, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-589.13 -1.94 2.20 5.60 223.04
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.293e+02 1.464e-01 883.43 <2e-16 ***
Average_Gallons_Card -2.132e-03 6.134e-05 -34.75 <2e-16 ***
Max_of_Cards 4.666e-01 7.633e-03 61.12 <2e-16 ***
Total_Expense_Amt -6.493e-02 3.620e-04 -179.36 <2e-16 ***
Net_Profit_Card 3.950e-02 4.659e-04 84.77 <2e-16 ***
Years_on_Books 6.673e-01 1.304e-02 51.16 <2e-16 ***
Purchase_Gallons_Qty 3.576e-04 4.402e-06 81.23 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 14.12 on 55589 degrees of freedom
(302 observations deleted due to missingness)
Multiple R-squared: 0.489, Adjusted R-squared: 0.489
F-statistic: 8868 on 6 and 55589 DF, p-value: < 2.2e-16
#removed acquisition and Purchase gallons 43%
c3 <- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books, data = Cluster1)
Dropping 302 rows with missing values
summary(c3)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt + Net_Profit_Card + Years_on_Books,
data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-566.62 -2.48 2.36 6.37 286.67
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.246e+02 1.424e-01 875.45 <2e-16 ***
Average_Gallons_Card 6.815e-04 5.355e-05 12.73 <2e-16 ***
Max_of_Cards 7.784e-01 6.978e-03 111.54 <2e-16 ***
Total_Expense_Amt -5.411e-02 3.560e-04 -151.98 <2e-16 ***
Net_Profit_Card 3.807e-02 4.924e-04 77.32 <2e-16 ***
Years_on_Books 7.743e-01 1.373e-02 56.41 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 14.93 on 55590 degrees of freedom
(302 observations deleted due to missingness)
Multiple R-squared: 0.4284, Adjusted R-squared: 0.4283
F-statistic: 8333 on 5 and 55590 DF, p-value: < 2.2e-16
#removed acquisition and Purchase gallons, Years_on_Books 40%
c4<- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards + Total_Expense_Amt + Net_Profit_Card, data = Cluster1)
Dropping 302 rows with missing values
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt + Net_Profit_Card, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-574.24 -0.82 2.83 5.36 311.75
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.296e+02 1.144e-01 1132.91 <2e-16 ***
Average_Gallons_Card 8.107e-04 5.501e-05 14.74 <2e-16 ***
Max_of_Cards 8.975e-01 6.839e-03 131.23 <2e-16 ***
Total_Expense_Amt -5.942e-02 3.530e-04 -168.32 <2e-16 ***
Net_Profit_Card 3.838e-02 5.063e-04 75.80 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15.36 on 55591 degrees of freedom
(302 observations deleted due to missingness)
Multiple R-squared: 0.3957, Adjusted R-squared: 0.3956
F-statistic: 9100 on 4 and 55591 DF, p-value: < 2.2e-16
#removed acquisition and Purchase gallons, Years_on_Books, NetProfit_Card 33%
c4<- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards + Total_Expense_Amt, data = Cluster1)
Dropping 302 rows with missing values
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards + Total_Expense_Amt, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-303.92 -1.87 2.49 6.69 319.49
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.326e+02 1.129e-01 1174.63 <2e-16 ***
Average_Gallons_Card 2.958e-03 4.953e-05 59.71 <2e-16 ***
Max_of_Cards 8.857e-01 7.182e-03 123.33 <2e-16 ***
Total_Expense_Amt -6.134e-02 3.698e-04 -165.87 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 16.13 on 55592 degrees of freedom
(302 observations deleted due to missingness)
Multiple R-squared: 0.3332, Adjusted R-squared: 0.3332
F-statistic: 9261 on 3 and 55592 DF, p-value: < 2.2e-16
#removed acquisition and Purchase gallons, Years_on_Books, NetProfit_Card .02%
c4<- lm(Positive_Profitability ~ Average_Gallons_Card+ Max_of_Cards, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card +
Max_of_Cards, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-134.665 -0.067 4.665 8.655 16.368
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.336e+02 1.370e-01 974.613 < 2e-16 ***
Average_Gallons_Card 1.931e-04 5.700e-05 3.388 0.000705 ***
Max_of_Cards 7.931e-02 6.470e-03 12.259 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19.7 on 55895 degrees of freedom
Multiple R-squared: 0.002871, Adjusted R-squared: 0.002835
F-statistic: 80.46 on 2 and 55895 DF, p-value: < 2.2e-16
#Net Profit Card
c4<- lm(Positive_Profitability ~ Net_Profit_Card, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Net_Profit_Card, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-460.15 0.47 4.55 7.81 19.41
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.306e+02 1.076e-01 1213.00 <2e-16 ***
Net_Profit_Card 3.096e-02 5.206e-04 59.46 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19.13 on 55896 degrees of freedom
Multiple R-squared: 0.05949, Adjusted R-squared: 0.05947
F-statistic: 3536 on 1 and 55896 DF, p-value: < 2.2e-16
#Net Profit Card
c4<- lm(Positive_Profitability ~ Average_Gallons_Card , data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Average_Gallons_Card, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-134.997 0.256 4.429 8.358 15.469
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.345e+02 1.115e-01 1206.067 < 2e-16 ***
Average_Gallons_Card 1.858e-04 5.707e-05 3.257 0.00113 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19.73 on 55896 degrees of freedom
Multiple R-squared: 0.0001897, Adjusted R-squared: 0.0001718
F-statistic: 10.61 on 1 and 55896 DF, p-value: 0.001128
#Max_Cards
c4<- lm(Positive_Profitability ~ Max_of_Cards, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Max_of_Cards, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-134.834 -0.043 4.641 8.720 16.115
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 133.80564 0.11486 1164.93 <2e-16 ***
Max_of_Cards 0.07908 0.00647 12.22 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19.7 on 55896 degrees of freedom
Multiple R-squared: 0.002666, Adjusted R-squared: 0.002648
F-statistic: 149.4 on 1 and 55896 DF, p-value: < 2.2e-16
#Max_Cards
c4<- lm(Positive_Profitability ~ Years_on_Books, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Years_on_Books, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-133.945 -3.428 3.572 9.572 23.572
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 125.08276 0.14087 887.92 <2e-16 ***
Years_on_Books 1.34481 0.01621 82.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.62 on 55896 degrees of freedom
Multiple R-squared: 0.1096, Adjusted R-squared: 0.1096
F-statistic: 6881 on 1 and 55896 DF, p-value: < 2.2e-16
#Purchase_Gallons_Qty
c4<- lm(Positive_Profitability ~ Purchase_Gallons_Qty, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Purchase_Gallons_Qty, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-134.378 -0.104 4.668 8.677 15.940
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.341e+02 9.764e-02 1373.06 <2e-16 ***
Purchase_Gallons_Qty 4.550e-05 3.253e-06 13.98 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19.7 on 55896 degrees of freedom
Multiple R-squared: 0.003487, Adjusted R-squared: 0.003469
F-statistic: 195.6 on 1 and 55896 DF, p-value: < 2.2e-16
#Acquisition_Cost
c4<- lm(Positive_Profitability ~ Acquisition_Cost, data = Cluster1)
summary(c4)
Call:
lm(formula = Positive_Profitability ~ Acquisition_Cost, data = Cluster1)
Residuals:
Min 1Q Median 3Q Max
-136.695 -1.695 3.305 6.305 154.555
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 136.695248 0.067131 2036.3 <2e-16 ***
Acquisition_Cost -0.339475 0.001877 -180.9 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15.67 on 55896 degrees of freedom
Multiple R-squared: 0.3692, Adjusted R-squared: 0.3692
F-statistic: 3.271e+04 on 1 and 55896 DF, p-value: < 2.2e-16
set.seed(201)
cluster1Smple <- sample_n(Cluster1, 40) %>% na.omit()
summary(cluster1Smple)
Positive_Profitability Average_Risk_Grade Average_Gallons_Card Max_of_Cards Total_Expense_Amt
Min. :124.0 Min. :1.000 Min. : 224.0 Min. : 1.00 Min. : 23.09
1st Qu.:134.5 1st Qu.:1.000 1st Qu.: 864.5 1st Qu.: 5.50 1st Qu.: 57.66
Median :138.0 Median :1.000 Median :1272.0 Median : 8.00 Median : 130.60
Mean :137.9 Mean :1.385 Mean :1594.8 Mean :12.95 Mean : 232.18
3rd Qu.:142.5 3rd Qu.:1.000 3rd Qu.:1567.5 3rd Qu.:20.00 3rd Qu.: 317.71
Max. :146.0 Max. :5.000 Max. :6440.0 Max. :41.00 Max. :1039.41
Net_Profit_Card Years_on_Books Purchase_Gallons_Qty Acquisition_Cost
Min. : 53.79 Min. : 1.000 Min. : 1344 Min. :0
1st Qu.: 86.75 1st Qu.: 4.000 1st Qu.: 4696 1st Qu.:0
Median :105.64 Median : 7.000 Median :10952 Median :0
Mean :144.00 Mean : 7.538 Mean :19737 Mean :0
3rd Qu.:155.69 3rd Qu.:11.000 3rd Qu.:31535 3rd Qu.:0
Max. :598.36 Max. :17.000 Max. :62123 Max. :0
kc <- kmeans(cluster1Smple[, 3:9], 4, nstart = 20)
kc$centers
Average_Gallons_Card Max_of_Cards Total_Expense_Amt Net_Profit_Card Years_on_Books
1 3233.800 20.400000 577.02400 163.8960 6.200000
2 1277.333 32.000000 581.15000 135.8300 10.666667
3 1272.750 6.583333 96.52958 135.5987 7.375000
4 1664.143 21.285714 301.37571 162.0914 7.714286
Purchase_Gallons_Qty Acquisition_Cost
1 58223.400 0
2 40548.667 0
3 6727.458 0
4 27929.857 0
kc$size
[1] 5 3 24 7
table(kc$cluster, cluster1Smple$Average_Risk_Grade)
1 2 3 4 5
1 3 1 1 0 0
2 3 0 0 0 0
3 19 2 1 1 1
4 6 1 0 0 0
kc$cluster <- as.factor(kc$cluster)
ggplot(cluster1Smple, aes(Total_Expense_Amt, Years_on_Books, color = kc$cluster, alpha = 0.1)) + geom_point()
ggplot(cluster1Smple, aes(Max_of_Cards, Total_Expense_Amt, color = kc$cluster, alpha = 0.5)) + geom_point()
head(cluster1Smple)
library(dendextend)
library(circlize)
irisCluster <- kmeans(cluster1Smple[, 3:9], 3, nstart = 20)
irisCluster$centers
Average_Gallons_Card Max_of_Cards Total_Expense_Amt Net_Profit_Card Years_on_Books
1 1272.75 6.583333 96.52958 135.5987 7.375
2 1548.10 24.500000 385.30800 154.2130 8.600
3 3233.80 20.400000 577.02400 163.8960 6.200
Purchase_Gallons_Qty Acquisition_Cost
1 6727.458 0
2 31715.500 0
3 58223.400 0
# create a dendrogram
hc <- hclust(dist(cluster1Smple[, 3:9]))
dend <- as.dendrogram(hc)
# modify the dendrogram to have some colors in the branches and labels
dend <- dend %>%
color_branches(k=3) %>%
color_labels
# plot the radial plot
par(mar = rep(0,4))
# circlize_dendrogram(dend, dend_track_height = 0.8)
circlize_dendrogram(dend, labels_track_height = NA, dend_track_height = .3)
# Create the dend:
hc <- hclust(dist(cluster1Smple), "ave")
d <- as.dendrogram(hc)
library(dendextend)
d <- d %>% color_branches(k=3) %>% color_labels
# horiz normal version
par(mar = c(3,1,1,7))
plot(d, horiz = TRUE)
plot(d, type = "triangle", ylab = "Height")
# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
# Customized plot; remove labels
plot(d, ylab = "Height", nodePar = nodePar, leaflab = "none", horiz = T)
library(ape)
dd <- as.phylo(d)
plot(as.phylo(d), type = "unrooted", cex = 0.6,
no.margin = TRUE)
colors = c("red", "blue", "green", "black")
clus4 = cutree(d, 4)
plot(as.phylo(d), type = "fan", tip.color = colors[clus4],
label.offset = 1, cex = 0.7)
Principle Component Analysis
cluster1Smple <- cluster1Smple[, c(1:3, 6:7, 9)]
head(cluster1Smple)
dim(cluster1Smple)
[1] 39 6
library("factoextra")
library("FactoMineR")
res.pca <- PCA(cluster1Smple, graph = FALSE)
res.pca
**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 39 individuals, described by 6 variables
*The results are available in the following objects:
name description
1 "$eig" "eigenvalues"
2 "$var" "results for the variables"
3 "$var$coord" "coord. for the variables"
4 "$var$cor" "correlations variables - dimensions"
5 "$var$cos2" "cos2 for the variables"
6 "$var$contrib" "contributions of the variables"
7 "$ind" "results for the individuals"
8 "$ind$coord" "coord. for the individuals"
9 "$ind$cos2" "cos2 for the individuals"
10 "$ind$contrib" "contributions of the individuals"
11 "$call" "summary statistics"
12 "$call$centre" "mean of the variables"
13 "$call$ecart.type" "standard error of the variables"
14 "$call$row.w" "weights for the individuals"
15 "$call$col.w" "weights for the variables"
Eigenvalues: The amount of variation retained by each principle component. The first PC corresponds to the direction with the maximum amount of variation in the data set.
eigenvalues <- res.pca$eig
eigenvalues[, 1:3]
fviz_screeplot(res.pca, ncp = 10)
Correlation circle can help to visualize the most correlated variables (i.e, variables that group together).
head(res.pca$var$coord)
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
Positive_Profitability 0.2703974 0.79072304 -0.50243193 0.11335299 0.190671397
Average_Risk_Grade 0.5188911 -0.57893025 -0.12256363 0.61206188 0.077137696
Average_Gallons_Card 0.8164903 0.08213607 0.46206813 -0.19226484 0.275906629
Net_Profit_Card 0.8594656 0.34597993 0.05814373 0.04069142 -0.369567737
Years_on_Books -0.4857148 0.57199989 0.46898556 0.46577594 -0.001590538
Acquisition_Cost 0.0000000 0.00000000 0.00000000 0.00000000 0.000000000
fviz_pca_var(res.pca)
head(res.pca$var$cos2)
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
Positive_Profitability 0.07311474 0.625242930 0.252437849 0.012848899 3.635558e-02
Average_Risk_Grade 0.26924795 0.335160240 0.015021843 0.374619742 5.950224e-03
Average_Gallons_Card 0.66665647 0.006746334 0.213506960 0.036965770 7.612447e-02
Net_Profit_Card 0.73868109 0.119702112 0.003380693 0.001655792 1.365803e-01
Years_on_Books 0.23591891 0.327183874 0.219947460 0.216947223 2.529810e-06
Acquisition_Cost NaN NaN NaN NaN NaN
fviz_pca_var(res.pca, col.var="cos2") +
scale_color_gradient2(low="white", mid="blue",
high="red", midpoint=0.5) + theme_minimal()
head(res.pca$var$contrib)
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
Positive_Profitability 3.685926 44.2169192 35.8426396 1.9981573 1.425636e+01
Average_Risk_Grade 13.573571 23.7023924 2.1328914 58.2578443 2.333301e+00
Average_Gallons_Card 33.608088 0.4770979 30.3149986 5.7486187 2.985120e+01
Net_Profit_Card 37.239058 8.4652834 0.4800111 0.2574954 5.355815e+01
Years_on_Books 11.893357 23.1383071 31.2294593 33.7378843 9.920312e-04
Acquisition_Cost 0.000000 0.0000000 0.0000000 0.0000000 0.000000e+00
fviz_pca_contrib(res.pca, choice = "var", axes = 1)
The function fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which can handle outputs of PCA, CA and MCA functions.
# Contributions of variables on PC2
fviz_pca_contrib(res.pca, choice = "var", axes = 2)
The function fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which can handle outputs of PCA, CA and MCA functions.
# Total contribution on PC1 and PC2
fviz_pca_contrib(res.pca, choice = "var", axes = 1:2)
The function fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which can handle outputs of PCA, CA and MCA functions.
# Control variable colors using their contributions
fviz_pca_var(res.pca, col.var="contrib")
res.desc <- dimdesc(res.pca, axes = c(1,2))
# Description of dimension 1
res.desc$Dim.1
$quanti
correlation p.value
<NA> NA NA
Net_Profit_Card 0.8594656 2.480154e-12
Average_Gallons_Card 0.8164903 2.349891e-10
Average_Risk_Grade 0.5188911 7.132008e-04
Years_on_Books -0.4857148 1.721179e-03
# Description of dimension 2
res.desc$Dim.2
$quanti
correlation p.value
<NA> NA NA
Positive_Profitability 0.7907230 2.112538e-09
Years_on_Books 0.5719999 1.422417e-04
Net_Profit_Card 0.3459799 3.096904e-02
Average_Risk_Grade -0.5789303 1.128856e-04
head(res.pca$ind$coord)
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
34243 -0.37589960 -0.5753707 -0.6431437 -0.8885824 0.08424434
9326 1.90290466 -3.8032097 -0.2865659 2.0146682 -0.60309285
20415 -0.08793709 -0.6820111 -0.6426179 -1.0198923 -0.29354963
2219 1.79101571 1.5519877 -0.8573422 -0.3878541 -1.06866041
35085 -0.25916907 -1.0412526 -0.8765024 0.1790552 0.07565001
7755 2.32037917 1.0636005 0.6796273 -1.1349772 1.30067377
fviz_pca_ind(res.pca)
# Contributions of the individuals to PC1
fviz_pca_contrib(res.pca, choice = "ind", axes = 1, top = 10)
The function fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which can handle outputs of PCA, CA and MCA functions.
clusterZ <- cluster1Smple %>% mutate(marketingCost = Acquisition_Cost + 5)
cs <- clusterZ %>% mutate(Positive_ProfitabilityZ = scale(Positive_Profitability), AvRiskGrZ = scale(Average_Risk_Grade) ,AvGallCardZ = scale(Average_Gallons_Card), NetProfitCardZ = scale(Net_Profit_Card), YrZ = scale(Years_on_Books), MarketZ = scale(marketingCost))
cs <- cs %>% dplyr::select(Positive_ProfitabilityZ, AvRiskGrZ, AvGallCardZ, NetProfitCardZ, YrZ)
Customer <- seq(from=1, to=195, by=1)
clusterLong <- gather(cs, variable, value,Positive_ProfitabilityZ:YrZ)
ClusterData <- cbind.data.frame(Customer, clusterLong)
ClusterData$variable <- as.factor(ClusterData$variable)
ClusterData$Customer <- as.character(ClusterData$Customer)
ggplot(ClusterData, aes(x = value, fill = variable)) + geom_histogram() + facet_grid(~variable) + theme_bw()
ggplot(ClusterData, aes(x=value, fill=variable)) + geom_density() + theme_bw()
ggplot(ClusterData, aes(x=value, colour=variable)) + geom_density(size = 2) + facet_wrap(~variable) + theme_bw()
# Density plots with semi-transparent fill
ggplot(ClusterData, aes(x=value, fill=variable)) + geom_density(alpha=.3)
ggplot(ClusterData, aes(x=value)) + geom_histogram(binwidth=.5, colour="black", fill="white") +
facet_grid(~ variable) + theme_bw()
# Histogram overlaid with kernel density curve
ggplot(ClusterData, aes(x=value)) +
geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") + facet_grid(~ variable) + theme_bw()# Overlay with transparent density plot
1
set.seed(201)
cluster2Sample <- sample_n(Cluster1, 1000) %>% na.omit()
head(cluster2Sample)
dat <- cluster2Sample[, 3:9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
2
dat <- cluster2Sample[, 3,5:9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
Error in plotcluster(dat, clus$cluster) :
could not find function "plotcluster"
3
dat <- cluster2Sample[, 3:4,6:9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
4
dat <- cluster2Sample[, 3:5, 7:9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
5
dat <- cluster2Sample[, 3:6, 8:9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
6
dat <- cluster2Sample[, 3:7,9] # without known classification
# Kmeans clustre analysis
clus <- kmeans(dat, centers=3)
# Fig 01
plotcluster(dat, clus$cluster)
ggcorr(Cluster1, label = T)
corrgram(as.matrix(Cluster1), order = NULL, panel = panel.shade, text.panel = panel.txt, main = "Correlogram")
fit <- lm(Profitability ~ Net_Late_Fee_Count + Acquisition_Cost + Net_Profit + Years_on_Books + Average_Risk_Grade, data=Cluster)
summary(fit) # show results
Cluster <- Cluster %>% dplyr::select(Random, Customer_Index, Profitability, Net_Late_Fee_Count, Acquisition_Cost, Net_Profit, Years_on_Books)
Cluster <- as.matrix(Cluster)
ggplot(newdata, aes(x = Positive_Profitability, y = Max_of_Cards )) + geom_point() + geom_smooth(method = "lm")
#,color = factor(Years_on_Books
ggplot(newdata, aes(x = Positive_Profitability, y = Gross_Revenue_Amt )) + geom_point() + geom_smooth(method = "lm")
ggplot(newdata, aes(y = Positive_Profitability, x = Card_Fees_Amt )) + geom_point() + geom_smooth(method = "lm")