The Dataset Posted By Valerie Briot

Load saved dataset from Github.com

Analysis:

“free” app with most downloads per year

“paid” app with most downloads per year

Average downloads for each app since the release date for each provider

Average downloads for each app since the release dates across providers

require(dplyr);
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(knitr);
## Loading required package: knitr
data <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/master/CUNY%20607%20-%20Week%206.csv", header = TRUE, sep = ",", skip = 2);
kable(head(data))
App.Name Type Paid.Free Release.Date X2010 X2011 X2012 X2013 X2014 X2015 Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1 X2014.1 X2015.1
Candy Crush Saga Game Free 4/12/2012 NA NA 8 56 60 76 11/4/2012 NA NA 2 53 64 72
Fruit Ninja Game Free 4/21/2010 4 8 58 102 126 148 7/10/2010 1 9 64 108 132 165
Angry Birds Game Free 12/11/2009 10 124 320 547 648 627 11/19/2010 2 108 312 538 647 656
Subway Surfers Game Free 5/24/2012 NA NA 23 123 202 303 5/24/2012 NA NA 26 128 236 329
Despicable Me: Minion Rush Game Free 6/10/2013 NA NA NA 16 58 128 6/10/2013 NA NA NA 18 64 294
Clash of Clans Game Free 8/2/2012 NA NA 24 123 234 345 10/7/2013 NA NA NA 4 143 256
names(data)
##  [1] "App.Name"       "Type"           "Paid.Free"      "Release.Date"  
##  [5] "X2010"          "X2011"          "X2012"          "X2013"         
##  [9] "X2014"          "X2015"          "Release.Date.1" "X2010.1"       
## [13] "X2011.1"        "X2012.1"        "X2013.1"        "X2014.1"       
## [17] "X2015.1"

Separating the data into two for easy analysis.

google_play <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1));
kable(head(google_play));
Type Paid.Free Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1 X2014.1 X2015.1
Game Free 11/4/2012 NA NA 2 53 64 72
Game Free 7/10/2010 1 9 64 108 132 165
Game Free 11/19/2010 2 108 312 538 647 656
Game Free 5/24/2012 NA NA 26 128 236 329
Game Free 6/10/2013 NA NA NA 18 64 294
Game Free 10/7/2013 NA NA NA 4 143 256
apple_store <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010, X2011, X2012, X2013, X2014, X2015));
kable(head(apple_store))
Type Paid.Free Release.Date.1 X2010 X2011 X2012 X2013 X2014 X2015
Game Free 11/4/2012 NA NA 8 56 60 76
Game Free 7/10/2010 4 8 58 102 126 148
Game Free 11/19/2010 10 124 320 547 648 627
Game Free 5/24/2012 NA NA 23 123 202 303
Game Free 6/10/2013 NA NA NA 16 58 128
Game Free 10/7/2013 NA NA 24 123 234 345

Replacing the NA’s to zero (0)

google_play <- google_play %>% mutate( X2010.1 = ifelse(is.na(X2010.1),0,X2010.1))
google_play <- google_play %>% mutate( X2011.1 = ifelse(is.na(X2011.1),0,X2011.1))
google_play <- google_play %>% mutate( X2012.1 = ifelse(is.na(X2012.1),0,X2012.1))
google_play <- google_play %>% mutate( X2013.1 = ifelse(is.na(X2013.1),0,X2013.1))
google_play <- google_play %>% mutate( X2014.1 = ifelse(is.na(X2014.1),0,X2014.1))
View(google_play)

apple_store <- apple_store %>% mutate( X2010 = ifelse(is.na(X2010),0,X2010))
apple_store <- apple_store %>% mutate( X2011 = ifelse(is.na(X2011),0,X2011))
apple_store <- apple_store %>% mutate( X2012 = ifelse(is.na(X2012),0,X2012))
apple_store <- apple_store %>% mutate( X2013 = ifelse(is.na(X2013),0,X2013))
apple_store <- apple_store %>% mutate( X2014 = ifelse(is.na(X2014),0,X2014))


kable(head(google_play))
Type Paid.Free Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1 X2014.1 X2015.1
Game Free 11/4/2012 0 0 2 53 64 72
Game Free 7/10/2010 1 9 64 108 132 165
Game Free 11/19/2010 2 108 312 538 647 656
Game Free 5/24/2012 0 0 26 128 236 329
Game Free 6/10/2013 0 0 0 18 64 294
Game Free 10/7/2013 0 0 0 4 143 256
kable(head(apple_store))
Type Paid.Free Release.Date.1 X2010 X2011 X2012 X2013 X2014 X2015
Game Free 11/4/2012 0 0 8 56 60 76
Game Free 7/10/2010 4 8 58 102 126 148
Game Free 11/19/2010 10 124 320 547 648 627
Game Free 5/24/2012 0 0 23 123 202 303
Game Free 6/10/2013 0 0 0 16 58 128
Game Free 10/7/2013 0 0 24 123 234 345

“free” app with most downloads per year Slicing the data for easy accesibilty

google_play2 <- slice(google_play, 1:10)
google_play3 <- slice(google_play, 11:12)
apple_store2 <- slice(apple_store, 1:10)
apple_store3 <- slice(apple_store, 11:12)

kable(google_play2 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play2$X2015.1)));
Paid.Free Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1 X2014.1 X2015.1
Free 11/19/2010 2 108 312 538 647 656

“paid” app with most downloads per year

kable(google_play3 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play3$X2015.1)));
Paid.Free Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1 X2014.1 X2015.1
Paid 11/7/2011 0 8 154 285 369 352

Average downloads for each app since the release date for each provider

kable(head(google_play %>% summarise_each(funs(mean), X2010.1,X2011.1,X2012.1, X2011.1,X2014.1, X2015.1)));
X2010.1 X2011.1 X2012.1 X2014.1 X2015.1
12.41667 33.83333 81.91667 251.75 322.9167

“free” app with most downloads per year

kable(apple_store2 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store2$X2015)));
Paid.Free Release.Date.1 X2010 X2011 X2012 X2013 X2014 X2015
Free 11/19/2010 10 124 320 547 648 627

“paid” app with most downloads per year

kable(apple_store3 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store3$X2015)));
Paid.Free Release.Date.1 X2010 X2011 X2012 X2013 X2014 X2015
Paid 11/7/2011 0 4 120 240 320 340

Average downloads for each app since the release date for each provider

kable(head(apple_store %>% summarise_each(funs(mean), X2010,X2011,X2012, X2011,X2014, X2015)))
X2010 X2011 X2012 X2014 X2015
13.08333 34.5 79.91667 243.9167 291.9167

DATASET TWO

WHAT IMPACT DOES BODY WEIGHT HAVE ON BRAIN WEIGHT?

The data records the average weight of the brain and body for a number of mammal species. There are 62 rows of data. The 3 data columns include: I, the index, A1, the brain weight; B, the body weight. We seek a model of the form: B = A1 * X1.

Load data from URL

require(knitr);
weights <- read.table("http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt", skip = 32, header = TRUE, sep = "")
kable(head(weights));
Body Weight
3.385 44.5
0.480 15.5
1.350 8.1
465.000 423.0
36.330 119.5
27.660 115.0
names(weights);
## [1] "Body"   "Weight"
options(warn=-1)

** Load require package.**

library(plyr);
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Note the data columns came with Body & Weight, but its actually “BrainWeight &”BodyWeight" in the data, which needs correction.

names(weights)[names(weights)=="Body"] <- "BrainWeight";
names(weights)[names(weights)=="Weight"] <- "BodyWeight";
str(weights);
## 'data.frame':    62 obs. of  2 variables:
##  $ BrainWeight: num  3.38 0.48 1.35 465 36.33 ...
##  $ BodyWeight : num  44.5 15.5 8.1 423 119.5 ...
names(weights);
## [1] "BrainWeight" "BodyWeight"
kable(head(weights));
BrainWeight BodyWeight
3.385 44.5
0.480 15.5
1.350 8.1
465.000 423.0
36.330 119.5
27.660 115.0

Lets checkout its histogram.

hist(weights$BrainWeight);

hist(weights$BodyWeight);

We can deduce that it a Very Rightly Skewed data, an intervention is required.Its getting the Natural logarithm of the data set.

weights_log <- cbind(weights, log(weights$BrainWeight), log(weights$BodyWeight));

kable(head(weights_log));
BrainWeight BodyWeight log(weights$BrainWeight) log(weights$BodyWeight)
3.385 44.5 1.2193539 3.795489
0.480 15.5 -0.7339692 2.740840
1.350 8.1 0.3001046 2.091864
465.000 423.0 6.1420374 6.047372
36.330 119.5 3.5926438 4.783316
27.660 115.0 3.3199873 4.744932

load required package and attach the data again and plot it in scatterplot 3D.

library(scatterplot3d);

attach(weights_log);
scatterplot3d(BrainWeight, BodyWeight, pch = 20, highlight.3d = TRUE, type = "h", main = "3D ScatterPlots"); 

Removing or seperating new column from initial data set.

weights_log[,c("BrainWeight","BodyWeight")] <- list(NULL);
colnames(weights_log);
## [1] "log(weights$BrainWeight)" "log(weights$BodyWeight)"
a <- plot(weights_log, ylab="Brain Weight",
   plot.type="double", col=1:2, xlab="Body Weight")
legend("topleft", legend=c("Brain Weight","Body Weight"),
  lty=1, col=c(1,2), cex=.8)
abline(a)

Changing the column name.

names(weights_log)[names(weights_log)=="log(weights$BrainWeight)"] <- "BrainWeights";
names(weights_log)[names(weights_log)=="log(weights$BodyWeight)"] <- "BodyWeight";

kable(head(weights_log));
BrainWeights BodyWeight
1.2193539 3.795489
-0.7339692 2.740840
0.3001046 2.091864
6.1420374 6.047372
3.5926438 4.783316
3.3199873 4.744932

Correlations and covariance shows a strong positive relationship between Brain Weights and Body weight

cor(weights_log, use="complete.obs", method="kendall")
##              BrainWeights BodyWeight
## BrainWeights    1.0000000  0.8334657
## BodyWeight      0.8334657  1.0000000

Histogram for Body weight

x <- weights_log$BodyWeight;

hist(x, 
 xlim=c(min(x),max(x)), probability=T, 
   col='purple', xlab='Body Weight', ylab=' Frequency', axes=T,
   main='Natural Logarithm: Multi-modal')
lines(density(x,bw=1), col='red', lwd=2)

Getting the mode of BodyWeight

mode_1 <- table(as.vector(x));
names(mode_1)[mode_1 == max(mode_1)];
## [1] "0"                "2.50959926237837" "4.74493212836325"

Histogram for Brain weight

y <- weights_log$BrainWeight;

hist(y, 
 xlim=c(min(y),max(y)), probability=T, 
   col='purple', xlab='Brain Weight', ylab=' Frequency', axes=T,
   main='Natural Logarithm: Bi-modal')
lines(density(y,bw=1), col='red', lwd=2)

kable(summary(weights_log));
BrainWeights BodyWeight
Min. :-5.2983 Min. :-1.966
1st Qu.:-0.5203 1st Qu.: 1.442
Median : 1.2066 Median : 2.848
Mean : 1.3375 Mean : 3.140
3rd Qu.: 3.8639 3rd Qu.: 5.111
Max. : 8.8030 Max. : 8.650

Getting the mode of Brain Weight

mode_2 <- table(as.vector(y));
names(mode_2)[mode_2 == max(mode_2)];
## [1] "-3.77226106305299" "1.25276296849537"

Load required package forecast, for forecasting.

library(forecast);
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 6.2
Acf(weights_log$BodyWeight, lag.max=NULL, type=c("correlation", "partial"), plot=TRUE, main=NULL, xlim=NULL, ylim=NULL, xlab="Lag", ylab=NULL, na.action=na.contiguous); 

The Credit Card Default Dataset Posted By Keith Folsom

The UCI Machine Learning Repository has a wide dataset containing credit card payments, owed and payed, for individuals spanning several months. The dataset includes variables such as gender, marital status, education, and age.

require(knitr);
require(dplyr);
require(tidyr);
## Loading required package: tidyr
dataset1 <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/cd55bba4605695b2cc07793517ae61fb05032e0a/Copy%20of%20default%20of%20credit%20card%20clients.csv", header = TRUE, sep = ",", skip = 1)

kable(head(dataset1))
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default.payment.next.month
1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913 3102 689 0 0 0 0 689 0 0 0 0 1
2 120000 2 2 2 26 -1 2 0 0 0 2 2682 1725 2682 3272 3455 3261 0 1000 1000 1000 0 2000 1
3 90000 2 2 2 34 0 0 0 0 0 0 29239 14027 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
4 50000 2 2 1 37 0 0 0 0 0 0 46990 48233 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
5 50000 1 2 1 57 -1 0 -1 0 0 0 8617 5670 35835 20940 19146 19131 2000 36681 10000 9000 689 679 0
6 50000 1 1 2 37 0 0 0 0 0 0 64400 57069 57608 19394 19619 20024 2500 1815 657 1000 1000 800 0

1.) Create summary statistics on each group (age, gender, education, etc.).

summary(dataset1)
##        ID          LIMIT_BAL            SEX          EDUCATION    
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##     MARRIAGE          AGE            PAY_0             PAY_2        
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##      PAY_3             PAY_4             PAY_5             PAY_6        
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##    BILL_AMT1         BILL_AMT2        BILL_AMT3         BILL_AMT4      
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##    BILL_AMT5        BILL_AMT6          PAY_AMT1         PAY_AMT2      
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##     PAY_AMT3         PAY_AMT4         PAY_AMT5           PAY_AMT6       
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##  default.payment.next.month
##  Min.   :0.0000            
##  1st Qu.:0.0000            
##  Median :0.0000            
##  Mean   :0.2212            
##  3rd Qu.:0.0000            
##  Max.   :1.0000

** Selecting and splitting the dataset for each accesibilty**

Bill_amount <- select(dataset1, num_range("BILL_AMT", 1:6))
kable(head(Bill_amount))
BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
3913 3102 689 0 0 0
2682 1725 2682 3272 3455 3261
29239 14027 13559 14331 14948 15549
46990 48233 49291 28314 28959 29547
8617 5670 35835 20940 19146 19131
64400 57069 57608 19394 19619 20024
pay <- select(dataset1, num_range("PAY", 1:6))
kable(head(pay))
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default.payment.next.month
1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913 3102 689 0 0 0 0 689 0 0 0 0 1
2 120000 2 2 2 26 -1 2 0 0 0 2 2682 1725 2682 3272 3455 3261 0 1000 1000 1000 0 2000 1
3 90000 2 2 2 34 0 0 0 0 0 0 29239 14027 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
4 50000 2 2 1 37 0 0 0 0 0 0 46990 48233 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
5 50000 1 2 1 57 -1 0 -1 0 0 0 8617 5670 35835 20940 19146 19131 2000 36681 10000 9000 689 679 0
6 50000 1 1 2 37 0 0 0 0 0 0 64400 57069 57608 19394 19619 20024 2500 1815 657 1000 1000 800 0
pay_amount <- select(dataset1, num_range("PAY_AMT", 1:6))
kable(head(pay_amount))
PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
0 689 0 0 0 0
0 1000 1000 1000 0 2000
1518 1500 1000 1000 1000 5000
2000 2019 1200 1100 1069 1000
2000 36681 10000 9000 689 679
2500 1815 657 1000 1000 800
dataset2 <- dataset1[, c(1, 2, 3, 4, 5, 6, 25)]
kable(head(dataset2))
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE default.payment.next.month
1 20000 2 2 1 24 1
2 120000 2 2 2 26 1
3 90000 2 2 2 34 0
4 50000 2 2 1 37 0
5 50000 1 2 1 57 0
6 50000 1 1 2 37 0
**2.) Which group has t he highest/l owed credit limit ?**
kable(head(pay_amount %>% select(PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6) %>% filter(PAY_AMT3 == max(pay_amount$PAY_AMT3))));
PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
50784 50723 896040 50000 50000 50256
kable(head(Bill_amount %>% select(BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6) %>% filter(BILL_AMT3 == max(Bill_amount$BILL_AMT3))))
BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
125 -18088 1664089 121757 97115 377217
kable(head(pay %>% select(PAY_0,PAY_2, PAY_3, PAY_4, PAY_5, PAY_6) %>% filter(PAY_6 == max(pay$PAY_6))));
PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
1 3 7 6 7 8
2 2 8 8 8 8

Which group has the highest/lowed credit limit? Which group has the most outstanding debt?

kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == max(dataset2$LIMIT_BAL))));
AGE EDUCATION SEX LIMIT_BAL
47 1 2 1e+06
kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == min(dataset2$LIMIT_BAL))));
AGE EDUCATION SEX LIMIT_BAL
22 2 1 10000
56 2 1 10000
23 3 1 10000
27 2 1 10000
24 2 1 10000
22 2 2 10000

What combination of age, gender, etc. is the least likely to default?

kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL, default.payment.next.month) %>% filter(default.payment.next.month == min(dataset2$default.payment.next.month))));
AGE EDUCATION SEX LIMIT_BAL default.payment.next.month
34 2 2 9e+04 0
37 2 2 5e+04 0
57 2 1 5e+04 0
37 1 1 5e+04 0
29 1 1 5e+05 0
23 2 2 1e+05 0