IS607: Project 2

The Dataset Posted By Valerie Briot

Load saved dataset from Github.com

Analysis:

“free” app with most downloads per year

“paid” app with most downloads per year

Average downloads for each app since the release date for each provider

Average downloads for each app since the release dates across providers

require(dplyr);

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(knitr);

## Loading required package: knitr

data <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/master/CUNY%20607%20-%20Week%206.csv", header = TRUE, sep = ",", skip = 2);
kable(head(data))

App.Name	Type	Paid.Free	Release.Date	X2010	X2011	X2012	X2013	X2014	X2015	Release.Date.1	X2010.1	X2011.1	X2012.1	X2013.1	X2014.1	X2015.1
Candy Crush Saga	Game	Free	4/12/2012	NA	NA	8	56	60	76	11/4/2012	NA	NA	2	53	64	72
Fruit Ninja	Game	Free	4/21/2010	4	8	58	102	126	148	7/10/2010	1	9	64	108	132	165
Angry Birds	Game	Free	12/11/2009	10	124	320	547	648	627	11/19/2010	2	108	312	538	647	656
Subway Surfers	Game	Free	5/24/2012	NA	NA	23	123	202	303	5/24/2012	NA	NA	26	128	236	329
Despicable Me: Minion Rush	Game	Free	6/10/2013	NA	NA	NA	16	58	128	6/10/2013	NA	NA	NA	18	64	294
Clash of Clans	Game	Free	8/2/2012	NA	NA	24	123	234	345	10/7/2013	NA	NA	NA	4	143	256

names(data)

##  [1] "App.Name"       "Type"           "Paid.Free"      "Release.Date"  
##  [5] "X2010"          "X2011"          "X2012"          "X2013"         
##  [9] "X2014"          "X2015"          "Release.Date.1" "X2010.1"       
## [13] "X2011.1"        "X2012.1"        "X2013.1"        "X2014.1"       
## [17] "X2015.1"

Separating the data into two for easy analysis.

google_play <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1));
kable(head(google_play));

Type	Paid.Free	Release.Date.1	X2010.1	X2011.1	X2012.1	X2013.1	X2014.1	X2015.1
Game	Free	11/4/2012	NA	NA	2	53	64	72
Game	Free	7/10/2010	1	9	64	108	132	165
Game	Free	11/19/2010	2	108	312	538	647	656
Game	Free	5/24/2012	NA	NA	26	128	236	329
Game	Free	6/10/2013	NA	NA	NA	18	64	294
Game	Free	10/7/2013	NA	NA	NA	4	143	256

apple_store <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010, X2011, X2012, X2013, X2014, X2015));
kable(head(apple_store))

Type	Paid.Free	Release.Date.1	X2010	X2011	X2012	X2013	X2014	X2015
Game	Free	11/4/2012	NA	NA	8	56	60	76
Game	Free	7/10/2010	4	8	58	102	126	148
Game	Free	11/19/2010	10	124	320	547	648	627
Game	Free	5/24/2012	NA	NA	23	123	202	303
Game	Free	6/10/2013	NA	NA	NA	16	58	128
Game	Free	10/7/2013	NA	NA	24	123	234	345

Replacing the NA’s to zero (0)

google_play <- google_play %>% mutate( X2010.1 = ifelse(is.na(X2010.1),0,X2010.1))
google_play <- google_play %>% mutate( X2011.1 = ifelse(is.na(X2011.1),0,X2011.1))
google_play <- google_play %>% mutate( X2012.1 = ifelse(is.na(X2012.1),0,X2012.1))
google_play <- google_play %>% mutate( X2013.1 = ifelse(is.na(X2013.1),0,X2013.1))
google_play <- google_play %>% mutate( X2014.1 = ifelse(is.na(X2014.1),0,X2014.1))
View(google_play)

apple_store <- apple_store %>% mutate( X2010 = ifelse(is.na(X2010),0,X2010))
apple_store <- apple_store %>% mutate( X2011 = ifelse(is.na(X2011),0,X2011))
apple_store <- apple_store %>% mutate( X2012 = ifelse(is.na(X2012),0,X2012))
apple_store <- apple_store %>% mutate( X2013 = ifelse(is.na(X2013),0,X2013))
apple_store <- apple_store %>% mutate( X2014 = ifelse(is.na(X2014),0,X2014))


kable(head(google_play))

Type	Paid.Free	Release.Date.1	X2010.1	X2011.1	X2012.1	X2013.1	X2014.1	X2015.1
Game	Free	11/4/2012	0	0	2	53	64	72
Game	Free	7/10/2010	1	9	64	108	132	165
Game	Free	11/19/2010	2	108	312	538	647	656
Game	Free	5/24/2012	0	0	26	128	236	329
Game	Free	6/10/2013	0	0	0	18	64	294
Game	Free	10/7/2013	0	0	0	4	143	256

kable(head(apple_store))

Type	Paid.Free	Release.Date.1	X2010	X2011	X2012	X2013	X2014	X2015
Game	Free	11/4/2012	0	0	8	56	60	76
Game	Free	7/10/2010	4	8	58	102	126	148
Game	Free	11/19/2010	10	124	320	547	648	627
Game	Free	5/24/2012	0	0	23	123	202	303
Game	Free	6/10/2013	0	0	0	16	58	128
Game	Free	10/7/2013	0	0	24	123	234	345

“free” app with most downloads per year Slicing the data for easy accesibilty

google_play2 <- slice(google_play, 1:10)
google_play3 <- slice(google_play, 11:12)
apple_store2 <- slice(apple_store, 1:10)
apple_store3 <- slice(apple_store, 11:12)

kable(google_play2 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play2$X2015.1)));

Paid.Free	Release.Date.1	X2010.1	X2011.1	X2012.1	X2013.1	X2014.1	X2015.1
Free	11/19/2010	2	108	312	538	647	656

“paid” app with most downloads per year

kable(google_play3 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play3$X2015.1)));

Paid.Free	Release.Date.1	X2010.1	X2011.1	X2012.1	X2013.1	X2014.1	X2015.1
Paid	11/7/2011	0	8	154	285	369	352

Average downloads for each app since the release date for each provider

kable(head(google_play %>% summarise_each(funs(mean), X2010.1,X2011.1,X2012.1, X2011.1,X2014.1, X2015.1)));

X2010.1	X2011.1	X2012.1	X2014.1	X2015.1
12.41667	33.83333	81.91667	251.75	322.9167

“free” app with most downloads per year

kable(apple_store2 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store2$X2015)));

Paid.Free	Release.Date.1	X2010	X2011	X2012	X2013	X2014	X2015
Free	11/19/2010	10	124	320	547	648	627

“paid” app with most downloads per year

kable(apple_store3 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store3$X2015)));

Paid.Free	Release.Date.1	X2010	X2011	X2012	X2013	X2014	X2015
Paid	11/7/2011	0	4	120	240	320	340

Average downloads for each app since the release date for each provider

kable(head(apple_store %>% summarise_each(funs(mean), X2010,X2011,X2012, X2011,X2014, X2015)))

X2010	X2011	X2012	X2014	X2015
13.08333	34.5	79.91667	243.9167	291.9167

DATASET TWO

WHAT IMPACT DOES BODY WEIGHT HAVE ON BRAIN WEIGHT?

The data records the average weight of the brain and body for a number of mammal species. There are 62 rows of data. The 3 data columns include: I, the index, A1, the brain weight; B, the body weight. We seek a model of the form: B = A1 * X1.

Load data from URL

require(knitr);
weights <- read.table("http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt", skip = 32, header = TRUE, sep = "")
kable(head(weights));

Body	Weight
3.385	44.5
0.480	15.5
1.350	8.1
465.000	423.0
36.330	119.5
27.660	115.0

names(weights);

## [1] "Body"   "Weight"

options(warn=-1)

** Load require package.**

library(plyr);

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Note the data columns came with Body & Weight, but its actually “BrainWeight &”BodyWeight" in the data, which needs correction.

names(weights)[names(weights)=="Body"] <- "BrainWeight";
names(weights)[names(weights)=="Weight"] <- "BodyWeight";
str(weights);

## 'data.frame':    62 obs. of  2 variables:
##  $ BrainWeight: num  3.38 0.48 1.35 465 36.33 ...
##  $ BodyWeight : num  44.5 15.5 8.1 423 119.5 ...

names(weights);

## [1] "BrainWeight" "BodyWeight"

kable(head(weights));

BrainWeight	BodyWeight
3.385	44.5
0.480	15.5
1.350	8.1
465.000	423.0
36.330	119.5
27.660	115.0

Lets checkout its histogram.

hist(weights$BrainWeight);

hist(weights$BodyWeight);

We can deduce that it a Very Rightly Skewed data, an intervention is required.Its getting the Natural logarithm of the data set.

weights_log <- cbind(weights, log(weights$BrainWeight), log(weights$BodyWeight));

kable(head(weights_log));

BrainWeight	BodyWeight	log(weights$BrainWeight)	log(weights$BodyWeight)
3.385	44.5	1.2193539	3.795489
0.480	15.5	-0.7339692	2.740840
1.350	8.1	0.3001046	2.091864
465.000	423.0	6.1420374	6.047372
36.330	119.5	3.5926438	4.783316
27.660	115.0	3.3199873	4.744932

load required package and attach the data again and plot it in scatterplot 3D.

library(scatterplot3d);

attach(weights_log);
scatterplot3d(BrainWeight, BodyWeight, pch = 20, highlight.3d = TRUE, type = "h", main = "3D ScatterPlots");

Removing or seperating new column from initial data set.

weights_log[,c("BrainWeight","BodyWeight")] <- list(NULL);
colnames(weights_log);

## [1] "log(weights$BrainWeight)" "log(weights$BodyWeight)"

a <- plot(weights_log, ylab="Brain Weight",
   plot.type="double", col=1:2, xlab="Body Weight")
legend("topleft", legend=c("Brain Weight","Body Weight"),
  lty=1, col=c(1,2), cex=.8)
abline(a)

Changing the column name.

names(weights_log)[names(weights_log)=="log(weights$BrainWeight)"] <- "BrainWeights";
names(weights_log)[names(weights_log)=="log(weights$BodyWeight)"] <- "BodyWeight";

kable(head(weights_log));

BrainWeights	BodyWeight
1.2193539	3.795489
-0.7339692	2.740840
0.3001046	2.091864
6.1420374	6.047372
3.5926438	4.783316
3.3199873	4.744932

Correlations and covariance shows a strong positive relationship between Brain Weights and Body weight

cor(weights_log, use="complete.obs", method="kendall")

##              BrainWeights BodyWeight
## BrainWeights    1.0000000  0.8334657
## BodyWeight      0.8334657  1.0000000

Histogram for Body weight

x <- weights_log$BodyWeight;

hist(x, 
 xlim=c(min(x),max(x)), probability=T, 
   col='purple', xlab='Body Weight', ylab=' Frequency', axes=T,
   main='Natural Logarithm: Multi-modal')
lines(density(x,bw=1), col='red', lwd=2)

Getting the mode of BodyWeight

mode_1 <- table(as.vector(x));
names(mode_1)[mode_1 == max(mode_1)];

## [1] "0"                "2.50959926237837" "4.74493212836325"

Histogram for Brain weight

y <- weights_log$BrainWeight;

hist(y, 
 xlim=c(min(y),max(y)), probability=T, 
   col='purple', xlab='Brain Weight', ylab=' Frequency', axes=T,
   main='Natural Logarithm: Bi-modal')
lines(density(y,bw=1), col='red', lwd=2)

kable(summary(weights_log));

	BrainWeights	BodyWeight
	Min. :-5.2983	Min. :-1.966
	1st Qu.:-0.5203	1st Qu.: 1.442
	Median : 1.2066	Median : 2.848
	Mean : 1.3375	Mean : 3.140
	3rd Qu.: 3.8639	3rd Qu.: 5.111
	Max. : 8.8030	Max. : 8.650

Getting the mode of Brain Weight

mode_2 <- table(as.vector(y));
names(mode_2)[mode_2 == max(mode_2)];

## [1] "-3.77226106305299" "1.25276296849537"

Load required package forecast, for forecasting.

library(forecast);

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: timeDate

## This is forecast 6.2

Acf(weights_log$BodyWeight, lag.max=NULL, type=c("correlation", "partial"), plot=TRUE, main=NULL, xlim=NULL, ylim=NULL, xlab="Lag", ylab=NULL, na.action=na.contiguous);

The Credit Card Default Dataset Posted By Keith Folsom

The UCI Machine Learning Repository has a wide dataset containing credit card payments, owed and payed, for individuals spanning several months. The dataset includes variables such as gender, marital status, education, and age.

require(knitr);
require(dplyr);
require(tidyr);

## Loading required package: tidyr

dataset1 <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/cd55bba4605695b2cc07793517ae61fb05032e0a/Copy%20of%20default%20of%20credit%20card%20clients.csv", header = TRUE, sep = ",", skip = 1)

kable(head(dataset1))

ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default.payment.next.month
1	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
2	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
3	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
4	50000	2	2	1	37	0	0	0	0	0	0	46990	48233	49291	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
5	50000	1	2	1	57	-1	0	-1	0	0	0	8617	5670	35835	20940	19146	19131	2000	36681	10000	9000	689	679	0
6	50000	1	1	2	37	0	0	0	0	0	0	64400	57069	57608	19394	19619	20024	2500	1815	657	1000	1000	800	0

1.) Create summary statistics on each group (age, gender, education, etc.).

summary(dataset1)

##        ID          LIMIT_BAL            SEX          EDUCATION    
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##     MARRIAGE          AGE            PAY_0             PAY_2        
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##      PAY_3             PAY_4             PAY_5             PAY_6        
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##    BILL_AMT1         BILL_AMT2        BILL_AMT3         BILL_AMT4      
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##    BILL_AMT5        BILL_AMT6          PAY_AMT1         PAY_AMT2      
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##     PAY_AMT3         PAY_AMT4         PAY_AMT5           PAY_AMT6       
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##  default.payment.next.month
##  Min.   :0.0000            
##  1st Qu.:0.0000            
##  Median :0.0000            
##  Mean   :0.2212            
##  3rd Qu.:0.0000            
##  Max.   :1.0000

** Selecting and splitting the dataset for each accesibilty**

Bill_amount <- select(dataset1, num_range("BILL_AMT", 1:6))
kable(head(Bill_amount))

BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6
3913	3102	689	0	0	0
2682	1725	2682	3272	3455	3261
29239	14027	13559	14331	14948	15549
46990	48233	49291	28314	28959	29547
8617	5670	35835	20940	19146	19131
64400	57069	57608	19394	19619	20024

pay <- select(dataset1, num_range("PAY", 1:6))
kable(head(pay))

ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default.payment.next.month
1	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
2	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
3	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
4	50000	2	2	1	37	0	0	0	0	0	0	46990	48233	49291	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
5	50000	1	2	1	57	-1	0	-1	0	0	0	8617	5670	35835	20940	19146	19131	2000	36681	10000	9000	689	679	0
6	50000	1	1	2	37	0	0	0	0	0	0	64400	57069	57608	19394	19619	20024	2500	1815	657	1000	1000	800	0

pay_amount <- select(dataset1, num_range("PAY_AMT", 1:6))
kable(head(pay_amount))

PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6
0	689	0	0	0	0
0	1000	1000	1000	0	2000
1518	1500	1000	1000	1000	5000
2000	2019	1200	1100	1069	1000
2000	36681	10000	9000	689	679
2500	1815	657	1000	1000	800

dataset2 <- dataset1[, c(1, 2, 3, 4, 5, 6, 25)]
kable(head(dataset2))

ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	default.payment.next.month
1	20000	2	2	1	24	1
2	120000	2	2	2	26	1
3	90000	2	2	2	34	0
4	50000	2	2	1	37	0
5	50000	1	2	1	57	0
6	50000	1	1	2	37	0
**2.)	Which group	has t	he highest/l	owed credit	limit	?**

kable(head(pay_amount %>% select(PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6) %>% filter(PAY_AMT3 == max(pay_amount$PAY_AMT3))));

PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6
50784	50723	896040	50000	50000	50256

kable(head(Bill_amount %>% select(BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6) %>% filter(BILL_AMT3 == max(Bill_amount$BILL_AMT3))))

BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6
125	-18088	1664089	121757	97115	377217

kable(head(pay %>% select(PAY_0,PAY_2, PAY_3, PAY_4, PAY_5, PAY_6) %>% filter(PAY_6 == max(pay$PAY_6))));

PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6
1	3	7	6	7	8
2	2	8	8	8	8

Which group has the highest/lowed credit limit? Which group has the most outstanding debt?

kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == max(dataset2$LIMIT_BAL))));

AGE	EDUCATION	SEX	LIMIT_BAL
47	1	2	1e+06

kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == min(dataset2$LIMIT_BAL))));

AGE	EDUCATION	SEX	LIMIT_BAL
22	2	1	10000
56	2	1	10000
23	3	1	10000
27	2	1	10000
24	2	1	10000
22	2	2	10000

What combination of age, gender, etc. is the least likely to default?

kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL, default.payment.next.month) %>% filter(default.payment.next.month == min(dataset2$default.payment.next.month))));

AGE	EDUCATION	SEX	LIMIT_BAL
34	2	2	9e+04
37	2	2	5e+04
57	2	1	5e+04
37	1	1	5e+04
29	1	1	5e+05
23	2	2	1e+05

IS607: Project 2

MUSA T. GANIYU

March 12, 2016