The Dataset Posted By Valerie Briot
Load saved dataset from Github.com
Analysis:
“free” app with most downloads per year
“paid” app with most downloads per year
Average downloads for each app since the release date for each provider
Average downloads for each app since the release dates across providers
require(dplyr);
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(knitr);
## Loading required package: knitr
data <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/master/CUNY%20607%20-%20Week%206.csv", header = TRUE, sep = ",", skip = 2);
kable(head(data))
| App.Name | Type | Paid.Free | Release.Date | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 | Release.Date.1 | X2010.1 | X2011.1 | X2012.1 | X2013.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Candy Crush Saga | Game | Free | 4/12/2012 | NA | NA | 8 | 56 | 60 | 76 | 11/4/2012 | NA | NA | 2 | 53 | 64 | 72 |
| Fruit Ninja | Game | Free | 4/21/2010 | 4 | 8 | 58 | 102 | 126 | 148 | 7/10/2010 | 1 | 9 | 64 | 108 | 132 | 165 |
| Angry Birds | Game | Free | 12/11/2009 | 10 | 124 | 320 | 547 | 648 | 627 | 11/19/2010 | 2 | 108 | 312 | 538 | 647 | 656 |
| Subway Surfers | Game | Free | 5/24/2012 | NA | NA | 23 | 123 | 202 | 303 | 5/24/2012 | NA | NA | 26 | 128 | 236 | 329 |
| Despicable Me: Minion Rush | Game | Free | 6/10/2013 | NA | NA | NA | 16 | 58 | 128 | 6/10/2013 | NA | NA | NA | 18 | 64 | 294 |
| Clash of Clans | Game | Free | 8/2/2012 | NA | NA | 24 | 123 | 234 | 345 | 10/7/2013 | NA | NA | NA | 4 | 143 | 256 |
names(data)
## [1] "App.Name" "Type" "Paid.Free" "Release.Date"
## [5] "X2010" "X2011" "X2012" "X2013"
## [9] "X2014" "X2015" "Release.Date.1" "X2010.1"
## [13] "X2011.1" "X2012.1" "X2013.1" "X2014.1"
## [17] "X2015.1"
Separating the data into two for easy analysis.
google_play <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1));
kable(head(google_play));
| Type | Paid.Free | Release.Date.1 | X2010.1 | X2011.1 | X2012.1 | X2013.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|---|---|---|---|
| Game | Free | 11/4/2012 | NA | NA | 2 | 53 | 64 | 72 |
| Game | Free | 7/10/2010 | 1 | 9 | 64 | 108 | 132 | 165 |
| Game | Free | 11/19/2010 | 2 | 108 | 312 | 538 | 647 | 656 |
| Game | Free | 5/24/2012 | NA | NA | 26 | 128 | 236 | 329 |
| Game | Free | 6/10/2013 | NA | NA | NA | 18 | 64 | 294 |
| Game | Free | 10/7/2013 | NA | NA | NA | 4 | 143 | 256 |
apple_store <- subset(data, select=c(Type, Paid.Free, Release.Date.1, X2010, X2011, X2012, X2013, X2014, X2015));
kable(head(apple_store))
| Type | Paid.Free | Release.Date.1 | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 |
|---|---|---|---|---|---|---|---|---|
| Game | Free | 11/4/2012 | NA | NA | 8 | 56 | 60 | 76 |
| Game | Free | 7/10/2010 | 4 | 8 | 58 | 102 | 126 | 148 |
| Game | Free | 11/19/2010 | 10 | 124 | 320 | 547 | 648 | 627 |
| Game | Free | 5/24/2012 | NA | NA | 23 | 123 | 202 | 303 |
| Game | Free | 6/10/2013 | NA | NA | NA | 16 | 58 | 128 |
| Game | Free | 10/7/2013 | NA | NA | 24 | 123 | 234 | 345 |
Replacing the NA’s to zero (0)
google_play <- google_play %>% mutate( X2010.1 = ifelse(is.na(X2010.1),0,X2010.1))
google_play <- google_play %>% mutate( X2011.1 = ifelse(is.na(X2011.1),0,X2011.1))
google_play <- google_play %>% mutate( X2012.1 = ifelse(is.na(X2012.1),0,X2012.1))
google_play <- google_play %>% mutate( X2013.1 = ifelse(is.na(X2013.1),0,X2013.1))
google_play <- google_play %>% mutate( X2014.1 = ifelse(is.na(X2014.1),0,X2014.1))
View(google_play)
apple_store <- apple_store %>% mutate( X2010 = ifelse(is.na(X2010),0,X2010))
apple_store <- apple_store %>% mutate( X2011 = ifelse(is.na(X2011),0,X2011))
apple_store <- apple_store %>% mutate( X2012 = ifelse(is.na(X2012),0,X2012))
apple_store <- apple_store %>% mutate( X2013 = ifelse(is.na(X2013),0,X2013))
apple_store <- apple_store %>% mutate( X2014 = ifelse(is.na(X2014),0,X2014))
kable(head(google_play))
| Type | Paid.Free | Release.Date.1 | X2010.1 | X2011.1 | X2012.1 | X2013.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|---|---|---|---|
| Game | Free | 11/4/2012 | 0 | 0 | 2 | 53 | 64 | 72 |
| Game | Free | 7/10/2010 | 1 | 9 | 64 | 108 | 132 | 165 |
| Game | Free | 11/19/2010 | 2 | 108 | 312 | 538 | 647 | 656 |
| Game | Free | 5/24/2012 | 0 | 0 | 26 | 128 | 236 | 329 |
| Game | Free | 6/10/2013 | 0 | 0 | 0 | 18 | 64 | 294 |
| Game | Free | 10/7/2013 | 0 | 0 | 0 | 4 | 143 | 256 |
kable(head(apple_store))
| Type | Paid.Free | Release.Date.1 | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 |
|---|---|---|---|---|---|---|---|---|
| Game | Free | 11/4/2012 | 0 | 0 | 8 | 56 | 60 | 76 |
| Game | Free | 7/10/2010 | 4 | 8 | 58 | 102 | 126 | 148 |
| Game | Free | 11/19/2010 | 10 | 124 | 320 | 547 | 648 | 627 |
| Game | Free | 5/24/2012 | 0 | 0 | 23 | 123 | 202 | 303 |
| Game | Free | 6/10/2013 | 0 | 0 | 0 | 16 | 58 | 128 |
| Game | Free | 10/7/2013 | 0 | 0 | 24 | 123 | 234 | 345 |
“free” app with most downloads per year Slicing the data for easy accesibilty
google_play2 <- slice(google_play, 1:10)
google_play3 <- slice(google_play, 11:12)
apple_store2 <- slice(apple_store, 1:10)
apple_store3 <- slice(apple_store, 11:12)
kable(google_play2 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play2$X2015.1)));
| Paid.Free | Release.Date.1 | X2010.1 | X2011.1 | X2012.1 | X2013.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|---|---|---|
| Free | 11/19/2010 | 2 | 108 | 312 | 538 | 647 | 656 |
“paid” app with most downloads per year
kable(google_play3 %>% select(Paid.Free,Release.Date.1,X2010.1, X2011.1, X2012.1, X2013.1, X2014.1, X2015.1) %>% filter(X2015.1 == max(google_play3$X2015.1)));
| Paid.Free | Release.Date.1 | X2010.1 | X2011.1 | X2012.1 | X2013.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|---|---|---|
| Paid | 11/7/2011 | 0 | 8 | 154 | 285 | 369 | 352 |
Average downloads for each app since the release date for each provider
kable(head(google_play %>% summarise_each(funs(mean), X2010.1,X2011.1,X2012.1, X2011.1,X2014.1, X2015.1)));
| X2010.1 | X2011.1 | X2012.1 | X2014.1 | X2015.1 |
|---|---|---|---|---|
| 12.41667 | 33.83333 | 81.91667 | 251.75 | 322.9167 |
“free” app with most downloads per year
kable(apple_store2 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store2$X2015)));
| Paid.Free | Release.Date.1 | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 |
|---|---|---|---|---|---|---|---|
| Free | 11/19/2010 | 10 | 124 | 320 | 547 | 648 | 627 |
“paid” app with most downloads per year
kable(apple_store3 %>% select(Paid.Free,Release.Date.1,X2010, X2011, X2012, X2013, X2014, X2015) %>% filter(X2015 == max(apple_store3$X2015)));
| Paid.Free | Release.Date.1 | X2010 | X2011 | X2012 | X2013 | X2014 | X2015 |
|---|---|---|---|---|---|---|---|
| Paid | 11/7/2011 | 0 | 4 | 120 | 240 | 320 | 340 |
Average downloads for each app since the release date for each provider
kable(head(apple_store %>% summarise_each(funs(mean), X2010,X2011,X2012, X2011,X2014, X2015)))
| X2010 | X2011 | X2012 | X2014 | X2015 |
|---|---|---|---|---|
| 13.08333 | 34.5 | 79.91667 | 243.9167 | 291.9167 |
DATASET TWO
WHAT IMPACT DOES BODY WEIGHT HAVE ON BRAIN WEIGHT?
The data records the average weight of the brain and body for a number of mammal species. There are 62 rows of data. The 3 data columns include: I, the index, A1, the brain weight; B, the body weight. We seek a model of the form: B = A1 * X1.
Load data from URL
require(knitr);
weights <- read.table("http://people.sc.fsu.edu/~jburkardt/datasets/regression/x01.txt", skip = 32, header = TRUE, sep = "")
kable(head(weights));
| Body | Weight |
|---|---|
| 3.385 | 44.5 |
| 0.480 | 15.5 |
| 1.350 | 8.1 |
| 465.000 | 423.0 |
| 36.330 | 119.5 |
| 27.660 | 115.0 |
names(weights);
## [1] "Body" "Weight"
options(warn=-1)
** Load require package.**
library(plyr);
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
Note the data columns came with Body & Weight, but its actually “BrainWeight &”BodyWeight" in the data, which needs correction.
names(weights)[names(weights)=="Body"] <- "BrainWeight";
names(weights)[names(weights)=="Weight"] <- "BodyWeight";
str(weights);
## 'data.frame': 62 obs. of 2 variables:
## $ BrainWeight: num 3.38 0.48 1.35 465 36.33 ...
## $ BodyWeight : num 44.5 15.5 8.1 423 119.5 ...
names(weights);
## [1] "BrainWeight" "BodyWeight"
kable(head(weights));
| BrainWeight | BodyWeight |
|---|---|
| 3.385 | 44.5 |
| 0.480 | 15.5 |
| 1.350 | 8.1 |
| 465.000 | 423.0 |
| 36.330 | 119.5 |
| 27.660 | 115.0 |
Lets checkout its histogram.
hist(weights$BrainWeight);
hist(weights$BodyWeight);
We can deduce that it a Very Rightly Skewed data, an intervention is required.Its getting the Natural logarithm of the data set.
weights_log <- cbind(weights, log(weights$BrainWeight), log(weights$BodyWeight));
kable(head(weights_log));
| BrainWeight | BodyWeight | log(weights$BrainWeight) | log(weights$BodyWeight) |
|---|---|---|---|
| 3.385 | 44.5 | 1.2193539 | 3.795489 |
| 0.480 | 15.5 | -0.7339692 | 2.740840 |
| 1.350 | 8.1 | 0.3001046 | 2.091864 |
| 465.000 | 423.0 | 6.1420374 | 6.047372 |
| 36.330 | 119.5 | 3.5926438 | 4.783316 |
| 27.660 | 115.0 | 3.3199873 | 4.744932 |
load required package and attach the data again and plot it in scatterplot 3D.
library(scatterplot3d);
attach(weights_log);
scatterplot3d(BrainWeight, BodyWeight, pch = 20, highlight.3d = TRUE, type = "h", main = "3D ScatterPlots");
Removing or seperating new column from initial data set.
weights_log[,c("BrainWeight","BodyWeight")] <- list(NULL);
colnames(weights_log);
## [1] "log(weights$BrainWeight)" "log(weights$BodyWeight)"
a <- plot(weights_log, ylab="Brain Weight",
plot.type="double", col=1:2, xlab="Body Weight")
legend("topleft", legend=c("Brain Weight","Body Weight"),
lty=1, col=c(1,2), cex=.8)
abline(a)
Changing the column name.
names(weights_log)[names(weights_log)=="log(weights$BrainWeight)"] <- "BrainWeights";
names(weights_log)[names(weights_log)=="log(weights$BodyWeight)"] <- "BodyWeight";
kable(head(weights_log));
| BrainWeights | BodyWeight |
|---|---|
| 1.2193539 | 3.795489 |
| -0.7339692 | 2.740840 |
| 0.3001046 | 2.091864 |
| 6.1420374 | 6.047372 |
| 3.5926438 | 4.783316 |
| 3.3199873 | 4.744932 |
Correlations and covariance shows a strong positive relationship between Brain Weights and Body weight
cor(weights_log, use="complete.obs", method="kendall")
## BrainWeights BodyWeight
## BrainWeights 1.0000000 0.8334657
## BodyWeight 0.8334657 1.0000000
Histogram for Body weight
x <- weights_log$BodyWeight;
hist(x,
xlim=c(min(x),max(x)), probability=T,
col='purple', xlab='Body Weight', ylab=' Frequency', axes=T,
main='Natural Logarithm: Multi-modal')
lines(density(x,bw=1), col='red', lwd=2)
Getting the mode of BodyWeight
mode_1 <- table(as.vector(x));
names(mode_1)[mode_1 == max(mode_1)];
## [1] "0" "2.50959926237837" "4.74493212836325"
Histogram for Brain weight
y <- weights_log$BrainWeight;
hist(y,
xlim=c(min(y),max(y)), probability=T,
col='purple', xlab='Brain Weight', ylab=' Frequency', axes=T,
main='Natural Logarithm: Bi-modal')
lines(density(y,bw=1), col='red', lwd=2)
kable(summary(weights_log));
| BrainWeights | BodyWeight | |
|---|---|---|
| Min. :-5.2983 | Min. :-1.966 | |
| 1st Qu.:-0.5203 | 1st Qu.: 1.442 | |
| Median : 1.2066 | Median : 2.848 | |
| Mean : 1.3375 | Mean : 3.140 | |
| 3rd Qu.: 3.8639 | 3rd Qu.: 5.111 | |
| Max. : 8.8030 | Max. : 8.650 |
Getting the mode of Brain Weight
mode_2 <- table(as.vector(y));
names(mode_2)[mode_2 == max(mode_2)];
## [1] "-3.77226106305299" "1.25276296849537"
Load required package forecast, for forecasting.
library(forecast);
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 6.2
Acf(weights_log$BodyWeight, lag.max=NULL, type=c("correlation", "partial"), plot=TRUE, main=NULL, xlim=NULL, ylim=NULL, xlab="Lag", ylab=NULL, na.action=na.contiguous);
The Credit Card Default Dataset Posted By Keith Folsom
The UCI Machine Learning Repository has a wide dataset containing credit card payments, owed and payed, for individuals spanning several months. The dataset includes variables such as gender, marital status, education, and age.
require(knitr);
require(dplyr);
require(tidyr);
## Loading required package: tidyr
dataset1 <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/cd55bba4605695b2cc07793517ae61fb05032e0a/Copy%20of%20default%20of%20credit%20card%20clients.csv", header = TRUE, sep = ",", skip = 1)
kable(head(dataset1))
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default.payment.next.month |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | -2 | -2 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | 0 | 2 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | 0 | 0 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
| 6 | 50000 | 1 | 1 | 2 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 64400 | 57069 | 57608 | 19394 | 19619 | 20024 | 2500 | 1815 | 657 | 1000 | 1000 | 800 | 0 |
1.) Create summary statistics on each group (age, gender, education, etc.).
summary(dataset1)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median :15000 Median : 140000 Median :2.000 Median :2.000
## Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853
## 3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000
## MARRIAGE AGE PAY_0 PAY_2
## Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000
## Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338
## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000
## PAY_3 PAY_4 PAY_5 PAY_6
## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
## BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4
## Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000
## 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327
## Median : 22382 Median : 21200 Median : 20089 Median : 19052
## Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263
## 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506
## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
## BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## Min. :-81334 Min. :-339603 Min. : 0 Min. : 0
## 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833
## Median : 18105 Median : 17071 Median : 2100 Median : 2009
## Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921
## 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000
## Max. :927171 Max. : 961664 Max. :873552 Max. :1684259
## PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8
## Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0
## Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5
## 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0
## Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0
## default.payment.next.month
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2212
## 3rd Qu.:0.0000
## Max. :1.0000
** Selecting and splitting the dataset for each accesibilty**
Bill_amount <- select(dataset1, num_range("BILL_AMT", 1:6))
kable(head(Bill_amount))
| BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 |
|---|---|---|---|---|---|
| 3913 | 3102 | 689 | 0 | 0 | 0 |
| 2682 | 1725 | 2682 | 3272 | 3455 | 3261 |
| 29239 | 14027 | 13559 | 14331 | 14948 | 15549 |
| 46990 | 48233 | 49291 | 28314 | 28959 | 29547 |
| 8617 | 5670 | 35835 | 20940 | 19146 | 19131 |
| 64400 | 57069 | 57608 | 19394 | 19619 | 20024 |
pay <- select(dataset1, num_range("PAY", 1:6))
kable(head(pay))
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default.payment.next.month |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | -2 | -2 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | 0 | 2 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | 0 | 0 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
| 6 | 50000 | 1 | 1 | 2 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 64400 | 57069 | 57608 | 19394 | 19619 | 20024 | 2500 | 1815 | 657 | 1000 | 1000 | 800 | 0 |
pay_amount <- select(dataset1, num_range("PAY_AMT", 1:6))
kable(head(pay_amount))
| PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 |
|---|---|---|---|---|---|
| 0 | 689 | 0 | 0 | 0 | 0 |
| 0 | 1000 | 1000 | 1000 | 0 | 2000 |
| 1518 | 1500 | 1000 | 1000 | 1000 | 5000 |
| 2000 | 2019 | 1200 | 1100 | 1069 | 1000 |
| 2000 | 36681 | 10000 | 9000 | 689 | 679 |
| 2500 | 1815 | 657 | 1000 | 1000 | 800 |
dataset2 <- dataset1[, c(1, 2, 3, 4, 5, 6, 25)]
kable(head(dataset2))
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | default.payment.next.month |
|---|---|---|---|---|---|---|
| 1 | 20000 | 2 | 2 | 1 | 24 | 1 |
| 2 | 120000 | 2 | 2 | 2 | 26 | 1 |
| 3 | 90000 | 2 | 2 | 2 | 34 | 0 |
| 4 | 50000 | 2 | 2 | 1 | 37 | 0 |
| 5 | 50000 | 1 | 2 | 1 | 57 | 0 |
| 6 | 50000 | 1 | 1 | 2 | 37 | 0 |
| **2.) | Which group | has t | he highest/l | owed credit | limit | ?** |
kable(head(pay_amount %>% select(PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6) %>% filter(PAY_AMT3 == max(pay_amount$PAY_AMT3))));
| PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 |
|---|---|---|---|---|---|
| 50784 | 50723 | 896040 | 50000 | 50000 | 50256 |
kable(head(Bill_amount %>% select(BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6) %>% filter(BILL_AMT3 == max(Bill_amount$BILL_AMT3))))
| BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 |
|---|---|---|---|---|---|
| 125 | -18088 | 1664089 | 121757 | 97115 | 377217 |
kable(head(pay %>% select(PAY_0,PAY_2, PAY_3, PAY_4, PAY_5, PAY_6) %>% filter(PAY_6 == max(pay$PAY_6))));
| PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 |
|---|---|---|---|---|---|
| 1 | 3 | 7 | 6 | 7 | 8 |
| 2 | 2 | 8 | 8 | 8 | 8 |
Which group has the highest/lowed credit limit? Which group has the most outstanding debt?
kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == max(dataset2$LIMIT_BAL))));
| AGE | EDUCATION | SEX | LIMIT_BAL |
|---|---|---|---|
| 47 | 1 | 2 | 1e+06 |
kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL) %>% filter(LIMIT_BAL == min(dataset2$LIMIT_BAL))));
| AGE | EDUCATION | SEX | LIMIT_BAL |
|---|---|---|---|
| 22 | 2 | 1 | 10000 |
| 56 | 2 | 1 | 10000 |
| 23 | 3 | 1 | 10000 |
| 27 | 2 | 1 | 10000 |
| 24 | 2 | 1 | 10000 |
| 22 | 2 | 2 | 10000 |
What combination of age, gender, etc. is the least likely to default?
kable(head(dataset2 %>% select(AGE, EDUCATION, SEX, LIMIT_BAL, default.payment.next.month) %>% filter(default.payment.next.month == min(dataset2$default.payment.next.month))));
| AGE | EDUCATION | SEX | LIMIT_BAL | default.payment.next.month |
|---|---|---|---|---|
| 34 | 2 | 2 | 9e+04 | 0 |
| 37 | 2 | 2 | 5e+04 | 0 |
| 57 | 2 | 1 | 5e+04 | 0 |
| 37 | 1 | 1 | 5e+04 | 0 |
| 29 | 1 | 1 | 5e+05 | 0 |
| 23 | 2 | 2 | 1e+05 | 0 |