data(CreditCard)
diagnose(CreditCard)
## # A tibble: 12 × 6
## variables types missing_count missing_percent unique_count unique_rate
## <chr> <chr> <int> <dbl> <int> <dbl>
## 1 card factor 0 0 2 0.00152
## 2 reports numeric 0 0 13 0.00986
## 3 age numeric 0 0 418 0.317
## 4 income numeric 0 0 431 0.327
## 5 share numeric 0 0 1162 0.881
## 6 expenditure numeric 0 0 981 0.744
## 7 owner factor 0 0 2 0.00152
## 8 selfemp factor 0 0 2 0.00152
## 9 dependents numeric 0 0 7 0.00531
## 10 months numeric 0 0 193 0.146
## 11 majorcards numeric 0 0 2 0.00152
## 12 active numeric 0 0 35 0.0265
diagnose_numeric(CreditCard)
## variables min Q1 mean median Q3
## 1 reports 0.0000000000 0.000000000 0.45640637 0.00000000 0.00000000
## 2 age 0.1666667000 25.416670000 33.21310325 31.25000000 39.41667000
## 3 income 0.2100000000 2.243750000 3.36537604 2.90000000 4.00000000
## 4 share 0.0001090909 0.002315922 0.06873217 0.03882722 0.09361682
## 5 expenditure 0.0000000000 4.583333000 185.05707078 101.29830000 249.03580000
## 6 dependents 0.0000000000 0.000000000 0.99393480 1.00000000 2.00000000
## 7 months 0.0000000000 12.000000000 55.26762699 30.00000000 72.00000000
## 8 majorcards 0.0000000000 1.000000000 0.81728582 1.00000000 1.00000000
## 9 active 0.0000000000 2.000000000 6.99696740 6.00000000 11.00000000
## max zero minus outlier
## 1 14.0000000 1060 0 259
## 2 83.5000000 0 0 28
## 3 13.5000000 0 0 77
## 4 0.9063205 0 0 74
## 5 3099.5050000 317 0 75
## 6 6.0000000 659 0 7
## 7 540.0000000 3 0 95
## 8 1.0000000 241 0 241
## 9 46.0000000 219 0 18
ggplot(CreditCard, aes(x=income, y=expenditure)) +
geom_boxplot(alpha=0.7) +
stat_summary(fun="mean", geom="point", shape=20, size=5, color="red", fill="red") +
geom_jitter() +
facet_grid(~card) +
scale_fill_brewer(palette="Set1")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
no_riskers<-filter(CreditCard, card=="no")
ggplot(no_riskers, aes(x=income, y=expenditure)) +
stat_summary(fun="median", geom="point", shape=20, size=5, color="red", fill="red") +
geom_jitter() +
scale_fill_brewer(palette="Set1")
Early faceting shows that expenditures of people with credit risk can
have crazy values, however most of them are rather low (right-side
asymetry), meanwhile expenditures of people with no risk are much lover
compared to those with risk(usualy equal 0), income is propably not
different so much. To check it I will find quantiles.
no_riskers<-filter(CreditCard, card=="no")
yes_riskers<-filter(CreditCard, card=="yes")
quantile(no_riskers$income,probs=c(0,0.1,0.25,0.5,0.75,0.95,1),na.rm=TRUE)
## 0% 10% 25% 50% 75% 95% 100%
## 0.490 1.676 2.000 2.590 3.625 6.125 11.000
quantile(yes_riskers$income,probs=c(0,0.1,0.25,0.5,0.75,0.95,1),na.rm=TRUE)
## 0% 10% 25% 50% 75% 95% 100%
## 0.21 1.90 2.35 3.00 4.00 7.00 13.50
Because Median and other quantiles are generaly a little higher for people with risk, we can assume they generaly earn a little more. Also income looks to have right-sided asymetry
diagnose_outlier(CreditCard)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 reports 259 19.6360879 2.3243243 0.45640637
## 2 age 28 2.1228203 49.8065473 33.21310325
## 3 income 77 5.8377559 8.2859974 3.36537604
## 4 share 74 5.6103108 0.3540334 0.06873217
## 5 expenditure 75 5.6861259 999.3246040 185.05707078
## 6 dependents 7 0.5307051 6.0000000 0.99393480
## 7 months 95 7.2024261 237.7263158 55.26762699
## 8 majorcards 241 18.2714177 0.0000000 0.81728582
## 9 active 18 1.3646702 29.8888889 6.99696740
## without_mean
## 1 0.00000000
## 2 32.85321446
## 3 3.06031337
## 4 0.05177451
## 5 135.96537866
## 6 0.96722561
## 7 41.10620915
## 8 1.00000000
## 9 6.68024596
Q <- quantile(CreditCard$expenditure, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(CreditCard$expenditure)
CreditCard_without_Expout<- subset(CreditCard, CreditCard$expenditure > (Q[1] - 1.5*iqr) & CreditCard$expenditure < (Q[2]+1.5*iqr))
diagnose_outlier(CreditCard_without_Expout)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 reports 242 19.453376 2.4008264 0.46704180
## 2 age 28 2.250804 49.8065473 33.17597793
## 3 income 68 5.466238 7.9964765 3.27191479
## 4 share 51 4.099678 0.2510199 0.05477854
## 5 expenditure 26 2.090032 564.3096769 135.96537866
## 6 dependents 7 0.562701 6.0000000 0.97427653
## 7 months 93 7.475884 237.3225806 55.77331190
## 8 majorcards 234 18.810289 0.0000000 0.81189711
## 9 active 15 1.205788 30.0000000 6.91639871
## without_mean
## 1 0.00000000
## 2 32.79303719
## 3 2.99872585
## 4 0.04638934
## 5 126.82174011
## 6 0.94583670
## 7 41.10425717
## 8 1.00000000
## 9 6.63466233
density.i <- ggdensity(CreditCard, x = "income",
fill = "card", palette = "jco")+
stat_overlay_normal_density(color = "red", linetype = "dashed")
density.e <- ggdensity(CreditCard_without_Expout, x = "expenditure",
fill = "card", palette = "jco")+
stat_overlay_normal_density(color = "red", linetype = "dashed")
density.i
CreditCard_without_Expout_andZeros<- subset(CreditCard_without_Expout, CreditCard_without_Expout$expenditure > 0)
density.e <- ggdensity(CreditCard_without_Expout_andZeros, x = "expenditure",
fill = "card", palette = "jco")+
stat_overlay_normal_density(color = "red", linetype = "dashed")
density.e
# As expected expenditures of people with risk have rigth-sided
asymetry. Whats interesting all expenditures of people with no risk have
gone, so I guess we can assume they are equal to 0(without outliers) #
Both income and expenditure have rigth sided asymetry. Income of both
groups is quiet similar, a little bit higher for people with risk.
Meanwhile expenditures are very different: people with risk have
expenditures higher than income, with some outliers of crazy high
values; people with no risk have expenditures mostly equal to 0.
summary_list <- split(CreditCard, CreditCard$card)
lapply(summary_list, summary)
## $no
## card reports age income
## no :296 Min. : 0.000 Min. : 0.75 Min. : 0.490
## yes: 0 1st Qu.: 0.000 1st Qu.:25.65 1st Qu.: 2.000
## Median : 1.000 Median :31.83 Median : 2.590
## Mean : 1.588 Mean :33.20 Mean : 3.069
## 3rd Qu.: 2.000 3rd Qu.:38.06 3rd Qu.: 3.625
## Max. :14.000 Max. :80.17 Max. :11.000
## share expenditure owner selfemp dependents
## Min. :0.0001091 Min. :0 no :206 no :268 Min. :0.000
## 1st Qu.:0.0003311 1st Qu.:0 yes: 90 yes: 28 1st Qu.:0.000
## Median :0.0004633 Median :0 Median :1.000
## Mean :0.0004768 Mean :0 Mean :1.078
## 3rd Qu.:0.0006000 3rd Qu.:0 3rd Qu.:2.000
## Max. :0.0024490 Max. :0 Max. :6.000
## months majorcards active
## Min. : 0.0 Min. :0.0000 Min. : 0.000
## 1st Qu.: 14.0 1st Qu.:0.0000 1st Qu.: 0.000
## Median : 36.0 Median :1.0000 Median : 4.000
## Mean : 55.3 Mean :0.7399 Mean : 6.054
## 3rd Qu.: 63.0 3rd Qu.:1.0000 3rd Qu.: 9.000
## Max. :528.0 Max. :1.0000 Max. :46.000
##
## $yes
## card reports age income
## no : 0 Min. :0.000 Min. : 0.1667 Min. : 0.210
## yes:1023 1st Qu.:0.000 1st Qu.:25.3333 1st Qu.: 2.350
## Median :0.000 Median :31.0833 Median : 3.000
## Mean :0.129 Mean :33.2160 Mean : 3.451
## 3rd Qu.:0.000 3rd Qu.:39.7083 3rd Qu.: 4.000
## Max. :4.000 Max. :83.5000 Max. :13.500
## share expenditure owner selfemp dependents
## Min. :0.000186 Min. : 0.00 no :532 no :960 Min. :0.0000
## 1st Qu.:0.026635 1st Qu.: 66.52 yes:491 yes: 63 1st Qu.:0.0000
## Median :0.060209 Median : 150.18 Median :0.0000
## Mean :0.088482 Mean : 238.60 Mean :0.9697
## 3rd Qu.:0.113858 3rd Qu.: 313.96 3rd Qu.:2.0000
## Max. :0.906320 Max. :3099.51 Max. :6.0000
## months majorcards active
## Min. : 0.00 Min. :0.0000 Min. : 0.00
## 1st Qu.: 12.00 1st Qu.:1.0000 1st Qu.: 2.00
## Median : 30.00 Median :1.0000 Median : 6.00
## Mean : 55.26 Mean :0.8397 Mean : 7.27
## 3rd Qu.: 72.00 3rd Qu.:1.0000 3rd Qu.:11.00
## Max. :540.00 Max. :1.0000 Max. :31.00
#describeBy(CreditCard, group = CreditCard$status, mat = TRUE)
CreditCard %>%
group_by(card) %>%
summarize(skewness_share = skewness(share, na.rm = TRUE),
kurtosis_share = kurtosis(share, na.rm = TRUE),
skewness_expenditure = skewness(expenditure, na.rm = TRUE),
kurtosis_expenditure = kurtosis(expenditure, na.rm = TRUE))
## # A tibble: 2 × 5
## card skewness_share kurtosis_share skewness_expenditure kurtosis_expenditure
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 no 2.76 26.2 NaN NaN
## 2 yes 3.05 17.9 3.57 23.1
#.Plot Lorenz curve and report the Gini coefficient
CreditCard <- CreditCard %>% arrange(share)
CreditCard <- CreditCard %>%
mutate(cum_share = cumsum(share) / sum(share),
cum_population = seq_along(share) / n())
ggplot(CreditCard, aes(x = cum_population, y = cum_share)) +
geom_line(color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Lorenz Curve for Credit Card Expenditure",
x = "Cumulative Proportion of Population",
y = "Cumulative Share of Credit Card Expenditure") +
theme_minimal()
gini_coefficient <- 1 - 2 * sum(CreditCard$cum_share * CreditCard$cum_population)
print(paste("Gini Coefficient:", gini_coefficient))
## [1] "Gini Coefficient: -412.102560732661"
summary_stats <- CreditCard %>%
group_by(card) %>%
summarize(median_share = median(share, na.rm = TRUE),
mean_share = mean(share, na.rm = TRUE))
boxplot <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot() +
labs(title = "Boxplot of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
strip_chart <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
labs(title = "Strip Chart of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
print(summary_stats)
## # A tibble: 2 × 3
## card median_share mean_share
## <fct> <dbl> <dbl>
## 1 no 0.000463 0.000477
## 2 yes 0.0602 0.0885
print(boxplot)
print(strip_chart)
#.Add kernel density estimation to your histograms
#boxplots with KDE
boxplot_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot() +
geom_density(aes(color = factor(card)), alpha = 0.5) + # Add KDE
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
#strip charts with KDE
strip_chart_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
geom_density(aes(fill = factor(card)), alpha = 0.5) + # Add KDE
labs(title = "Strip Chart with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
print(boxplot_kde)
print(strip_chart_kde)
#.Be aware of the level of measurement of each variable, missing values and outliers
#Checking for missing values
missing_values <- sum(is.na(CreditCard))
print(paste("Missing values:", missing_values))
## [1] "Missing values: 0"
median_share <- median(CreditCard$share, na.rm = TRUE)
CreditCard$share[is.na(CreditCard$share)] <- median_share
boxplot_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot(coef = 1.5) + # Adjust coefficient for outlier detection
geom_density(aes(color = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") + # Add rug plots
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure (Median Imputed)") +
theme_minimal()
#What about robust measures of skewness and kurtosis?
robust_measures <- CreditCard %>%
group_by(card) %>%
summarize(
median_share = median(share),
mean_share = mean(share),
median_skewness = median((share - median_share)^3) / (median((share - median_share)^2)^(3/2)),
robust_kurtosis = median((share - median_share)^4) / (median((share - median_share)^2)^2)
)
print(robust_measures)
## # A tibble: 2 × 5
## card median_share mean_share median_skewness robust_kurtosis
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 no 0.000463 0.000477 0 1
## 2 yes 0.0602 0.0885 0 1
boxplot_kde_robust <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot(coef = 1.5) + # Adjust coefficient for outlier detection
geom_density(aes(color = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") + # Add rug plots
geom_text(data = robust_measures, aes(label = paste("Robust Skewness:", round(median_skewness, 2),
"\nRobust Kurtosis:", round(robust_kurtosis, 2))),
y = Inf, vjust = 2, hjust = 0) +
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
strip_chart_kde_robust <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
geom_density(aes(fill = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") +
geom_text(data = robust_measures, aes(label = paste("Robust Skewness:", round(median_skewness, 2),
"\nRobust Kurtosis:", round(robust_kurtosis, 2))),
y = Inf, vjust = 2, hjust = 0) +
labs(title = "Strip Chart with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()
print(boxplot_kde_robust)
print(strip_chart_kde_robust)
#Prepare your own descriptive analysis for the “CreditCard” dataset (AER package). It is a cross-sectional dataframe on the credit history for a sample of applicants for a type of credit card.
summary_stats <- CreditCard %>%
summarize(
mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
mean_income = mean(income, na.rm = TRUE),
median_income = median(income, na.rm = TRUE),
mean_share = mean(share, na.rm = TRUE),
median_share = median(share, na.rm = TRUE),
mean_expenditure = mean(expenditure, na.rm = TRUE),
median_expenditure = median(expenditure, na.rm = TRUE)
)
print(summary_stats)
## mean_age median_age mean_income median_income mean_share median_share
## 1 33.2131 31.25 3.365376 2.9 0.06873217 0.03882722
## mean_expenditure median_expenditure
## 1 185.0571 101.2983
hist_kde_age <- ggplot(CreditCard, aes(x = age)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
geom_density(alpha = 0.5, fill = "blue") +
labs(title = "Distribution of Age",
x = "Age",
y = "Density") +
theme_minimal()
hist_kde_income <- ggplot(CreditCard, aes(x = income)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
geom_density(alpha = 0.5, fill = "blue") +
labs(title = "Distribution of Income",
x = "Income",
y = "Density") +
theme_minimal()
boxplot_share_expenditure <- ggplot(CreditCard, aes(x = 1, y = share)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Boxplot of Share and Expenditure",
x = "",
y = "Value") +
theme_minimal()
print(hist_kde_age)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(hist_kde_income)
print(boxplot_share_expenditure)
robust_measures_share <- CreditCard %>%
summarize(
median_share = median(share),
robust_skewness = median((share - median_share)^3) / (median((share - median_share)^2)^(3/2)),
robust_kurtosis = median((share - median_share)^4) / (median((share - median_share)^2)^2)
)
print(robust_measures_share)
## median_share robust_skewness robust_kurtosis
## 1 0.03882722 0 1
cor_matrix <- CreditCard %>%
select(age, income, share, expenditure) %>%
cor()
print(cor_matrix)
## age income share expenditure
## age 1.0000000 0.32465320 -0.11569704 0.0149477
## income 0.3246532 1.00000000 -0.05442926 0.2811040
## share -0.1156970 -0.05442926 1.00000000 0.8387793
## expenditure 0.0149477 0.28110402 0.83877932 1.0000000
#!ggcorrplot::ggcorrplot(cor_matrix, method = "circle", lab = TRUE)
#.Are the yearly incomes (in USD 10,000), credit card expenditures, age, ratio of monthly credit card expenditure to yearly income - significantly different for applicants for customers with different credit risk (“card” variable - factor)?
anova_income <- aov(income ~ card, data = CreditCard)
print(summary(anova_income))
## Df Sum Sq Mean Sq F value Pr(>F)
## card 1 34 33.63 11.82 0.000605 ***
## Residuals 1317 3748 2.85
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_expenditure <- aov(expenditure ~ card, data = CreditCard)
print(summary(anova_expenditure))
## Df Sum Sq Mean Sq F value Pr(>F)
## card 1 13069899 13069899 203.5 <2e-16 ***
## Residuals 1317 84598038 64235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_age <- aov(age ~ card, data = CreditCard)
print(summary(anova_age))
## Df Sum Sq Mean Sq F value Pr(>F)
## card 1 0 0.04 0 0.984
## Residuals 1317 135591 102.95
anova_share <- aov(share ~ card, data = CreditCard)
print(summary(anova_share))
## Df Sum Sq Mean Sq F value Pr(>F)
## card 1 1.778 1.7780 233.4 <2e-16 ***
## Residuals 1317 10.031 0.0076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kruskal_income <- kruskal.test(income ~ card, data = CreditCard)
print(kruskal_income)
##
## Kruskal-Wallis rank sum test
##
## data: income by card
## Kruskal-Wallis chi-squared = 21.758, df = 1, p-value = 3.093e-06
kruskal_expenditure <- kruskal.test(expenditure ~ card, data = CreditCard)
print(kruskal_expenditure)
##
## Kruskal-Wallis rank sum test
##
## data: expenditure by card
## Kruskal-Wallis chi-squared = 669.53, df = 1, p-value < 2.2e-16
kruskal_age <- kruskal.test(age ~ card, data = CreditCard)
print(kruskal_age)
##
## Kruskal-Wallis rank sum test
##
## data: age by card
## Kruskal-Wallis chi-squared = 0.0052081, df = 1, p-value = 0.9425
kruskal_share <- kruskal.test(share ~ card, data = CreditCard)
print(kruskal_share)
##
## Kruskal-Wallis rank sum test
##
## data: share by card
## Kruskal-Wallis chi-squared = 660.43, df = 1, p-value < 2.2e-16