Univariate Analysis Report

DATA

data(CreditCard)
diagnose(CreditCard)

## # A tibble: 12 × 6
##    variables   types   missing_count missing_percent unique_count unique_rate
##    <chr>       <chr>           <int>           <dbl>        <int>       <dbl>
##  1 card        factor              0               0            2     0.00152
##  2 reports     numeric             0               0           13     0.00986
##  3 age         numeric             0               0          418     0.317  
##  4 income      numeric             0               0          431     0.327  
##  5 share       numeric             0               0         1162     0.881  
##  6 expenditure numeric             0               0          981     0.744  
##  7 owner       factor              0               0            2     0.00152
##  8 selfemp     factor              0               0            2     0.00152
##  9 dependents  numeric             0               0            7     0.00531
## 10 months      numeric             0               0          193     0.146  
## 11 majorcards  numeric             0               0            2     0.00152
## 12 active      numeric             0               0           35     0.0265

diagnose_numeric(CreditCard)

##     variables          min           Q1         mean       median           Q3
## 1     reports 0.0000000000  0.000000000   0.45640637   0.00000000   0.00000000
## 2         age 0.1666667000 25.416670000  33.21310325  31.25000000  39.41667000
## 3      income 0.2100000000  2.243750000   3.36537604   2.90000000   4.00000000
## 4       share 0.0001090909  0.002315922   0.06873217   0.03882722   0.09361682
## 5 expenditure 0.0000000000  4.583333000 185.05707078 101.29830000 249.03580000
## 6  dependents 0.0000000000  0.000000000   0.99393480   1.00000000   2.00000000
## 7      months 0.0000000000 12.000000000  55.26762699  30.00000000  72.00000000
## 8  majorcards 0.0000000000  1.000000000   0.81728582   1.00000000   1.00000000
## 9      active 0.0000000000  2.000000000   6.99696740   6.00000000  11.00000000
##            max zero minus outlier
## 1   14.0000000 1060     0     259
## 2   83.5000000    0     0      28
## 3   13.5000000    0     0      77
## 4    0.9063205    0     0      74
## 5 3099.5050000  317     0      75
## 6    6.0000000  659     0       7
## 7  540.0000000    3     0      95
## 8    1.0000000  241     0     241
## 9   46.0000000  219     0      18

ggplot(CreditCard, aes(x=income, y=expenditure)) +
    geom_boxplot(alpha=0.7) +
    stat_summary(fun="mean", geom="point", shape=20, size=5, color="red", fill="red") +
 geom_jitter() +
    facet_grid(~card) +
    scale_fill_brewer(palette="Set1")

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

no_riskers<-filter(CreditCard, card=="no")

ggplot(no_riskers, aes(x=income, y=expenditure)) +
    stat_summary(fun="median", geom="point", shape=20, size=5, color="red", fill="red") +
 geom_jitter() +
    scale_fill_brewer(palette="Set1")

Early faceting shows that expenditures of people with credit risk can have crazy values, however most of them are rather low (right-side asymetry), meanwhile expenditures of people with no risk are much lover compared to those with risk(usualy equal 0), income is propably not different so much. To check it I will find quantiles.

no_riskers<-filter(CreditCard, card=="no")
yes_riskers<-filter(CreditCard, card=="yes")
quantile(no_riskers$income,probs=c(0,0.1,0.25,0.5,0.75,0.95,1),na.rm=TRUE)

##     0%    10%    25%    50%    75%    95%   100% 
##  0.490  1.676  2.000  2.590  3.625  6.125 11.000

quantile(yes_riskers$income,probs=c(0,0.1,0.25,0.5,0.75,0.95,1),na.rm=TRUE)

##    0%   10%   25%   50%   75%   95%  100% 
##  0.21  1.90  2.35  3.00  4.00  7.00 13.50

Because Median and other quantiles are generaly a little higher for people with risk, we can assume they generaly earn a little more. Also income looks to have right-sided asymetry

Outliers

diagnose_outlier(CreditCard)

##     variables outliers_cnt outliers_ratio outliers_mean    with_mean
## 1     reports          259     19.6360879     2.3243243   0.45640637
## 2         age           28      2.1228203    49.8065473  33.21310325
## 3      income           77      5.8377559     8.2859974   3.36537604
## 4       share           74      5.6103108     0.3540334   0.06873217
## 5 expenditure           75      5.6861259   999.3246040 185.05707078
## 6  dependents            7      0.5307051     6.0000000   0.99393480
## 7      months           95      7.2024261   237.7263158  55.26762699
## 8  majorcards          241     18.2714177     0.0000000   0.81728582
## 9      active           18      1.3646702    29.8888889   6.99696740
##   without_mean
## 1   0.00000000
## 2  32.85321446
## 3   3.06031337
## 4   0.05177451
## 5 135.96537866
## 6   0.96722561
## 7  41.10620915
## 8   1.00000000
## 9   6.68024596

Q <- quantile(CreditCard$expenditure, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(CreditCard$expenditure)
CreditCard_without_Expout<- subset(CreditCard, CreditCard$expenditure > (Q[1] - 1.5*iqr) & CreditCard$expenditure < (Q[2]+1.5*iqr))
diagnose_outlier(CreditCard_without_Expout)

##     variables outliers_cnt outliers_ratio outliers_mean    with_mean
## 1     reports          242      19.453376     2.4008264   0.46704180
## 2         age           28       2.250804    49.8065473  33.17597793
## 3      income           68       5.466238     7.9964765   3.27191479
## 4       share           51       4.099678     0.2510199   0.05477854
## 5 expenditure           26       2.090032   564.3096769 135.96537866
## 6  dependents            7       0.562701     6.0000000   0.97427653
## 7      months           93       7.475884   237.3225806  55.77331190
## 8  majorcards          234      18.810289     0.0000000   0.81189711
## 9      active           15       1.205788    30.0000000   6.91639871
##   without_mean
## 1   0.00000000
## 2  32.79303719
## 3   2.99872585
## 4   0.04638934
## 5 126.82174011
## 6   0.94583670
## 7  41.10425717
## 8   1.00000000
## 9   6.63466233

Density PLOT - “price_PLN”

density.i <- ggdensity(CreditCard, x = "income", 
                       fill = "card", palette = "jco")+
  stat_overlay_normal_density(color = "red", linetype = "dashed")

density.e <- ggdensity(CreditCard_without_Expout, x = "expenditure", 
                       fill = "card", palette = "jco")+
  stat_overlay_normal_density(color = "red", linetype = "dashed")
density.i

CreditCard_without_Expout_andZeros<- subset(CreditCard_without_Expout, CreditCard_without_Expout$expenditure > 0)

density.e <- ggdensity(CreditCard_without_Expout_andZeros, x = "expenditure", 
                       fill = "card", palette = "jco")+
  stat_overlay_normal_density(color = "red", linetype = "dashed")
density.e

# As expected expenditures of people with risk have rigth-sided asymetry. Whats interesting all expenditures of people with no risk have gone, so I guess we can assume they are equal to 0(without outliers) # Both income and expenditure have rigth sided asymetry. Income of both groups is quiet similar, a little bit higher for people with risk. Meanwhile expenditures are very different: people with risk have expenditures higher than income, with some outliers of crazy high values; people with no risk have expenditures mostly equal to 0.

ADDITIONAL ANALYSIS

summary_list <- split(CreditCard, CreditCard$card)
lapply(summary_list, summary)

## $no
##   card        reports            age            income      
##  no :296   Min.   : 0.000   Min.   : 0.75   Min.   : 0.490  
##  yes:  0   1st Qu.: 0.000   1st Qu.:25.65   1st Qu.: 2.000  
##            Median : 1.000   Median :31.83   Median : 2.590  
##            Mean   : 1.588   Mean   :33.20   Mean   : 3.069  
##            3rd Qu.: 2.000   3rd Qu.:38.06   3rd Qu.: 3.625  
##            Max.   :14.000   Max.   :80.17   Max.   :11.000  
##      share            expenditure owner     selfemp     dependents   
##  Min.   :0.0001091   Min.   :0    no :206   no :268   Min.   :0.000  
##  1st Qu.:0.0003311   1st Qu.:0    yes: 90   yes: 28   1st Qu.:0.000  
##  Median :0.0004633   Median :0                        Median :1.000  
##  Mean   :0.0004768   Mean   :0                        Mean   :1.078  
##  3rd Qu.:0.0006000   3rd Qu.:0                        3rd Qu.:2.000  
##  Max.   :0.0024490   Max.   :0                        Max.   :6.000  
##      months        majorcards         active      
##  Min.   :  0.0   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 14.0   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median : 36.0   Median :1.0000   Median : 4.000  
##  Mean   : 55.3   Mean   :0.7399   Mean   : 6.054  
##  3rd Qu.: 63.0   3rd Qu.:1.0000   3rd Qu.: 9.000  
##  Max.   :528.0   Max.   :1.0000   Max.   :46.000  
## 
## $yes
##   card         reports           age              income      
##  no :   0   Min.   :0.000   Min.   : 0.1667   Min.   : 0.210  
##  yes:1023   1st Qu.:0.000   1st Qu.:25.3333   1st Qu.: 2.350  
##             Median :0.000   Median :31.0833   Median : 3.000  
##             Mean   :0.129   Mean   :33.2160   Mean   : 3.451  
##             3rd Qu.:0.000   3rd Qu.:39.7083   3rd Qu.: 4.000  
##             Max.   :4.000   Max.   :83.5000   Max.   :13.500  
##      share           expenditure      owner     selfemp     dependents    
##  Min.   :0.000186   Min.   :   0.00   no :532   no :960   Min.   :0.0000  
##  1st Qu.:0.026635   1st Qu.:  66.52   yes:491   yes: 63   1st Qu.:0.0000  
##  Median :0.060209   Median : 150.18                       Median :0.0000  
##  Mean   :0.088482   Mean   : 238.60                       Mean   :0.9697  
##  3rd Qu.:0.113858   3rd Qu.: 313.96                       3rd Qu.:2.0000  
##  Max.   :0.906320   Max.   :3099.51                       Max.   :6.0000  
##      months         majorcards         active     
##  Min.   :  0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.: 12.00   1st Qu.:1.0000   1st Qu.: 2.00  
##  Median : 30.00   Median :1.0000   Median : 6.00  
##  Mean   : 55.26   Mean   :0.8397   Mean   : 7.27  
##  3rd Qu.: 72.00   3rd Qu.:1.0000   3rd Qu.:11.00  
##  Max.   :540.00   Max.   :1.0000   Max.   :31.00

#describeBy(CreditCard, group = CreditCard$status, mat = TRUE)
CreditCard %>%
group_by(card) %>%
summarize(skewness_share = skewness(share, na.rm = TRUE),
kurtosis_share = kurtosis(share, na.rm = TRUE),
skewness_expenditure = skewness(expenditure, na.rm = TRUE),
kurtosis_expenditure = kurtosis(expenditure, na.rm = TRUE))

## # A tibble: 2 × 5
##   card  skewness_share kurtosis_share skewness_expenditure kurtosis_expenditure
##   <fct>          <dbl>          <dbl>                <dbl>                <dbl>
## 1 no              2.76           26.2               NaN                   NaN  
## 2 yes             3.05           17.9                 3.57                 23.1

#.Plot Lorenz curve and report the Gini coefficient
CreditCard <- CreditCard %>% arrange(share)
CreditCard <- CreditCard %>%
mutate(cum_share = cumsum(share) / sum(share),
cum_population = seq_along(share) / n())
ggplot(CreditCard, aes(x = cum_population, y = cum_share)) +
geom_line(color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Lorenz Curve for Credit Card Expenditure",
x = "Cumulative Proportion of Population",
y = "Cumulative Share of Credit Card Expenditure") +
theme_minimal()

gini_coefficient <- 1 - 2 * sum(CreditCard$cum_share * CreditCard$cum_population)
print(paste("Gini Coefficient:", gini_coefficient))

## [1] "Gini Coefficient: -412.102560732661"

summary_stats <- CreditCard %>%
group_by(card) %>%
summarize(median_share = median(share, na.rm = TRUE),
mean_share = mean(share, na.rm = TRUE))

boxplot <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot() +
labs(title = "Boxplot of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

strip_chart <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
labs(title = "Strip Chart of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

print(summary_stats)

## # A tibble: 2 × 3
##   card  median_share mean_share
##   <fct>        <dbl>      <dbl>
## 1 no        0.000463   0.000477
## 2 yes       0.0602     0.0885

print(boxplot)

print(strip_chart)

#.Add kernel density estimation to your histograms

#boxplots with KDE
boxplot_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot() +
geom_density(aes(color = factor(card)), alpha = 0.5) + # Add KDE
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

#strip charts with KDE
strip_chart_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
geom_density(aes(fill = factor(card)), alpha = 0.5) + # Add KDE
labs(title = "Strip Chart with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

print(boxplot_kde)

print(strip_chart_kde)

#.Be aware of the level of measurement of each variable, missing values and outliers

#Checking for missing values
missing_values <- sum(is.na(CreditCard))
print(paste("Missing values:", missing_values))

## [1] "Missing values: 0"

median_share <- median(CreditCard$share, na.rm = TRUE)
CreditCard$share[is.na(CreditCard$share)] <- median_share

boxplot_kde <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot(coef = 1.5) + # Adjust coefficient for outlier detection
geom_density(aes(color = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") + # Add rug plots
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure (Median Imputed)") +
theme_minimal()

#What about robust measures of skewness and kurtosis?

robust_measures <- CreditCard %>%
group_by(card) %>%
summarize(
median_share = median(share),
mean_share = mean(share),
median_skewness = median((share - median_share)^3) / (median((share - median_share)^2)^(3/2)),
robust_kurtosis = median((share - median_share)^4) / (median((share - median_share)^2)^2)
)

print(robust_measures)

## # A tibble: 2 × 5
##   card  median_share mean_share median_skewness robust_kurtosis
##   <fct>        <dbl>      <dbl>           <dbl>           <dbl>
## 1 no        0.000463   0.000477               0               1
## 2 yes       0.0602     0.0885                 0               1

boxplot_kde_robust <- ggplot(CreditCard, aes(x = factor(card), y = share, fill = factor(card))) +
geom_boxplot(coef = 1.5) + # Adjust coefficient for outlier detection
geom_density(aes(color = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") + # Add rug plots
geom_text(data = robust_measures, aes(label = paste("Robust Skewness:", round(median_skewness, 2),
"\nRobust Kurtosis:", round(robust_kurtosis, 2))),
y = Inf, vjust = 2, hjust = 0) +
labs(title = "Boxplot with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

strip_chart_kde_robust <- ggplot(CreditCard, aes(x = factor(card), y = share, color = factor(card))) +
geom_jitter(width = 0.2, alpha = 0.7) +
geom_density(aes(fill = factor(card)), alpha = 0.5) +
geom_rug(aes(color = factor(card)), sides = "b") +
geom_text(data = robust_measures, aes(label = paste("Robust Skewness:", round(median_skewness, 2),
"\nRobust Kurtosis:", round(robust_kurtosis, 2))),
y = Inf, vjust = 2, hjust = 0) +
labs(title = "Strip Chart with KDE of Credit Card Expenditure by Card Status",
x = "Card Status",
y = "Share of Credit Card Expenditure") +
theme_minimal()

print(boxplot_kde_robust)

print(strip_chart_kde_robust)

#Prepare your own descriptive analysis for the “CreditCard” dataset (AER package). It is a cross-sectional dataframe on the credit history for a sample of applicants for a type of credit card.

summary_stats <- CreditCard %>%
summarize(
mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
mean_income = mean(income, na.rm = TRUE),
median_income = median(income, na.rm = TRUE),
mean_share = mean(share, na.rm = TRUE),
median_share = median(share, na.rm = TRUE),
mean_expenditure = mean(expenditure, na.rm = TRUE),
median_expenditure = median(expenditure, na.rm = TRUE)
)

print(summary_stats)

##   mean_age median_age mean_income median_income mean_share median_share
## 1  33.2131      31.25    3.365376           2.9 0.06873217   0.03882722
##   mean_expenditure median_expenditure
## 1         185.0571           101.2983

hist_kde_age <- ggplot(CreditCard, aes(x = age)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
geom_density(alpha = 0.5, fill = "blue") +
labs(title = "Distribution of Age",
x = "Age",
y = "Density") +
theme_minimal()

hist_kde_income <- ggplot(CreditCard, aes(x = income)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
geom_density(alpha = 0.5, fill = "blue") +
labs(title = "Distribution of Income",
x = "Income",
y = "Density") +
theme_minimal()

boxplot_share_expenditure <- ggplot(CreditCard, aes(x = 1, y = share)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Boxplot of Share and Expenditure",
x = "",
y = "Value") +
theme_minimal()

print(hist_kde_age)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(hist_kde_income)

print(boxplot_share_expenditure)

robust_measures_share <- CreditCard %>%
summarize(
median_share = median(share),
robust_skewness = median((share - median_share)^3) / (median((share - median_share)^2)^(3/2)),
robust_kurtosis = median((share - median_share)^4) / (median((share - median_share)^2)^2)
)

print(robust_measures_share)

##   median_share robust_skewness robust_kurtosis
## 1   0.03882722               0               1

cor_matrix <- CreditCard %>%
select(age, income, share, expenditure) %>%
cor()

print(cor_matrix)

##                    age      income       share expenditure
## age          1.0000000  0.32465320 -0.11569704   0.0149477
## income       0.3246532  1.00000000 -0.05442926   0.2811040
## share       -0.1156970 -0.05442926  1.00000000   0.8387793
## expenditure  0.0149477  0.28110402  0.83877932   1.0000000

#!ggcorrplot::ggcorrplot(cor_matrix, method = "circle", lab = TRUE)

#.Are the yearly incomes (in USD 10,000), credit card expenditures, age, ratio of monthly credit card expenditure to yearly income - significantly different for applicants for customers with different credit risk (“card” variable - factor)?

anova_income <- aov(income ~ card, data = CreditCard)
print(summary(anova_income))

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## card           1     34   33.63   11.82 0.000605 ***
## Residuals   1317   3748    2.85                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova_expenditure <- aov(expenditure ~ card, data = CreditCard)
print(summary(anova_expenditure))

##               Df   Sum Sq  Mean Sq F value Pr(>F)    
## card           1 13069899 13069899   203.5 <2e-16 ***
## Residuals   1317 84598038    64235                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova_age <- aov(age ~ card, data = CreditCard)
print(summary(anova_age))

##               Df Sum Sq Mean Sq F value Pr(>F)
## card           1      0    0.04       0  0.984
## Residuals   1317 135591  102.95

anova_share <- aov(share ~ card, data = CreditCard)
print(summary(anova_share))

##               Df Sum Sq Mean Sq F value Pr(>F)    
## card           1  1.778  1.7780   233.4 <2e-16 ***
## Residuals   1317 10.031  0.0076                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

kruskal_income <- kruskal.test(income ~ card, data = CreditCard)
print(kruskal_income)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  income by card
## Kruskal-Wallis chi-squared = 21.758, df = 1, p-value = 3.093e-06

kruskal_expenditure <- kruskal.test(expenditure ~ card, data = CreditCard)
print(kruskal_expenditure)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  expenditure by card
## Kruskal-Wallis chi-squared = 669.53, df = 1, p-value < 2.2e-16

kruskal_age <- kruskal.test(age ~ card, data = CreditCard)
print(kruskal_age)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  age by card
## Kruskal-Wallis chi-squared = 0.0052081, df = 1, p-value = 0.9425

kruskal_share <- kruskal.test(share ~ card, data = CreditCard)
print(kruskal_share)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  share by card
## Kruskal-Wallis chi-squared = 660.43, df = 1, p-value < 2.2e-16

Univariate Analysis Report

Alicja(197772), Hubert(197740), Yaren(201924)

Last edited: 24.04.2024

DATA

Outliers

Density PLOT - “price_PLN”

ADDITIONAL ANALYSIS