Univariate Statistics

Aleksandra Templin & Ksawery Raupuk

2024-04-23

Data

At first we print data frame and check if it need any preprocessing, as in example with ‘apartments’. It looks quite good.

head(CreditCard)

##   card reports      age income       share expenditure owner selfemp dependents
## 1  yes       0 37.66667 4.5200 0.033269910  124.983300   yes      no          3
## 2  yes       0 33.25000 2.4200 0.005216942    9.854167    no      no          3
## 3  yes       0 33.66667 4.5000 0.004155556   15.000000   yes      no          4
## 4  yes       0 30.50000 2.5400 0.065213780  137.869200    no      no          0
## 5  yes       0 32.16667 9.7867 0.067050590  546.503300   yes      no          2
## 6  yes       0 23.25000 2.5000 0.044438400   91.996670    no      no          0
##   months majorcards active
## 1     54          1     12
## 2     34          1     13
## 3     58          1      5
## 4     25          1      7
## 5     64          1      5
## 6     54          1      1

YEARLY INCOME

YEARLY INCOME - plot

ggplot(CreditCard, aes(x = income * 10000, fill = card)) +
  geom_histogram(position = "dodge", binwidth = 5000) +
  labs(title = "Distribution of Yearly Incomes", x = "Yearly Income (USD)", y = "Frequency")

ggplot(CreditCard, aes(x = card, y = income, fill = card)) +
  geom_boxplot() +
  labs(title = "Yearly Income by Credit Risk", x = "Credit Risk", y = "Yearly Income") +
  theme_minimal()

YEARLY INCOME - calculating statistics

# Handling NA values before computing statistics
na.omit_income <- na.omit(CreditCard$income)

# Calculating and printing statistics with descriptions
# Mean of income
mean_income <- mean(na.omit_income)
cat("Mean of yearly income: ", mean_income, "\n")

## Mean of yearly income:  3.365376

# Median of income
median_income <- median(na.omit_income)
cat("Median of yearly income: ", median_income, "\n")

## Median of yearly income:  2.9

# Standard deviation of income
sd_income <- sd(na.omit_income)
cat("Standard deviation of yearly income: ", sd_income, "\n")

## Standard deviation of yearly income:  1.693902

# Variance of income
var_income <- var(na.omit_income)
cat("Variance of yearly income: ", var_income, "\n")

## Variance of yearly income:  2.869303

# Coefficient of variability for income
coeff_var_income <- sd_income / mean_income * 100
cat("Coefficient of variability for yearly income (%): ", coeff_var_income, "\n")

## Coefficient of variability for yearly income (%):  50.33321

# Interquartile range (IQR) of income
IQR_income <- IQR(na.omit_income)
cat("Interquartile range (IQR) of yearly income: ", IQR_income, "\n")

## Interquartile range (IQR) of yearly income:  1.75625

# Interquartile deviation of income
sx_income <- IQR_income / 2
cat("Interquartile deviation of yearly income: ", sx_income, "\n")

## Interquartile deviation of yearly income:  0.878125

# IQR coefficient of variability for income
coeff_varx_income <- sx_income / median_income * 100
cat("IQR coefficient of variability for yearly income (%): ", coeff_varx_income, "\n")

## IQR coefficient of variability for yearly income (%):  30.28017

# Minimum income
min_income <- min(na.omit_income)
cat("Minimum yearly income: ", min_income, "\n")

## Minimum yearly income:  0.21

# Maximum income
max_income <- max(na.omit_income)
cat("Maximum yearly income: ", max_income, "\n")

## Maximum yearly income:  13.5

# Quantiles of income
quantiles_income <- quantile(na.omit_income, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1))
cat("Quantiles of yearly income: \n")

## Quantiles of yearly income:

print(quantiles_income)

##       0%      10%      25%      50%      75%      95%     100% 
##  0.21000  1.80000  2.24375  2.90000  4.00000  6.90000 13.50000

# Summary by credit card possession
cat("Summary of yearly incomes for cardholders labeled 'yes':\n")

## Summary of yearly incomes for cardholders labeled 'yes':

print(summary(subset(CreditCard, card == 'yes')$income))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.210   2.350   3.000   3.451   4.000  13.500

cat("Summary of yearly incomes for cardholders labeled 'no':\n")

## Summary of yearly incomes for cardholders labeled 'no':

print(summary(subset(CreditCard, card == 'no')$income))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.490   2.000   2.590   3.069   3.625  11.000

CONCLUSIONS: From the yearly income data, we can observe the distribution of income across different credit risk categories. The boxplots and quantiles help identify the spread and central tendency of income. The variability measures, such as standard deviation and the coefficient of variability, indicate the extent of income disparity among cardholders. This analysis can help in assessing the financial health and risk associated with different cardholders, guiding credit risk management.

CREDIT CARD EXPENDITURES

CREDIT CARD EXPENDITURES - plot

ggplot(CreditCard, aes(x = card, y = expenditure, fill = card)) +
  geom_boxplot() +
  labs(title = "Credit Card Expenditure by Credit Risk", x = "Credit Risk", y = "Expenditure") +
  theme_minimal()

CREDIT CARD EXPENDITURES - calculating statistics

# Handling NA values before computing statistics
na.omit_expenditure <- na.omit(CreditCard$expenditure)

# Mean of expenditure
mean_expenditure <- mean(na.omit_expenditure)
cat("Mean of expenditure: ", mean_expenditure, "\n")

## Mean of expenditure:  185.0571

# Median of expenditure
median_expenditure <- median(na.omit_expenditure)
cat("Median of expenditure: ", median_expenditure, "\n")

## Median of expenditure:  101.2983

# Standard deviation of expenditure
sd_expenditure <- sd(na.omit_expenditure)
cat("Standard deviation of expenditure: ", sd_expenditure, "\n")

## Standard deviation of expenditure:  272.2189

# Variance of expenditure
var_expenditure <- var(na.omit_expenditure)
cat("Variance of expenditure: ", var_expenditure, "\n")

## Variance of expenditure:  74103.14

# Coefficient of variability for expenditure
coeff_var_expenditure <- sd_expenditure / mean_expenditure * 100
cat("Coefficient of variability for expenditure (%): ", coeff_var_expenditure, "\n")

## Coefficient of variability for expenditure (%):  147.1

# Interquartile range (IQR) of expenditure
IQR_expenditure <- IQR(na.omit_expenditure)
cat("Interquartile range (IQR) of expenditure: ", IQR_expenditure, "\n")

## Interquartile range (IQR) of expenditure:  244.4525

# Interquartile deviation of expenditure
sx_expenditure <- IQR_expenditure / 2
cat("Interquartile deviation of expenditure: ", sx_expenditure, "\n")

## Interquartile deviation of expenditure:  122.2262

# IQR coefficient of variability for expenditure
coeff_varx_expenditure <- sx_expenditure / median_expenditure * 100
cat("IQR coefficient of variability for expenditure (%): ", coeff_varx_expenditure, "\n")

## IQR coefficient of variability for expenditure (%):  120.6597

# Minimum expenditure
min_expenditure <- min(na.omit_expenditure)
cat("Minimum expenditure: ", min_expenditure, "\n")

## Minimum expenditure:  0

# Maximum expenditure
max_expenditure <- max(na.omit_expenditure)
cat("Maximum expenditure: ", max_expenditure, "\n")

## Maximum expenditure:  3099.505

# Quantiles of expenditure
quantiles_expenditure <- quantile(na.omit_expenditure, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1))
cat("Quantiles of expenditure: \n")

## Quantiles of expenditure:

print(quantiles_expenditure)

##          0%         10%         25%         50%         75%         95% 
##    0.000000    0.000000    4.583333  101.298300  249.035800  640.730130 
##        100% 
## 3099.505000

# Summaries by credit card possession
cat("Summary of expenditures for cardholders labeled 'yes':\n")

## Summary of expenditures for cardholders labeled 'yes':

print(summary(subset(CreditCard, card == 'yes')$expenditure))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   66.52  150.18  238.60  313.96 3099.51

CONCLUSIONS: The expenditure data reveals significant variability in credit card usage among cardholders. The mean and median values indicate typical spending habits, while the wide range in the interquartile and total spread highlights diversity in financial behavior. Higher coefficients of variability suggest that spending is not uniform, possibly reflecting differing financial situations or attitudes towards credit usage. Notably, the extremities in minimum and maximum expenditures, along with the distribution across quantiles, can inform risk management strategies by illustrating patterns of financial responsibility or potential risk. Analyzing expenditures by credit card possession (‘yes’ vs ‘no’) further differentiates these patterns, offering targeted insights for credit policy adjustments.

AGE

AGE - plot

ggplot(CreditCard, aes(x = card, y = age, fill = card)) +
  geom_boxplot() +
  labs(title = "Age distribution by credit risk", x = "Credit risk", y = "Age") +
  theme_minimal()

AGE - calculating statistics

# mean of age
mean_age <- mean(CreditCard$age)
print(mean_age)

## [1] 33.2131

# median of age
median_age <- median(CreditCard$age)
print(median_age)

## [1] 31.25

# standard deviation of age
sd_age <- sd(CreditCard$age)
print(sd_age)

## [1] 10.14278

# variance of age
var_age <- var(CreditCard$age)
print(var_age)

## [1] 102.8761

# coefficient of variability for age
coeff_var_age <- sd_age / mean_age * 100
print(coeff_var_age)

## [1] 30.5385

# interquartile range (IQR) of age
IQR_age <- IQR(CreditCard$age)
print(IQR_age)

## 75% 
##  14

# interquartile deviation of age
sx_age <- IQR_age / 2
print(sx_age)

## 75% 
##   7

# IQR coefficient of variability for age
coeff_varx_age <- sx_age / median_age * 100
print(coeff_varx_age)

##  75% 
## 22.4

# minimum age
min_age <- min(CreditCard$age)
print(min_age)

## [1] 0.1666667

# maximum age
max_age <- max(CreditCard$age)
print(max_age)

## [1] 83.5

# quantiles of age
quantiles_age <- quantile(CreditCard$age, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1), na.rm = TRUE)
print(quantiles_age)

##         0%        10%        25%        50%        75%        95%       100% 
##  0.1666667 22.6666700 25.4166700 31.2500000 39.4166700 52.4166700 83.5000000

CardYes <-subset(CreditCard, card == 'yes')
summary(CardYes$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1667 25.3333 31.0833 33.2160 39.7083 83.5000

CardNo <-subset(CreditCard, card == 'no')
summary(CardNo$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.75   25.65   31.83   33.20   38.06   80.17

AGE - summary table

	Table 1
	No	Yes
Min	0.7500000	0.1666667
Max	80.1666600	83.5000000
Q1	25.6458350	25.3333300
Median	31.8333350	31.0833300
Q3	38.0625000	39.7083350
Mean	33.2029842	33.2160312
Sd	9.9212871	10.2107524
IQR	12.4166650	14.3750050
Sx	6.2083325	7.1875025
Var %	0.2988071	0.3074043
IQR Var %	0.3900523	0.4624667
Skewness	1.0978851	0.7694931
Kurtosis	2.3671473	1.2094982

CONCLUSIONS:

age distribution is generally similar in ‘yes’ and ‘no’ columns,
median and mean in both categories are almost the same, which indicates diversity even between people at the same age
from plot and table we can see than there is more younger people using credit cards than not using, but also maximum age is bigger for the users
outliers are present, especially when looking at minimum age

RATIO OF MONTHLY CREDIT CARD EXPENDITURES TO YEARLY INCOME

RATIO - plot

CreditCard$expenditure_to_income_ratio <- CreditCard$expenditure / (CreditCard$income * 12)

ggplot(CreditCard, aes(x = card, y = expenditure_to_income_ratio, fill = card)) +
  geom_boxplot() +
  labs(title = "Expenditure-to-Income Ratio", x = "Credit card", y = "Expenditure-to-Income Ratio") +
  theme_minimal()

CONCLUSIONS:

we can see that there is probably no data for people that don’t use credit cards, so let’s focus on this topic generally

mean_income <- mean(CreditCard$income * 12)
mean_expenditure <- mean(CreditCard$expenditure)

mean_data <- data.frame(Category = c("Income", "Expenditure"),
                        Mean_Value = c(mean_income, mean_expenditure))

bar_plot <- ggplot(mean_data, aes(x = Category, y = Mean_Value, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Comparison of mean income and expenditure",
       x = "Category", y = "Mean Value") +
  theme_minimal()

bar_plot

RATIO - calculating statistics

ratio <- (CreditCard$expenditure / (CreditCard$income * 12))
summary(ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.133   2.692   4.759   6.497  62.939

RATIO - summary table

Summary of expenditure-to-income ratio
Statistics	Value
Min	0.0000000
1st Quartile	0.1329582
Median	2.6924769
Mean	4.7593645
3rd Quartile	6.4967092
Max	62.9389167

CONCLUSIONS:

there is wide range between min and max, but also probably some outliers, because min = 0
mean is greater than median, so it means more people have higher expenditure-income ratio, which can lead to some financial instability