Data
At first we print data frame and check if it need any preprocessing, as in example with ‘apartments’. It looks quite good.
head(CreditCard)
## card reports age income share expenditure owner selfemp dependents
## 1 yes 0 37.66667 4.5200 0.033269910 124.983300 yes no 3
## 2 yes 0 33.25000 2.4200 0.005216942 9.854167 no no 3
## 3 yes 0 33.66667 4.5000 0.004155556 15.000000 yes no 4
## 4 yes 0 30.50000 2.5400 0.065213780 137.869200 no no 0
## 5 yes 0 32.16667 9.7867 0.067050590 546.503300 yes no 2
## 6 yes 0 23.25000 2.5000 0.044438400 91.996670 no no 0
## months majorcards active
## 1 54 1 12
## 2 34 1 13
## 3 58 1 5
## 4 25 1 7
## 5 64 1 5
## 6 54 1 1
YEARLY INCOME
YEARLY INCOME - plot
ggplot(CreditCard, aes(x = income * 10000, fill = card)) +
geom_histogram(position = "dodge", binwidth = 5000) +
labs(title = "Distribution of Yearly Incomes", x = "Yearly Income (USD)", y = "Frequency")
ggplot(CreditCard, aes(x = card, y = income, fill = card)) +
geom_boxplot() +
labs(title = "Yearly Income by Credit Risk", x = "Credit Risk", y = "Yearly Income") +
theme_minimal()
YEARLY INCOME - calculating statistics
# Handling NA values before computing statistics
na.omit_income <- na.omit(CreditCard$income)
# Calculating and printing statistics with descriptions
# Mean of income
mean_income <- mean(na.omit_income)
cat("Mean of yearly income: ", mean_income, "\n")
## Mean of yearly income: 3.365376
# Median of income
median_income <- median(na.omit_income)
cat("Median of yearly income: ", median_income, "\n")
## Median of yearly income: 2.9
# Standard deviation of income
sd_income <- sd(na.omit_income)
cat("Standard deviation of yearly income: ", sd_income, "\n")
## Standard deviation of yearly income: 1.693902
# Variance of income
var_income <- var(na.omit_income)
cat("Variance of yearly income: ", var_income, "\n")
## Variance of yearly income: 2.869303
# Coefficient of variability for income
coeff_var_income <- sd_income / mean_income * 100
cat("Coefficient of variability for yearly income (%): ", coeff_var_income, "\n")
## Coefficient of variability for yearly income (%): 50.33321
# Interquartile range (IQR) of income
IQR_income <- IQR(na.omit_income)
cat("Interquartile range (IQR) of yearly income: ", IQR_income, "\n")
## Interquartile range (IQR) of yearly income: 1.75625
# Interquartile deviation of income
sx_income <- IQR_income / 2
cat("Interquartile deviation of yearly income: ", sx_income, "\n")
## Interquartile deviation of yearly income: 0.878125
# IQR coefficient of variability for income
coeff_varx_income <- sx_income / median_income * 100
cat("IQR coefficient of variability for yearly income (%): ", coeff_varx_income, "\n")
## IQR coefficient of variability for yearly income (%): 30.28017
# Minimum income
min_income <- min(na.omit_income)
cat("Minimum yearly income: ", min_income, "\n")
## Minimum yearly income: 0.21
# Maximum income
max_income <- max(na.omit_income)
cat("Maximum yearly income: ", max_income, "\n")
## Maximum yearly income: 13.5
# Quantiles of income
quantiles_income <- quantile(na.omit_income, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1))
cat("Quantiles of yearly income: \n")
## Quantiles of yearly income:
print(quantiles_income)
## 0% 10% 25% 50% 75% 95% 100%
## 0.21000 1.80000 2.24375 2.90000 4.00000 6.90000 13.50000
# Summary by credit card possession
cat("Summary of yearly incomes for cardholders labeled 'yes':\n")
## Summary of yearly incomes for cardholders labeled 'yes':
print(summary(subset(CreditCard, card == 'yes')$income))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.210 2.350 3.000 3.451 4.000 13.500
cat("Summary of yearly incomes for cardholders labeled 'no':\n")
## Summary of yearly incomes for cardholders labeled 'no':
print(summary(subset(CreditCard, card == 'no')$income))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.490 2.000 2.590 3.069 3.625 11.000
CONCLUSIONS: From the yearly income data, we can observe the distribution of income across different credit risk categories. The boxplots and quantiles help identify the spread and central tendency of income. The variability measures, such as standard deviation and the coefficient of variability, indicate the extent of income disparity among cardholders. This analysis can help in assessing the financial health and risk associated with different cardholders, guiding credit risk management.
CREDIT CARD EXPENDITURES
CREDIT CARD EXPENDITURES - plot
ggplot(CreditCard, aes(x = card, y = expenditure, fill = card)) +
geom_boxplot() +
labs(title = "Credit Card Expenditure by Credit Risk", x = "Credit Risk", y = "Expenditure") +
theme_minimal()
CREDIT CARD EXPENDITURES - calculating statistics
# Handling NA values before computing statistics
na.omit_expenditure <- na.omit(CreditCard$expenditure)
# Mean of expenditure
mean_expenditure <- mean(na.omit_expenditure)
cat("Mean of expenditure: ", mean_expenditure, "\n")
## Mean of expenditure: 185.0571
# Median of expenditure
median_expenditure <- median(na.omit_expenditure)
cat("Median of expenditure: ", median_expenditure, "\n")
## Median of expenditure: 101.2983
# Standard deviation of expenditure
sd_expenditure <- sd(na.omit_expenditure)
cat("Standard deviation of expenditure: ", sd_expenditure, "\n")
## Standard deviation of expenditure: 272.2189
# Variance of expenditure
var_expenditure <- var(na.omit_expenditure)
cat("Variance of expenditure: ", var_expenditure, "\n")
## Variance of expenditure: 74103.14
# Coefficient of variability for expenditure
coeff_var_expenditure <- sd_expenditure / mean_expenditure * 100
cat("Coefficient of variability for expenditure (%): ", coeff_var_expenditure, "\n")
## Coefficient of variability for expenditure (%): 147.1
# Interquartile range (IQR) of expenditure
IQR_expenditure <- IQR(na.omit_expenditure)
cat("Interquartile range (IQR) of expenditure: ", IQR_expenditure, "\n")
## Interquartile range (IQR) of expenditure: 244.4525
# Interquartile deviation of expenditure
sx_expenditure <- IQR_expenditure / 2
cat("Interquartile deviation of expenditure: ", sx_expenditure, "\n")
## Interquartile deviation of expenditure: 122.2262
# IQR coefficient of variability for expenditure
coeff_varx_expenditure <- sx_expenditure / median_expenditure * 100
cat("IQR coefficient of variability for expenditure (%): ", coeff_varx_expenditure, "\n")
## IQR coefficient of variability for expenditure (%): 120.6597
# Minimum expenditure
min_expenditure <- min(na.omit_expenditure)
cat("Minimum expenditure: ", min_expenditure, "\n")
## Minimum expenditure: 0
# Maximum expenditure
max_expenditure <- max(na.omit_expenditure)
cat("Maximum expenditure: ", max_expenditure, "\n")
## Maximum expenditure: 3099.505
# Quantiles of expenditure
quantiles_expenditure <- quantile(na.omit_expenditure, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1))
cat("Quantiles of expenditure: \n")
## Quantiles of expenditure:
print(quantiles_expenditure)
## 0% 10% 25% 50% 75% 95%
## 0.000000 0.000000 4.583333 101.298300 249.035800 640.730130
## 100%
## 3099.505000
# Summaries by credit card possession
cat("Summary of expenditures for cardholders labeled 'yes':\n")
## Summary of expenditures for cardholders labeled 'yes':
print(summary(subset(CreditCard, card == 'yes')$expenditure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 66.52 150.18 238.60 313.96 3099.51
CONCLUSIONS: The expenditure data reveals significant variability in credit card usage among cardholders. The mean and median values indicate typical spending habits, while the wide range in the interquartile and total spread highlights diversity in financial behavior. Higher coefficients of variability suggest that spending is not uniform, possibly reflecting differing financial situations or attitudes towards credit usage. Notably, the extremities in minimum and maximum expenditures, along with the distribution across quantiles, can inform risk management strategies by illustrating patterns of financial responsibility or potential risk. Analyzing expenditures by credit card possession (‘yes’ vs ‘no’) further differentiates these patterns, offering targeted insights for credit policy adjustments.
AGE
AGE - plot
ggplot(CreditCard, aes(x = card, y = age, fill = card)) +
geom_boxplot() +
labs(title = "Age distribution by credit risk", x = "Credit risk", y = "Age") +
theme_minimal()
AGE - calculating statistics
# mean of age
mean_age <- mean(CreditCard$age)
print(mean_age)
## [1] 33.2131
# median of age
median_age <- median(CreditCard$age)
print(median_age)
## [1] 31.25
# standard deviation of age
sd_age <- sd(CreditCard$age)
print(sd_age)
## [1] 10.14278
# variance of age
var_age <- var(CreditCard$age)
print(var_age)
## [1] 102.8761
# coefficient of variability for age
coeff_var_age <- sd_age / mean_age * 100
print(coeff_var_age)
## [1] 30.5385
# interquartile range (IQR) of age
IQR_age <- IQR(CreditCard$age)
print(IQR_age)
## 75%
## 14
# interquartile deviation of age
sx_age <- IQR_age / 2
print(sx_age)
## 75%
## 7
# IQR coefficient of variability for age
coeff_varx_age <- sx_age / median_age * 100
print(coeff_varx_age)
## 75%
## 22.4
# minimum age
min_age <- min(CreditCard$age)
print(min_age)
## [1] 0.1666667
# maximum age
max_age <- max(CreditCard$age)
print(max_age)
## [1] 83.5
# quantiles of age
quantiles_age <- quantile(CreditCard$age, probs = c(0, 0.1, 0.25, 0.5, 0.75, 0.95, 1), na.rm = TRUE)
print(quantiles_age)
## 0% 10% 25% 50% 75% 95% 100%
## 0.1666667 22.6666700 25.4166700 31.2500000 39.4166700 52.4166700 83.5000000
CardYes <-subset(CreditCard, card == 'yes')
summary(CardYes$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1667 25.3333 31.0833 33.2160 39.7083 83.5000
CardNo <-subset(CreditCard, card == 'no')
summary(CardNo$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.75 25.65 31.83 33.20 38.06 80.17
AGE - summary table
|
Table 1
|
||
|---|---|---|
| No | Yes | |
| Min | 0.7500000 | 0.1666667 |
| Max | 80.1666600 | 83.5000000 |
| Q1 | 25.6458350 | 25.3333300 |
| Median | 31.8333350 | 31.0833300 |
| Q3 | 38.0625000 | 39.7083350 |
| Mean | 33.2029842 | 33.2160312 |
| Sd | 9.9212871 | 10.2107524 |
| IQR | 12.4166650 | 14.3750050 |
| Sx | 6.2083325 | 7.1875025 |
| Var % | 0.2988071 | 0.3074043 |
| IQR Var % | 0.3900523 | 0.4624667 |
| Skewness | 1.0978851 | 0.7694931 |
| Kurtosis | 2.3671473 | 1.2094982 |
CONCLUSIONS:
age distribution is generally similar in ‘yes’ and ‘no’ columns,
median and mean in both categories are almost the same, which indicates diversity even between people at the same age
from plot and table we can see than there is more younger people using credit cards than not using, but also maximum age is bigger for the users
outliers are present, especially when looking at minimum age
RATIO OF MONTHLY CREDIT CARD EXPENDITURES TO YEARLY INCOME
RATIO - plot
CreditCard$expenditure_to_income_ratio <- CreditCard$expenditure / (CreditCard$income * 12)
ggplot(CreditCard, aes(x = card, y = expenditure_to_income_ratio, fill = card)) +
geom_boxplot() +
labs(title = "Expenditure-to-Income Ratio", x = "Credit card", y = "Expenditure-to-Income Ratio") +
theme_minimal()
CONCLUSIONS:
- we can see that there is probably no data for people that don’t use credit cards, so let’s focus on this topic generally
mean_income <- mean(CreditCard$income * 12)
mean_expenditure <- mean(CreditCard$expenditure)
mean_data <- data.frame(Category = c("Income", "Expenditure"),
Mean_Value = c(mean_income, mean_expenditure))
bar_plot <- ggplot(mean_data, aes(x = Category, y = Mean_Value, fill = Category)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of mean income and expenditure",
x = "Category", y = "Mean Value") +
theme_minimal()
bar_plot
RATIO - calculating statistics
ratio <- (CreditCard$expenditure / (CreditCard$income * 12))
summary(ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.133 2.692 4.759 6.497 62.939
RATIO - summary table
| Statistics | Value |
|---|---|
| Min | 0.0000000 |
| 1st Quartile | 0.1329582 |
| Median | 2.6924769 |
| Mean | 4.7593645 |
| 3rd Quartile | 6.4967092 |
| Max | 62.9389167 |
CONCLUSIONS:
there is wide range between min and max, but also probably some outliers, because min = 0
mean is greater than median, so it means more people have higher expenditure-income ratio, which can lead to some financial instability