# extracting all data in Credit Card Clients datase
cl_df <- read_excel("Book1.xlsx")
View(cl_df)
Use given Dataset to display variable names, the top 5 rows, the bottom 10 rows, the data type, and the shape of the dataset. Drop duplicate rows if any, find the number of missing values in each column, and use a box plot to check if there are any outliers in the quantitative variables. Use appropriate methods to deal with missing values How to rename the columns in dataframe using the first columns of the dataframe in R and outliers, if any. ### display variable names
#Getting all columns names and store them in var_names variable
var_names <- names(cl_df)
print(var_names)
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default payment next month"
head(cl_df,5)
## # A tibble: 5 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
tail(cl_df,10)
## # A tibble: 10 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29991 140000 1 2 1 41 0 0 0 0 0
## 2 29992 210000 1 2 1 34 3 2 2 2 2
## 3 29993 10000 1 3 1 43 0 0 0 -2 -2
## 4 29994 100000 1 1 2 38 0 -1 -1 0 0
## 5 29995 80000 1 2 2 34 2 2 2 2 2
## 6 29996 220000 1 3 1 39 0 0 0 0 0
## 7 29997 150000 1 3 2 43 -1 -1 -1 -1 0
## 8 29998 30000 1 2 2 37 4 3 2 -1 0
## 9 29999 80000 1 3 1 41 1 -1 0 0 0
## 10 30000 50000 1 2 1 46 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
str(cl_df)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
dim(cl_df)
## [1] 30000 25
cl_df <- unique(cl_df)
dim(cl_df)
## [1] 30000 25
sum(is.na(cl_df))
## [1] 0
boxplot(cl_df)
#Ther is no missing values found
By using labeled and appropriate graphs: Summarize the customer information from the dataset by Gender, Education Level, Age, Marital Status and Default payment status. by using your findings complete the missing information in the following statement: The EDA shows that ….% of the clients are less that 40 years. Moreover,…..out of 30,000 have university-level education. In addition, the number of credit cards issued to female is ….. times the number issued to Male. Generally, ….% of the loan were defaulted
# Gender bar graph
cl_df$SEX <- factor(cl_df$SEX, levels = c(1, 2), labels = c("Male", "Female"))
ggplot(cl_df,aes(x=SEX))+
geom_bar()+
labs(
title = "Gender",
x = "Gender",
y = "Counts"
)
# Education bar graph
ggplot(cl_df,aes(x=EDUCATION))+
geom_bar()
# Age bar graph
ggplot(cl_df,aes(x=AGE))+
geom_histogram(binwidth = 5, fill="blue", col="white") + labs(title="Distribution by Age")
# Marital status bar graph
ggplot(cl_df,aes(x=MARRIAGE))+
geom_bar(fill="blue", col="white")
# default payment status bar graph
ggplot(cl_df,aes(x=PAY_AMT2))+
geom_bar(fill="blue", col="white")
df <- cl_df
colnames(df)[ncol(df)] <- "default_status"
total_rows <- nrow(df)
# % less than 40
age_less_40 <- mean(df$AGE < 40) * 100
# University level education
# 1 = graduate school, 2 = university, 3 = high school, 4 = others, 5=unknown, 6=unknown
univ_count <- sum(df$EDUCATION == 2)
# Sex: 1 = male, 2 = female
female_count <- sum(df$SEX == 2)
male_count <- sum(df$SEX == 1)
female_to_male_ratio <- female_count / male_count
# Default rate
default_rate <- mean(df$default_status) * 100
cat(sprintf("Age < 40: %.2f%%\n", age_less_40))
## Age < 40: 69.52%
cat(sprintf("University count: %d\n", univ_count))
## University count: 14030
cat(sprintf("Female to Male ratio: %.2f\n", female_to_male_ratio))
## Female to Male ratio: NaN
cat(sprintf("Default rate: %.2f%%\n", default_rate))
## Default rate: 22.12%
## 4.3 How do default rates vary by age groups(take age difference of 5 years), education level, gender and Marital Status
level, gender and Marital Status?
df$age_group <- cut(df$AGE, breaks = seq(min(df$AGE), max(df$AGE) + 5, by = 5), right = FALSE)
age_default <- df %>% group_by(age_group) %>% summarize(rate = mean(default_status))
print(age_default)
## # A tibble: 12 × 2
## age_group rate
## <fct> <dbl>
## 1 [21,26) 0.267
## 2 [26,31) 0.201
## 3 [31,36) 0.194
## 4 [36,41) 0.216
## 5 [41,46) 0.221
## 6 [46,51) 0.250
## 7 [51,56) 0.248
## 8 [56,61) 0.264
## 9 [61,66) 0.269
## 10 [66,71) 0.254
## 11 [71,76) 0.357
## 12 [76,81) 0
# Education
edu_default <- df %>% group_by(EDUCATION) %>% summarize(rate = mean(default_status))
print(edu_default)
## # A tibble: 7 × 2
## EDUCATION rate
## <dbl> <dbl>
## 1 0 0
## 2 1 0.192
## 3 2 0.237
## 4 3 0.252
## 5 4 0.0569
## 6 5 0.0643
## 7 6 0.157
# Sex
sex_default <- df %>% group_by(SEX) %>% summarize(rate = mean(default_status))
print(sex_default)
## # A tibble: 2 × 2
## SEX rate
## <fct> <dbl>
## 1 Male 0.242
## 2 Female 0.208
# Marriage
marriage_default <- df %>% group_by(MARRIAGE) %>% summarize(rate = mean(default_status))
print(marriage_default)
## # A tibble: 4 × 2
## MARRIAGE rate
## <dbl> <dbl>
## 1 0 0.0926
## 2 1 0.235
## 3 2 0.209
## 4 3 0.260
defined by a combination of attributes or behaviors – for example, “young customers age ≤ 30 with low education level” or “customers who maxed out their credit limit and had multiple late payments.” Clearly define the segment, quantify its default rate.
df<-cl_df
colnames(df)[ncol(df)] <- "default_status"
segment <- df %>% filter(AGE <= 30, SEX == 1, EDUCATION == 3)
cat(sprintf("Segment Default Rate: %.2f%%\n", mean(segment$default_status)*100))
## Segment Default Rate: NaN%
cat(sprintf("Segment Size: %d\n", nrow(segment)))
## Segment Size: 0
segment2 <- df %>% filter(AGE <= 30, EDUCATION == 3)
cat(sprintf("Segment2 Default Rate: %.2f%%\n", mean(segment2$default_status)*100))
## Segment2 Default Rate: 27.10%