Minor Project

Here is a Banking Analysis code.

Getting started with loading all required libraries.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 1.0.0 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(plotrix)
library(ggplot2)

Loading dataset named banking

library(readr)
banking <- read_csv("C:/Users/malik/Downloads/banking.csv")

## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, email, phone_number, job
## dbl (8): customer_id, age, balance, loan, day, month, year, campaign
## lgl (1): housing
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(banking)

Performing Data Cleaning

Get dataset’s description

print(paste("Dimensions of dataset: ", dim(banking)))

## [1] "Dimensions of dataset:  1000" "Dimensions of dataset:  13"

Summary of my Dataset

summary(banking)

##      name              email            customer_id          age       
##  Length:1000        Length:1000        Min.   :   1.0   Min.   :18.00  
##  Class :character   Class :character   1st Qu.: 250.8   1st Qu.:34.00  
##  Mode  :character   Mode  :character   Median : 500.5   Median :51.00  
##                                        Mean   : 500.5   Mean   :51.21  
##                                        3rd Qu.: 750.2   3rd Qu.:69.00  
##                                        Max.   :1000.0   Max.   :85.00  
##                                                         NA's   :88     
##  phone_number          balance            loan        housing       
##  Length:1000        Min.   :  5442   Min.   :10024   Mode :logical  
##  Class :character   1st Qu.:249472   1st Qu.:32650   FALSE:480      
##  Mode  :character   Median :515405   Median :53930   TRUE :486      
##                     Mean   :507163   Mean   :54886   NA's :34       
##                     3rd Qu.:761599   3rd Qu.:77593                  
##                     Max.   :999607   Max.   :99977                  
##                                      NA's   :42                     
##       day            month             year         campaign    
##  Min.   : 1.00   Min.   : 1.000   Min.   :1995   Min.   : 1.00  
##  1st Qu.: 9.00   1st Qu.: 3.000   1st Qu.:2001   1st Qu.: 6.00  
##  Median :17.00   Median : 7.000   Median :2008   Median :11.00  
##  Mean   :16.55   Mean   : 6.527   Mean   :2008   Mean   :10.61  
##  3rd Qu.:24.00   3rd Qu.:10.000   3rd Qu.:2016   3rd Qu.:16.00  
##  Max.   :31.00   Max.   :12.000   Max.   :2022   Max.   :20.00  
##                                                  NA's   :39     
##      job           
##  Length:1000       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Number of na values exists in dataset-banking

sum(is.na(banking))

## [1] 354

Removing the na values

bank<-na.omit(banking)
View(bank)

Verifying na values values exists or not

sum(is.na(bank))

## [1] 0

HISTOGRAM:-

# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
  geom_histogram(binwidth = 5,
                 fill = "lightgreen",
                 color = "black") +
 labs(title="Distribution of Days",
       x="Day",
       y="Frequency")

PIE CHART:-

# Create a data frame with housing loan counts
housing_counts <- bank %>%
  group_by(housing) %>%
  summarise(count = n())

# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
  geom_bar(stat = "identity",
           width = 1,
           color = "white") +
  coord_polar("y") +
  labs(title = "Housing Loan Status",
       fill = "Housing",
       x = NULL,
       y = NULL) +
  theme_void()

Density Plot

# Density Plot for Age
ggplot(bank, aes(x = age)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Age Density Plot",
       x = "Age",
       y = "Density")

SCATTER PLOT:-

# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)

# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
  geom_point() +
  labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")

LINE CHART:-

bank %>%
  group_by(year) %>%
  summarise(mean_balance = mean(balance)) %>%
  ggplot(aes(x = year, y = mean_balance)) +
  geom_line() +
  labs(title = "Average Balance Over Years",
       x = "Year",
       y = "Average Balance")

# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
  group_by(month, campaign) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
  geom_area() +
  labs(title = "Campaign Activity Over Months", 
       x = "Month", y = "Count", fill = "Campaign")

Performing Measure of Central Tendency:-

# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))

# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance


# Display the Central Tendency
print("Measures of Central Tendency:\n")

## [1] "Measures of Central Tendency:\n"

cat("Mean:", mean_balance, "\n")

## Mean: 494321.5

cat("Median:", median_balance, "\n")

## Median: 497429.6

cat("Mode:", mode_balance, "\n")

## Mode: 5442.22

#Display the Measures of Dispersion
print("\nMeasures of Dispersion:\n")

## [1] "\nMeasures of Dispersion:\n"

cat("Range:", range_balance, "\n")

## Range: 5442.22 999607.1

cat("Variance:", variance_balance, "\n")

## Variance: 83574460647

cat("Standard Deviation:", std_deviation_balance, "\n")

## Standard Deviation: 289092.5

cat("Interquartile Range (IQR):", iqr_balance, "\n")

## Interquartile Range (IQR): 495985.4

Conclusion

#10 highest loan borrowers
highestloan_10<- bank[order(-bank$loan), ][1:10, ]
highestloan_10

## # A tibble: 10 × 13
##    name    email  custo…¹   age phone…² balance   loan housing   day month  year
##    <chr>   <chr>    <dbl> <dbl> <chr>     <dbl>  <dbl> <lgl>   <dbl> <dbl> <dbl>
##  1 Sabra   sbott…     389    82 263-91… 200990. 99977. FALSE      26     9  1995
##  2 Wit     wfree…     180    35 965-73… 183993. 99881. TRUE       21     5  2002
##  3 Minna   mmang…     149    69 129-34… 883715. 99870. TRUE       18     2  2016
##  4 Fionna  fmart…     966    73 564-36…  12440. 99853. TRUE       31     2  2004
##  5 Homerus harne…     191    34 785-16… 122084. 99778. TRUE       30     6  2001
##  6 Dugald  dmatt…     187    65 539-50… 890699. 99717. FALSE      26     5  2018
##  7 Cassie  cmcca…     335    68 186-46… 924931. 99577. FALSE      20     1  2018
##  8 Sandi   smonc…     297    73 886-52…  48548. 99478. TRUE       14    11  2018
##  9 Massimo mflam…     277    39 887-47… 536319. 99088. TRUE       26    12  2009
## 10 Kassi   kbidm…     790    32 525-84… 657573. 98964. TRUE       17     3  2018
## # … with 2 more variables: campaign <dbl>, job <chr>, and abbreviated variable
## #   names ¹customer_id, ²phone_number

#10highest balance customers
highestbal_10<-bank[order(-bank$balance),][1:10, ]
highestbal_10

## # A tibble: 10 × 13
##    name     email custo…¹   age phone…² balance   loan housing   day month  year
##    <chr>    <chr>   <dbl> <dbl> <chr>     <dbl>  <dbl> <lgl>   <dbl> <dbl> <dbl>
##  1 Danette  dort…     469    65 406-17… 999607. 75049. TRUE        5     7  2014
##  2 Yardley  ymas…     928    35 970-31… 991077. 76478. TRUE        5     9  2001
##  3 Antonie… acoc…     549    27 992-57… 990204. 57300. FALSE       7     5  2002
##  4 Hilly    hsze…     513    68 294-72… 990097. 48897. FALSE      17     7  2015
##  5 Elonore  eyou…     398    74 785-60… 989874. 37838. TRUE       13     4  2019
##  6 Pam      psie…     862    53 409-90… 989630. 17295. TRUE       19     8  2017
##  7 Jackque… jwad…      31    28 507-53… 986247. 80061. FALSE      31     3  1996
##  8 Corinne  cpen…      39    36 488-41… 985569. 81940. TRUE        8     8  2019
##  9 Ranna    rrym…      79    52 945-87… 984917. 52647. FALSE       8     8  2011
## 10 Mechelle msta…     574    19 256-16… 983253. 76958. TRUE        4     7  2000
## # … with 2 more variables: campaign <dbl>, job <chr>, and abbreviated variable
## #   names ¹customer_id, ²phone_number

Through above analysis and visualization we got to know about patterns and trends- how many people are joined with the bank, which age group of are enganged, how many customers are participating in campaign activities, highest balance and loan, customer using housing loan or not.

Minor Project

Arsh Malik

2023-09-15