Minor Project

Here is a Banking Analysis code.

Getting started with loading all required libraries.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 1.0.0 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(plotrix)
library(ggplot2)

Loading dataset named banking

library(readr)
banking <- read_csv("C:/Users/malik/Downloads/banking.csv")

## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, email, phone_number, job
## dbl (8): customer_id, age, balance, loan, day, month, year, campaign
## lgl (1): housing
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(banking)

Performing Data Cleaning

Get dataset’s description

print(paste("Dimensions of dataset: ", dim(banking)))

## [1] "Dimensions of dataset:  1000" "Dimensions of dataset:  13"

Summary of my Dataset

summary(banking)

##      name              email            customer_id          age       
##  Length:1000        Length:1000        Min.   :   1.0   Min.   :18.00  
##  Class :character   Class :character   1st Qu.: 250.8   1st Qu.:34.00  
##  Mode  :character   Mode  :character   Median : 500.5   Median :51.00  
##                                        Mean   : 500.5   Mean   :51.21  
##                                        3rd Qu.: 750.2   3rd Qu.:69.00  
##                                        Max.   :1000.0   Max.   :85.00  
##                                                         NA's   :88     
##  phone_number          balance            loan        housing       
##  Length:1000        Min.   :  5442   Min.   :10024   Mode :logical  
##  Class :character   1st Qu.:249472   1st Qu.:32650   FALSE:480      
##  Mode  :character   Median :515405   Median :53930   TRUE :486      
##                     Mean   :507163   Mean   :54886   NA's :34       
##                     3rd Qu.:761599   3rd Qu.:77593                  
##                     Max.   :999607   Max.   :99977                  
##                                      NA's   :42                     
##       day            month             year         campaign    
##  Min.   : 1.00   Min.   : 1.000   Min.   :1995   Min.   : 1.00  
##  1st Qu.: 9.00   1st Qu.: 3.000   1st Qu.:2001   1st Qu.: 6.00  
##  Median :17.00   Median : 7.000   Median :2008   Median :11.00  
##  Mean   :16.55   Mean   : 6.527   Mean   :2008   Mean   :10.61  
##  3rd Qu.:24.00   3rd Qu.:10.000   3rd Qu.:2016   3rd Qu.:16.00  
##  Max.   :31.00   Max.   :12.000   Max.   :2022   Max.   :20.00  
##                                                  NA's   :39     
##      job           
##  Length:1000       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Number of na values exists in dataset-banking

sum(is.na(banking))

## [1] 354

Removing the na values

bank<-na.omit(banking)
View(bank)

Verifying na values values exists or not

sum(is.na(banking))

## [1] 354

HISTOGRAM:-

# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
  geom_histogram(binwidth = 5,
                 fill = "lightgreen",
                 color = "black") +
 labs(title="Distribution of Days",
       x="Day",
       y="Frequency")

PIE CHART:-

# Create a data frame with housing loan counts
housing_counts <- bank %>%
  group_by(housing) %>%
  summarise(count = n())

# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
  geom_bar(stat = "identity",
           width = 1,
           color = "white") +
  coord_polar("y") +
  labs(title = "Housing Loan Status",
       fill = "Housing",
       x = NULL,
       y = NULL) +
  theme_void()

Density Plot

# Density Plot for Age
ggplot(bank, aes(x = age)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Age Density Plot",
       x = "Age",
       y = "Density")

SCATTER PLOT:-

# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)

# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
  geom_point() +
  labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")

LINE CHART:-

bank %>%
  group_by(year) %>%
  summarise(mean_balance = mean(balance)) %>%
  ggplot(aes(x = year, y = mean_balance)) +
  geom_line() +
  labs(title = "Average Balance Over Years",
       x = "Year",
       y = "Average Balance")

# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
  group_by(month, campaign) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
  geom_area() +
  labs(title = "Campaign Activity Over Months", 
       x = "Month", y = "Count", fill = "Campaign")

Performing Measure of Central Tendency:-

# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))

# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance


# Display the Central Tendency
print("Measures of Central Tendency:\n")

## [1] "Measures of Central Tendency:\n"

cat("Mean:", mean_balance, "\n")

## Mean: 494321.5

cat("Median:", median_balance, "\n")

## Median: 497429.6

cat("Mode:", mode_balance, "\n")

## Mode: 5442.22

#Display the Measures of Dispersion
print("\nMeasures of Dispersion:\n")

## [1] "\nMeasures of Dispersion:\n"

cat("Range:", range_balance, "\n")

## Range: 5442.22 999607.1

cat("Variance:", variance_balance, "\n")

## Variance: 83574460647

cat("Standard Deviation:", std_deviation_balance, "\n")

## Standard Deviation: 289092.5

cat("Interquartile Range (IQR):", iqr_balance, "\n")

## Interquartile Range (IQR): 495985.4

Minor Project

Arsh Malik

2023-09-15