Processing math: 100%

Banking customer’s dataset - csv file

reading data into R

getting current path

getwd()

method 1 to read file and set working directory

setting path

setwd(“C:/XLRI/R data set”) getwd() data<- read.csv(“DATASET.csv”)

method 2

data<- read.csv(“C:/XLRI/R data set/DATASET.csv”)

method 3

data <- read.csv(file.choose()) View(data)

descriptive statistics

head(data) tail(data) str(data) summary((data)) #################################

missing data

is.na() - displays boolean values

is.na(data) # cal. sum of columns colSums(is.na(data))

histogram to confirm mean or median for imputation of missing values

hist(dataAge)hist(dataSCR) summary(dataAge)summary(dataSCR)

converting the Gender to factor (factor treat them as categorical data)

dataGender<factor(dataGender) str(data)

Go with mean for missing records as mean and median are closer with each other

mean(dataAge,na.rm=TRUE)mean(dataSCR, na.rm = TRUE)

indexing of na values from Age column

dataAge[is.na(dataAge)]

Replacing na values of Age column with calculated value

dataAge[is.na(dataAge)] <- mean(data$Age, na.rm = TRUE)

Indexing of na values from SCR column

dataSCR[is.na(dataSCR)] <- mean(data$SCR, na.rm = TRUE)

confirming if missing values has been replaced with mean

is.na(data) colSums(is.na(data)) ###########################################################

EDA - using ggplot2

library(ggplot2)

Different Layers in ggplot2

Layer 1 - Data

Layer 2 - Aesthetics - which column used for plot

Layer 3 - Geometries - type of plot

Layer 4 - Statistics

Layer 5 - Facets

Layer 6 - Coordinates

Layer 7 - Theme

max(data$Age)

ggplot(data, aes(Age, fill =“blue”)) + geom_histogram()

to differentiate genders - Male, female, others with different colours

ggplot(data, aes(Age, fill = Gender)) + geom_histogram()

Facets - creating matrix of panels

based on Gender

ggplot(data, aes(Age)) + geom_histogram() + facet_grid(Gender~.) # based on Occupation ggplot(data, aes(Age)) + geom_histogram() + facet_grid(Occupation~.)

Adjusting coordinates of axes

ggplot(data, aes(Age)) + geom_histogram() + facet_grid(Occupation~.) + coord_cartesian(ylim = c(0,500)) ##########################################################

Theme - to give label names

ggplot(data, aes(Age)) + geom_histogram() + facet_grid(Occupation~.) + coord_cartesian(ylim = c(0,500)) + xlab(“Age of Customers”) + ylab(“No. of Customers”) ####################################################

Count plots - shows count of categorical variables

ggplot(data, aes(Occupation, fill = Gender)) + geom_bar() ###########################################################

boxplot - aes(xvalue, yvalue)

ggplot(data, aes(Occupation, SCR)) + geom_boxplot()

ggplot(data, aes(Gender, SCR)) + geom_boxplot()

ggplot(data, aes(Occupation, SCR, fill = Gender)) + geom_boxplot() #####################################################################