The brief Exploratory Data Analysis(EDA) is designed to give brief overview of consumer complaints database provided by Consumer Financial Protection Bureau(CFPB). The dataset can be downloaded [here] (http://catalog.data.gov/dataset/consumer-complaint-database).
Load required library
options(warn=-1)
library(ggplot2); library(RColorBrewer); library(gridExtra)
library(wordcloud)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(devtools)
library(skimthru)
df <- read.csv("Consumer_Complaints.csv" ,header= TRUE, stringsAsFactors = F)
Load the dataset
df <- read.csv("Consumer_Complaints.csv" ,header= TRUE, stringsAsFactors = F)
df$Date.received <- as.Date(df$Date.received, format = "%m/%d/%Y")
df$Date.sent.to.company <- as.Date(df$Date.sent.to.company, format = "%m/%d/%Y")
Exploratory Analysis
create a word cloud that shows most frequently complained product
set.seed(1)
clouddf1 <- NofRow(df, 2)
wordcloud(clouddf1[,1], clouddf1[,2], scale = c(3,.8), colors=brewer.pal(12,"Accent"))
create a bar chart that shows most frequently complained product
colcount.product = length(unique(df$Product))
getPalette = colorRampPalette(brewer.pal(8, "Set2"))
ggplot(df, aes(x= Product)) + geom_bar(aes(fill = factor(Product))) + theme(axis.text.x = element_blank()) +
scale_fill_manual(values = getPalette(colcount.product))
create a consultant chart that shows most frequently complained product
ggplot(df, aes(x= Product)) + geom_bar(aes(fill = factor(Product))) +
scale_fill_manual(values = getPalette(colcount.product)) + coord_polar() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(),
axis.text.y = element_blank(), axis.text.x = element_text(size = 12),
axis.ticks.x = element_blank(), axis.ticks.y = element_blank())
Overview of all data on submission method, company response to consumer, timely response, and consumer dipsputed
p2 <- ggplot(df, aes(x = Submitted.via)) + geom_bar(aes(fill = Submitted.via)) +
theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Accent")
p3 <- ggplot(df, aes(x = Company.response.to.consumer)) + geom_bar(aes(fill = Company.response.to.consumer)) +
theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Dark2")
p4 <- ggplot(df[df$Timely.response. %in% "No",], aes(x = factor(1), fill = Product)) + geom_bar(width = 1) +
coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank()) +
scale_fill_manual(values = getPalette(colcount.product)) +
labs(title = "Products that failed to provide timely response")
p5 <- ggplot(df[df$Consumer.disputed %in% "Yes",], aes(x = factor(1), fill = Product)) + geom_bar(width = 1) +
coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank()) +
scale_fill_manual(values = getPalette(colcount.product)) +
labs(title = "Products that customers disputed")
grid.arrange(p2, p3, p4, p5, nrow=2, ncol=2)
Create a function that shows overview of each product about submission method, company response to consumer, timely response, and consumer disputed.
EDA.Sub.product <- function(dataframe, prod){
EDAdf <- df[df$Product == prod,]
colcount.subproduct = length(unique(df$Sub.product))
getPalette = colorRampPalette(brewer.pal(8, "Accent"))
p2.1 <- ggplot(EDAdf, aes(x = Submitted.via)) + geom_bar(aes(fill = Submitted.via)) +
theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Accent") +
labs(title = paste("Submission Method for ", prod))
p3.1 <- ggplot(EDAdf, aes(x = Company.response.to.consumer)) + geom_bar(aes(fill = Company.response.to.consumer)) +
theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Dark2") +
labs(title = paste("Company Response to Complaints regarding ", prod))
p4.1 <- ggplot(EDAdf[EDAdf$Timely.response. %in% "No",], aes(x = factor(1), fill = Sub.product)) + geom_bar(width = 1) +
coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank()) +
scale_fill_brewer(palette = "Set3") +
labs(title = paste(prod, " failed to responde timely",sep = ""))
p5.1 <- ggplot(EDAdf[EDAdf$Consumer.disputed %in% "Yes",], aes(x = factor(1), fill = Sub.product)) + geom_bar(width = 1) +
coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank()) +
scale_fill_brewer(palette = "Set3") +
labs(title = paste(prod, " Complaints that Consumer Disputed", sep=""))
if(EDAdf$Sub.product == ""){
grid.arrange(p2.1, p3.1, nrow=1, ncol=2)
}
else{
grid.arrange(p2.1, p3.1, p4.1, p5.1, nrow=2, ncol=2)
}
}
Show the top five products with most complaints and display an overview of the each product
topprod <- NofRow(df, 2)
topprod <- topprod[order(-topprod$count),]
topprod[1:5,]
## uniqueV count
## 1 Mortgage 176980
## 2 Debt collection 92458
## 5 Credit reporting 82599
## 6 Credit card 62326
## 3 Bank account or service 58731
EDA.Sub.product(df, topprod[1,1])
EDA.Sub.product(df, topprod[2,1])
EDA.Sub.product(df, topprod[3,1])
EDA.Sub.product(df, topprod[4,1])
EDA.Sub.product(df, topprod[5,1])