Introduction

The brief Exploratory Data Analysis(EDA) is designed to give brief overview of consumer complaints database provided by Consumer Financial Protection Bureau(CFPB). The dataset can be downloaded [here] (http://catalog.data.gov/dataset/consumer-complaint-database).

Exploratory Data Analysis

Load required library

options(warn=-1)
library(ggplot2); library(RColorBrewer); library(gridExtra)
library(wordcloud)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(devtools)
library(skimthru)
df <- read.csv("Consumer_Complaints.csv" ,header= TRUE, stringsAsFactors = F)

Load the dataset

df <- read.csv("Consumer_Complaints.csv" ,header= TRUE, stringsAsFactors = F)
df$Date.received <- as.Date(df$Date.received, format = "%m/%d/%Y")
df$Date.sent.to.company <- as.Date(df$Date.sent.to.company, format = "%m/%d/%Y")

Exploratory Analysis

create a word cloud that shows most frequently complained product

set.seed(1)
clouddf1 <- NofRow(df, 2)
wordcloud(clouddf1[,1], clouddf1[,2], scale = c(3,.8), colors=brewer.pal(12,"Accent"))

create a bar chart that shows most frequently complained product

colcount.product = length(unique(df$Product))
getPalette = colorRampPalette(brewer.pal(8, "Set2"))
ggplot(df, aes(x= Product)) + geom_bar(aes(fill = factor(Product))) + theme(axis.text.x = element_blank()) +
    scale_fill_manual(values = getPalette(colcount.product))

create a consultant chart that shows most frequently complained product

ggplot(df, aes(x= Product)) + geom_bar(aes(fill = factor(Product))) + 
    scale_fill_manual(values = getPalette(colcount.product)) + coord_polar() + 
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(),
          axis.text.y = element_blank(), axis.text.x = element_text(size = 12),
          axis.ticks.x = element_blank(), axis.ticks.y = element_blank())

Overview of all data on submission method, company response to consumer, timely response, and consumer dipsputed

p2 <- ggplot(df, aes(x = Submitted.via)) + geom_bar(aes(fill = Submitted.via)) + 
    theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Accent")

p3 <- ggplot(df, aes(x = Company.response.to.consumer)) + geom_bar(aes(fill = Company.response.to.consumer)) + 
    theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Dark2")

p4 <- ggplot(df[df$Timely.response. %in% "No",], aes(x = factor(1), fill = Product)) + geom_bar(width = 1) + 
    coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                                     axis.title.y = element_blank(), axis.title.x = element_blank()) + 
    scale_fill_manual(values = getPalette(colcount.product)) + 
    labs(title = "Products that failed to provide timely response")

p5 <- ggplot(df[df$Consumer.disputed %in% "Yes",], aes(x = factor(1), fill = Product)) + geom_bar(width = 1) +
    coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                                     axis.title.y = element_blank(), axis.title.x = element_blank()) + 
    scale_fill_manual(values = getPalette(colcount.product)) + 
    labs(title = "Products that customers disputed")

grid.arrange(p2, p3, p4, p5, nrow=2, ncol=2)

Create a function that shows overview of each product about submission method, company response to consumer, timely response, and consumer disputed.

EDA.Sub.product <- function(dataframe, prod){
    EDAdf <- df[df$Product == prod,]
    
    colcount.subproduct = length(unique(df$Sub.product))
    getPalette = colorRampPalette(brewer.pal(8, "Accent"))
    
    p2.1 <- ggplot(EDAdf, aes(x = Submitted.via)) + geom_bar(aes(fill = Submitted.via)) + 
        theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Accent") + 
        labs(title = paste("Submission Method for ", prod))
    
    p3.1 <- ggplot(EDAdf, aes(x = Company.response.to.consumer)) + geom_bar(aes(fill = Company.response.to.consumer)) + 
        theme(axis.text.x = element_blank()) + scale_fill_brewer(palette="Dark2") +
        labs(title = paste("Company Response to Complaints regarding ", prod))
    
    p4.1 <- ggplot(EDAdf[EDAdf$Timely.response. %in% "No",], aes(x = factor(1), fill = Sub.product)) + geom_bar(width = 1) + 
        coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                                     axis.title.y = element_blank(), axis.title.x = element_blank()) + 
        scale_fill_brewer(palette = "Set3") + 
        labs(title = paste(prod, " failed to responde timely",sep = ""))
    
    p5.1 <- ggplot(EDAdf[EDAdf$Consumer.disputed %in% "Yes",], aes(x = factor(1), fill = Sub.product)) + geom_bar(width = 1) +
        coord_polar(theta = "y") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                                         axis.title.y = element_blank(), axis.title.x = element_blank()) + 
        scale_fill_brewer(palette = "Set3") + 
        labs(title = paste(prod, " Complaints that Consumer Disputed", sep=""))
    
    if(EDAdf$Sub.product == ""){
        grid.arrange(p2.1, p3.1, nrow=1, ncol=2)
    }
    else{
        grid.arrange(p2.1, p3.1, p4.1, p5.1, nrow=2, ncol=2)
    }
    
}

Show the top five products with most complaints and display an overview of the each product

topprod <- NofRow(df, 2)
topprod <- topprod[order(-topprod$count),]
topprod[1:5,]
##                   uniqueV  count
## 1                Mortgage 176980
## 2         Debt collection  92458
## 5        Credit reporting  82599
## 6             Credit card  62326
## 3 Bank account or service  58731
EDA.Sub.product(df, topprod[1,1])

EDA.Sub.product(df, topprod[2,1])

EDA.Sub.product(df, topprod[3,1])

EDA.Sub.product(df, topprod[4,1])

EDA.Sub.product(df, topprod[5,1])