Kiva_EDA

# install.packages("tidyverse")
# install.packages("stringr")
# install.packages("lubridate")
# install.packages("wordcloud")
# install.packages("tidytext")
# install.packages("DT")
# install.packages("leaflet")
# install.packages("igraph")
# install.packages("ggraph")
# install.packages("topicmodels")
# install.packages("SnowballC")
# install.packages("tm")
library(tidyverse) #  data manipulation and graphs

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr) #  string manipulation
library(lubridate) #  date manipulation

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(wordcloud) #  wordcloud

## Loading required package: RColorBrewer

#library(tidytext) # tidy implementation of NLP methods
library(DT)       # table format display of data
library(leaflet) # maps
library(igraph) #  graphs

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

#library(ggraph) #  graphs
library(topicmodels) # for LDA topic modelling 
library(tm) # general text mining functions, making document term matrixes

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(SnowballC) # for stemming

setwd("C:/Users/v-vyupad/Desktop/Analytics")
rm(list=ls())

fillColor = "#FFA07A"
fillColor2 = "#F1C40F"

loans <- read.csv("kiva_loans.csv")
regions <- read.csv("kiva_mpi_region_locations.csv")
themes <- read.csv("loan_theme_ids.csv")
themes_region <- read.csv("loan_themes_by_region.csv")

#Loan Data
datatable(head(loans), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

# Region Data
datatable(head(regions), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

# Themes Data
datatable(head(themes), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

# Regions Data
datatable(head(themes_region), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

# Most Poopular Themes

themes %>%
 rename (themeType = `Loan.Theme.Type`) %>%
  filter(!is.na(themeType)) %>%
  group_by(themeType) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(themeType = reorder(themeType,Count)) %>%
  head(10) %>%
  
  ggplot(aes(x = themeType,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor) +
  geom_text(aes(x = themeType, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Type of Theme', 
       y = 'Count', 
       title = 'Type of Theme and Count') +
  coord_flip() +
  theme_bw()

# Most Popular Regions
themes_region %>%
  group_by(country) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(country = reorder(country,Count)) %>%
  head(10) %>%
  
  ggplot(aes(x = country,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = country, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Country', 
       y = 'Count', 
       title = 'Country and Count') +
  coord_flip() +
  theme_bw()

# Most popular Sector### Working
themes_region %>%
  group_by(sector) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(sector = reorder(sector,Count)) %>%
  head(10) %>%
  
  ggplot(aes(x = sector,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = sector, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Sector', 
       y = 'Count', 
       title = 'Sector and Count') +
  coord_flip() +
  theme_bw()

# Maps of Loan
leaflet(themes_region) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~lon, lat = ~lat,radius = ~(amount/10) ,
             color = ~c("red"))  %>%
  # controls
  setView(lng=0, lat=0,zoom = 2)

## Warning in validateCoords(lng, lat, funcName): Data contains 2074 rows with
## either missing or invalid lat/lon values and will be ignored

# Loans in Philipiens

country_loans = themes_region %>% 
  filter(country == "Philippines") %>%
  rename (themeType = `Loan.Theme.Type`) 

country_loans$themeType = as.factor(country_loans$themeType)

colorsList = c("#e6194b",
               "#3cb44b",
               "#ffe119",
               "#0082c8",
               "#f58231",
               "#911eb4",
               "#46f0f0",
               "#f032e6",
               "#d2f53c",
               "#fabebe",
               "#008080",
               "#e6beff",
               "#aa6e28",
               "#fffac8",
               "#800000",
               "#aaffc3",
               "#808000",
               "#ffd8b1",
               "#000080",
               "#808080",
               "#FFFFFF",
               "#000000"
)

factpal <- colorFactor(colorsList, country_loans$themeType)


center_lon = median(country_loans$lon,na.rm = TRUE)
center_lat = median(country_loans$lat,na.rm = TRUE)


leaflet(country_loans) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~lon, lat = ~lat,radius = ~(amount/10) ,
             color = ~factpal(themeType))  %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 6) %>%
  
  #legends for the map
  addLegend("bottomright", pal = factpal, values = ~themeType,
            title = "Loans Themes",
            opacity = 1)

## Warning in validateCoords(lng, lat, funcName): Data contains 384 rows with
## either missing or invalid lat/lon values and will be ignored

# Distribution of Funded Loan

loans %>%
  ggplot(aes(x = funded_amount) )+
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) + 
  geom_histogram(fill = fillColor2,bins=50) +
  labs(x = 'Funded Loan Amount' ,y = 'Count', title = paste("Distribution of", "Funded Loan Amount")) +
  theme_bw()

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 3383 rows containing non-finite values (stat_bin).

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 6 rows containing missing values (geom_bar).

# Distribution of Funded Loan Amount by Sector
loans %>%
  mutate( fill = as.factor(sector)) %>%
  ggplot(aes(x = sector, y= funded_amount, fill = sector)) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  geom_boxplot() +
  labs(x= 'Sector Type',y = 'Funded Amount', 
       title = paste("Distribution of", ' Funded Amount ')) +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 3383 rows containing non-finite values (stat_boxplot).

# Distribution of Loan by Gender

loans %>%
  filter(!is.na(borrower_genders)) %>%
  group_by(borrower_genders) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(borrower_genders = reorder(borrower_genders,Count)) %>%
  head(10) %>%
  
  ggplot(aes(x = borrower_genders,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = borrower_genders, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Gender', 
       y = 'Count', 
       title = 'Gender and Count') +
  coord_flip() +
  theme_bw()

Kiva_EDA

Vyomesh Upadhyay

February 27, 2018