Project2

Data set - loan data (modified by me)

Loading necessary libraries

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Reading csv file

loan <- read_csv("https://raw.githubusercontent.com/olgashiligin/project2_607/master/loan.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   country = col_character(),
##   loan_amount = col_integer(),
##   gender = col_character(),
##   date = col_character()
## )

loan

## # A tibble: 10 x 5
##       X1 country loan_amount gender date      
##    <int> <chr>         <int> <chr>  <chr>     
##  1     1 Italy            57 F      12-01-2017
##  2     2 France          125 F      12.02.2018
##  3     3 England         275 F      01/06/2018
##  4     4 England          86 M      10 06 2016
##  5     5 Italy            57 F      20-03-2016
##  6     6 France          125 M      12.02.2018
##  7     7 Russia          675 M      01/06/2018
##  8     8 Russia          167 M      11 04 2018
##  9     9 Russia          186 F      09 06 2018
## 10    10 Italy           574 M      28-01-2018

Cleaning date

clean_date<-loan %>% 
  separate(date,c("Year","Month","Day"),sep=" |/|\\.|-|\\,") %>% 
  unite("new_date",Year, Month, Day, sep="/")

clean_date

## # A tibble: 10 x 5
##       X1 country loan_amount gender new_date  
##    <int> <chr>         <int> <chr>  <chr>     
##  1     1 Italy            57 F      12/01/2017
##  2     2 France          125 F      12/02/2018
##  3     3 England         275 F      01/06/2018
##  4     4 England          86 M      10/06/2016
##  5     5 Italy            57 F      20/03/2016
##  6     6 France          125 M      12/02/2018
##  7     7 Russia          675 M      01/06/2018
##  8     8 Russia          167 M      11/04/2018
##  9     9 Russia          186 F      09/06/2018
## 10    10 Italy           574 M      28/01/2018

Analysis (loan percentage by gender and by country)

analysis<-clean_date %>% 
  group_by(country, gender) %>% 
  summarise(total_loan_amount =sum(loan_amount)) %>% 
  spread(gender, total_loan_amount) %>% 
  ungroup() %>%
  transmute(country = country,  female_percent = F / (F+M), male_percent = M /(F+M))

analysis

## # A tibble: 4 x 3
##   country female_percent male_percent
##   <chr>            <dbl>        <dbl>
## 1 England          0.762        0.238
## 2 France           0.5          0.5  
## 3 Italy            0.166        0.834
## 4 Russia           0.181        0.819

Male percentage of total loan amount for France is equal to female percentage of total loan amount. Whereas in Italy and Russian male percentage of loan amount is significantly greater than female percentage of total loan amount (0.82 and 0.18 respectively).

What month is the most “busy” in giving loans?

busy_month<-clean_date %>% 
  separate(new_date,c("Year","Month","Day"),sep="/") %>% 
  group_by(Month) %>% 
  count(country) %>% 
  summarise(sum_count= sum(n)) %>% 
  arrange(desc(sum_count))
  
busy_month

## # A tibble: 5 x 2
##   Month sum_count
##   <chr>     <int>
## 1 06            4
## 2 01            2
## 3 02            2
## 4 03            1
## 5 04            1

The most “busy” month is June.

Project2_Loan

Olga Shiligin

07/10/2018