Libraries Used in Project #2

library(dplyr)
library(stringr)
library(tidyr)
library(knitr)
library(kableExtra)
library(ggplot2)

Dataset #1 - Pokemon Dataset

Comparing two different types of pokemon

Found a useful pokemon dataset here -> https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv

I will save this in my own github and use it from there.

I decided to work on this dataset because I recently started playing Pokemon Shield on my Nintendo Switch and am on a mission to capture all to fill my pokedex (a dictionary with useful pokemon stats). This cause my interest in search for such data since it is in the game it must be in the internet.

Reading the data from csv into R

#pokemon_data <- read.csv('https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv')

pokemon_data <- read.csv('https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/pokemon.csv')
head(pokemon_data, 25)

##    X.                      Name Type.1 Type.2 Total HP Attack Defense Sp..Atk
## 1   1                 Bulbasaur  Grass Poison   318 45     49      49      65
## 2   2                   Ivysaur  Grass Poison   405 60     62      63      80
## 3   3                  Venusaur  Grass Poison   525 80     82      83     100
## 4   3     VenusaurMega Venusaur  Grass Poison   625 80    100     123     122
## 5   4                Charmander   Fire          309 39     52      43      60
## 6   5                Charmeleon   Fire          405 58     64      58      80
## 7   6                 Charizard   Fire Flying   534 78     84      78     109
## 8   6 CharizardMega Charizard X   Fire Dragon   634 78    130     111     130
## 9   6 CharizardMega Charizard Y   Fire Flying   634 78    104      78     159
## 10  7                  Squirtle  Water          314 44     48      65      50
## 11  8                 Wartortle  Water          405 59     63      80      65
## 12  9                 Blastoise  Water          530 79     83     100      85
## 13  9   BlastoiseMega Blastoise  Water          630 79    103     120     135
## 14 10                  Caterpie    Bug          195 45     30      35      20
## 15 11                   Metapod    Bug          205 50     20      55      25
## 16 12                Butterfree    Bug Flying   395 60     45      50      90
## 17 13                    Weedle    Bug Poison   195 40     35      30      20
## 18 14                    Kakuna    Bug Poison   205 45     25      50      25
## 19 15                  Beedrill    Bug Poison   395 65     90      40      45
## 20 15     BeedrillMega Beedrill    Bug Poison   495 65    150      40      15
## 21 16                    Pidgey Normal Flying   251 40     45      40      35
## 22 17                 Pidgeotto Normal Flying   349 63     60      55      50
## 23 18                   Pidgeot Normal Flying   479 83     80      75      70
## 24 18       PidgeotMega Pidgeot Normal Flying   579 83     80      80     135
## 25 19                   Rattata Normal          253 30     56      35      25
##    Sp..Def Speed Generation Legendary
## 1       65    45          1     False
## 2       80    60          1     False
## 3      100    80          1     False
## 4      120    80          1     False
## 5       50    65          1     False
## 6       65    80          1     False
## 7       85   100          1     False
## 8       85   100          1     False
## 9      115   100          1     False
## 10      64    43          1     False
## 11      80    58          1     False
## 12     105    78          1     False
## 13     115    78          1     False
## 14      20    45          1     False
## 15      25    30          1     False
## 16      80    70          1     False
## 17      20    50          1     False
## 18      25    35          1     False
## 19      80    75          1     False
## 20      80   145          1     False
## 21      35    56          1     False
## 22      50    71          1     False
## 23      70   101          1     False
## 24      80   121          1     False
## 25      35    72          1     False

Some Pokemon were duplicated – mostly those that were split into multiple Mega versions of themselves. To remove these, I wil use regular expression.

pokemon_data <- pokemon_data %>% 
  filter(!str_detect(Name, '\\Mega'))

head(pokemon_data, 15)

##    X.       Name Type.1 Type.2 Total HP Attack Defense Sp..Atk Sp..Def Speed
## 1   1  Bulbasaur  Grass Poison   318 45     49      49      65      65    45
## 2   2    Ivysaur  Grass Poison   405 60     62      63      80      80    60
## 3   3   Venusaur  Grass Poison   525 80     82      83     100     100    80
## 4   4 Charmander   Fire          309 39     52      43      60      50    65
## 5   5 Charmeleon   Fire          405 58     64      58      80      65    80
## 6   6  Charizard   Fire Flying   534 78     84      78     109      85   100
## 7   7   Squirtle  Water          314 44     48      65      50      64    43
## 8   8  Wartortle  Water          405 59     63      80      65      80    58
## 9   9  Blastoise  Water          530 79     83     100      85     105    78
## 10 10   Caterpie    Bug          195 45     30      35      20      20    45
## 11 11    Metapod    Bug          205 50     20      55      25      25    30
## 12 12 Butterfree    Bug Flying   395 60     45      50      90      80    70
## 13 13     Weedle    Bug Poison   195 40     35      30      20      20    50
## 14 14     Kakuna    Bug Poison   205 45     25      50      25      25    35
## 15 15   Beedrill    Bug Poison   395 65     90      40      45      80    75
##    Generation Legendary
## 1           1     False
## 2           1     False
## 3           1     False
## 4           1     False
## 5           1     False
## 6           1     False
## 7           1     False
## 8           1     False
## 9           1     False
## 10          1     False
## 11          1     False
## 12          1     False
## 13          1     False
## 14          1     False
## 15          1     False

By comparing both data frames we can indeed see that the mega Pokemon were removed

For this comparison, it would be useful to select the columns that are relevant, and to group them by their type and total HP. In order to calculate maximum and minimum HP per Pokemon type, I used group by multiple times as well as mutate a few times.

lowandHighHp <- pokemon_data %>% 
  select(X.,Name, Type.1, Total) %>% 
    arrange(Type.1, Total) %>% 
    group_by(Type.1) %>% 
      mutate(High_HP = max(Total), Low_HP = min(Total)) %>% 
    ungroup() %>% mutate(HP_Stats = (Total == High_HP) | (Total == Low_HP)) %>% 
      filter(HP_Stats == TRUE) %>% 
      mutate(HP_Stats = ifelse(Total == High_HP, 'High HP', 'Low HP')) %>%
        select(X.,Name, Type.1, HP_Stats, Total) %>%
          arrange(desc(Total)) %>%
           rename("Pokemon #" = X.,  "Type" = Type.1, "Pokemon" = Name, "Total HP" = Total)

highestHp <- lowandHighHp %>% filter(HP_Stats == 'High HP')

lowestHp <- lowandHighHp %>% filter(HP_Stats == 'Low HP')

highestHp

## # A tibble: 29 x 5
##    `Pokemon #` Pokemon               Type   HP_Stats `Total HP`
##          <int> <chr>                 <chr>  <chr>         <int>
##  1         383 GroudonPrimal Groudon Ground High HP         770
##  2         382 KyogrePrimal Kyogre   Water  High HP         770
##  3         493 Arceus                Normal High HP         720
##  4         646 KyuremBlack Kyurem    Dragon High HP         700
##  5         646 KyuremWhite Kyurem    Dragon High HP         700
##  6         717 Yveltal               Dark   High HP         680
##  7         716 Xerneas               Fairy  High HP         680
##  8         250 Ho-oh                 Fire   High HP         680
##  9         487 GiratinaAltered Forme Ghost  High HP         680
## 10         487 GiratinaOrigin Forme  Ghost  High HP         680
## # ... with 19 more rows

lowestHp

## # A tibble: 24 x 5
##    `Pokemon #` Pokemon    Type   HP_Stats `Total HP`
##          <int> <chr>      <chr>  <chr>         <int>
##  1         147 Dratini    Dragon Low HP          300
##  2         371 Bagon      Dragon Low HP          300
##  3         443 Gible      Dragon Low HP          300
##  4         704 Goomy      Dragon Low HP          300
##  5         374 Beldum     Steel  Low HP          300
##  6         436 Bronzor    Steel  Low HP          300
##  7         599 Klink      Steel  Low HP          300
##  8         524 Roggenrola Rock   Low HP          280
##  9         607 Litwick    Ghost  Low HP          275
## 10          50 Diglett    Ground Low HP          265
## # ... with 14 more rows

Printing the tables using shiny

kable(highestHp, align = 'clc', caption = 'Pokemon w/ Highest HP in Group') %>%
    kable_styling(bootstrap_options = c('striped'), full_width = F)

Pokemon w/ Highest HP in Group
Pokemon #	Pokemon	Type	HP_Stats	Total HP
383	GroudonPrimal Groudon	Ground	High HP	770
382	KyogrePrimal Kyogre	Water	High HP	770
493	Arceus	Normal	High HP	720
646	KyuremBlack Kyurem	Dragon	High HP	700
646	KyuremWhite Kyurem	Dragon	High HP	700
717	Yveltal	Dark	High HP	680
716	Xerneas	Fairy	High HP	680
250	Ho-oh	Fire	High HP	680
487	GiratinaAltered Forme	Ghost	High HP	680
487	GiratinaOrigin Forme	Ghost	High HP	680
150	Mewtwo	Psychic	High HP	680
249	Lugia	Psychic	High HP	680
720	HoopaHoopa Unbound	Psychic	High HP	680
483	Dialga	Steel	High HP	680
649	Genesect	Bug	High HP	600
492	ShayminLand Forme	Grass	High HP	600
492	ShayminSky Forme	Grass	High HP	600
248	Tyranitar	Rock	High HP	600
719	Diancie	Rock	High HP	600
145	Zapdos	Electric	High HP	580
243	Raikou	Electric	High HP	580
642	ThundurusIncarnate Forme	Electric	High HP	580
642	ThundurusTherian Forme	Electric	High HP	580
641	TornadusIncarnate Forme	Flying	High HP	580
641	TornadusTherian Forme	Flying	High HP	580
144	Articuno	Ice	High HP	580
378	Regice	Ice	High HP	580
169	Crobat	Poison	High HP	535
448	Lucario	Fighting	High HP	525

kable(lowestHp, align = 'clc', caption = 'Pokemon w/ Lowest HP in Group') %>%
    kable_styling(bootstrap_options = c('striped'), full_width = F)

Pokemon w/ Lowest HP in Group
Pokemon #	Pokemon	Type	HP_Stats	Total HP
147	Dratini	Dragon	Low HP	300
371	Bagon	Dragon	Low HP	300
443	Gible	Dragon	Low HP	300
704	Goomy	Dragon	Low HP	300
374	Beldum	Steel	Low HP	300
436	Bronzor	Steel	Low HP	300
599	Klink	Steel	Low HP	300
524	Roggenrola	Rock	Low HP	280
607	Litwick	Ghost	Low HP	275
50	Diglett	Ground	Low HP	265
218	Slugma	Fire	Low HP	250
220	Swinub	Ice	Low HP	250
714	Noibat	Flying	Low HP	245
41	Zubat	Poison	Low HP	245
261	Poochyena	Dark	Low HP	220
173	Cleffa	Fairy	Low HP	218
236	Tyrogue	Fighting	Low HP	210
172	Pichu	Electric	Low HP	205
129	Magikarp	Water	Low HP	200
349	Feebas	Water	Low HP	200
280	Ralts	Psychic	Low HP	198
401	Kricketot	Bug	Low HP	194
298	Azurill	Normal	Low HP	190
191	Sunkern	Grass	Low HP	180

Those Pokemon that have the highest HP for their group are evolved forms of less evolved Pokemon, as we can see from the list above. Similarly, Pokemon with the lowest HP for their group tend to be non evolved forms. The goal as a trainer in the game is to increase your Pokemon’s hit points (HP), as well as evolve them to be more effective against other Pokemon you do this by doing battle with wild Pokemon or fighting against other trainers.For the purpose of answering this question, we will compare Water type Pokemon to Grass type Pokemon.

filter_pokemon_type <- lowandHighHp %>% filter(Type == 'Grass' | Type == 'Water') %>% 
      arrange(`Total HP`)

plot <- ggplot(filter_pokemon_type, aes(x = Pokemon, y = `Total HP`, fill= HP_Stats))
plot <- plot + scale_y_continuous()
plot <- plot + theme(legend.position = "right")

plot <- plot + geom_bar(stat = "identity", position = "stack", color="black")
plot <- plot + facet_wrap(Type ~ ., scales = "free_x")
plot

### Conclusion

When we compare the Pokemon that have the highest and lowest hit points from the Grass and Water types, we can see that Sukern has a lower hit point total than Feebas or Magikarp, which is the lowest hit point total for a Water type. Also, KyogrePrimal Kyogre has a higher HP than ShayminLand Forme and ShayminSky Forme (same number), who has the highest HP of all Grass type Pokemon. Consequently, when we compare the Pokemon with the highest and lowest HP between Water type and Grass type, we see that water types have a higher HP for both their lowest and highest rated Pokemon compared to Grass type Pokemon.

Dataset 2: Lending Club Accepted Loans from 2018

https://www.kaggle.com/wordsforthewise/lending-club?select=accepted_2007_to_2018Q4.csv.gz I deleted everything before 2018 as this list was extremely long the original can be found at the link above

Question: is there a direct relationship between these fields (interest rates, grade, loan amount and income)?

loan <- read.csv("https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/accepted2018Q4.csv")

# summary(loan) too long

colnames(loan)

##  [1] "loan_amnt"                   "funded_amnt"                
##  [3] "funded_amnt_inv"             "term"                       
##  [5] "int_rate"                    "installment"                
##  [7] "grade"                       "sub_grade"                  
##  [9] "emp_title"                   "emp_length"                 
## [11] "home_ownership"              "annual_inc"                 
## [13] "verification_status"         "issue_d"                    
## [15] "loan_status"                 "pymnt_plan"                 
## [17] "url"                         "desc"                       
## [19] "purpose"                     "title"                      
## [21] "zip_code"                    "addr_state"                 
## [23] "dti"                         "delinq_2yrs"                
## [25] "earliest_cr_line"            "fico_range_low"             
## [27] "fico_range_high"             "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "last_fico_range_high"       
## [51] "last_fico_range_low"         "collections_12_mths_ex_med" 
## [53] "mths_since_last_major_derog" "policy_code"                
## [55] "application_type"            "annual_inc_joint"           
## [57] "dti_joint"                   "verification_status_joint"  
## [59] "acc_now_delinq"              "tot_coll_amt"               
## [61] "tot_cur_bal"                 "open_acc_6m"                
## [63] "open_act_il"                 "open_il_12m"                
## [65] "open_il_24m"                 "mths_since_rcnt_il"         
## [67] "total_bal_il"

Selection of fields I need

loan <- loan %>% 
    select(annual_inc, loan_amnt, int_rate, grade) 
colnames(loan)

## [1] "annual_inc" "loan_amnt"  "int_rate"   "grade"

head(loan)

##   annual_inc loan_amnt int_rate grade
## 1      50000      5000    20.39     D
## 2     196000     15000     9.92     B
## 3      44000     11200    30.79     G
## 4      65000     25000    21.85     D
## 5      52000      3000     7.34     A
## 6      52000     17000    20.39     D

It will be easier for me to create a field and categorize the loan, interest rates and the customer annual salary

#loan amount group
loan$loangroup <- findInterval(loan$loan_amnt, seq(0, 30000, 10000)) 
loan$loangroup[loan$loangroup==1] <- "<$10,000"
loan$loangroup[loan$loangroup==2] <- "<$20,000"
loan$loangroup[loan$loangroup==3] <- "<$30,000"
loan$loangroup[loan$loangroup==4] <- "<=$40,000"

loan$loangroup <- factor(loan$loangroup, levels=c("<$10,000", "<$20,000", "<$30,000", "<=$40,000"))

#Loan interest rates group also know as APR
loan$aprgroup<- findInterval(loan$int_rate, seq(0, 30, 10))
loan$aprgroup[loan$aprgroup==1] <- "0-10%"
loan$aprgroup[loan$aprgroup==2] <- "10-20%"
loan$aprgroup[loan$aprgroup==3] <- "20-30%"
loan$aprgroup[loan$aprgroup==4] <- "30-40%"

loan$aprgroup<- factor(loan$aprgroup, levels=c("0-10%", "10-20%", "20-30%", "30-40%"))

# Grouping customer annual salary by every $25k
loan$salarygroup <- findInterval(loan$annual_inc, seq(0, 100000, 25000))
loan$salarygroup[loan$salarygroup==1] <- "<$25,000"
loan$salarygroup[loan$salarygroup==2] <- "<$50,000"
loan$salarygroup[loan$salarygroup==3] <- "<$75,000"
loan$salarygroup[loan$salarygroup==4] <- "<$100,000"
loan$salarygroup[loan$salarygroup==5] <- ">=$100,000"

loan$salarygroup <- factor(loan$salarygroup, levels=c("<$25,000"," <$50,000", "<$75,000",
                                                      "<$100,000", ">=$100,000"))

tail(loan)

##        annual_inc loan_amnt int_rate grade loangroup aprgroup salarygroup
## 236053      57000      5000    16.46     C  <$10,000   10-20%    <$75,000
## 236054      53414     12000    14.03     C  <$20,000   10-20%    <$75,000
## 236055      22000      5000     6.19     A  <$10,000    0-10%    <$25,000
## 236056      80000     20000    15.49     C  <$30,000   10-20%   <$100,000
## 236057      60000     10000    11.05     B  <$20,000   10-20%    <$75,000
## 236058       9600      4000    16.46     C  <$10,000   10-20%    <$25,000

ggplot(loan, aes(x=loan_amnt)) + 
  geom_histogram(aes(fill=..count..)) + 
  labs(title = "Loan distribution Amounts based on Grade")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(loan, aes(x=factor(salarygroup), y=loan_amnt, fill=factor(salarygroup))) + 
  geom_boxplot() + 
  facet_grid(. ~ grade) + 
  labs(title = "Boxplot for Loan based on Income, Grade, and Interest Rate", x="Income", y="Loan Amount") +
  theme(axis.text.x = element_text(angle=90, hjust=1))

ggplot(loan, aes(x=factor(loangroup), y=int_rate)) + 
  geom_violin(alpha=0.6, color="red") +  
  geom_jitter(alpha=0.5, aes(color=grade)) + 
  facet_grid(. ~ loangroup) + 
  labs(title = "Violin Plot for APR based on Loan Amount, Income, and Grade", x="Loan Amount", y="APR") +
  theme(axis.text.x = element_text(angle=90, hjust=1))

I am able to visualize the relationships between variables by grouping them into categories.

Project 2

Eddie Perez

2022-03-4