Libraries Used in Project #2

library(dplyr)
library(stringr)
library(tidyr)
library(knitr)
library(kableExtra)
library(ggplot2)

Dataset #1 - Pokemon Dataset

Comparing two different types of pokemon

Found a useful pokemon dataset here -> https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv

I will save this in my own github and use it from there.

I decided to work on this dataset because I recently started playing Pokemon Shield on my Nintendo Switch and am on a mission to capture all to fill my pokedex (a dictionary with useful pokemon stats). This cause my interest in search for such data since it is in the game it must be in the internet.

Reading the data from csv into R

#pokemon_data <- read.csv('https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv')

pokemon_data <- read.csv('https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/pokemon.csv')
head(pokemon_data, 25)
##    X.                      Name Type.1 Type.2 Total HP Attack Defense Sp..Atk
## 1   1                 Bulbasaur  Grass Poison   318 45     49      49      65
## 2   2                   Ivysaur  Grass Poison   405 60     62      63      80
## 3   3                  Venusaur  Grass Poison   525 80     82      83     100
## 4   3     VenusaurMega Venusaur  Grass Poison   625 80    100     123     122
## 5   4                Charmander   Fire          309 39     52      43      60
## 6   5                Charmeleon   Fire          405 58     64      58      80
## 7   6                 Charizard   Fire Flying   534 78     84      78     109
## 8   6 CharizardMega Charizard X   Fire Dragon   634 78    130     111     130
## 9   6 CharizardMega Charizard Y   Fire Flying   634 78    104      78     159
## 10  7                  Squirtle  Water          314 44     48      65      50
## 11  8                 Wartortle  Water          405 59     63      80      65
## 12  9                 Blastoise  Water          530 79     83     100      85
## 13  9   BlastoiseMega Blastoise  Water          630 79    103     120     135
## 14 10                  Caterpie    Bug          195 45     30      35      20
## 15 11                   Metapod    Bug          205 50     20      55      25
## 16 12                Butterfree    Bug Flying   395 60     45      50      90
## 17 13                    Weedle    Bug Poison   195 40     35      30      20
## 18 14                    Kakuna    Bug Poison   205 45     25      50      25
## 19 15                  Beedrill    Bug Poison   395 65     90      40      45
## 20 15     BeedrillMega Beedrill    Bug Poison   495 65    150      40      15
## 21 16                    Pidgey Normal Flying   251 40     45      40      35
## 22 17                 Pidgeotto Normal Flying   349 63     60      55      50
## 23 18                   Pidgeot Normal Flying   479 83     80      75      70
## 24 18       PidgeotMega Pidgeot Normal Flying   579 83     80      80     135
## 25 19                   Rattata Normal          253 30     56      35      25
##    Sp..Def Speed Generation Legendary
## 1       65    45          1     False
## 2       80    60          1     False
## 3      100    80          1     False
## 4      120    80          1     False
## 5       50    65          1     False
## 6       65    80          1     False
## 7       85   100          1     False
## 8       85   100          1     False
## 9      115   100          1     False
## 10      64    43          1     False
## 11      80    58          1     False
## 12     105    78          1     False
## 13     115    78          1     False
## 14      20    45          1     False
## 15      25    30          1     False
## 16      80    70          1     False
## 17      20    50          1     False
## 18      25    35          1     False
## 19      80    75          1     False
## 20      80   145          1     False
## 21      35    56          1     False
## 22      50    71          1     False
## 23      70   101          1     False
## 24      80   121          1     False
## 25      35    72          1     False

Some Pokemon were duplicated – mostly those that were split into multiple Mega versions of themselves. To remove these, I wil use regular expression.

pokemon_data <- pokemon_data %>% 
  filter(!str_detect(Name, '\\Mega'))

head(pokemon_data, 15)
##    X.       Name Type.1 Type.2 Total HP Attack Defense Sp..Atk Sp..Def Speed
## 1   1  Bulbasaur  Grass Poison   318 45     49      49      65      65    45
## 2   2    Ivysaur  Grass Poison   405 60     62      63      80      80    60
## 3   3   Venusaur  Grass Poison   525 80     82      83     100     100    80
## 4   4 Charmander   Fire          309 39     52      43      60      50    65
## 5   5 Charmeleon   Fire          405 58     64      58      80      65    80
## 6   6  Charizard   Fire Flying   534 78     84      78     109      85   100
## 7   7   Squirtle  Water          314 44     48      65      50      64    43
## 8   8  Wartortle  Water          405 59     63      80      65      80    58
## 9   9  Blastoise  Water          530 79     83     100      85     105    78
## 10 10   Caterpie    Bug          195 45     30      35      20      20    45
## 11 11    Metapod    Bug          205 50     20      55      25      25    30
## 12 12 Butterfree    Bug Flying   395 60     45      50      90      80    70
## 13 13     Weedle    Bug Poison   195 40     35      30      20      20    50
## 14 14     Kakuna    Bug Poison   205 45     25      50      25      25    35
## 15 15   Beedrill    Bug Poison   395 65     90      40      45      80    75
##    Generation Legendary
## 1           1     False
## 2           1     False
## 3           1     False
## 4           1     False
## 5           1     False
## 6           1     False
## 7           1     False
## 8           1     False
## 9           1     False
## 10          1     False
## 11          1     False
## 12          1     False
## 13          1     False
## 14          1     False
## 15          1     False

By comparing both data frames we can indeed see that the mega Pokemon were removed

For this comparison, it would be useful to select the columns that are relevant, and to group them by their type and total HP. In order to calculate maximum and minimum HP per Pokemon type, I used group by multiple times as well as mutate a few times.

lowandHighHp <- pokemon_data %>% 
  select(X.,Name, Type.1, Total) %>% 
    arrange(Type.1, Total) %>% 
    group_by(Type.1) %>% 
      mutate(High_HP = max(Total), Low_HP = min(Total)) %>% 
    ungroup() %>% mutate(HP_Stats = (Total == High_HP) | (Total == Low_HP)) %>% 
      filter(HP_Stats == TRUE) %>% 
      mutate(HP_Stats = ifelse(Total == High_HP, 'High HP', 'Low HP')) %>%
        select(X.,Name, Type.1, HP_Stats, Total) %>%
          arrange(desc(Total)) %>%
           rename("Pokemon #" = X.,  "Type" = Type.1, "Pokemon" = Name, "Total HP" = Total)

highestHp <- lowandHighHp %>% filter(HP_Stats == 'High HP')

lowestHp <- lowandHighHp %>% filter(HP_Stats == 'Low HP')

highestHp
## # A tibble: 29 x 5
##    `Pokemon #` Pokemon               Type   HP_Stats `Total HP`
##          <int> <chr>                 <chr>  <chr>         <int>
##  1         383 GroudonPrimal Groudon Ground High HP         770
##  2         382 KyogrePrimal Kyogre   Water  High HP         770
##  3         493 Arceus                Normal High HP         720
##  4         646 KyuremBlack Kyurem    Dragon High HP         700
##  5         646 KyuremWhite Kyurem    Dragon High HP         700
##  6         717 Yveltal               Dark   High HP         680
##  7         716 Xerneas               Fairy  High HP         680
##  8         250 Ho-oh                 Fire   High HP         680
##  9         487 GiratinaAltered Forme Ghost  High HP         680
## 10         487 GiratinaOrigin Forme  Ghost  High HP         680
## # ... with 19 more rows
lowestHp
## # A tibble: 24 x 5
##    `Pokemon #` Pokemon    Type   HP_Stats `Total HP`
##          <int> <chr>      <chr>  <chr>         <int>
##  1         147 Dratini    Dragon Low HP          300
##  2         371 Bagon      Dragon Low HP          300
##  3         443 Gible      Dragon Low HP          300
##  4         704 Goomy      Dragon Low HP          300
##  5         374 Beldum     Steel  Low HP          300
##  6         436 Bronzor    Steel  Low HP          300
##  7         599 Klink      Steel  Low HP          300
##  8         524 Roggenrola Rock   Low HP          280
##  9         607 Litwick    Ghost  Low HP          275
## 10          50 Diglett    Ground Low HP          265
## # ... with 14 more rows

Printing the tables using shiny

kable(highestHp, align = 'clc', caption = 'Pokemon w/ Highest HP in Group') %>%
    kable_styling(bootstrap_options = c('striped'), full_width = F)
Pokemon w/ Highest HP in Group
Pokemon # Pokemon Type HP_Stats Total HP
383 GroudonPrimal Groudon Ground High HP 770
382 KyogrePrimal Kyogre Water High HP 770
493 Arceus Normal High HP 720
646 KyuremBlack Kyurem Dragon High HP 700
646 KyuremWhite Kyurem Dragon High HP 700
717 Yveltal Dark High HP 680
716 Xerneas Fairy High HP 680
250 Ho-oh Fire High HP 680
487 GiratinaAltered Forme Ghost High HP 680
487 GiratinaOrigin Forme Ghost High HP 680
150 Mewtwo Psychic High HP 680
249 Lugia Psychic High HP 680
720 HoopaHoopa Unbound Psychic High HP 680
483 Dialga Steel High HP 680
649 Genesect Bug High HP 600
492 ShayminLand Forme Grass High HP 600
492 ShayminSky Forme Grass High HP 600
248 Tyranitar Rock High HP 600
719 Diancie Rock High HP 600
145 Zapdos Electric High HP 580
243 Raikou Electric High HP 580
642 ThundurusIncarnate Forme Electric High HP 580
642 ThundurusTherian Forme Electric High HP 580
641 TornadusIncarnate Forme Flying High HP 580
641 TornadusTherian Forme Flying High HP 580
144 Articuno Ice High HP 580
378 Regice Ice High HP 580
169 Crobat Poison High HP 535
448 Lucario Fighting High HP 525
kable(lowestHp, align = 'clc', caption = 'Pokemon w/ Lowest HP in Group') %>%
    kable_styling(bootstrap_options = c('striped'), full_width = F)
Pokemon w/ Lowest HP in Group
Pokemon # Pokemon Type HP_Stats Total HP
147 Dratini Dragon Low HP 300
371 Bagon Dragon Low HP 300
443 Gible Dragon Low HP 300
704 Goomy Dragon Low HP 300
374 Beldum Steel Low HP 300
436 Bronzor Steel Low HP 300
599 Klink Steel Low HP 300
524 Roggenrola Rock Low HP 280
607 Litwick Ghost Low HP 275
50 Diglett Ground Low HP 265
218 Slugma Fire Low HP 250
220 Swinub Ice Low HP 250
714 Noibat Flying Low HP 245
41 Zubat Poison Low HP 245
261 Poochyena Dark Low HP 220
173 Cleffa Fairy Low HP 218
236 Tyrogue Fighting Low HP 210
172 Pichu Electric Low HP 205
129 Magikarp Water Low HP 200
349 Feebas Water Low HP 200
280 Ralts Psychic Low HP 198
401 Kricketot Bug Low HP 194
298 Azurill Normal Low HP 190
191 Sunkern Grass Low HP 180

Those Pokemon that have the highest HP for their group are evolved forms of less evolved Pokemon, as we can see from the list above. Similarly, Pokemon with the lowest HP for their group tend to be non evolved forms. The goal as a trainer in the game is to increase your Pokemon’s hit points (HP), as well as evolve them to be more effective against other Pokemon you do this by doing battle with wild Pokemon or fighting against other trainers.For the purpose of answering this question, we will compare Water type Pokemon to Grass type Pokemon.

filter_pokemon_type <- lowandHighHp %>% filter(Type == 'Grass' | Type == 'Water') %>% 
      arrange(`Total HP`)

plot <- ggplot(filter_pokemon_type, aes(x = Pokemon, y = `Total HP`, fill= HP_Stats))
plot <- plot + scale_y_continuous()
plot <- plot + theme(legend.position = "right")

plot <- plot + geom_bar(stat = "identity", position = "stack", color="black")
plot <- plot + facet_wrap(Type ~ ., scales = "free_x")
plot

### Conclusion

When we compare the Pokemon that have the highest and lowest hit points from the Grass and Water types, we can see that Sukern has a lower hit point total than Feebas or Magikarp, which is the lowest hit point total for a Water type. Also, KyogrePrimal Kyogre has a higher HP than ShayminLand Forme and ShayminSky Forme (same number), who has the highest HP of all Grass type Pokemon. Consequently, when we compare the Pokemon with the highest and lowest HP between Water type and Grass type, we see that water types have a higher HP for both their lowest and highest rated Pokemon compared to Grass type Pokemon.

Dataset 2: Lending Club Accepted Loans from 2018

https://www.kaggle.com/wordsforthewise/lending-club?select=accepted_2007_to_2018Q4.csv.gz I deleted everything before 2018 as this list was extremely long the original can be found at the link above

Question: is there a direct relationship between these fields (interest rates, grade, loan amount and income)?

loan <- read.csv("https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/accepted2018Q4.csv")

# summary(loan) too long 
colnames(loan)
##  [1] "loan_amnt"                   "funded_amnt"                
##  [3] "funded_amnt_inv"             "term"                       
##  [5] "int_rate"                    "installment"                
##  [7] "grade"                       "sub_grade"                  
##  [9] "emp_title"                   "emp_length"                 
## [11] "home_ownership"              "annual_inc"                 
## [13] "verification_status"         "issue_d"                    
## [15] "loan_status"                 "pymnt_plan"                 
## [17] "url"                         "desc"                       
## [19] "purpose"                     "title"                      
## [21] "zip_code"                    "addr_state"                 
## [23] "dti"                         "delinq_2yrs"                
## [25] "earliest_cr_line"            "fico_range_low"             
## [27] "fico_range_high"             "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "last_fico_range_high"       
## [51] "last_fico_range_low"         "collections_12_mths_ex_med" 
## [53] "mths_since_last_major_derog" "policy_code"                
## [55] "application_type"            "annual_inc_joint"           
## [57] "dti_joint"                   "verification_status_joint"  
## [59] "acc_now_delinq"              "tot_coll_amt"               
## [61] "tot_cur_bal"                 "open_acc_6m"                
## [63] "open_act_il"                 "open_il_12m"                
## [65] "open_il_24m"                 "mths_since_rcnt_il"         
## [67] "total_bal_il"

Selection of fields I need

loan <- loan %>% 
    select(annual_inc, loan_amnt, int_rate, grade) 
colnames(loan)
## [1] "annual_inc" "loan_amnt"  "int_rate"   "grade"
head(loan)
##   annual_inc loan_amnt int_rate grade
## 1      50000      5000    20.39     D
## 2     196000     15000     9.92     B
## 3      44000     11200    30.79     G
## 4      65000     25000    21.85     D
## 5      52000      3000     7.34     A
## 6      52000     17000    20.39     D

It will be easier for me to create a field and categorize the loan, interest rates and the customer annual salary

#loan amount group
loan$loangroup <- findInterval(loan$loan_amnt, seq(0, 30000, 10000)) 
loan$loangroup[loan$loangroup==1] <- "<$10,000"
loan$loangroup[loan$loangroup==2] <- "<$20,000"
loan$loangroup[loan$loangroup==3] <- "<$30,000"
loan$loangroup[loan$loangroup==4] <- "<=$40,000"

loan$loangroup <- factor(loan$loangroup, levels=c("<$10,000", "<$20,000", "<$30,000", "<=$40,000"))

#Loan interest rates group also know as APR
loan$aprgroup<- findInterval(loan$int_rate, seq(0, 30, 10))
loan$aprgroup[loan$aprgroup==1] <- "0-10%"
loan$aprgroup[loan$aprgroup==2] <- "10-20%"
loan$aprgroup[loan$aprgroup==3] <- "20-30%"
loan$aprgroup[loan$aprgroup==4] <- "30-40%"

loan$aprgroup<- factor(loan$aprgroup, levels=c("0-10%", "10-20%", "20-30%", "30-40%"))

# Grouping customer annual salary by every $25k
loan$salarygroup <- findInterval(loan$annual_inc, seq(0, 100000, 25000))
loan$salarygroup[loan$salarygroup==1] <- "<$25,000"
loan$salarygroup[loan$salarygroup==2] <- "<$50,000"
loan$salarygroup[loan$salarygroup==3] <- "<$75,000"
loan$salarygroup[loan$salarygroup==4] <- "<$100,000"
loan$salarygroup[loan$salarygroup==5] <- ">=$100,000"

loan$salarygroup <- factor(loan$salarygroup, levels=c("<$25,000"," <$50,000", "<$75,000",
                                                      "<$100,000", ">=$100,000"))

tail(loan)
##        annual_inc loan_amnt int_rate grade loangroup aprgroup salarygroup
## 236053      57000      5000    16.46     C  <$10,000   10-20%    <$75,000
## 236054      53414     12000    14.03     C  <$20,000   10-20%    <$75,000
## 236055      22000      5000     6.19     A  <$10,000    0-10%    <$25,000
## 236056      80000     20000    15.49     C  <$30,000   10-20%   <$100,000
## 236057      60000     10000    11.05     B  <$20,000   10-20%    <$75,000
## 236058       9600      4000    16.46     C  <$10,000   10-20%    <$25,000
ggplot(loan, aes(x=loan_amnt)) + 
  geom_histogram(aes(fill=..count..)) + 
  labs(title = "Loan distribution Amounts based on Grade") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(loan, aes(x=factor(salarygroup), y=loan_amnt, fill=factor(salarygroup))) + 
  geom_boxplot() + 
  facet_grid(. ~ grade) + 
  labs(title = "Boxplot for Loan based on Income, Grade, and Interest Rate", x="Income", y="Loan Amount") +
  theme(axis.text.x = element_text(angle=90, hjust=1))

ggplot(loan, aes(x=factor(loangroup), y=int_rate)) + 
  geom_violin(alpha=0.6, color="red") +  
  geom_jitter(alpha=0.5, aes(color=grade)) + 
  facet_grid(. ~ loangroup) + 
  labs(title = "Violin Plot for APR based on Loan Amount, Income, and Grade", x="Loan Amount", y="APR") +
  theme(axis.text.x = element_text(angle=90, hjust=1))

I am able to visualize the relationships between variables by grouping them into categories.