library(dplyr)
library(stringr)
library(tidyr)
library(knitr)
library(kableExtra)
library(ggplot2)
Found a useful pokemon dataset here -> https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv
I will save this in my own github and use it from there.
I decided to work on this dataset because I recently started playing Pokemon Shield on my Nintendo Switch and am on a mission to capture all to fill my pokedex (a dictionary with useful pokemon stats). This cause my interest in search for such data since it is in the game it must be in the internet.
#pokemon_data <- read.csv('https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv')
pokemon_data <- read.csv('https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/pokemon.csv')
head(pokemon_data, 25)
## X. Name Type.1 Type.2 Total HP Attack Defense Sp..Atk
## 1 1 Bulbasaur Grass Poison 318 45 49 49 65
## 2 2 Ivysaur Grass Poison 405 60 62 63 80
## 3 3 Venusaur Grass Poison 525 80 82 83 100
## 4 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122
## 5 4 Charmander Fire 309 39 52 43 60
## 6 5 Charmeleon Fire 405 58 64 58 80
## 7 6 Charizard Fire Flying 534 78 84 78 109
## 8 6 CharizardMega Charizard X Fire Dragon 634 78 130 111 130
## 9 6 CharizardMega Charizard Y Fire Flying 634 78 104 78 159
## 10 7 Squirtle Water 314 44 48 65 50
## 11 8 Wartortle Water 405 59 63 80 65
## 12 9 Blastoise Water 530 79 83 100 85
## 13 9 BlastoiseMega Blastoise Water 630 79 103 120 135
## 14 10 Caterpie Bug 195 45 30 35 20
## 15 11 Metapod Bug 205 50 20 55 25
## 16 12 Butterfree Bug Flying 395 60 45 50 90
## 17 13 Weedle Bug Poison 195 40 35 30 20
## 18 14 Kakuna Bug Poison 205 45 25 50 25
## 19 15 Beedrill Bug Poison 395 65 90 40 45
## 20 15 BeedrillMega Beedrill Bug Poison 495 65 150 40 15
## 21 16 Pidgey Normal Flying 251 40 45 40 35
## 22 17 Pidgeotto Normal Flying 349 63 60 55 50
## 23 18 Pidgeot Normal Flying 479 83 80 75 70
## 24 18 PidgeotMega Pidgeot Normal Flying 579 83 80 80 135
## 25 19 Rattata Normal 253 30 56 35 25
## Sp..Def Speed Generation Legendary
## 1 65 45 1 False
## 2 80 60 1 False
## 3 100 80 1 False
## 4 120 80 1 False
## 5 50 65 1 False
## 6 65 80 1 False
## 7 85 100 1 False
## 8 85 100 1 False
## 9 115 100 1 False
## 10 64 43 1 False
## 11 80 58 1 False
## 12 105 78 1 False
## 13 115 78 1 False
## 14 20 45 1 False
## 15 25 30 1 False
## 16 80 70 1 False
## 17 20 50 1 False
## 18 25 35 1 False
## 19 80 75 1 False
## 20 80 145 1 False
## 21 35 56 1 False
## 22 50 71 1 False
## 23 70 101 1 False
## 24 80 121 1 False
## 25 35 72 1 False
Some Pokemon were duplicated – mostly those that were split into multiple Mega versions of themselves. To remove these, I wil use regular expression.
pokemon_data <- pokemon_data %>%
filter(!str_detect(Name, '\\Mega'))
head(pokemon_data, 15)
## X. Name Type.1 Type.2 Total HP Attack Defense Sp..Atk Sp..Def Speed
## 1 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45
## 2 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
## 3 3 Venusaur Grass Poison 525 80 82 83 100 100 80
## 4 4 Charmander Fire 309 39 52 43 60 50 65
## 5 5 Charmeleon Fire 405 58 64 58 80 65 80
## 6 6 Charizard Fire Flying 534 78 84 78 109 85 100
## 7 7 Squirtle Water 314 44 48 65 50 64 43
## 8 8 Wartortle Water 405 59 63 80 65 80 58
## 9 9 Blastoise Water 530 79 83 100 85 105 78
## 10 10 Caterpie Bug 195 45 30 35 20 20 45
## 11 11 Metapod Bug 205 50 20 55 25 25 30
## 12 12 Butterfree Bug Flying 395 60 45 50 90 80 70
## 13 13 Weedle Bug Poison 195 40 35 30 20 20 50
## 14 14 Kakuna Bug Poison 205 45 25 50 25 25 35
## 15 15 Beedrill Bug Poison 395 65 90 40 45 80 75
## Generation Legendary
## 1 1 False
## 2 1 False
## 3 1 False
## 4 1 False
## 5 1 False
## 6 1 False
## 7 1 False
## 8 1 False
## 9 1 False
## 10 1 False
## 11 1 False
## 12 1 False
## 13 1 False
## 14 1 False
## 15 1 False
By comparing both data frames we can indeed see that the mega Pokemon were removed
For this comparison, it would be useful to select the columns that are relevant, and to group them by their type and total HP. In order to calculate maximum and minimum HP per Pokemon type, I used group by multiple times as well as mutate a few times.
lowandHighHp <- pokemon_data %>%
select(X.,Name, Type.1, Total) %>%
arrange(Type.1, Total) %>%
group_by(Type.1) %>%
mutate(High_HP = max(Total), Low_HP = min(Total)) %>%
ungroup() %>% mutate(HP_Stats = (Total == High_HP) | (Total == Low_HP)) %>%
filter(HP_Stats == TRUE) %>%
mutate(HP_Stats = ifelse(Total == High_HP, 'High HP', 'Low HP')) %>%
select(X.,Name, Type.1, HP_Stats, Total) %>%
arrange(desc(Total)) %>%
rename("Pokemon #" = X., "Type" = Type.1, "Pokemon" = Name, "Total HP" = Total)
highestHp <- lowandHighHp %>% filter(HP_Stats == 'High HP')
lowestHp <- lowandHighHp %>% filter(HP_Stats == 'Low HP')
highestHp
## # A tibble: 29 x 5
## `Pokemon #` Pokemon Type HP_Stats `Total HP`
## <int> <chr> <chr> <chr> <int>
## 1 383 GroudonPrimal Groudon Ground High HP 770
## 2 382 KyogrePrimal Kyogre Water High HP 770
## 3 493 Arceus Normal High HP 720
## 4 646 KyuremBlack Kyurem Dragon High HP 700
## 5 646 KyuremWhite Kyurem Dragon High HP 700
## 6 717 Yveltal Dark High HP 680
## 7 716 Xerneas Fairy High HP 680
## 8 250 Ho-oh Fire High HP 680
## 9 487 GiratinaAltered Forme Ghost High HP 680
## 10 487 GiratinaOrigin Forme Ghost High HP 680
## # ... with 19 more rows
lowestHp
## # A tibble: 24 x 5
## `Pokemon #` Pokemon Type HP_Stats `Total HP`
## <int> <chr> <chr> <chr> <int>
## 1 147 Dratini Dragon Low HP 300
## 2 371 Bagon Dragon Low HP 300
## 3 443 Gible Dragon Low HP 300
## 4 704 Goomy Dragon Low HP 300
## 5 374 Beldum Steel Low HP 300
## 6 436 Bronzor Steel Low HP 300
## 7 599 Klink Steel Low HP 300
## 8 524 Roggenrola Rock Low HP 280
## 9 607 Litwick Ghost Low HP 275
## 10 50 Diglett Ground Low HP 265
## # ... with 14 more rows
kable(highestHp, align = 'clc', caption = 'Pokemon w/ Highest HP in Group') %>%
kable_styling(bootstrap_options = c('striped'), full_width = F)
| Pokemon # | Pokemon | Type | HP_Stats | Total HP |
|---|---|---|---|---|
| 383 | GroudonPrimal Groudon | Ground | High HP | 770 |
| 382 | KyogrePrimal Kyogre | Water | High HP | 770 |
| 493 | Arceus | Normal | High HP | 720 |
| 646 | KyuremBlack Kyurem | Dragon | High HP | 700 |
| 646 | KyuremWhite Kyurem | Dragon | High HP | 700 |
| 717 | Yveltal | Dark | High HP | 680 |
| 716 | Xerneas | Fairy | High HP | 680 |
| 250 | Ho-oh | Fire | High HP | 680 |
| 487 | GiratinaAltered Forme | Ghost | High HP | 680 |
| 487 | GiratinaOrigin Forme | Ghost | High HP | 680 |
| 150 | Mewtwo | Psychic | High HP | 680 |
| 249 | Lugia | Psychic | High HP | 680 |
| 720 | HoopaHoopa Unbound | Psychic | High HP | 680 |
| 483 | Dialga | Steel | High HP | 680 |
| 649 | Genesect | Bug | High HP | 600 |
| 492 | ShayminLand Forme | Grass | High HP | 600 |
| 492 | ShayminSky Forme | Grass | High HP | 600 |
| 248 | Tyranitar | Rock | High HP | 600 |
| 719 | Diancie | Rock | High HP | 600 |
| 145 | Zapdos | Electric | High HP | 580 |
| 243 | Raikou | Electric | High HP | 580 |
| 642 | ThundurusIncarnate Forme | Electric | High HP | 580 |
| 642 | ThundurusTherian Forme | Electric | High HP | 580 |
| 641 | TornadusIncarnate Forme | Flying | High HP | 580 |
| 641 | TornadusTherian Forme | Flying | High HP | 580 |
| 144 | Articuno | Ice | High HP | 580 |
| 378 | Regice | Ice | High HP | 580 |
| 169 | Crobat | Poison | High HP | 535 |
| 448 | Lucario | Fighting | High HP | 525 |
kable(lowestHp, align = 'clc', caption = 'Pokemon w/ Lowest HP in Group') %>%
kable_styling(bootstrap_options = c('striped'), full_width = F)
| Pokemon # | Pokemon | Type | HP_Stats | Total HP |
|---|---|---|---|---|
| 147 | Dratini | Dragon | Low HP | 300 |
| 371 | Bagon | Dragon | Low HP | 300 |
| 443 | Gible | Dragon | Low HP | 300 |
| 704 | Goomy | Dragon | Low HP | 300 |
| 374 | Beldum | Steel | Low HP | 300 |
| 436 | Bronzor | Steel | Low HP | 300 |
| 599 | Klink | Steel | Low HP | 300 |
| 524 | Roggenrola | Rock | Low HP | 280 |
| 607 | Litwick | Ghost | Low HP | 275 |
| 50 | Diglett | Ground | Low HP | 265 |
| 218 | Slugma | Fire | Low HP | 250 |
| 220 | Swinub | Ice | Low HP | 250 |
| 714 | Noibat | Flying | Low HP | 245 |
| 41 | Zubat | Poison | Low HP | 245 |
| 261 | Poochyena | Dark | Low HP | 220 |
| 173 | Cleffa | Fairy | Low HP | 218 |
| 236 | Tyrogue | Fighting | Low HP | 210 |
| 172 | Pichu | Electric | Low HP | 205 |
| 129 | Magikarp | Water | Low HP | 200 |
| 349 | Feebas | Water | Low HP | 200 |
| 280 | Ralts | Psychic | Low HP | 198 |
| 401 | Kricketot | Bug | Low HP | 194 |
| 298 | Azurill | Normal | Low HP | 190 |
| 191 | Sunkern | Grass | Low HP | 180 |
Those Pokemon that have the highest HP for their group are evolved forms of less evolved Pokemon, as we can see from the list above. Similarly, Pokemon with the lowest HP for their group tend to be non evolved forms. The goal as a trainer in the game is to increase your Pokemon’s hit points (HP), as well as evolve them to be more effective against other Pokemon you do this by doing battle with wild Pokemon or fighting against other trainers.For the purpose of answering this question, we will compare Water type Pokemon to Grass type Pokemon.
filter_pokemon_type <- lowandHighHp %>% filter(Type == 'Grass' | Type == 'Water') %>%
arrange(`Total HP`)
plot <- ggplot(filter_pokemon_type, aes(x = Pokemon, y = `Total HP`, fill= HP_Stats))
plot <- plot + scale_y_continuous()
plot <- plot + theme(legend.position = "right")
plot <- plot + geom_bar(stat = "identity", position = "stack", color="black")
plot <- plot + facet_wrap(Type ~ ., scales = "free_x")
plot
### Conclusion
When we compare the Pokemon that have the highest and lowest hit points from the Grass and Water types, we can see that Sukern has a lower hit point total than Feebas or Magikarp, which is the lowest hit point total for a Water type. Also, KyogrePrimal Kyogre has a higher HP than ShayminLand Forme and ShayminSky Forme (same number), who has the highest HP of all Grass type Pokemon. Consequently, when we compare the Pokemon with the highest and lowest HP between Water type and Grass type, we see that water types have a higher HP for both their lowest and highest rated Pokemon compared to Grass type Pokemon.
https://www.kaggle.com/wordsforthewise/lending-club?select=accepted_2007_to_2018Q4.csv.gz I deleted everything before 2018 as this list was extremely long the original can be found at the link above
Question: is there a direct relationship between these fields (interest rates, grade, loan amount and income)?
loan <- read.csv("https://raw.githubusercontent.com/Eperez54/Dat-607/main/Project%202/accepted2018Q4.csv")
# summary(loan) too long
colnames(loan)
## [1] "loan_amnt" "funded_amnt"
## [3] "funded_amnt_inv" "term"
## [5] "int_rate" "installment"
## [7] "grade" "sub_grade"
## [9] "emp_title" "emp_length"
## [11] "home_ownership" "annual_inc"
## [13] "verification_status" "issue_d"
## [15] "loan_status" "pymnt_plan"
## [17] "url" "desc"
## [19] "purpose" "title"
## [21] "zip_code" "addr_state"
## [23] "dti" "delinq_2yrs"
## [25] "earliest_cr_line" "fico_range_low"
## [27] "fico_range_high" "inq_last_6mths"
## [29] "mths_since_last_delinq" "mths_since_last_record"
## [31] "open_acc" "pub_rec"
## [33] "revol_bal" "revol_util"
## [35] "total_acc" "initial_list_status"
## [37] "out_prncp" "out_prncp_inv"
## [39] "total_pymnt" "total_pymnt_inv"
## [41] "total_rec_prncp" "total_rec_int"
## [43] "total_rec_late_fee" "recoveries"
## [45] "collection_recovery_fee" "last_pymnt_d"
## [47] "last_pymnt_amnt" "next_pymnt_d"
## [49] "last_credit_pull_d" "last_fico_range_high"
## [51] "last_fico_range_low" "collections_12_mths_ex_med"
## [53] "mths_since_last_major_derog" "policy_code"
## [55] "application_type" "annual_inc_joint"
## [57] "dti_joint" "verification_status_joint"
## [59] "acc_now_delinq" "tot_coll_amt"
## [61] "tot_cur_bal" "open_acc_6m"
## [63] "open_act_il" "open_il_12m"
## [65] "open_il_24m" "mths_since_rcnt_il"
## [67] "total_bal_il"
loan <- loan %>%
select(annual_inc, loan_amnt, int_rate, grade)
colnames(loan)
## [1] "annual_inc" "loan_amnt" "int_rate" "grade"
head(loan)
## annual_inc loan_amnt int_rate grade
## 1 50000 5000 20.39 D
## 2 196000 15000 9.92 B
## 3 44000 11200 30.79 G
## 4 65000 25000 21.85 D
## 5 52000 3000 7.34 A
## 6 52000 17000 20.39 D
It will be easier for me to create a field and categorize the loan, interest rates and the customer annual salary
#loan amount group
loan$loangroup <- findInterval(loan$loan_amnt, seq(0, 30000, 10000))
loan$loangroup[loan$loangroup==1] <- "<$10,000"
loan$loangroup[loan$loangroup==2] <- "<$20,000"
loan$loangroup[loan$loangroup==3] <- "<$30,000"
loan$loangroup[loan$loangroup==4] <- "<=$40,000"
loan$loangroup <- factor(loan$loangroup, levels=c("<$10,000", "<$20,000", "<$30,000", "<=$40,000"))
#Loan interest rates group also know as APR
loan$aprgroup<- findInterval(loan$int_rate, seq(0, 30, 10))
loan$aprgroup[loan$aprgroup==1] <- "0-10%"
loan$aprgroup[loan$aprgroup==2] <- "10-20%"
loan$aprgroup[loan$aprgroup==3] <- "20-30%"
loan$aprgroup[loan$aprgroup==4] <- "30-40%"
loan$aprgroup<- factor(loan$aprgroup, levels=c("0-10%", "10-20%", "20-30%", "30-40%"))
# Grouping customer annual salary by every $25k
loan$salarygroup <- findInterval(loan$annual_inc, seq(0, 100000, 25000))
loan$salarygroup[loan$salarygroup==1] <- "<$25,000"
loan$salarygroup[loan$salarygroup==2] <- "<$50,000"
loan$salarygroup[loan$salarygroup==3] <- "<$75,000"
loan$salarygroup[loan$salarygroup==4] <- "<$100,000"
loan$salarygroup[loan$salarygroup==5] <- ">=$100,000"
loan$salarygroup <- factor(loan$salarygroup, levels=c("<$25,000"," <$50,000", "<$75,000",
"<$100,000", ">=$100,000"))
tail(loan)
## annual_inc loan_amnt int_rate grade loangroup aprgroup salarygroup
## 236053 57000 5000 16.46 C <$10,000 10-20% <$75,000
## 236054 53414 12000 14.03 C <$20,000 10-20% <$75,000
## 236055 22000 5000 6.19 A <$10,000 0-10% <$25,000
## 236056 80000 20000 15.49 C <$30,000 10-20% <$100,000
## 236057 60000 10000 11.05 B <$20,000 10-20% <$75,000
## 236058 9600 4000 16.46 C <$10,000 10-20% <$25,000
ggplot(loan, aes(x=loan_amnt)) +
geom_histogram(aes(fill=..count..)) +
labs(title = "Loan distribution Amounts based on Grade")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(loan, aes(x=factor(salarygroup), y=loan_amnt, fill=factor(salarygroup))) +
geom_boxplot() +
facet_grid(. ~ grade) +
labs(title = "Boxplot for Loan based on Income, Grade, and Interest Rate", x="Income", y="Loan Amount") +
theme(axis.text.x = element_text(angle=90, hjust=1))
ggplot(loan, aes(x=factor(loangroup), y=int_rate)) +
geom_violin(alpha=0.6, color="red") +
geom_jitter(alpha=0.5, aes(color=grade)) +
facet_grid(. ~ loangroup) +
labs(title = "Violin Plot for APR based on Loan Amount, Income, and Grade", x="Loan Amount", y="APR") +
theme(axis.text.x = element_text(angle=90, hjust=1))
I am able to visualize the relationships between variables by grouping them into categories.