#Read Files
library(readr)
df <- read_csv("match.csv")
## Rows: 50000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): match_id, start_time, duration, tower_status_radiant, tower_status...
## lgl (1): radiant_win
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 13
## match_id start_time duration tower_s…¹ tower…² barra…³ barra…⁴ first…⁵ game_…⁶
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1446750112 2375 1982 4 3 63 1 22
## 2 1 1446753078 2582 0 1846 63 0 221 22
## 3 2 1446764586 2716 256 1972 63 48 190 22
## 4 3 1446765723 3085 4 1924 51 3 40 22
## 5 4 1446796385 1887 2047 0 0 63 58 22
## 6 5 1446798766 1574 2047 4 3 63 113 22
## # … with 4 more variables: radiant_win <lgl>, negative_votes <dbl>,
## # positive_votes <dbl>, cluster <dbl>, and abbreviated variable names
## # ¹tower_status_radiant, ²tower_status_dire, ³barracks_status_dire,
## # ⁴barracks_status_radiant, ⁵first_blood_time, ⁶game_mode
str(df)
## spc_tbl_ [50,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ match_id : num [1:50000] 0 1 2 3 4 5 6 7 8 9 ...
## $ start_time : num [1:50000] 1.45e+09 1.45e+09 1.45e+09 1.45e+09 1.45e+09 ...
## $ duration : num [1:50000] 2375 2582 2716 3085 1887 ...
## $ tower_status_radiant : num [1:50000] 1982 0 256 4 2047 ...
## $ tower_status_dire : num [1:50000] 4 1846 1972 1924 0 ...
## $ barracks_status_dire : num [1:50000] 3 63 63 51 0 3 3 0 63 63 ...
## $ barracks_status_radiant: num [1:50000] 63 0 48 3 63 63 63 63 0 0 ...
## $ first_blood_time : num [1:50000] 1 221 190 40 58 113 4 255 4 85 ...
## $ game_mode : num [1:50000] 22 22 22 22 22 22 22 22 22 22 ...
## $ radiant_win : logi [1:50000] TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ negative_votes : num [1:50000] 0 0 0 0 0 0 0 0 0 0 ...
## $ positive_votes : num [1:50000] 1 2 0 0 0 0 0 0 0 0 ...
## $ cluster : num [1:50000] 155 154 132 191 156 155 151 138 182 133 ...
## - attr(*, "spec")=
## .. cols(
## .. match_id = col_double(),
## .. start_time = col_double(),
## .. duration = col_double(),
## .. tower_status_radiant = col_double(),
## .. tower_status_dire = col_double(),
## .. barracks_status_dire = col_double(),
## .. barracks_status_radiant = col_double(),
## .. first_blood_time = col_double(),
## .. game_mode = col_double(),
## .. radiant_win = col_logical(),
## .. negative_votes = col_double(),
## .. positive_votes = col_double(),
## .. cluster = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#check missing val
sum(is.na(df))
## [1] 0
#Duration still in seconds so change it to min
df$duration <- df$duration/60
#drop unnecessary columns to make our analysis tight
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- select(df, -c("start_time", "tower_status_radiant", "tower_status_dire", "barracks_status_dire", "game_mode", "negative_votes", "positive_votes", "cluster", "barracks_status_radiant"))
str(df)
## tibble [50,000 × 4] (S3: tbl_df/tbl/data.frame)
## $ match_id : num [1:50000] 0 1 2 3 4 5 6 7 8 9 ...
## $ duration : num [1:50000] 39.6 43 45.3 51.4 31.4 ...
## $ first_blood_time: num [1:50000] 1 221 190 40 58 113 4 255 4 85 ...
## $ radiant_win : logi [1:50000] TRUE FALSE FALSE FALSE TRUE TRUE ...
#so if radiant lose means dire win rigth? so we make another column to include that info
df$dire_win <- ifelse(df$radiant_win == "True", "0", "1")
head(df)
## # A tibble: 6 × 5
## match_id duration first_blood_time radiant_win dire_win
## <dbl> <dbl> <dbl> <lgl> <chr>
## 1 0 39.6 1 TRUE 1
## 2 1 43.0 221 FALSE 1
## 3 2 45.3 190 FALSE 1
## 4 3 51.4 40 FALSE 1
## 5 4 31.4 58 TRUE 1
## 6 5 26.2 113 TRUE 1
#move on to the maestro of the game
df2 <- read_csv("players.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 500000 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): stuns
## dbl (64): match_id, account_id, hero_id, player_slot, gold, gold_spent, gold...
## lgl (8): unit_order_none, unit_order_taunt, unit_order_cast_rune, unit_orde...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df2)
## # A tibble: 6 × 73
## match_id account_id hero_id playe…¹ gold gold_…² gold_…³ xp_pe…⁴ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 86 0 3261 10960 347 362 9 3
## 2 0 1 51 1 2954 17760 494 659 13 3
## 3 0 0 83 2 110 12195 350 385 0 4
## 4 0 2 11 3 1179 22505 599 605 8 4
## 5 0 3 67 4 3307 23825 613 762 20 3
## 6 0 4 106 128 476 12285 397 524 5 6
## # … with 63 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## # stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>,
## # item_0 <dbl>, item_1 <dbl>, item_2 <dbl>, item_3 <dbl>, item_4 <dbl>,
## # item_5 <dbl>, level <dbl>, leaver_status <dbl>, xp_hero <dbl>,
## # xp_creep <dbl>, xp_roshan <dbl>, xp_other <dbl>, gold_other <dbl>,
## # gold_death <dbl>, gold_buyback <dbl>, gold_abandon <dbl>, gold_sell <dbl>,
## # gold_destroying_structure <dbl>, gold_killing_heros <dbl>, …
#remove unnecessary columns again
df2 <- select(df2, -c("item_0", "item_1", "item_2", "item_3","item_4", "item_5", "level", "leaver_status", "xp_hero", "xp_creep","xp_roshan", "xp_other", "gold_other", "gold_death", "gold_buyback", "gold_abandon", "gold_sell", "gold_destroying_structure", "gold_killing_heros", "gold_killing_creeps", "gold_killing_roshan",
"gold_killing_couriers", "unit_order_none","unit_order_move_to_position", "unit_order_move_to_target","unit_order_attack_move", "unit_order_attack_target",
"unit_order_cast_position", "unit_order_cast_target","unit_order_cast_target_tree", "unit_order_cast_no_target",
"unit_order_cast_toggle", "unit_order_hold_position","unit_order_train_ability","unit_order_drop_item","unit_order_give_item", "unit_order_pickup_item",
"unit_order_pickup_rune", "unit_order_purchase_item","unit_order_sell_item", "unit_order_disassemble_item","unit_order_move_item", "unit_order_cast_toggle_auto",
"unit_order_stop", "unit_order_taunt", "unit_order_buyback","unit_order_glyph","unit_order_eject_item_from_stash",
"unit_order_cast_rune", "unit_order_ping_ability","unit_order_move_to_direction", "unit_order_patrol",
"unit_order_vector_target_position", "unit_order_radar","unit_order_set_item_combine_lock", "unit_order_continue"))
# better
head(df2)
## # A tibble: 6 × 17
## match_id account_id hero_id playe…¹ gold gold_…² gold_…³ xp_pe…⁴ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 86 0 3261 10960 347 362 9 3
## 2 0 1 51 1 2954 17760 494 659 13 3
## 3 0 0 83 2 110 12195 350 385 0 4
## 4 0 2 11 3 1179 22505 599 605 8 4
## 5 0 3 67 4 3307 23825 613 762 20 3
## 6 0 4 106 128 476 12285 397 524 5 6
## # … with 7 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## # stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>, and
## # abbreviated variable names ¹player_slot, ²gold_spent, ³gold_per_min,
## # ⁴xp_per_min
#merge/join the data
dota <- left_join(df2, df, by = c("match_id" ="match_id"))
dota
## # A tibble: 500,000 × 21
## match_id account…¹ hero_id playe…² gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 86 0 3261 10960 347 362 9 3
## 2 0 1 51 1 2954 17760 494 659 13 3
## 3 0 0 83 2 110 12195 350 385 0 4
## 4 0 2 11 3 1179 22505 599 605 8 4
## 5 0 3 67 4 3307 23825 613 762 20 3
## 6 0 4 106 128 476 12285 397 524 5 6
## 7 0 0 102 129 317 10355 303 369 4 13
## 8 0 5 46 130 2390 13395 452 517 4 8
## 9 0 0 7 131 475 5035 189 223 1 14
## 10 0 6 73 132 60 17550 496 456 1 11
## # … with 499,990 more rows, 11 more variables: assists <dbl>, denies <dbl>,
## # last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## # tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## # radiant_win <lgl>, dire_win <chr>, and abbreviated variable names
## # ¹account_id, ²player_slot, ³gold_spent, ⁴gold_per_min, ⁵xp_per_min
df3 <- read_csv("hero_names.csv")
## Rows: 112 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, localized_name
## dbl (1): hero_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#drop unnecessary columns
df3 <- select(df3, -c("name"))
#merge the data again
dota <- left_join(dota, df3, by = c("hero_id" ="hero_id"))
dota
## # A tibble: 500,000 × 22
## match_id account…¹ hero_id playe…² gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 86 0 3261 10960 347 362 9 3
## 2 0 1 51 1 2954 17760 494 659 13 3
## 3 0 0 83 2 110 12195 350 385 0 4
## 4 0 2 11 3 1179 22505 599 605 8 4
## 5 0 3 67 4 3307 23825 613 762 20 3
## 6 0 4 106 128 476 12285 397 524 5 6
## 7 0 0 102 129 317 10355 303 369 4 13
## 8 0 5 46 130 2390 13395 452 517 4 8
## 9 0 0 7 131 475 5035 189 223 1 14
## 10 0 6 73 132 60 17550 496 456 1 11
## # … with 499,990 more rows, 12 more variables: assists <dbl>, denies <dbl>,
## # last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## # tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## # radiant_win <lgl>, dire_win <chr>, localized_name <chr>, and abbreviated
## # variable names ¹account_id, ²player_slot, ³gold_spent, ⁴gold_per_min,
## # ⁵xp_per_min
df4 <- read_csv("player_ratings.csv")
## Rows: 834226 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): account_id, total_wins, total_matches, trueskill_mu, trueskill_sigma
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#merge.. again.. :')
dota <- left_join(dota, df4, by = c("account_id" = "account_id"))
head(dota)
## # A tibble: 6 × 26
## match_id account_id hero_id playe…¹ gold gold_…² gold_…³ xp_pe…⁴ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 86 0 3261 10960 347 362 9 3
## 2 0 1 51 1 2954 17760 494 659 13 3
## 3 0 0 83 2 110 12195 350 385 0 4
## 4 0 2 11 3 1179 22505 599 605 8 4
## 5 0 3 67 4 3307 23825 613 762 20 3
## 6 0 4 106 128 476 12285 397 524 5 6
## # … with 16 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## # stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>,
## # duration <dbl>, first_blood_time <dbl>, radiant_win <lgl>, dire_win <chr>,
## # localized_name <chr>, total_wins <dbl>, total_matches <dbl>,
## # trueskill_mu <dbl>, trueskill_sigma <dbl>, and abbreviated variable names
## # ¹player_slot, ²gold_spent, ³gold_per_min, ⁴xp_per_min
dota$win_rate <- dota$total_wins/dota$total_matches*100
dota %>% arrange(desc(dota$win_rate))
## # A tibble: 500,000 × 27
## match_id account…¹ hero_id playe…² gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 11 109 1 781 12515 322 358 6 11
## 2 2 12 9 2 640 13845 355 425 10 6
## 3 2 13 41 3 667 13260 328 345 0 9
## 4 3 17 78 128 1807 20350 459 632 13 8
## 5 8 45 28 1 1043 9555 352 410 3 9
## 6 9 50 26 129 3522 16475 404 615 11 6
## 7 11 61 50 4 3832 12075 310 430 5 9
## 8 15 83 93 3 2297 12770 384 616 7 13
## 9 22 129 39 128 48 16795 422 538 12 10
## 10 24 145 98 130 3819 15890 439 454 11 12
## # … with 499,990 more rows, 17 more variables: assists <dbl>, denies <dbl>,
## # last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## # tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## # radiant_win <lgl>, dire_win <chr>, localized_name <chr>, total_wins <dbl>,
## # total_matches <dbl>, trueskill_mu <dbl>, trueskill_sigma <dbl>,
## # win_rate <dbl>, and abbreviated variable names ¹account_id, ²player_slot,
## # ³gold_spent, ⁴gold_per_min, ⁵xp_per_min
#checking obs
sum(unique(dota$match_id))
## [1] 1249975000
sum(unique(dota$account_id))
## [1] 12539023980
#missing val
sum(is.na(dota))
## [1] 170647
dota <- na.omit(dota)
#for dota players you might ban this hero
dota %>% group_by(localized_name) %>% summarise(average = mean(win_rate, na.rm = T)) %>% arrange(desc(average))
## # A tibble: 110 × 2
## localized_name average
## <chr> <dbl>
## 1 Elder Titan 52.3
## 2 Io 51.8
## 3 Bane 51.8
## 4 Earth Spirit 51.6
## 5 Phoenix 51.6
## 6 Dazzle 51.5
## 7 Terrorblade 51.4
## 8 Oracle 51.4
## 9 Visage 51.3
## 10 Timbersaw 51.3
## # … with 100 more rows
interesting, so support role is very important huh
#for dota developers you might want to give this hero some epic skin
dota %>% group_by(localized_name) %>% summarise(frequency = n()) %>% arrange(desc(frequency))
## # A tibble: 110 × 2
## localized_name frequency
## <chr> <int>
## 1 Windranger 19456
## 2 Shadow Fiend 15926
## 3 Invoker 10859
## 4 Slardar 10487
## 5 Earthshaker 10436
## 6 Queen of Pain 9877
## 7 Tusk 9668
## 8 Juggernaut 9615
## 9 Alchemist 9182
## 10 Pudge 8836
## # … with 100 more rows
mostly they already have arcana (dota2 hero epic skin)
dota %>% group_by(localized_name) %>% summarise(frequency = n()) %>% arrange(frequency)
## # A tibble: 110 × 2
## localized_name frequency
## <chr> <int>
## 1 Chen 545
## 2 Elder Titan 792
## 3 Brewmaster 875
## 4 Lone Druid 919
## 5 Naga Siren 929
## 6 Lycan 932
## 7 Visage 940
## 8 Enchantress 944
## 9 Oracle 970
## 10 Batrider 980
## # … with 100 more rows
that’s why most of support hero got the biggest winrate
dota %>% group_by(account_id) %>% summarise(kills = sum(kills),assists = sum(assists),deaths = sum(deaths) ,matchplayed = n(), winrate = mean(win_rate)) %>% arrange(desc(kills))
## # A tibble: 131,387 × 6
## account_id kills assists deaths matchplayed winrate
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 0 1347261 2109685 1454732 181162 48.5
## 2 2962 595 712 464 57 53.0
## 3 732 545 415 263 40 52
## 4 4709 517 418 333 43 49.0
## 5 22106 517 258 133 25 84.4
## 6 170 494 358 282 38 60
## 7 23079 488 430 264 38 56.2
## 8 2543 475 334 293 42 55.6
## 9 12823 457 795 498 54 43.9
## 10 4638 439 265 152 25 75
## # … with 131,377 more rows
#guys forget my first analysis that tells you which heroes to ban, ban this heroes instead
dota %>% group_by(localized_name) %>% summarise(kills = sum(kills),assists = sum(assists),deaths = sum(deaths) ,matchplayed = n(), winrate = mean(win_rate)) %>% arrange(desc(kills))
## # A tibble: 110 × 6
## localized_name kills assists deaths matchplayed winrate
## <chr> <dbl> <dbl> <dbl> <int> <dbl>
## 1 Windranger 176647 193832 133420 19456 50.2
## 2 Shadow Fiend 164284 158819 115260 15926 50.4
## 3 Queen of Pain 106847 114564 68827 9877 50.3
## 4 Invoker 99487 130484 76199 10859 49.9
## 5 Juggernaut 91856 80523 61681 9615 50.3
## 6 Slark 83746 78349 55855 7888 50.3
## 7 Pudge 77699 107402 85291 8836 49.4
## 8 Tusk 77453 134714 87820 9668 50.8
## 9 Lina 68635 76691 65817 7638 49.9
## 10 Ember Spirit 68464 88231 47045 7028 51.2
## # … with 100 more rows
now i dont see any support heroes
#are all of that carry hero have a lot of kills because they earn gold and exp easily?
dota %>% group_by(localized_name) %>% summarise(gold = mean(gold),gold_per_min = sum(gold_per_min),exp_per_min = mean(xp_per_min)) %>% arrange(desc(gold))
## # A tibble: 110 × 4
## localized_name gold gold_per_min exp_per_min
## <chr> <dbl> <dbl> <dbl>
## 1 Alchemist 2764. 6524297 578.
## 2 Medusa 2503. 1102734 532.
## 3 Terrorblade 2495. 724807 494.
## 4 Luna 2462. 1169611 545.
## 5 Spectre 2462. 3082588 540.
## 6 Anti-Mage 2416. 4900909 582.
## 7 Templar Assassin 2348. 2954490 575.
## 8 Meepo 2336. 831054 677.
## 9 Invoker 2322. 4889102 497.
## 10 Wraith King 2294. 3508868 524.
## # … with 100 more rows
that’s not like what i expected maybe the hero above have a lot of kills because they have usefull skill to do ks? (ks in dota2 means “kill secure”, you know like your team mates do the hard work and u just need a little touch up and do the kill) i mean windranger is that kind of hero for sure. Annoying right
# let's change out point of view a little
dota %>% group_by(localized_name) %>% summarise(gold = mean(gold),gold_per_min = sum(gold_per_min),exp_per_min = mean(xp_per_min)) %>% arrange(desc(exp_per_min))
## # A tibble: 110 × 4
## localized_name gold gold_per_min exp_per_min
## <chr> <dbl> <dbl> <dbl>
## 1 Meepo 2336. 831054 677.
## 2 Anti-Mage 2416. 4900909 582.
## 3 Alchemist 2764. 6524297 578.
## 4 Templar Assassin 2348. 2954490 575.
## 5 Shadow Fiend 2251. 8793962 573.
## 6 Juggernaut 2226. 5011800 550.
## 7 Ursa 2168. 1913598 549.
## 8 Slark 2037. 3777711 548.
## 9 Luna 2462. 1169611 545.
## 10 Ember Spirit 2027. 3490635 543.
## # … with 100 more rows
okay let’s see, i see slark, shadow fiend, and ursa here. Okay so maybe getting lot of exp is a lot usefull than farm all the time to get golds? you decide
# lets change our point of view again, so in dota2 the player that get kills is the player that get the last hit right, and we get gold from killing creeps and monster also if we get the last hit. OKay let's check it out peeps
dota %>% group_by(localized_name) %>% summarise(last_hits = sum(last_hits)) %>% arrange(desc(last_hits))
## # A tibble: 110 × 2
## localized_name last_hits
## <chr> <dbl>
## 1 Shadow Fiend 3940258
## 2 Windranger 2710820
## 3 Anti-Mage 2640646
## 4 Alchemist 2438584
## 5 Juggernaut 2123373
## 6 Queen of Pain 1586999
## 7 Ember Spirit 1571429
## 8 Invoker 1567701
## 9 Gyrocopter 1427665
## 10 Legion Commander 1393329
## # … with 100 more rows
I see windranger hereee O_O
okay for my opinion shadow fiend is the most terrfying heroes because he can get exp easily, last hit easily and also kills. Disclaimer i only just talking about stats here okay, if you use shadow fiend and still lose the game, Improve your skill mate! cheers
Okay That’s it for my EDA, thank you for reading