#Read Files
library(readr)
df <- read_csv("match.csv")
## Rows: 50000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): match_id, start_time, duration, tower_status_radiant, tower_status...
## lgl  (1): radiant_win
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 13
##   match_id start_time duration tower_s…¹ tower…² barra…³ barra…⁴ first…⁵ game_…⁶
##      <dbl>      <dbl>    <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1        0 1446750112     2375      1982       4       3      63       1      22
## 2        1 1446753078     2582         0    1846      63       0     221      22
## 3        2 1446764586     2716       256    1972      63      48     190      22
## 4        3 1446765723     3085         4    1924      51       3      40      22
## 5        4 1446796385     1887      2047       0       0      63      58      22
## 6        5 1446798766     1574      2047       4       3      63     113      22
## # … with 4 more variables: radiant_win <lgl>, negative_votes <dbl>,
## #   positive_votes <dbl>, cluster <dbl>, and abbreviated variable names
## #   ¹​tower_status_radiant, ²​tower_status_dire, ³​barracks_status_dire,
## #   ⁴​barracks_status_radiant, ⁵​first_blood_time, ⁶​game_mode
str(df)
## spc_tbl_ [50,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ match_id               : num [1:50000] 0 1 2 3 4 5 6 7 8 9 ...
##  $ start_time             : num [1:50000] 1.45e+09 1.45e+09 1.45e+09 1.45e+09 1.45e+09 ...
##  $ duration               : num [1:50000] 2375 2582 2716 3085 1887 ...
##  $ tower_status_radiant   : num [1:50000] 1982 0 256 4 2047 ...
##  $ tower_status_dire      : num [1:50000] 4 1846 1972 1924 0 ...
##  $ barracks_status_dire   : num [1:50000] 3 63 63 51 0 3 3 0 63 63 ...
##  $ barracks_status_radiant: num [1:50000] 63 0 48 3 63 63 63 63 0 0 ...
##  $ first_blood_time       : num [1:50000] 1 221 190 40 58 113 4 255 4 85 ...
##  $ game_mode              : num [1:50000] 22 22 22 22 22 22 22 22 22 22 ...
##  $ radiant_win            : logi [1:50000] TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ negative_votes         : num [1:50000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ positive_votes         : num [1:50000] 1 2 0 0 0 0 0 0 0 0 ...
##  $ cluster                : num [1:50000] 155 154 132 191 156 155 151 138 182 133 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   match_id = col_double(),
##   ..   start_time = col_double(),
##   ..   duration = col_double(),
##   ..   tower_status_radiant = col_double(),
##   ..   tower_status_dire = col_double(),
##   ..   barracks_status_dire = col_double(),
##   ..   barracks_status_radiant = col_double(),
##   ..   first_blood_time = col_double(),
##   ..   game_mode = col_double(),
##   ..   radiant_win = col_logical(),
##   ..   negative_votes = col_double(),
##   ..   positive_votes = col_double(),
##   ..   cluster = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#check missing val
sum(is.na(df))
## [1] 0
#Duration still in seconds so change it to min
df$duration <- df$duration/60
#drop unnecessary columns to make our analysis tight
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- select(df, -c("start_time", "tower_status_radiant", "tower_status_dire", "barracks_status_dire", "game_mode", "negative_votes", "positive_votes", "cluster", "barracks_status_radiant"))

str(df)
## tibble [50,000 × 4] (S3: tbl_df/tbl/data.frame)
##  $ match_id        : num [1:50000] 0 1 2 3 4 5 6 7 8 9 ...
##  $ duration        : num [1:50000] 39.6 43 45.3 51.4 31.4 ...
##  $ first_blood_time: num [1:50000] 1 221 190 40 58 113 4 255 4 85 ...
##  $ radiant_win     : logi [1:50000] TRUE FALSE FALSE FALSE TRUE TRUE ...
#so if radiant lose means dire win rigth? so we make another column to include that info
df$dire_win <- ifelse(df$radiant_win == "True", "0", "1")
head(df)
## # A tibble: 6 × 5
##   match_id duration first_blood_time radiant_win dire_win
##      <dbl>    <dbl>            <dbl> <lgl>       <chr>   
## 1        0     39.6                1 TRUE        1       
## 2        1     43.0              221 FALSE       1       
## 3        2     45.3              190 FALSE       1       
## 4        3     51.4               40 FALSE       1       
## 5        4     31.4               58 TRUE        1       
## 6        5     26.2              113 TRUE        1
#move on to the maestro of the game
df2 <- read_csv("players.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 500000 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): stuns
## dbl (64): match_id, account_id, hero_id, player_slot, gold, gold_spent, gold...
## lgl  (8): unit_order_none, unit_order_taunt, unit_order_cast_rune, unit_orde...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df2)
## # A tibble: 6 × 73
##   match_id account_id hero_id playe…¹  gold gold_…² gold_…³ xp_pe…⁴ kills deaths
##      <dbl>      <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
## 1        0          0      86       0  3261   10960     347     362     9      3
## 2        0          1      51       1  2954   17760     494     659    13      3
## 3        0          0      83       2   110   12195     350     385     0      4
## 4        0          2      11       3  1179   22505     599     605     8      4
## 5        0          3      67       4  3307   23825     613     762    20      3
## 6        0          4     106     128   476   12285     397     524     5      6
## # … with 63 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## #   stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>,
## #   item_0 <dbl>, item_1 <dbl>, item_2 <dbl>, item_3 <dbl>, item_4 <dbl>,
## #   item_5 <dbl>, level <dbl>, leaver_status <dbl>, xp_hero <dbl>,
## #   xp_creep <dbl>, xp_roshan <dbl>, xp_other <dbl>, gold_other <dbl>,
## #   gold_death <dbl>, gold_buyback <dbl>, gold_abandon <dbl>, gold_sell <dbl>,
## #   gold_destroying_structure <dbl>, gold_killing_heros <dbl>, …
#remove unnecessary columns again
df2 <- select(df2, -c("item_0", "item_1", "item_2", "item_3","item_4", "item_5", "level", "leaver_status", "xp_hero", "xp_creep","xp_roshan", "xp_other", "gold_other", "gold_death", "gold_buyback",       "gold_abandon", "gold_sell", "gold_destroying_structure",       "gold_killing_heros", "gold_killing_creeps", "gold_killing_roshan",
       "gold_killing_couriers", "unit_order_none","unit_order_move_to_position", "unit_order_move_to_target","unit_order_attack_move", "unit_order_attack_target",
       "unit_order_cast_position", "unit_order_cast_target","unit_order_cast_target_tree", "unit_order_cast_no_target",
       "unit_order_cast_toggle", "unit_order_hold_position","unit_order_train_ability","unit_order_drop_item","unit_order_give_item", "unit_order_pickup_item",
       "unit_order_pickup_rune", "unit_order_purchase_item","unit_order_sell_item", "unit_order_disassemble_item","unit_order_move_item", "unit_order_cast_toggle_auto",
       "unit_order_stop", "unit_order_taunt", "unit_order_buyback","unit_order_glyph","unit_order_eject_item_from_stash",
       "unit_order_cast_rune", "unit_order_ping_ability","unit_order_move_to_direction", "unit_order_patrol",
       "unit_order_vector_target_position", "unit_order_radar","unit_order_set_item_combine_lock", "unit_order_continue"))
# better
head(df2)
## # A tibble: 6 × 17
##   match_id account_id hero_id playe…¹  gold gold_…² gold_…³ xp_pe…⁴ kills deaths
##      <dbl>      <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
## 1        0          0      86       0  3261   10960     347     362     9      3
## 2        0          1      51       1  2954   17760     494     659    13      3
## 3        0          0      83       2   110   12195     350     385     0      4
## 4        0          2      11       3  1179   22505     599     605     8      4
## 5        0          3      67       4  3307   23825     613     762    20      3
## 6        0          4     106     128   476   12285     397     524     5      6
## # … with 7 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## #   stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>, and
## #   abbreviated variable names ¹​player_slot, ²​gold_spent, ³​gold_per_min,
## #   ⁴​xp_per_min
#merge/join the data

dota <- left_join(df2, df, by = c("match_id" ="match_id"))
dota
## # A tibble: 500,000 × 21
##    match_id account…¹ hero_id playe…²  gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
##       <dbl>     <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
##  1        0         0      86       0  3261   10960     347     362     9      3
##  2        0         1      51       1  2954   17760     494     659    13      3
##  3        0         0      83       2   110   12195     350     385     0      4
##  4        0         2      11       3  1179   22505     599     605     8      4
##  5        0         3      67       4  3307   23825     613     762    20      3
##  6        0         4     106     128   476   12285     397     524     5      6
##  7        0         0     102     129   317   10355     303     369     4     13
##  8        0         5      46     130  2390   13395     452     517     4      8
##  9        0         0       7     131   475    5035     189     223     1     14
## 10        0         6      73     132    60   17550     496     456     1     11
## # … with 499,990 more rows, 11 more variables: assists <dbl>, denies <dbl>,
## #   last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## #   tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## #   radiant_win <lgl>, dire_win <chr>, and abbreviated variable names
## #   ¹​account_id, ²​player_slot, ³​gold_spent, ⁴​gold_per_min, ⁵​xp_per_min
df3 <- read_csv("hero_names.csv")
## Rows: 112 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, localized_name
## dbl (1): hero_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#drop unnecessary columns 
df3 <- select(df3, -c("name"))
#merge the data again
dota <- left_join(dota, df3, by = c("hero_id" ="hero_id"))
dota
## # A tibble: 500,000 × 22
##    match_id account…¹ hero_id playe…²  gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
##       <dbl>     <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
##  1        0         0      86       0  3261   10960     347     362     9      3
##  2        0         1      51       1  2954   17760     494     659    13      3
##  3        0         0      83       2   110   12195     350     385     0      4
##  4        0         2      11       3  1179   22505     599     605     8      4
##  5        0         3      67       4  3307   23825     613     762    20      3
##  6        0         4     106     128   476   12285     397     524     5      6
##  7        0         0     102     129   317   10355     303     369     4     13
##  8        0         5      46     130  2390   13395     452     517     4      8
##  9        0         0       7     131   475    5035     189     223     1     14
## 10        0         6      73     132    60   17550     496     456     1     11
## # … with 499,990 more rows, 12 more variables: assists <dbl>, denies <dbl>,
## #   last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## #   tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## #   radiant_win <lgl>, dire_win <chr>, localized_name <chr>, and abbreviated
## #   variable names ¹​account_id, ²​player_slot, ³​gold_spent, ⁴​gold_per_min,
## #   ⁵​xp_per_min
df4 <- read_csv("player_ratings.csv")
## Rows: 834226 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): account_id, total_wins, total_matches, trueskill_mu, trueskill_sigma
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#merge.. again.. :')
dota <- left_join(dota, df4, by = c("account_id" = "account_id"))
head(dota)
## # A tibble: 6 × 26
##   match_id account_id hero_id playe…¹  gold gold_…² gold_…³ xp_pe…⁴ kills deaths
##      <dbl>      <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
## 1        0          0      86       0  3261   10960     347     362     9      3
## 2        0          1      51       1  2954   17760     494     659    13      3
## 3        0          0      83       2   110   12195     350     385     0      4
## 4        0          2      11       3  1179   22505     599     605     8      4
## 5        0          3      67       4  3307   23825     613     762    20      3
## 6        0          4     106     128   476   12285     397     524     5      6
## # … with 16 more variables: assists <dbl>, denies <dbl>, last_hits <dbl>,
## #   stuns <chr>, hero_damage <dbl>, hero_healing <dbl>, tower_damage <dbl>,
## #   duration <dbl>, first_blood_time <dbl>, radiant_win <lgl>, dire_win <chr>,
## #   localized_name <chr>, total_wins <dbl>, total_matches <dbl>,
## #   trueskill_mu <dbl>, trueskill_sigma <dbl>, and abbreviated variable names
## #   ¹​player_slot, ²​gold_spent, ³​gold_per_min, ⁴​xp_per_min
dota$win_rate <- dota$total_wins/dota$total_matches*100
dota %>% arrange(desc(dota$win_rate))
## # A tibble: 500,000 × 27
##    match_id account…¹ hero_id playe…²  gold gold_…³ gold_…⁴ xp_pe…⁵ kills deaths
##       <dbl>     <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>
##  1        2        11     109       1   781   12515     322     358     6     11
##  2        2        12       9       2   640   13845     355     425    10      6
##  3        2        13      41       3   667   13260     328     345     0      9
##  4        3        17      78     128  1807   20350     459     632    13      8
##  5        8        45      28       1  1043    9555     352     410     3      9
##  6        9        50      26     129  3522   16475     404     615    11      6
##  7       11        61      50       4  3832   12075     310     430     5      9
##  8       15        83      93       3  2297   12770     384     616     7     13
##  9       22       129      39     128    48   16795     422     538    12     10
## 10       24       145      98     130  3819   15890     439     454    11     12
## # … with 499,990 more rows, 17 more variables: assists <dbl>, denies <dbl>,
## #   last_hits <dbl>, stuns <chr>, hero_damage <dbl>, hero_healing <dbl>,
## #   tower_damage <dbl>, duration <dbl>, first_blood_time <dbl>,
## #   radiant_win <lgl>, dire_win <chr>, localized_name <chr>, total_wins <dbl>,
## #   total_matches <dbl>, trueskill_mu <dbl>, trueskill_sigma <dbl>,
## #   win_rate <dbl>, and abbreviated variable names ¹​account_id, ²​player_slot,
## #   ³​gold_spent, ⁴​gold_per_min, ⁵​xp_per_min

#checking obs

sum(unique(dota$match_id))
## [1] 1249975000
sum(unique(dota$account_id))
## [1] 12539023980

#missing val

sum(is.na(dota))
## [1] 170647
dota <- na.omit(dota)
#for dota players you might ban this hero 
dota %>% group_by(localized_name) %>% summarise(average = mean(win_rate, na.rm = T)) %>% arrange(desc(average))
## # A tibble: 110 × 2
##    localized_name average
##    <chr>            <dbl>
##  1 Elder Titan       52.3
##  2 Io                51.8
##  3 Bane              51.8
##  4 Earth Spirit      51.6
##  5 Phoenix           51.6
##  6 Dazzle            51.5
##  7 Terrorblade       51.4
##  8 Oracle            51.4
##  9 Visage            51.3
## 10 Timbersaw         51.3
## # … with 100 more rows

interesting, so support role is very important huh

#for dota developers you might want to give this hero some epic skin
dota %>% group_by(localized_name) %>% summarise(frequency = n()) %>% arrange(desc(frequency))
## # A tibble: 110 × 2
##    localized_name frequency
##    <chr>              <int>
##  1 Windranger         19456
##  2 Shadow Fiend       15926
##  3 Invoker            10859
##  4 Slardar            10487
##  5 Earthshaker        10436
##  6 Queen of Pain       9877
##  7 Tusk                9668
##  8 Juggernaut          9615
##  9 Alchemist           9182
## 10 Pudge               8836
## # … with 100 more rows

mostly they already have arcana (dota2 hero epic skin)

dota %>% group_by(localized_name) %>% summarise(frequency = n()) %>% arrange(frequency)
## # A tibble: 110 × 2
##    localized_name frequency
##    <chr>              <int>
##  1 Chen                 545
##  2 Elder Titan          792
##  3 Brewmaster           875
##  4 Lone Druid           919
##  5 Naga Siren           929
##  6 Lycan                932
##  7 Visage               940
##  8 Enchantress          944
##  9 Oracle               970
## 10 Batrider             980
## # … with 100 more rows

that’s why most of support hero got the biggest winrate

dota %>% group_by(account_id) %>% summarise(kills = sum(kills),assists = sum(assists),deaths = sum(deaths) ,matchplayed = n(), winrate = mean(win_rate)) %>% arrange(desc(kills))
## # A tibble: 131,387 × 6
##    account_id   kills assists  deaths matchplayed winrate
##         <dbl>   <dbl>   <dbl>   <dbl>       <int>   <dbl>
##  1          0 1347261 2109685 1454732      181162    48.5
##  2       2962     595     712     464          57    53.0
##  3        732     545     415     263          40    52  
##  4       4709     517     418     333          43    49.0
##  5      22106     517     258     133          25    84.4
##  6        170     494     358     282          38    60  
##  7      23079     488     430     264          38    56.2
##  8       2543     475     334     293          42    55.6
##  9      12823     457     795     498          54    43.9
## 10       4638     439     265     152          25    75  
## # … with 131,377 more rows
#guys forget my first analysis that tells you which heroes to ban, ban this heroes instead
dota %>% group_by(localized_name) %>% summarise(kills = sum(kills),assists = sum(assists),deaths = sum(deaths) ,matchplayed = n(), winrate = mean(win_rate)) %>% arrange(desc(kills))
## # A tibble: 110 × 6
##    localized_name  kills assists deaths matchplayed winrate
##    <chr>           <dbl>   <dbl>  <dbl>       <int>   <dbl>
##  1 Windranger     176647  193832 133420       19456    50.2
##  2 Shadow Fiend   164284  158819 115260       15926    50.4
##  3 Queen of Pain  106847  114564  68827        9877    50.3
##  4 Invoker         99487  130484  76199       10859    49.9
##  5 Juggernaut      91856   80523  61681        9615    50.3
##  6 Slark           83746   78349  55855        7888    50.3
##  7 Pudge           77699  107402  85291        8836    49.4
##  8 Tusk            77453  134714  87820        9668    50.8
##  9 Lina            68635   76691  65817        7638    49.9
## 10 Ember Spirit    68464   88231  47045        7028    51.2
## # … with 100 more rows

now i dont see any support heroes

#are all of that carry hero have a lot of kills because they earn gold and exp easily?
dota %>% group_by(localized_name) %>% summarise(gold = mean(gold),gold_per_min = sum(gold_per_min),exp_per_min = mean(xp_per_min)) %>% arrange(desc(gold))
## # A tibble: 110 × 4
##    localized_name    gold gold_per_min exp_per_min
##    <chr>            <dbl>        <dbl>       <dbl>
##  1 Alchemist        2764.      6524297        578.
##  2 Medusa           2503.      1102734        532.
##  3 Terrorblade      2495.       724807        494.
##  4 Luna             2462.      1169611        545.
##  5 Spectre          2462.      3082588        540.
##  6 Anti-Mage        2416.      4900909        582.
##  7 Templar Assassin 2348.      2954490        575.
##  8 Meepo            2336.       831054        677.
##  9 Invoker          2322.      4889102        497.
## 10 Wraith King      2294.      3508868        524.
## # … with 100 more rows

that’s not like what i expected maybe the hero above have a lot of kills because they have usefull skill to do ks? (ks in dota2 means “kill secure”, you know like your team mates do the hard work and u just need a little touch up and do the kill) i mean windranger is that kind of hero for sure. Annoying right

# let's change out point of view a little
dota %>% group_by(localized_name) %>% summarise(gold = mean(gold),gold_per_min = sum(gold_per_min),exp_per_min = mean(xp_per_min)) %>% arrange(desc(exp_per_min))
## # A tibble: 110 × 4
##    localized_name    gold gold_per_min exp_per_min
##    <chr>            <dbl>        <dbl>       <dbl>
##  1 Meepo            2336.       831054        677.
##  2 Anti-Mage        2416.      4900909        582.
##  3 Alchemist        2764.      6524297        578.
##  4 Templar Assassin 2348.      2954490        575.
##  5 Shadow Fiend     2251.      8793962        573.
##  6 Juggernaut       2226.      5011800        550.
##  7 Ursa             2168.      1913598        549.
##  8 Slark            2037.      3777711        548.
##  9 Luna             2462.      1169611        545.
## 10 Ember Spirit     2027.      3490635        543.
## # … with 100 more rows

okay let’s see, i see slark, shadow fiend, and ursa here. Okay so maybe getting lot of exp is a lot usefull than farm all the time to get golds? you decide

# lets change our point of view again, so in dota2 the player that get kills is the player that get the last hit right, and we get gold from killing creeps and monster also if we get the last hit. OKay let's check it out peeps

dota %>% group_by(localized_name) %>% summarise(last_hits = sum(last_hits)) %>% arrange(desc(last_hits))
## # A tibble: 110 × 2
##    localized_name   last_hits
##    <chr>                <dbl>
##  1 Shadow Fiend       3940258
##  2 Windranger         2710820
##  3 Anti-Mage          2640646
##  4 Alchemist          2438584
##  5 Juggernaut         2123373
##  6 Queen of Pain      1586999
##  7 Ember Spirit       1571429
##  8 Invoker            1567701
##  9 Gyrocopter         1427665
## 10 Legion Commander   1393329
## # … with 100 more rows

I see windranger hereee O_O

okay for my opinion shadow fiend is the most terrfying heroes because he can get exp easily, last hit easily and also kills. Disclaimer i only just talking about stats here okay, if you use shadow fiend and still lose the game, Improve your skill mate! cheers

Okay That’s it for my EDA, thank you for reading