Tidyverse ftw! (solution

Hacky convert to Rmd

<Template_Solution.ipynb grep '###' | 
  sed -e 's/    "//'\
    -e 's/\\n"//'\
    -e 's/,$//'\
    -e 's/$/\n```{r}\n\n```/'\
> solution_libor.Rmd

`kkable`

The construct

xxx %>%
  kkable

is only used to display a nice table in markdown.

One click publish to web from R Studio

The rendered document ends up on RPubs.

1. Read CSV files (~3 lines)

read_csv('data/families.csv') -> dfam

## Parsed with column specification:
## cols(
##   sha256 = col_character(),
##   virus_type = col_character(),
##   virus_family = col_character(),
##   magic = col_character()
## )

read_csv('data/features.csv') -> dfeat

## Parsed with column specification:
## cols(
##   max_section_entropy = col_double(),
##   num_sections = col_integer(),
##   min_section_entropy = col_double(),
##   image_base = col_integer(),
##   import_table_size = col_integer(),
##   compile_timestamp = col_integer(),
##   file_size = col_integer(),
##   sha256 = col_character(),
##   non_standard_section_names = col_character()
## )

read_csv('data/peid.csv') -> dpeid

## Parsed with column specification:
## cols(
##   sha256 = col_character(),
##   peid = col_character()
## )

2. Number of rows and number of columns (~3 lines)

dim(dfam)

## [1] 115   4

dim(dfeat)

## [1] 115   9

dim(dpeid)

## [1] 115   2

3. Display first several rows (~3 lines)

head(dfam)

## # A tibble: 6 x 4
##                                                             sha256
##                                                              <chr>
## 1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
## 2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
## 3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
## 4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
## 5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
## 6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
## # ... with 3 more variables: virus_type <chr>, virus_family <chr>,
## #   magic <chr>

4. Display list of column names for every DF (~3 lines)"

colnames(dfam)

## [1] "sha256"       "virus_type"   "virus_family" "magic"

5. Merge 3 DFs into one DF (~1 line)

dfam %>%
  left_join(dfeat, by="sha256") %>%
  left_join(dpeid, by="sha256") ->
  d

6. For every feature (column) calculate descriptive statistics (~1 line)"

d %>% 
  as.list %>% 
  map(summary)

## $sha256
##    Length     Class      Mode 
##       115 character character 
## 
## $virus_type
##    Length     Class      Mode 
##       115 character character 
## 
## $virus_family
##    Length     Class      Mode 
##       115 character character 
## 
## $magic
##    Length     Class      Mode 
##       115 character character 
## 
## $max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.706   5.894   6.635   6.491   7.914   7.993 
## 
## $num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   4.000   4.000   4.383   5.000   8.000 
## 
## $min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.2008  1.4183  3.4726  5.4093 
## 
## $image_base
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 4.194e+06 4.194e+06 4.194e+06 1.323e+08 2.684e+08 1.997e+09 
## 
## $import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   6.000   4.791   6.000  22.000 
## 
## $compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 7.090e+08 1.158e+09 1.165e+09 1.198e+09 1.302e+09 1.506e+09 
## 
## $file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3584  160690  200936  699532  561152 6752860 
## 
## $non_standard_section_names
##    Length     Class      Mode 
##       115 character character 
## 
## $peid
##    Length     Class      Mode 
##       115 character character

7. Check the type of every column and if the dataset has null values (~1 line)"

d %>% 
  as.list %>% 
  map_dfr(~ tibble(hasNA = sum(is.na(.x)) > 0, class(.x)), .id="col") %>%
  kkable

col	hasNA	class(.x)
sha256	FALSE	character
virus_type	FALSE	character
virus_family	FALSE	character
magic	FALSE	character
max_section_entropy	FALSE	numeric
num_sections	FALSE	integer
min_section_entropy	FALSE	numeric
image_base	FALSE	integer
import_table_size	FALSE	integer
compile_timestamp	FALSE	integer
file_size	FALSE	integer
non_standard_section_names	FALSE	character
peid	FALSE	character

8. Select all rows where virus_type = ‘ransomware’ with 3 different ways (~3 lines)"

d %>% 
  filter(virus_type == "ransomware")

## # A tibble: 29 x 13
##                                                              sha256
##                                                               <chr>
##  1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
##  2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
##  3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
##  4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
##  5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
##  6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
##  7 6B8946A144F05C56C81A77DF3B15C68D1344771AAC812BFB80CD199F058D13CA
##  8 78AAF6530035F26B6DF645E908C10E41692351E7302304A2B35ADCB73D916432
##  9 AC99BA525C11C399BF977E2084E7E21591181F01102C22F82B83CEE7D6CABAE5
## 10 BCDF7A4F4E0EEFD55EC0A814B382559C815106CB7820C93E7BB8A8E216E8C78D
## # ... with 19 more rows, and 12 more variables: virus_type <chr>,
## #   virus_family <chr>, magic <chr>, max_section_entropy <dbl>,
## #   num_sections <int>, min_section_entropy <dbl>, image_base <int>,
## #   import_table_size <int>, compile_timestamp <int>, file_size <int>,
## #   non_standard_section_names <chr>, peid <chr>

9. Drow histogram of the `file_size` for `virus_type` == "infector" (~ 1 line)

d %>%
  filter(virus_type == "infector") %>%
  ggplot(aes(file_size)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

10. Select rows where `virus_type` is "infector" and `file size` > 2MB"

d %>% 
  filter(virus_type == "infector",
         file_size > 2 * 1024 * 1024)

## # A tibble: 10 x 13
##                                                              sha256
##                                                               <chr>
##  1 2C3F5105B5E29460731B2E22DED720908A537017EDC0E6C9802D152427295143
##  2 2FBAA0DE5541A3BE9C73D49CEA29FF0CA90BCADED3D0F6620B65A479F4E5031A
##  3 0BCFB7AB7132E16B93BD19A3009A6CF7AAC4F61A80506FD948DEAF7AD35DA9C8
##  4 3ABBECADD592F277E6FDF1A22853DBADE6A6E5DDB6AB1C27D2E68971AB70E5AB
##  5 0E99B2AE0D118C7D453B9F98EA6504034820AEB799983801DFEB71A02B93BD70
##  6 2C40BF34F5258A5CE974349E7A47BBEB672BCD9205439BFA819F59195183E43A
##  7 2EB1E8760F783C0175A2B9665EE7CACD017F37F3DBCCCC6A299EA152BF8E1E34
##  8 0A95E06C9C01378F5EDC8A542E9C1B69CC439CEF64BB5D67A3A15D3C5C7FFDBF
##  9 2A017D1086E410E6C47E27A82A2A009F38C30040EB8AAF348BFB08D568E58A18
## 10 3DBC24323C9CF88B670F3BDE57331A4545A5FFEAB2278318BF8A8BA276E79517
## # ... with 12 more variables: virus_type <chr>, virus_family <chr>,
## #   magic <chr>, max_section_entropy <dbl>, num_sections <int>,
## #   min_section_entropy <dbl>, image_base <int>, import_table_size <int>,
## #   compile_timestamp <int>, file_size <int>,
## #   non_standard_section_names <chr>, peid <chr>

11. How many rows have ‘DLL’ substring in the textual column `magic`? (~2 lines)"

d %>%
  filter(grepl('DLL', magic)) %>%
  dim %>%
  .[1]

## [1] 42

12. Calculate frequency of all unique values in the column virus_type (~1 line)"

d %>%
  count(virus_type) %>%
  kkable

virus_type	n
infector	46
ransomware	29
worm	40

13. Replace values in the column `file_size` such that they reflect MB instead of bytes (~1 line)"

# (almost) never replace columns in your dataset!
d %>%
  mutate(file_size_mb = file_size / (1024 * 1024)) ->
  dfix

14. Calculate the number of unique pairs (`virus_family`, `virus_type`)

d %>%
  count(virus_type, virus_family) %>%
  kkable

virus_type	virus_family	n
infector	lamer	23
infector	ramnit	23
ransomware	cerber	5
ransomware	cryptolocker	15
ransomware	cryptxxx	9
worm	conficker	15
worm	xindl	25

15. Calculate frequency of all unique values in `virus_family` and % from the total number of values (~ 3 lines)"

d %>%
  count(virus_family) %>%
  mutate(perc = 100 * n / sum(n)) %>%
  kkable

virus_family	n	perc
cerber	5	4.347826
conficker	15	13.043478
cryptolocker	15	13.043478
cryptxxx	9	7.826087
lamer	23	20.000000
ramnit	23	20.000000
xindl	25	21.739130

16. Create cross-tab for `peid` and `virus_family` (~1 line)

d %>%
  count(peid, virus_family) %>%
  kkable

peid	virus_family	n
ASPack	ramnit	1
Borland Delphi	cryptxxx	2
CAB self-extract	lamer	1
Custom packer	cerber	2
Custom packer	ramnit	9
Microsoft Visual Basic	xindl	15
Microsoft Visual C++	conficker	7
Microsoft Visual C++	cryptolocker	1
Microsoft Visual C++	cryptxxx	2
Nothing found	cerber	1
Nothing found	conficker	8
Nothing found	cryptolocker	14
Nothing found	cryptxxx	5
Nothing found	lamer	13
Nothing found	ramnit	13
RAR self-extract	lamer	1
UPX	xindl	10
ZIP self-extract	cerber	2
ZIP self-extract	lamer	8

17. Unique values of textual column ‘magic’ with 3 ways (~3 lines)

d %>%
  count(magic) %>%
  .$magic

## [1] "PE32 executable (DLL) (console) Intel 80386, for MS Windows"                       
## [2] "PE32 executable (DLL) (GUI) Intel 80386 (stripped to external PDB), for MS Windows"
## [3] "PE32 executable (DLL) (GUI) Intel 80386, for MS Windows"                           
## [4] "PE32 executable (GUI) Intel 80386, for MS Windows"                                 
## [5] "PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed"

d$magic %>% unique

## [1] "PE32 executable (DLL) (GUI) Intel 80386, for MS Windows"                           
## [2] "PE32 executable (DLL) (GUI) Intel 80386 (stripped to external PDB), for MS Windows"
## [3] "PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed"                 
## [4] "PE32 executable (GUI) Intel 80386, for MS Windows"                                 
## [5] "PE32 executable (DLL) (console) Intel 80386, for MS Windows"

18. Find a row with the max `file_size` (~1 line)

d %>%
  top_n(1, file_size) %>%
  kkable

sha256	virus_type	virus_family	magic	max_section_entropy	num_sections	min_section_entropy	image_base	import_table_size	compile_timestamp	file_size	non_standard_section_names	peid
0A95E06C9C01378F5EDC8A542E9C1B69CC439CEF64BB5D67A3A15D3C5C7FFDBF	infector	lamer	PE32 executable (GUI) Intel 80386, for MS Windows	5.894445	5	0.1841507	4194304	6	1301791803	6752860	True	ZIP self-extract

19. Descriptive statistics for all numeric features groupped by `virus_family`"

d %>% 
  split(.$virus_family) %>%
  map(~ as.list(.x) %>% keep(is.numeric) %>% map(summary))

## $cerber
## $cerber$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.225   6.246   6.411   6.681   6.619   7.903 
## 
## $cerber$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0     4.0     4.0     4.2     4.0     5.0 
## 
## $cerber$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2404  0.2444  1.5118  1.8988  3.3178  4.1794 
## 
## $cerber$image_base
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4194304 4194304 4194304 4194304 4194304 4194304 
## 
## $cerber$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0     4.0     4.0     5.8     7.0    11.0 
## 
## $cerber$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.099e+09 1.190e+09 1.456e+09 1.332e+09 1.457e+09 1.459e+09 
## 
## $cerber$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  122292  123913  176128  215129  322048  331264 
## 
## 
## $conficker
## $conficker$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.965   7.973   7.979   7.979   7.984   7.993 
## 
## $conficker$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4       4       4       4       4       4 
## 
## $conficker$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.285   4.477   4.563   4.599   4.736   4.973 
## 
## $conficker$image_base
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 268435456 268435456 268435456 268435456 268435456 268435456 
## 
## $conficker$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   7.000   7.000   6.933   7.000   7.000 
## 
## $conficker$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.695e+08 9.401e+08 1.008e+09 1.007e+09 1.095e+09 1.161e+09 
## 
## $conficker$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  115856  159165  163729  158742  165412  167764 
## 
## 
## $cryptolocker
## $cryptolocker$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.711   6.798   6.904   6.912   6.938   7.698 
## 
## $cryptolocker$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   4.000   4.000   4.067   4.000   5.000 
## 
## $cryptolocker$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.257   3.447   3.493   3.455   3.512   3.526 
## 
## $cryptolocker$image_base
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4194304 4194304 4194304 4194304 4194304 4194304 
## 
## $cryptolocker$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   5.000   6.000   5.867   6.000   7.000 
## 
## $cryptolocker$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.081e+09 1.145e+09 1.160e+09 1.177e+09 1.197e+09 1.454e+09 
## 
## $cryptolocker$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   98304  186368  192512  285355  382976  581632 
## 
## 
## $cryptxxx
## $cryptxxx$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.443   6.635   6.913   6.864   6.981   7.345 
## 
## $cryptxxx$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   7.000   7.000   7.222   8.000   8.000 
## 
## $cryptxxx$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4904  0.7957  2.2006 
## 
## $cryptxxx$image_base
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   4194304 184549376 268435456 191073849 268435456 268435456 
## 
## $cryptxxx$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    5.00    8.00   10.11   12.00   22.00 
## 
## $cryptxxx$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 7.090e+08 1.120e+09 1.145e+09 1.145e+09 1.342e+09 1.466e+09 
## 
## $cryptxxx$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69120  188416  242176  275285  398848  465920 
## 
## 
## $lamer
## $lamer$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.894   5.894   5.894   6.002   6.142   6.142 
## 
## $lamer$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       5       5       5       5       5       5 
## 
## $lamer$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1842  0.1842  0.1842  0.1842  0.1842  0.1842 
## 
## $lamer$image_base
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4194304 4194304 4194304 4194304 4194304 4194304 
## 
## $lamer$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       6       6       6       6       6       6 
## 
## $lamer$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.302e+09 1.302e+09 1.302e+09 1.302e+09 1.302e+09 1.302e+09 
## 
## $lamer$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  276106 1314298 1923221 2525708 3105400 6752860 
## 
## 
## $ramnit
## $ramnit$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.706   3.706   3.706   5.416   7.583   7.972 
## 
## $ramnit$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   4.000   4.000   4.652   5.500   7.000 
## 
## $ramnit$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4244  0.4244  1.0497  0.5116  5.4093 
## 
## $ramnit$image_base
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 4.194e+06 2.684e+08 2.684e+08 3.992e+08 2.684e+08 1.997e+09 
## 
## $ramnit$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    2.87    4.00   10.00 
## 
## $ramnit$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.851e+08 1.205e+09 1.294e+09 1.258e+09 1.294e+09 1.506e+09 
## 
## $ramnit$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3584    3584    3584  277798  233430 1839104 
## 
## 
## $xindl
## $xindl$max_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.718   5.718   5.718   6.612   7.953   7.954 
## 
## $xindl$num_sections
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## $xindl$min_section_entropy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $xindl$image_base
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4194304 4194304 4194304 4194304 4194304 4194304 
## 
## $xindl$import_table_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.0     1.0     1.4     2.0     2.0 
## 
## $xindl$compile_timestamp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.165e+09 1.165e+09 1.165e+09 1.165e+09 1.165e+09 1.165e+09 
## 
## $xindl$file_size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  144049  200936  229376  230033  278528  326888

# can be polished more to produce a table

20. Create a new column in which you will have an average `file_size` for current `virus_family` (~2 lines)"

d %>%
  group_by(virus_family) %>%
  mutate(avg_file_size = mean(file_size)) %>%
  select(virus_family, avg_file_size, 2:13)

## # A tibble: 115 x 13
## # Groups:   virus_family [7]
##    virus_family avg_file_size virus_type
##           <chr>         <dbl>      <chr>
##  1     cryptxxx      275285.3 ransomware
##  2     cryptxxx      275285.3 ransomware
##  3     cryptxxx      275285.3 ransomware
##  4     cryptxxx      275285.3 ransomware
##  5     cryptxxx      275285.3 ransomware
##  6     cryptxxx      275285.3 ransomware
##  7     cryptxxx      275285.3 ransomware
##  8     cryptxxx      275285.3 ransomware
##  9     cryptxxx      275285.3 ransomware
## 10       cerber      215129.0 ransomware
## # ... with 105 more rows, and 10 more variables: magic <chr>,
## #   max_section_entropy <dbl>, num_sections <int>,
## #   min_section_entropy <dbl>, image_base <int>, import_table_size <int>,
## #   compile_timestamp <int>, file_size <int>,
## #   non_standard_section_names <chr>, peid <chr>

21. Draw histogram of `file_size` for every `virus_family` in one plot (~1 line)"

# too many unique values so the overlapping plot does not make sense!
d %>%
  ggplot(aes(file_size)) +
  geom_histogram() +
  facet_wrap(~ virus_family, ncol = 1, strip.position = "left")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

22. Drow boxplot for `max_section_entropy` (~1 line)

# it's normal to use categorical y for all boxplots, we'll use 1 as fake here
d %>%
  ggplot(aes(1, max_section_entropy)) +
  geom_boxplot()

23. Drow boxplots of `import_table_size` for each `virus_type` (3 boxplots in one plot, ~1 line)"

d %>%
  ggplot(aes(virus_type, import_table_size)) +
  geom_boxplot()

24. Create a new DataFrame with one-hot encoded `peid` column (~1 line)

d %>%
  select(sha256, peid) %>%
  mutate(one = T) %>%
  spread(peid, one, fill=F) %>% 
  select(-sha256) ->
  peid_dummies

25. Create a DF with only numeric features from `data` (~1 line)

d %>%
  as.list %>%
  keep(is.numeric) %>%
  do.call(tibble, .) ->
  data_num

26. Concatenate two DataFrames into one: `data_num` and `peid_dummies` (stack horizontally, ~1 line)

bind_cols(data_num, peid_dummies)

## # A tibble: 115 x 17
##    max_section_entropy num_sections min_section_entropy image_base
##                  <dbl>        <int>               <dbl>      <int>
##  1            6.635195            7           0.0000000    4194304
##  2            6.442698            7           1.4170280  184549376
##  3            7.171786            7           2.2005675  184549376
##  4            6.789311            8           0.0000000  268435456
##  5            6.934068            8           0.0000000  268435456
##  6            6.981491            8           0.0000000  268435456
##  7            7.345267            5           0.7957287  268435456
##  8            6.913324            8           0.0000000  268435456
##  9            6.566401            7           0.0000000    4194304
## 10            6.246415            4           0.2443513    4194304
## # ... with 105 more rows, and 13 more variables: import_table_size <int>,
## #   compile_timestamp <int>, file_size <int>, ASPack <lgl>, `Borland
## #   Delphi` <lgl>, `CAB self-extract` <lgl>, `Custom packer` <lgl>,
## #   `Microsoft Visual Basic` <lgl>, `Microsoft Visual C++` <lgl>, `Nothing
## #   found` <lgl>, `RAR self-extract` <lgl>, UPX <lgl>, `ZIP
## #   self-extract` <lgl>

27. Save dataset X to a CSV file `X.csv`"

bind_cols(data_num, peid_dummies) %>%
  write_csv('data/X.csv')

28. Drop columns `compile_timestamp` and `image_base`"

d %>%
  select(-compile_timestamp, -image_base)

## # A tibble: 115 x 11
##                                                              sha256
##                                                               <chr>
##  1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
##  2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
##  3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
##  4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
##  5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
##  6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
##  7 6B8946A144F05C56C81A77DF3B15C68D1344771AAC812BFB80CD199F058D13CA
##  8 78AAF6530035F26B6DF645E908C10E41692351E7302304A2B35ADCB73D916432
##  9 AC99BA525C11C399BF977E2084E7E21591181F01102C22F82B83CEE7D6CABAE5
## 10 BCDF7A4F4E0EEFD55EC0A814B382559C815106CB7820C93E7BB8A8E216E8C78D
## # ... with 105 more rows, and 10 more variables: virus_type <chr>,
## #   virus_family <chr>, magic <chr>, max_section_entropy <dbl>,
## #   num_sections <int>, min_section_entropy <dbl>,
## #   import_table_size <int>, file_size <int>,
## #   non_standard_section_names <chr>, peid <chr>

29. Create a Series `y` from the column `virus_type`

# no Series in R ;)

30. Create a simple Random Forest classifier for X and y

# no more time ;)

Tidyverse ftw! (solution_libor)

liborm

2/14/2018

Hacky convert to Rmd

`kkable`

One click publish to web from R Studio

1. Read CSV files (~3 lines)

2. Number of rows and number of columns (~3 lines)

3. Display first several rows (~3 lines)

4. Display list of column names for every DF (~3 lines)"

5. Merge 3 DFs into one DF (~1 line)

6. For every feature (column) calculate descriptive statistics (~1 line)"

7. Check the type of every column and if the dataset has null values (~1 line)"

8. Select all rows where virus_type = ‘ransomware’ with 3 different ways (~3 lines)"

9. Drow histogram of the `file_size` for `virus_type` == "infector" (~ 1 line)

10. Select rows where `virus_type` is "infector" and `file size` > 2MB"

11. How many rows have ‘DLL’ substring in the textual column `magic`? (~2 lines)"

12. Calculate frequency of all unique values in the column virus_type (~1 line)"

13. Replace values in the column `file_size` such that they reflect MB instead of bytes (~1 line)"

14. Calculate the number of unique pairs (`virus_family`, `virus_type`)

15. Calculate frequency of all unique values in `virus_family` and % from the total number of values (~ 3 lines)"

16. Create cross-tab for `peid` and `virus_family` (~1 line)

17. Unique values of textual column ‘magic’ with 3 ways (~3 lines)

18. Find a row with the max `file_size` (~1 line)

19. Descriptive statistics for all numeric features groupped by `virus_family`"

20. Create a new column in which you will have an average `file_size` for current `virus_family` (~2 lines)"

21. Draw histogram of `file_size` for every `virus_family` in one plot (~1 line)"

22. Drow boxplot for `max_section_entropy` (~1 line)

23. Drow boxplots of `import_table_size` for each `virus_type` (3 boxplots in one plot, ~1 line)"

24. Create a new DataFrame with one-hot encoded `peid` column (~1 line)

25. Create a DF with only numeric features from `data` (~1 line)

26. Concatenate two DataFrames into one: `data_num` and `peid_dummies` (stack horizontally, ~1 line)

27. Save dataset X to a CSV file `X.csv`"

28. Drop columns `compile_timestamp` and `image_base`"

29. Create a Series `y` from the column `virus_type`

30. Create a simple Random Forest classifier for X and y

Tidyverse ftw! (solution_libor)

liborm

2/14/2018

Hacky convert to Rmd

kkable

One click publish to web from R Studio

1. Read CSV files (~3 lines)

2. Number of rows and number of columns (~3 lines)

3. Display first several rows (~3 lines)

4. Display list of column names for every DF (~3 lines)"

5. Merge 3 DFs into one DF (~1 line)

6. For every feature (column) calculate descriptive statistics (~1 line)"

7. Check the type of every column and if the dataset has null values (~1 line)"

8. Select all rows where virus_type = ‘ransomware’ with 3 different ways (~3 lines)"

9. Drow histogram of the file_size for virus_type == "infector" (~ 1 line)

10. Select rows where virus_type is "infector" and file size > 2MB"

11. How many rows have ‘DLL’ substring in the textual column magic? (~2 lines)"

12. Calculate frequency of all unique values in the column virus_type (~1 line)"

13. Replace values in the column file_size such that they reflect MB instead of bytes (~1 line)"

14. Calculate the number of unique pairs (virus_family, virus_type)

15. Calculate frequency of all unique values in virus_family and % from the total number of values (~ 3 lines)"

16. Create cross-tab for peid and virus_family (~1 line)

17. Unique values of textual column ‘magic’ with 3 ways (~3 lines)

18. Find a row with the max file_size (~1 line)

19. Descriptive statistics for all numeric features groupped by virus_family"

20. Create a new column in which you will have an average file_size for current virus_family (~2 lines)"

21. Draw histogram of file_size for every virus_family in one plot (~1 line)"

22. Drow boxplot for max_section_entropy (~1 line)

23. Drow boxplots of import_table_size for each virus_type (3 boxplots in one plot, ~1 line)"

24. Create a new DataFrame with one-hot encoded peid column (~1 line)

25. Create a DF with only numeric features from data (~1 line)

26. Concatenate two DataFrames into one: data_num and peid_dummies (stack horizontally, ~1 line)

27. Save dataset X to a CSV file X.csv"

28. Drop columns compile_timestamp and image_base"

29. Create a Series y from the column virus_type

30. Create a simple Random Forest classifier for X and y

`kkable`

9. Drow histogram of the `file_size` for `virus_type` == "infector" (~ 1 line)

10. Select rows where `virus_type` is "infector" and `file size` > 2MB"

11. How many rows have ‘DLL’ substring in the textual column `magic`? (~2 lines)"

13. Replace values in the column `file_size` such that they reflect MB instead of bytes (~1 line)"

14. Calculate the number of unique pairs (`virus_family`, `virus_type`)

15. Calculate frequency of all unique values in `virus_family` and % from the total number of values (~ 3 lines)"

16. Create cross-tab for `peid` and `virus_family` (~1 line)

18. Find a row with the max `file_size` (~1 line)

19. Descriptive statistics for all numeric features groupped by `virus_family`"

20. Create a new column in which you will have an average `file_size` for current `virus_family` (~2 lines)"

21. Draw histogram of `file_size` for every `virus_family` in one plot (~1 line)"

22. Drow boxplot for `max_section_entropy` (~1 line)

23. Drow boxplots of `import_table_size` for each `virus_type` (3 boxplots in one plot, ~1 line)"

24. Create a new DataFrame with one-hot encoded `peid` column (~1 line)

25. Create a DF with only numeric features from `data` (~1 line)

26. Concatenate two DataFrames into one: `data_num` and `peid_dummies` (stack horizontally, ~1 line)

27. Save dataset X to a CSV file `X.csv`"

28. Drop columns `compile_timestamp` and `image_base`"

29. Create a Series `y` from the column `virus_type`