Hacky convert to Rmd
<Template_Solution.ipynb grep '###' |
sed -e 's/ "//'\
-e 's/\\n"//'\
-e 's/,$//'\
-e 's/$/\n```{r}\n\n```/'\
> solution_libor.Rmd
kkable
The construct
xxx %>%
kkable
is only used to display a nice table in markdown.
One click publish to web from R Studio
The rendered document ends up on RPubs.
1. Read CSV files (~3 lines)
read_csv('data/families.csv') -> dfam
## Parsed with column specification:
## cols(
## sha256 = col_character(),
## virus_type = col_character(),
## virus_family = col_character(),
## magic = col_character()
## )
read_csv('data/features.csv') -> dfeat
## Parsed with column specification:
## cols(
## max_section_entropy = col_double(),
## num_sections = col_integer(),
## min_section_entropy = col_double(),
## image_base = col_integer(),
## import_table_size = col_integer(),
## compile_timestamp = col_integer(),
## file_size = col_integer(),
## sha256 = col_character(),
## non_standard_section_names = col_character()
## )
read_csv('data/peid.csv') -> dpeid
## Parsed with column specification:
## cols(
## sha256 = col_character(),
## peid = col_character()
## )
2. Number of rows and number of columns (~3 lines)
dim(dfam)
## [1] 115 4
dim(dfeat)
## [1] 115 9
dim(dpeid)
## [1] 115 2
3. Display first several rows (~3 lines)
head(dfam)
## # A tibble: 6 x 4
## sha256
## <chr>
## 1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
## 2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
## 3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
## 4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
## 5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
## 6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
## # ... with 3 more variables: virus_type <chr>, virus_family <chr>,
## # magic <chr>
4. Display list of column names for every DF (~3 lines)"
colnames(dfam)
## [1] "sha256" "virus_type" "virus_family" "magic"
5. Merge 3 DFs into one DF (~1 line)
dfam %>%
left_join(dfeat, by="sha256") %>%
left_join(dpeid, by="sha256") ->
d
6. For every feature (column) calculate descriptive statistics (~1 line)"
d %>%
as.list %>%
map(summary)
## $sha256
## Length Class Mode
## 115 character character
##
## $virus_type
## Length Class Mode
## 115 character character
##
## $virus_family
## Length Class Mode
## 115 character character
##
## $magic
## Length Class Mode
## 115 character character
##
## $max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.706 5.894 6.635 6.491 7.914 7.993
##
## $num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 4.000 4.000 4.383 5.000 8.000
##
## $min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.2008 1.4183 3.4726 5.4093
##
## $image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.194e+06 4.194e+06 4.194e+06 1.323e+08 2.684e+08 1.997e+09
##
## $import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 6.000 4.791 6.000 22.000
##
## $compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.090e+08 1.158e+09 1.165e+09 1.198e+09 1.302e+09 1.506e+09
##
## $file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3584 160690 200936 699532 561152 6752860
##
## $non_standard_section_names
## Length Class Mode
## 115 character character
##
## $peid
## Length Class Mode
## 115 character character
7. Check the type of every column and if the dataset has null values (~1 line)"
d %>%
as.list %>%
map_dfr(~ tibble(hasNA = sum(is.na(.x)) > 0, class(.x)), .id="col") %>%
kkable
|
col
|
hasNA
|
class(.x)
|
|
sha256
|
FALSE
|
character
|
|
virus_type
|
FALSE
|
character
|
|
virus_family
|
FALSE
|
character
|
|
magic
|
FALSE
|
character
|
|
max_section_entropy
|
FALSE
|
numeric
|
|
num_sections
|
FALSE
|
integer
|
|
min_section_entropy
|
FALSE
|
numeric
|
|
image_base
|
FALSE
|
integer
|
|
import_table_size
|
FALSE
|
integer
|
|
compile_timestamp
|
FALSE
|
integer
|
|
file_size
|
FALSE
|
integer
|
|
non_standard_section_names
|
FALSE
|
character
|
|
peid
|
FALSE
|
character
|
8. Select all rows where virus_type = ‘ransomware’ with 3 different ways (~3 lines)"
d %>%
filter(virus_type == "ransomware")
## # A tibble: 29 x 13
## sha256
## <chr>
## 1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
## 2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
## 3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
## 4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
## 5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
## 6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
## 7 6B8946A144F05C56C81A77DF3B15C68D1344771AAC812BFB80CD199F058D13CA
## 8 78AAF6530035F26B6DF645E908C10E41692351E7302304A2B35ADCB73D916432
## 9 AC99BA525C11C399BF977E2084E7E21591181F01102C22F82B83CEE7D6CABAE5
## 10 BCDF7A4F4E0EEFD55EC0A814B382559C815106CB7820C93E7BB8A8E216E8C78D
## # ... with 19 more rows, and 12 more variables: virus_type <chr>,
## # virus_family <chr>, magic <chr>, max_section_entropy <dbl>,
## # num_sections <int>, min_section_entropy <dbl>, image_base <int>,
## # import_table_size <int>, compile_timestamp <int>, file_size <int>,
## # non_standard_section_names <chr>, peid <chr>
9. Drow histogram of the file_size for virus_type == "infector" (~ 1 line)
d %>%
filter(virus_type == "infector") %>%
ggplot(aes(file_size)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

10. Select rows where virus_type is "infector" and file size > 2MB"
d %>%
filter(virus_type == "infector",
file_size > 2 * 1024 * 1024)
## # A tibble: 10 x 13
## sha256
## <chr>
## 1 2C3F5105B5E29460731B2E22DED720908A537017EDC0E6C9802D152427295143
## 2 2FBAA0DE5541A3BE9C73D49CEA29FF0CA90BCADED3D0F6620B65A479F4E5031A
## 3 0BCFB7AB7132E16B93BD19A3009A6CF7AAC4F61A80506FD948DEAF7AD35DA9C8
## 4 3ABBECADD592F277E6FDF1A22853DBADE6A6E5DDB6AB1C27D2E68971AB70E5AB
## 5 0E99B2AE0D118C7D453B9F98EA6504034820AEB799983801DFEB71A02B93BD70
## 6 2C40BF34F5258A5CE974349E7A47BBEB672BCD9205439BFA819F59195183E43A
## 7 2EB1E8760F783C0175A2B9665EE7CACD017F37F3DBCCCC6A299EA152BF8E1E34
## 8 0A95E06C9C01378F5EDC8A542E9C1B69CC439CEF64BB5D67A3A15D3C5C7FFDBF
## 9 2A017D1086E410E6C47E27A82A2A009F38C30040EB8AAF348BFB08D568E58A18
## 10 3DBC24323C9CF88B670F3BDE57331A4545A5FFEAB2278318BF8A8BA276E79517
## # ... with 12 more variables: virus_type <chr>, virus_family <chr>,
## # magic <chr>, max_section_entropy <dbl>, num_sections <int>,
## # min_section_entropy <dbl>, image_base <int>, import_table_size <int>,
## # compile_timestamp <int>, file_size <int>,
## # non_standard_section_names <chr>, peid <chr>
11. How many rows have ‘DLL’ substring in the textual column magic? (~2 lines)"
d %>%
filter(grepl('DLL', magic)) %>%
dim %>%
.[1]
## [1] 42
12. Calculate frequency of all unique values in the column virus_type (~1 line)"
d %>%
count(virus_type) %>%
kkable
|
virus_type
|
n
|
|
infector
|
46
|
|
ransomware
|
29
|
|
worm
|
40
|
13. Replace values in the column file_size such that they reflect MB instead of bytes (~1 line)"
# (almost) never replace columns in your dataset!
d %>%
mutate(file_size_mb = file_size / (1024 * 1024)) ->
dfix
14. Calculate the number of unique pairs (virus_family, virus_type)
d %>%
count(virus_type, virus_family) %>%
kkable
|
virus_type
|
virus_family
|
n
|
|
infector
|
lamer
|
23
|
|
infector
|
ramnit
|
23
|
|
ransomware
|
cerber
|
5
|
|
ransomware
|
cryptolocker
|
15
|
|
ransomware
|
cryptxxx
|
9
|
|
worm
|
conficker
|
15
|
|
worm
|
xindl
|
25
|
15. Calculate frequency of all unique values in virus_family and % from the total number of values (~ 3 lines)"
d %>%
count(virus_family) %>%
mutate(perc = 100 * n / sum(n)) %>%
kkable
|
virus_family
|
n
|
perc
|
|
cerber
|
5
|
4.347826
|
|
conficker
|
15
|
13.043478
|
|
cryptolocker
|
15
|
13.043478
|
|
cryptxxx
|
9
|
7.826087
|
|
lamer
|
23
|
20.000000
|
|
ramnit
|
23
|
20.000000
|
|
xindl
|
25
|
21.739130
|
16. Create cross-tab for peid and virus_family (~1 line)
d %>%
count(peid, virus_family) %>%
kkable
|
peid
|
virus_family
|
n
|
|
ASPack
|
ramnit
|
1
|
|
Borland Delphi
|
cryptxxx
|
2
|
|
CAB self-extract
|
lamer
|
1
|
|
Custom packer
|
cerber
|
2
|
|
Custom packer
|
ramnit
|
9
|
|
Microsoft Visual Basic
|
xindl
|
15
|
|
Microsoft Visual C++
|
conficker
|
7
|
|
Microsoft Visual C++
|
cryptolocker
|
1
|
|
Microsoft Visual C++
|
cryptxxx
|
2
|
|
Nothing found
|
cerber
|
1
|
|
Nothing found
|
conficker
|
8
|
|
Nothing found
|
cryptolocker
|
14
|
|
Nothing found
|
cryptxxx
|
5
|
|
Nothing found
|
lamer
|
13
|
|
Nothing found
|
ramnit
|
13
|
|
RAR self-extract
|
lamer
|
1
|
|
UPX
|
xindl
|
10
|
|
ZIP self-extract
|
cerber
|
2
|
|
ZIP self-extract
|
lamer
|
8
|
17. Unique values of textual column ‘magic’ with 3 ways (~3 lines)
d %>%
count(magic) %>%
.$magic
## [1] "PE32 executable (DLL) (console) Intel 80386, for MS Windows"
## [2] "PE32 executable (DLL) (GUI) Intel 80386 (stripped to external PDB), for MS Windows"
## [3] "PE32 executable (DLL) (GUI) Intel 80386, for MS Windows"
## [4] "PE32 executable (GUI) Intel 80386, for MS Windows"
## [5] "PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed"
d$magic %>% unique
## [1] "PE32 executable (DLL) (GUI) Intel 80386, for MS Windows"
## [2] "PE32 executable (DLL) (GUI) Intel 80386 (stripped to external PDB), for MS Windows"
## [3] "PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed"
## [4] "PE32 executable (GUI) Intel 80386, for MS Windows"
## [5] "PE32 executable (DLL) (console) Intel 80386, for MS Windows"
18. Find a row with the max file_size (~1 line)
d %>%
top_n(1, file_size) %>%
kkable
|
sha256
|
virus_type
|
virus_family
|
magic
|
max_section_entropy
|
num_sections
|
min_section_entropy
|
image_base
|
import_table_size
|
compile_timestamp
|
file_size
|
non_standard_section_names
|
peid
|
|
0A95E06C9C01378F5EDC8A542E9C1B69CC439CEF64BB5D67A3A15D3C5C7FFDBF
|
infector
|
lamer
|
PE32 executable (GUI) Intel 80386, for MS Windows
|
5.894445
|
5
|
0.1841507
|
4194304
|
6
|
1301791803
|
6752860
|
True
|
ZIP self-extract
|
19. Descriptive statistics for all numeric features groupped by virus_family"
d %>%
split(.$virus_family) %>%
map(~ as.list(.x) %>% keep(is.numeric) %>% map(summary))
## $cerber
## $cerber$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.225 6.246 6.411 6.681 6.619 7.903
##
## $cerber$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 4.0 4.0 4.2 4.0 5.0
##
## $cerber$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2404 0.2444 1.5118 1.8988 3.3178 4.1794
##
## $cerber$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4194304 4194304 4194304 4194304 4194304 4194304
##
## $cerber$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 4.0 4.0 5.8 7.0 11.0
##
## $cerber$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.099e+09 1.190e+09 1.456e+09 1.332e+09 1.457e+09 1.459e+09
##
## $cerber$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 122292 123913 176128 215129 322048 331264
##
##
## $conficker
## $conficker$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.965 7.973 7.979 7.979 7.984 7.993
##
## $conficker$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 4 4 4 4 4
##
## $conficker$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.285 4.477 4.563 4.599 4.736 4.973
##
## $conficker$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 268435456 268435456 268435456 268435456 268435456 268435456
##
## $conficker$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 7.000 7.000 6.933 7.000 7.000
##
## $conficker$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.695e+08 9.401e+08 1.008e+09 1.007e+09 1.095e+09 1.161e+09
##
## $conficker$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 115856 159165 163729 158742 165412 167764
##
##
## $cryptolocker
## $cryptolocker$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.711 6.798 6.904 6.912 6.938 7.698
##
## $cryptolocker$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 4.000 4.000 4.067 4.000 5.000
##
## $cryptolocker$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.257 3.447 3.493 3.455 3.512 3.526
##
## $cryptolocker$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4194304 4194304 4194304 4194304 4194304 4194304
##
## $cryptolocker$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 5.000 6.000 5.867 6.000 7.000
##
## $cryptolocker$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.081e+09 1.145e+09 1.160e+09 1.177e+09 1.197e+09 1.454e+09
##
## $cryptolocker$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 98304 186368 192512 285355 382976 581632
##
##
## $cryptxxx
## $cryptxxx$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.443 6.635 6.913 6.864 6.981 7.345
##
## $cryptxxx$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 7.000 7.000 7.222 8.000 8.000
##
## $cryptxxx$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4904 0.7957 2.2006
##
## $cryptxxx$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4194304 184549376 268435456 191073849 268435456 268435456
##
## $cryptxxx$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 5.00 8.00 10.11 12.00 22.00
##
## $cryptxxx$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.090e+08 1.120e+09 1.145e+09 1.145e+09 1.342e+09 1.466e+09
##
## $cryptxxx$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69120 188416 242176 275285 398848 465920
##
##
## $lamer
## $lamer$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.894 5.894 5.894 6.002 6.142 6.142
##
## $lamer$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5 5 5 5 5 5
##
## $lamer$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1842 0.1842 0.1842 0.1842 0.1842 0.1842
##
## $lamer$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4194304 4194304 4194304 4194304 4194304 4194304
##
## $lamer$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6 6 6 6 6 6
##
## $lamer$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.302e+09 1.302e+09 1.302e+09 1.302e+09 1.302e+09 1.302e+09
##
## $lamer$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 276106 1314298 1923221 2525708 3105400 6752860
##
##
## $ramnit
## $ramnit$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.706 3.706 3.706 5.416 7.583 7.972
##
## $ramnit$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 4.000 4.000 4.652 5.500 7.000
##
## $ramnit$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.4244 0.4244 1.0497 0.5116 5.4093
##
## $ramnit$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.194e+06 2.684e+08 2.684e+08 3.992e+08 2.684e+08 1.997e+09
##
## $ramnit$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 2.87 4.00 10.00
##
## $ramnit$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.851e+08 1.205e+09 1.294e+09 1.258e+09 1.294e+09 1.506e+09
##
## $ramnit$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3584 3584 3584 277798 233430 1839104
##
##
## $xindl
## $xindl$max_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.718 5.718 5.718 6.612 7.953 7.954
##
## $xindl$num_sections
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
##
## $xindl$min_section_entropy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
##
## $xindl$image_base
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4194304 4194304 4194304 4194304 4194304 4194304
##
## $xindl$import_table_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 1.0 1.4 2.0 2.0
##
## $xindl$compile_timestamp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.165e+09 1.165e+09 1.165e+09 1.165e+09 1.165e+09 1.165e+09
##
## $xindl$file_size
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 144049 200936 229376 230033 278528 326888
# can be polished more to produce a table
20. Create a new column in which you will have an average file_size for current virus_family (~2 lines)"
d %>%
group_by(virus_family) %>%
mutate(avg_file_size = mean(file_size)) %>%
select(virus_family, avg_file_size, 2:13)
## # A tibble: 115 x 13
## # Groups: virus_family [7]
## virus_family avg_file_size virus_type
## <chr> <dbl> <chr>
## 1 cryptxxx 275285.3 ransomware
## 2 cryptxxx 275285.3 ransomware
## 3 cryptxxx 275285.3 ransomware
## 4 cryptxxx 275285.3 ransomware
## 5 cryptxxx 275285.3 ransomware
## 6 cryptxxx 275285.3 ransomware
## 7 cryptxxx 275285.3 ransomware
## 8 cryptxxx 275285.3 ransomware
## 9 cryptxxx 275285.3 ransomware
## 10 cerber 215129.0 ransomware
## # ... with 105 more rows, and 10 more variables: magic <chr>,
## # max_section_entropy <dbl>, num_sections <int>,
## # min_section_entropy <dbl>, image_base <int>, import_table_size <int>,
## # compile_timestamp <int>, file_size <int>,
## # non_standard_section_names <chr>, peid <chr>
21. Draw histogram of file_size for every virus_family in one plot (~1 line)"
# too many unique values so the overlapping plot does not make sense!
d %>%
ggplot(aes(file_size)) +
geom_histogram() +
facet_wrap(~ virus_family, ncol = 1, strip.position = "left")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

22. Drow boxplot for max_section_entropy (~1 line)
# it's normal to use categorical y for all boxplots, we'll use 1 as fake here
d %>%
ggplot(aes(1, max_section_entropy)) +
geom_boxplot()

23. Drow boxplots of import_table_size for each virus_type (3 boxplots in one plot, ~1 line)"
d %>%
ggplot(aes(virus_type, import_table_size)) +
geom_boxplot()

24. Create a new DataFrame with one-hot encoded peid column (~1 line)
d %>%
select(sha256, peid) %>%
mutate(one = T) %>%
spread(peid, one, fill=F) %>%
select(-sha256) ->
peid_dummies
25. Create a DF with only numeric features from data (~1 line)
d %>%
as.list %>%
keep(is.numeric) %>%
do.call(tibble, .) ->
data_num
26. Concatenate two DataFrames into one: data_num and peid_dummies (stack horizontally, ~1 line)
bind_cols(data_num, peid_dummies)
## # A tibble: 115 x 17
## max_section_entropy num_sections min_section_entropy image_base
## <dbl> <int> <dbl> <int>
## 1 6.635195 7 0.0000000 4194304
## 2 6.442698 7 1.4170280 184549376
## 3 7.171786 7 2.2005675 184549376
## 4 6.789311 8 0.0000000 268435456
## 5 6.934068 8 0.0000000 268435456
## 6 6.981491 8 0.0000000 268435456
## 7 7.345267 5 0.7957287 268435456
## 8 6.913324 8 0.0000000 268435456
## 9 6.566401 7 0.0000000 4194304
## 10 6.246415 4 0.2443513 4194304
## # ... with 105 more rows, and 13 more variables: import_table_size <int>,
## # compile_timestamp <int>, file_size <int>, ASPack <lgl>, `Borland
## # Delphi` <lgl>, `CAB self-extract` <lgl>, `Custom packer` <lgl>,
## # `Microsoft Visual Basic` <lgl>, `Microsoft Visual C++` <lgl>, `Nothing
## # found` <lgl>, `RAR self-extract` <lgl>, UPX <lgl>, `ZIP
## # self-extract` <lgl>
27. Save dataset X to a CSV file X.csv"
bind_cols(data_num, peid_dummies) %>%
write_csv('data/X.csv')
28. Drop columns compile_timestamp and image_base"
d %>%
select(-compile_timestamp, -image_base)
## # A tibble: 115 x 11
## sha256
## <chr>
## 1 336C46AEC4D01E3E44388AB8A537474D056FA79B1F00025999E514E1B4B3FD2B
## 2 E07FF347A27EB18A9863FAB951D218DCFEC84E19623D8C2826BC0268FDB79450
## 3 01C225263D70D9DE177B31C450F5E21E5703E93BE2F8C5D7AD3F17539E143C58
## 4 0FD2A07237A41F6FC64C6D243D9C5DD269848120B46B489BF5054CAD0CB8D04F
## 5 E225839539E22AC7FB975E622F3FE0B4E4D2D040AA539D10B756FC91D92E1E07
## 6 5B75A8F0DA6021F1B41BDC7D23CFE63A25FFAEACB8C8F9940FB0603AD9848CF1
## 7 6B8946A144F05C56C81A77DF3B15C68D1344771AAC812BFB80CD199F058D13CA
## 8 78AAF6530035F26B6DF645E908C10E41692351E7302304A2B35ADCB73D916432
## 9 AC99BA525C11C399BF977E2084E7E21591181F01102C22F82B83CEE7D6CABAE5
## 10 BCDF7A4F4E0EEFD55EC0A814B382559C815106CB7820C93E7BB8A8E216E8C78D
## # ... with 105 more rows, and 10 more variables: virus_type <chr>,
## # virus_family <chr>, magic <chr>, max_section_entropy <dbl>,
## # num_sections <int>, min_section_entropy <dbl>,
## # import_table_size <int>, file_size <int>,
## # non_standard_section_names <chr>, peid <chr>
29. Create a Series y from the column virus_type
# no Series in R ;)
30. Create a simple Random Forest classifier for X and y
# no more time ;)