library(data.table)

Why consider data.table: http://www.analyticsvidhya.com/blog/2016/05/data-table-data-frame-work-large-data-sets/

system.time(applicant_tracking_steps <- import.format.ats())
user system elapsed
3884.43 16.06 3904.17

That’s about 65 minutes using read.csv()

system.time(applicant_tracking_steps <- import.format.ats.dt())
Read 35409294 rows and 10 (of 10) columns from 7.892 GB file in 00:10:08
user system elapsed
762.75 11.04 820.06

That’s a little over 13 min using fread() from the data.table package.

If you are interested in learning more about data.table: https://www.datacamp.com/courses/data-table-data-manipulation-r-tutorial.

Or if you just want a quick reference: https://s3.amazonaws.com/assets.datacamp.com/img/blog/data+table+cheat+sheet.pdf

Alzheimers <- read.csv("~/AlzheimersTest/Alzheimers.csv", header=T)
Alzheimers <- data.table(Alzheimers)

Let’s check and make sure the data looks like we would expect it to.

head(Alzheimers)
##    ACE_CD143_Angiotensin_Converti ACTH_Adrenocorticotropic_Hormon
## 1:                      2.0031003                       -1.386294
## 2:                      1.5618560                       -1.386294
## 3:                      1.5206598                       -1.714798
## 4:                      1.6808260                       -1.609438
## 5:                      2.4009308                       -0.967584
## 6:                      0.4311565                       -1.272966
##           AXL Adiponectin Alpha_1_Antichymotrypsin Alpha_1_Antitrypsin
## 1:  1.0983867   -5.360193                 1.740466           -12.63136
## 2:  0.6832816   -5.020686                 1.458615           -11.90988
## 3: -0.1452763   -5.809143                 1.193922           -13.64296
## 4:  0.6832816   -5.115996                 1.280934           -15.52356
## 5:  0.1908902   -4.779524                 2.128232           -11.13306
## 6: -0.2223611   -5.221356                 1.308333           -12.13464
##    Alpha_1_Microglobulin Alpha_2_Macroglobulin Angiopoietin_2_ANG_2
## 1:             -2.577022             -72.65029           1.06471074
## 2:             -3.244194            -154.61228           0.74193734
## 3:             -2.882404            -136.52918           0.83290912
## 4:             -3.170086             -98.36175           0.91629073
## 5:             -2.343407            -144.94460           0.95551144
## 6:             -2.551046            -154.61228          -0.05129329
##    Angiotensinogen Apolipoprotein_A_IV Apolipoprotein_A1 Apolipoprotein_A2
## 1:        2.510547           -1.427116         -7.402052       -0.26136476
## 2:        2.457283           -1.660731         -7.047017       -0.86750057
## 3:        1.976365           -1.660731         -7.684284       -0.65392647
## 4:        2.376085           -2.120264         -8.047190       -1.23787436
## 5:        2.862219           -1.171183         -6.725434        0.09531018
## 6:        2.524026           -1.386294         -7.402052       -0.27443685
##    Apolipoprotein_B Apolipoprotein_CI Apolipoprotein_CIII Apolipoprotein_D
## 1:        -4.624044        -1.2729657           -2.312635         2.079442
## 2:        -6.747507        -1.2729657           -2.343407         1.335001
## 3:        -3.976069        -1.7147984           -2.748872         1.335001
## 4:        -6.517424        -1.9661129           -2.995732         1.435085
## 5:        -3.378594        -0.7550226           -1.514128         1.629241
## 6:        -2.963532        -1.6607312           -2.312635         1.916923
##    Apolipoprotein_E Apolipoprotein_H B_Lymphocyte_Chemoattractant_BL
## 1:        3.7545215       -0.1573491                        2.296982
## 2:        3.0971187       -0.5753962                        1.673121
## 3:        2.7530556       -0.3448394                        1.673121
## 4:        2.3713615       -0.5317281                        1.980509
## 5:        3.0671471        0.6626345                        2.296982
## 6:        0.5911464        0.0971503                        2.479838
##        BMP_6 Beta_2_Microglobulin Betacellulin C_Reactive_Protein
## 1: -2.200744            0.6931472           34          -4.074542
## 2: -1.728053            0.4700036           53          -6.645391
## 3: -2.062421            0.3364722           49          -8.047190
## 4: -1.982912            0.6418539           52          -6.214608
## 5: -1.241520            0.3364722           67          -4.342806
## 6: -1.877412           -0.5447272           51          -7.561682
##          CD40        CD5L Calbindin Calcitonin      CgA Clusterin_Apo_J
## 1: -0.7964147  0.09531018  33.21363  1.3862944 397.6536        3.555348
## 2: -1.2733760 -0.67334455  25.27636  3.6109179 465.6759        3.044522
## 3: -1.2415199  0.09531018  22.16609  2.1162555 347.8639        2.772589
## 4: -1.1238408 -0.32850407  23.45584 -0.1508229 334.2346        2.833213
## 5: -0.9240345  0.36331197  21.83275  1.3083328 442.8046        3.044522
## 6: -1.7844998  0.40546511  13.23155  1.6292405 137.9473        2.564949
##    Complement_3 Complement_Factor_H Connective_Tissue_Growth_Factor
## 1:    -10.36305            3.573725                       0.5306283
## 2:    -16.10824            3.600047                       0.5877867
## 3:    -16.10824            4.474569                       0.6418539
## 4:    -13.20556            3.097119                       0.5306283
## 5:    -12.81314            7.245150                       0.9162907
## 6:    -11.98323            3.573725                       0.9932518
##    Cortisol Creatine_Kinase_MB Cystatin_C      EGF_R   EN_RAGE    ENA_78
## 1:       10          -1.710172   9.041922 -0.1354543 -3.688879 -1.349543
## 2:       12          -1.751002   9.067624 -0.3700474 -3.816713 -1.356595
## 3:       10          -1.383559   8.954157 -0.7329871 -4.755993 -1.390672
## 4:       14          -1.647864   9.581904 -0.4218532 -2.937463 -1.367775
## 5:       11          -1.625834   8.977146 -0.6206034 -2.364460 -1.339440
## 6:       13          -1.671366   7.835975 -1.1112274 -3.442019 -1.363957
##    Eotaxin_3         FAS FSH_Follicle_Stimulation_Hormon Fas_Ligand
## 1:        53 -0.08338161                      -0.6516715   3.101492
## 2:        62 -0.52763274                      -1.6272839   2.978813
## 3:        62 -0.63487827                      -1.5630004   1.360010
## 4:        44 -0.47803580                      -0.5902871   2.537220
## 5:        64 -0.12783337                      -0.9763009   4.037285
## 6:        57 -0.32850407                      -1.6832823   2.407182
##    Fatty_Acid_Binding_Protein Ferritin  Fetuin_A Fibrinogen GRO_alpha
## 1:                  2.5208712 3.329165 1.2809338  -7.035589  1.381830
## 2:                  2.2477966 3.932959 1.1939225  -8.047190  1.372438
## 3:                  0.9063009 3.176872 1.4109870  -7.195437  1.412679
## 4:                  0.6237306 3.138093 0.7419373  -7.799353  1.372438
## 5:                  2.6345883 2.690416 2.1517622  -6.980326  1.398431
## 6:                  0.6237306 1.847077 1.4816045  -6.437752  1.398431
##    Gamma_Interferon_induced_Monokin Glutathione_S_Transferase_alpha
## 1:                         2.949822                       1.0641271
## 2:                         2.721793                       0.8670202
## 3:                         2.762231                       0.8890150
## 4:                         2.885476                       0.7083677
## 5:                         2.851987                       1.2358607
## 6:                         2.822442                       1.1538270
##      HB_EGF     HCC_4 Hepatocyte_Growth_Factor_HGF    I_309     ICAM_1
## 1: 6.559746 -3.036554                   0.58778666 3.433987 -0.1907787
## 2: 8.754531 -4.074542                   0.53062825 3.135494 -0.4620172
## 3: 7.745463 -3.649659                   0.09531018 2.397895 -0.4620172
## 4: 5.949436 -3.816713                   0.40546511 3.367296 -0.8572661
## 5: 7.245150 -3.146555                   0.53062825 3.761200  0.0971503
## 6: 6.413012 -3.079114                   0.09531018 2.708050 -0.9351069
##    IGF_BP_2    IL_11    IL_13    IL_16   IL_17E IL_1alpha      IL_3
## 1: 5.609472 5.121987 1.282549 4.192081 5.731246 -6.571283 -3.244194
## 2: 5.347108 4.936704 1.269463 2.876338 6.705891 -8.047190 -3.912023
## 3: 5.181784 4.665910 1.274133 2.616102 4.149327 -8.180721 -4.645992
## 4: 5.424950 6.223931 1.307549 2.441056 4.695848 -7.600902 -4.268698
## 5: 5.420535 7.070709 1.309980 4.736472 4.204987 -6.943657 -2.995732
## 6: 5.056246 6.103215 1.282549 2.671032 3.637051 -8.180721 -3.863233
##        IL_4       IL_5        IL_6 IL_6_Receptor     IL_7     IL_8
## 1: 2.484907  1.0986123  0.26936976    0.64279595 4.805045 1.711325
## 2: 2.397895  0.6931472  0.09622438    0.43115645 3.705506 1.675557
## 3: 1.824549 -0.2484614  0.18568645    0.09668586 1.005622 1.691393
## 4: 1.481605  0.7884574 -0.37116408    0.57519641 2.336211 1.719944
## 5: 2.708050  1.1631508 -0.07204658    0.09668586 4.287562 1.764298
## 6: 1.208960 -0.4004776  0.18568645   -0.51727788 2.776394 1.708270
##    IP_10_Inducible_Protein_10       IgA    Insulin
## 1:                   6.242223 -6.812445 -0.6258253
## 2:                   5.686975 -6.377127 -0.9431406
## 3:                   5.049856 -6.319969 -1.4466191
## 4:                   5.602119 -7.621105 -1.4852687
## 5:                   6.369901 -4.645992 -0.3003110
## 6:                   5.480639 -5.809143 -1.3405481
##    Kidney_Injury_Molecule_1_KIM_1     LOX_1     Leptin Lipoprotein_a
## 1:                      -1.204295 1.7047481 -1.5290628     -4.268698
## 2:                      -1.197703 1.5260563 -1.4660558     -4.933674
## 3:                      -1.191191 1.1631508 -1.6622675     -5.843045
## 4:                      -1.231557 1.2237754 -1.2693924     -4.990833
## 5:                      -1.163800 1.3609766 -0.9151068     -2.937463
## 6:                      -1.123868 0.6418539 -1.3613475     -4.509860
##       MCP_1     MCP_2       MIF MIP_1alpha MIP_1beta    MMP_2     MMP_3
## 1: 6.740519 1.9805094 -1.237874   4.968453  3.258097 4.478566 -2.207275
## 2: 6.849066 1.8088944 -1.897120   3.690160  3.135494 3.781473 -2.465104
## 3: 6.767343 0.4005958 -2.302585   4.049508  2.397895 2.866631 -2.302585
## 4: 6.781058 1.9805094 -1.660731   4.928562  3.218876 2.968511 -1.771957
## 5: 6.722630 2.2208309 -1.897120   6.452764  3.526361 3.690160 -1.560648
## 6: 6.541030 2.3343863 -2.040221   4.603421  2.890372 2.917760 -3.036554
##        MMP10       MMP7  Myoglobin NT_proBNP    NrCAM Osteopontin
## 1: -3.270169 -3.7735027 -1.8971200  4.553877 5.003946    5.356586
## 2: -3.649659 -5.9681907 -0.7550226  4.219508 5.209486    6.003887
## 3: -2.733368 -4.0302269 -1.3862944  4.248495 4.744932    5.017280
## 4: -4.074542 -6.8561489 -1.1394343  4.110874 4.969813    5.768321
## 5: -2.617296 -0.2222222 -1.7719568  4.465908 5.198497    5.693732
## 6: -3.324236 -1.9223227 -1.1394343  4.189655 3.258097    4.736198
##         PAI_1    PAPP_A     PLGF      PYY Pancreatic_polypeptide
## 1:  1.0035016 -2.902226 4.442651 3.218876              0.5787809
## 2: -0.0305988 -2.813276 4.025352 3.135494              0.3364722
## 3:  0.4383721 -2.935541 4.510860 2.890372             -0.8915981
## 4:  0.0000000 -2.786601 3.433987 2.833213             -0.8209806
## 5:  0.2523047 -2.935541 4.795791 3.663562              0.2623643
## 6:  0.4383721 -2.935541 4.394449 3.332205             -0.4780358
##      Prolactin Prostatic_Acid_Phosphatase Protein_S
## 1:  0.00000000                  -1.620527 -1.784998
## 2: -0.51082562                  -1.739232 -2.463991
## 3: -0.13926207                  -1.636682 -2.259135
## 4: -0.04082199                  -1.739232 -2.703458
## 5:  0.18232156                  -1.696685 -1.659842
## 6: -0.15082289                  -1.755051 -2.357788
##    Pulmonary_and_Activation_Regulat    RANTES  Resistin    S100b
## 1:                       -0.8439701 -6.214608 -16.47532 1.561856
## 2:                       -2.3025851 -6.938214 -16.02528 1.756621
## 3:                       -1.6607312 -6.645391 -16.47532 1.435728
## 4:                       -1.1086626 -5.991465 -13.50124 1.254400
## 5:                       -0.5621189 -6.319969 -11.09284 1.301297
## 6:                       -1.1711830 -6.502290 -11.29137 1.054607
##           SGOT      SHBG      SOD Serum_Amyloid_P Sortilin
## 1: -0.94160854 -1.897120 5.609472       -5.599422 4.908629
## 2: -0.65392647 -1.560648 5.814131       -6.119298 5.478731
## 3:  0.33647224 -2.207275 5.723585       -5.381699 3.810182
## 4: -0.19845094 -3.146555 5.771441       -6.645391 3.402176
## 5:  0.09531018 -2.430418 5.655992       -5.203007 3.402176
## 6: -0.31471075 -2.645075 4.543295       -5.115996 2.978813
##    Stem_Cell_Factor TGF_alpha   TIMP_1    TNF_RII   TRAIL_R3
## 1:         4.174387  8.649098 15.20465 -0.0618754 -0.1829004
## 2:         3.713572 11.331619 11.26650 -0.3285041 -0.5007471
## 3:         3.433987 10.858497 12.28286 -0.4155154 -0.9240345
## 4:         3.951244  9.454406 11.11488 -0.3424903 -0.3848591
## 5:         4.060443  8.323453 13.74802 -0.3424903 -0.8582591
## 6:         2.564949 10.008788 11.26650 -0.9416085 -0.7380092
##    TTR_prealbumin Tamm_Horsfall_Protein_THP Thrombomodulin Thrombopoietin
## 1:       2.944439                 -3.095810      -1.340566     -0.1026334
## 2:       2.833213                 -3.111190      -1.675252     -0.6733501
## 3:       2.944439                 -3.166721      -1.534276     -0.9229670
## 4:       2.944439                 -3.155652      -1.975407     -0.7510004
## 5:       3.044522                 -3.038017      -1.210709      0.0976177
## 6:       3.044522                 -3.125574      -1.451666     -1.0000000
##    Thymus_Expressed_Chemokine_TECK Thyroid_Stimulating_Hormone
## 1:                        4.149327                   -3.863233
## 2:                        3.810182                   -4.828314
## 3:                        2.791992                   -4.990833
## 4:                        4.037285                   -4.892852
## 5:                        4.534163                   -4.645992
## 6:                        4.534163                   -4.422849
##    Thyroxine_Binding_Globulin Tissue_Factor Transferrin
## 1:                 -1.4271164    2.04122033    3.332205
## 2:                 -1.6094379    2.02814825    2.890372
## 3:                 -1.8971200    1.43508452    2.890372
## 4:                 -2.0402208    2.02814825    2.890372
## 5:                 -0.4780358    1.98787435    3.496508
## 6:                 -1.2378744   -0.01005034    2.995732
##    Trefoil_Factor_3_TFF3   VCAM_1     VEGF Vitronectin
## 1:             -3.381395 3.258097 22.03456 -0.04082199
## 2:             -3.912023 2.708050 18.60184 -0.38566248
## 3:             -3.729701 2.639057 17.47619 -0.22314355
## 4:             -3.816713 2.772589 17.54560 -0.65392647
## 5:             -3.442019 3.044522 20.77860  0.16621555
## 6:             -4.342806 2.208274 13.19761  0.26236426
##    von_Willebrand_Factor      age      tau    p_tau    Ab_42 male Genotype
## 1:             -3.146555 88.52057 6.297754 4.348108 12.01968    0     E3E3
## 2:             -3.863233 80.33331 6.659294 4.859967 11.01576    0     E3E4
## 3:             -3.540459 83.20507 6.270988 4.400247 12.30227    1     E3E4
## 4:             -3.863233 83.40201 6.152733 4.494886 12.39814    0     E3E4
## 5:             -3.816713 85.96176 6.623707 4.524589 11.02411    0     E3E3
## 6:             -4.509860 69.37921 5.361292 3.465736 11.22671    1     E4E4
##       response gender
## 1: NotImpaired Female
## 2: NotImpaired Female
## 3: NotImpaired      M
## 4: NotImpaired Female
## 5: NotImpaired Female
## 6:    Impaired   Male

Wow that’s a lot of data! Let’s start by focusing on some that we think are interesting. Since I use data.table there are some differences in syntax compared to using a data.frame, and I just want to point those out to avoid confusion.

Some differences between data.frame and data.table syntax:

Alzheimers_small <- Alzheimers[, .(response,
                                   Genotype,
                                   gender,
                                   age,
                                   tau)]

Another quick data check.

summary(Alzheimers_small)
##         response   Genotype      gender         age        
##  Impaired   : 91   E2E2:  2   female:  2   Min.   : 65.00  
##  NotImpaired:242   E2E3: 37   Female:202   1st Qu.: 73.99  
##                    E2E4:  8   M     :  3   Median : 80.23  
##                    E3E3:167   male  :  2   Mean   : 79.99  
##                    E3E4:106   Male  :124   3rd Qu.: 86.05  
##                    E4E4: 13                Max.   :100.00  
##       tau       
##  Min.   :4.535  
##  1st Qu.:5.365  
##  Median :5.754  
##  Mean   :5.778  
##  3rd Qu.:6.175  
##  Max.   :7.172

Looks great but it looks like the coding on the gender variable is a little messy. Let’s clean that up. You will notice that data.table allows for internal updating of variables using :=.

Alzheimers_small[, gender := ifelse(gender == "M" | gender == "male", "Male",
                                    ifelse(gender == "female", "Female", levels(gender)[gender]))]

Alzheimers_small[, gender := factor(gender)] # remove old factor levels

Another convenient feature of data.table is the .N function, which provides row counts. You can pair it with the by argument to get row counts of groups.

Alzheimers_small[, .N, by = gender] # check to make sure it is right
##    gender   N
## 1: Female 204
## 2:   Male 129

Next, I just want to vizualize the distributions of the data using the ggplot2 package.

library(ggplot2)

I usually use ggplot2 for data explorations and visualizations. One of my favorite ggplot2 references is http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/

First, I will visualize the distributions of the data with geom_bar().

ggplot(Alzheimers_small,
       aes(x = response)) +
  geom_bar()

I could copy and paste that code over and over for each variable or use a for loop to go through each column. First I will create a vector of the column names and determine the number of columns.

cols <- names(Alzheimers_small)

num_cols <- length(cols)

print(cols)
## [1] "response" "Genotype" "gender"   "age"      "tau"

The column names are now listed as a string, and ggplot by default is not looking for a string but an object name. Therefore in the loop, we need to use aes_string() instead of the default aes(). Side note, aes stands for aesthetic which creates the visual mappings between the data and the visualization.

for (i in 1:num_cols) {
  
  print(ggplot(Alzheimers_small,
               aes_string(x = cols[i])) +
          geom_bar())
  
}

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Now, let’s take a look at how some of these variables interact. For example, I might want to understand how age and genotype relate to impairment.

ggplot(data = Alzheimers_small,
       aes(x = Genotype,
           y = age,
           color = response)) + 
  geom_boxplot()

This could be interesting. Maybe we want to share it with someone or include it in a report. Let’s add some titles to our plot and make it more readable.

ggplot(data = Alzheimers_small,
       aes(x = Genotype,
           y = age,
           color = response)) + 
  geom_boxplot(position = position_dodge(width=0.9)) +
  scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
  labs(title = "Alzheimers Impairment by Age and Genotype",
       y = "Age")

The default styling isn’t bad, but I think I could do better.

ggplot(data = Alzheimers_small,
       aes(x = Genotype,
           y = age,
           color = response)) + 
  geom_boxplot(position = position_dodge(width=0.9)) +
  scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
  labs(title = "Alzheimers Impairment by Age and Genotype",
       y = "Age") + 
  theme(plot.title = element_text(size=16, 
                                  face="bold", 
                                  vjust = 2, 
                                  color = "grey25"),
        axis.title.y = element_text(size = 14,
                                    face="bold", 
                                    color = "grey42",
                                    vjust = .5),
        axis.title.x = element_text(size = 14,
                                    face="bold", 
                                    color = "grey42",
                                    vjust = -.5),
        axis.text = element_text(size = 12),
        legend.position = "bottom",
        legend.title = element_blank())

Now I like the way that looks a lot better. I just need to copy that code to the 10 other graphs I want to include in my report, but wait, why copy code when you don’t have to. I can store my theme in an object and just add the object to all my plots. Then if I decide I want to change one of the settings across the board I only have to do it in one place.

my_theme <- theme_grey() + 
  theme(plot.title = element_text(size=16, 
                                  face="bold", 
                                  vjust = 2, 
                                  color = "grey25"),
        axis.title.y = element_text(size = 14,
                                    face = "bold",
                                    color = "grey42",
                                    vjust = .5),
        axis.title.x = element_text(size = 14,
                                    face = "bold",
                                    color = "grey42",
                                    vjust = -.5),
        axis.text = element_text(size = 12),
        legend.position = "bottom",
        legend.title = element_text(size = 12,
                                    color = "grey42"))

ggplot(data = Alzheimers_small,
       aes(x = Genotype,
           y = age,
           color = response)) + 
  geom_boxplot(fill="white",outlier.colour = NA, 
               position = position_dodge(width=0.9)) +
  scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
  labs(title = "Alzheimers Impairment by Age and Genotype",
       y = "Age") +
  my_theme + theme(legend.title = element_blank())

Now I am ready to crank out all those graphs for my report, but there is always one thing that has bothered me about a boxplot. You can’t tell how many observations are in each group. Therefore, I like to add the points to the graph using position_jitterdodge so that they aren’t all overlapping one another on the center line.

ggplot(data = Alzheimers_small,
       aes(x = Genotype,
           y = age,
           color = response,
           fill = response)) + 
  geom_boxplot(fill="white", outlier.colour = "grey42", 
               position = position_dodge(width=0.9)) + 
  geom_point(position=position_jitterdodge(dodge.width=0.9)) +
  scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
  scale_fill_discrete(guide = F) +
  labs(title = "Alzheimers Impairment by Age and Genotype",
       y = "Age") +
  my_theme + theme(legend.title = element_blank())

Let’s just look at one other type of graph, we will do a scatter plot of age compared to the tau protein level and color code the dots by Genotype. We can then layer on a smoothing function to help see if there are any trends worth investigating.

ggplot(data = Alzheimers_small,
       aes(x = age,
           y = tau,
           color = Genotype)) +
  geom_point(size = 3) + 
  geom_smooth(size = 2, 
              alpha = .15,
              method = "lm") +
  ggtitle(expression(atop("Tau Protein Levels Relative to Patient Age", 
                     atop(italic("Grouped by Genotype"))))) + 
  scale_color_discrete("Genotype") +
  labs(x = "Age",
       y = "Tau Level") +
  my_theme + theme(plot.title = element_text(vjust = 0))
## Warning in qt((1 - level)/2, df): NaNs produced