library(data.table)
Why consider data.table: http://www.analyticsvidhya.com/blog/2016/05/data-table-data-frame-work-large-data-sets/
system.time(applicant_tracking_steps <- import.format.ats())
user system elapsed
3884.43 16.06 3904.17
That’s about 65 minutes using read.csv()
system.time(applicant_tracking_steps <- import.format.ats.dt())
Read 35409294 rows and 10 (of 10) columns from 7.892 GB file in 00:10:08
user system elapsed
762.75 11.04 820.06
That’s a little over 13 min using fread() from the data.table package.
If you are interested in learning more about data.table: https://www.datacamp.com/courses/data-table-data-manipulation-r-tutorial.
Or if you just want a quick reference: https://s3.amazonaws.com/assets.datacamp.com/img/blog/data+table+cheat+sheet.pdf
Alzheimers <- read.csv("~/AlzheimersTest/Alzheimers.csv", header=T)
Alzheimers <- data.table(Alzheimers)
Let’s check and make sure the data looks like we would expect it to.
head(Alzheimers)
## ACE_CD143_Angiotensin_Converti ACTH_Adrenocorticotropic_Hormon
## 1: 2.0031003 -1.386294
## 2: 1.5618560 -1.386294
## 3: 1.5206598 -1.714798
## 4: 1.6808260 -1.609438
## 5: 2.4009308 -0.967584
## 6: 0.4311565 -1.272966
## AXL Adiponectin Alpha_1_Antichymotrypsin Alpha_1_Antitrypsin
## 1: 1.0983867 -5.360193 1.740466 -12.63136
## 2: 0.6832816 -5.020686 1.458615 -11.90988
## 3: -0.1452763 -5.809143 1.193922 -13.64296
## 4: 0.6832816 -5.115996 1.280934 -15.52356
## 5: 0.1908902 -4.779524 2.128232 -11.13306
## 6: -0.2223611 -5.221356 1.308333 -12.13464
## Alpha_1_Microglobulin Alpha_2_Macroglobulin Angiopoietin_2_ANG_2
## 1: -2.577022 -72.65029 1.06471074
## 2: -3.244194 -154.61228 0.74193734
## 3: -2.882404 -136.52918 0.83290912
## 4: -3.170086 -98.36175 0.91629073
## 5: -2.343407 -144.94460 0.95551144
## 6: -2.551046 -154.61228 -0.05129329
## Angiotensinogen Apolipoprotein_A_IV Apolipoprotein_A1 Apolipoprotein_A2
## 1: 2.510547 -1.427116 -7.402052 -0.26136476
## 2: 2.457283 -1.660731 -7.047017 -0.86750057
## 3: 1.976365 -1.660731 -7.684284 -0.65392647
## 4: 2.376085 -2.120264 -8.047190 -1.23787436
## 5: 2.862219 -1.171183 -6.725434 0.09531018
## 6: 2.524026 -1.386294 -7.402052 -0.27443685
## Apolipoprotein_B Apolipoprotein_CI Apolipoprotein_CIII Apolipoprotein_D
## 1: -4.624044 -1.2729657 -2.312635 2.079442
## 2: -6.747507 -1.2729657 -2.343407 1.335001
## 3: -3.976069 -1.7147984 -2.748872 1.335001
## 4: -6.517424 -1.9661129 -2.995732 1.435085
## 5: -3.378594 -0.7550226 -1.514128 1.629241
## 6: -2.963532 -1.6607312 -2.312635 1.916923
## Apolipoprotein_E Apolipoprotein_H B_Lymphocyte_Chemoattractant_BL
## 1: 3.7545215 -0.1573491 2.296982
## 2: 3.0971187 -0.5753962 1.673121
## 3: 2.7530556 -0.3448394 1.673121
## 4: 2.3713615 -0.5317281 1.980509
## 5: 3.0671471 0.6626345 2.296982
## 6: 0.5911464 0.0971503 2.479838
## BMP_6 Beta_2_Microglobulin Betacellulin C_Reactive_Protein
## 1: -2.200744 0.6931472 34 -4.074542
## 2: -1.728053 0.4700036 53 -6.645391
## 3: -2.062421 0.3364722 49 -8.047190
## 4: -1.982912 0.6418539 52 -6.214608
## 5: -1.241520 0.3364722 67 -4.342806
## 6: -1.877412 -0.5447272 51 -7.561682
## CD40 CD5L Calbindin Calcitonin CgA Clusterin_Apo_J
## 1: -0.7964147 0.09531018 33.21363 1.3862944 397.6536 3.555348
## 2: -1.2733760 -0.67334455 25.27636 3.6109179 465.6759 3.044522
## 3: -1.2415199 0.09531018 22.16609 2.1162555 347.8639 2.772589
## 4: -1.1238408 -0.32850407 23.45584 -0.1508229 334.2346 2.833213
## 5: -0.9240345 0.36331197 21.83275 1.3083328 442.8046 3.044522
## 6: -1.7844998 0.40546511 13.23155 1.6292405 137.9473 2.564949
## Complement_3 Complement_Factor_H Connective_Tissue_Growth_Factor
## 1: -10.36305 3.573725 0.5306283
## 2: -16.10824 3.600047 0.5877867
## 3: -16.10824 4.474569 0.6418539
## 4: -13.20556 3.097119 0.5306283
## 5: -12.81314 7.245150 0.9162907
## 6: -11.98323 3.573725 0.9932518
## Cortisol Creatine_Kinase_MB Cystatin_C EGF_R EN_RAGE ENA_78
## 1: 10 -1.710172 9.041922 -0.1354543 -3.688879 -1.349543
## 2: 12 -1.751002 9.067624 -0.3700474 -3.816713 -1.356595
## 3: 10 -1.383559 8.954157 -0.7329871 -4.755993 -1.390672
## 4: 14 -1.647864 9.581904 -0.4218532 -2.937463 -1.367775
## 5: 11 -1.625834 8.977146 -0.6206034 -2.364460 -1.339440
## 6: 13 -1.671366 7.835975 -1.1112274 -3.442019 -1.363957
## Eotaxin_3 FAS FSH_Follicle_Stimulation_Hormon Fas_Ligand
## 1: 53 -0.08338161 -0.6516715 3.101492
## 2: 62 -0.52763274 -1.6272839 2.978813
## 3: 62 -0.63487827 -1.5630004 1.360010
## 4: 44 -0.47803580 -0.5902871 2.537220
## 5: 64 -0.12783337 -0.9763009 4.037285
## 6: 57 -0.32850407 -1.6832823 2.407182
## Fatty_Acid_Binding_Protein Ferritin Fetuin_A Fibrinogen GRO_alpha
## 1: 2.5208712 3.329165 1.2809338 -7.035589 1.381830
## 2: 2.2477966 3.932959 1.1939225 -8.047190 1.372438
## 3: 0.9063009 3.176872 1.4109870 -7.195437 1.412679
## 4: 0.6237306 3.138093 0.7419373 -7.799353 1.372438
## 5: 2.6345883 2.690416 2.1517622 -6.980326 1.398431
## 6: 0.6237306 1.847077 1.4816045 -6.437752 1.398431
## Gamma_Interferon_induced_Monokin Glutathione_S_Transferase_alpha
## 1: 2.949822 1.0641271
## 2: 2.721793 0.8670202
## 3: 2.762231 0.8890150
## 4: 2.885476 0.7083677
## 5: 2.851987 1.2358607
## 6: 2.822442 1.1538270
## HB_EGF HCC_4 Hepatocyte_Growth_Factor_HGF I_309 ICAM_1
## 1: 6.559746 -3.036554 0.58778666 3.433987 -0.1907787
## 2: 8.754531 -4.074542 0.53062825 3.135494 -0.4620172
## 3: 7.745463 -3.649659 0.09531018 2.397895 -0.4620172
## 4: 5.949436 -3.816713 0.40546511 3.367296 -0.8572661
## 5: 7.245150 -3.146555 0.53062825 3.761200 0.0971503
## 6: 6.413012 -3.079114 0.09531018 2.708050 -0.9351069
## IGF_BP_2 IL_11 IL_13 IL_16 IL_17E IL_1alpha IL_3
## 1: 5.609472 5.121987 1.282549 4.192081 5.731246 -6.571283 -3.244194
## 2: 5.347108 4.936704 1.269463 2.876338 6.705891 -8.047190 -3.912023
## 3: 5.181784 4.665910 1.274133 2.616102 4.149327 -8.180721 -4.645992
## 4: 5.424950 6.223931 1.307549 2.441056 4.695848 -7.600902 -4.268698
## 5: 5.420535 7.070709 1.309980 4.736472 4.204987 -6.943657 -2.995732
## 6: 5.056246 6.103215 1.282549 2.671032 3.637051 -8.180721 -3.863233
## IL_4 IL_5 IL_6 IL_6_Receptor IL_7 IL_8
## 1: 2.484907 1.0986123 0.26936976 0.64279595 4.805045 1.711325
## 2: 2.397895 0.6931472 0.09622438 0.43115645 3.705506 1.675557
## 3: 1.824549 -0.2484614 0.18568645 0.09668586 1.005622 1.691393
## 4: 1.481605 0.7884574 -0.37116408 0.57519641 2.336211 1.719944
## 5: 2.708050 1.1631508 -0.07204658 0.09668586 4.287562 1.764298
## 6: 1.208960 -0.4004776 0.18568645 -0.51727788 2.776394 1.708270
## IP_10_Inducible_Protein_10 IgA Insulin
## 1: 6.242223 -6.812445 -0.6258253
## 2: 5.686975 -6.377127 -0.9431406
## 3: 5.049856 -6.319969 -1.4466191
## 4: 5.602119 -7.621105 -1.4852687
## 5: 6.369901 -4.645992 -0.3003110
## 6: 5.480639 -5.809143 -1.3405481
## Kidney_Injury_Molecule_1_KIM_1 LOX_1 Leptin Lipoprotein_a
## 1: -1.204295 1.7047481 -1.5290628 -4.268698
## 2: -1.197703 1.5260563 -1.4660558 -4.933674
## 3: -1.191191 1.1631508 -1.6622675 -5.843045
## 4: -1.231557 1.2237754 -1.2693924 -4.990833
## 5: -1.163800 1.3609766 -0.9151068 -2.937463
## 6: -1.123868 0.6418539 -1.3613475 -4.509860
## MCP_1 MCP_2 MIF MIP_1alpha MIP_1beta MMP_2 MMP_3
## 1: 6.740519 1.9805094 -1.237874 4.968453 3.258097 4.478566 -2.207275
## 2: 6.849066 1.8088944 -1.897120 3.690160 3.135494 3.781473 -2.465104
## 3: 6.767343 0.4005958 -2.302585 4.049508 2.397895 2.866631 -2.302585
## 4: 6.781058 1.9805094 -1.660731 4.928562 3.218876 2.968511 -1.771957
## 5: 6.722630 2.2208309 -1.897120 6.452764 3.526361 3.690160 -1.560648
## 6: 6.541030 2.3343863 -2.040221 4.603421 2.890372 2.917760 -3.036554
## MMP10 MMP7 Myoglobin NT_proBNP NrCAM Osteopontin
## 1: -3.270169 -3.7735027 -1.8971200 4.553877 5.003946 5.356586
## 2: -3.649659 -5.9681907 -0.7550226 4.219508 5.209486 6.003887
## 3: -2.733368 -4.0302269 -1.3862944 4.248495 4.744932 5.017280
## 4: -4.074542 -6.8561489 -1.1394343 4.110874 4.969813 5.768321
## 5: -2.617296 -0.2222222 -1.7719568 4.465908 5.198497 5.693732
## 6: -3.324236 -1.9223227 -1.1394343 4.189655 3.258097 4.736198
## PAI_1 PAPP_A PLGF PYY Pancreatic_polypeptide
## 1: 1.0035016 -2.902226 4.442651 3.218876 0.5787809
## 2: -0.0305988 -2.813276 4.025352 3.135494 0.3364722
## 3: 0.4383721 -2.935541 4.510860 2.890372 -0.8915981
## 4: 0.0000000 -2.786601 3.433987 2.833213 -0.8209806
## 5: 0.2523047 -2.935541 4.795791 3.663562 0.2623643
## 6: 0.4383721 -2.935541 4.394449 3.332205 -0.4780358
## Prolactin Prostatic_Acid_Phosphatase Protein_S
## 1: 0.00000000 -1.620527 -1.784998
## 2: -0.51082562 -1.739232 -2.463991
## 3: -0.13926207 -1.636682 -2.259135
## 4: -0.04082199 -1.739232 -2.703458
## 5: 0.18232156 -1.696685 -1.659842
## 6: -0.15082289 -1.755051 -2.357788
## Pulmonary_and_Activation_Regulat RANTES Resistin S100b
## 1: -0.8439701 -6.214608 -16.47532 1.561856
## 2: -2.3025851 -6.938214 -16.02528 1.756621
## 3: -1.6607312 -6.645391 -16.47532 1.435728
## 4: -1.1086626 -5.991465 -13.50124 1.254400
## 5: -0.5621189 -6.319969 -11.09284 1.301297
## 6: -1.1711830 -6.502290 -11.29137 1.054607
## SGOT SHBG SOD Serum_Amyloid_P Sortilin
## 1: -0.94160854 -1.897120 5.609472 -5.599422 4.908629
## 2: -0.65392647 -1.560648 5.814131 -6.119298 5.478731
## 3: 0.33647224 -2.207275 5.723585 -5.381699 3.810182
## 4: -0.19845094 -3.146555 5.771441 -6.645391 3.402176
## 5: 0.09531018 -2.430418 5.655992 -5.203007 3.402176
## 6: -0.31471075 -2.645075 4.543295 -5.115996 2.978813
## Stem_Cell_Factor TGF_alpha TIMP_1 TNF_RII TRAIL_R3
## 1: 4.174387 8.649098 15.20465 -0.0618754 -0.1829004
## 2: 3.713572 11.331619 11.26650 -0.3285041 -0.5007471
## 3: 3.433987 10.858497 12.28286 -0.4155154 -0.9240345
## 4: 3.951244 9.454406 11.11488 -0.3424903 -0.3848591
## 5: 4.060443 8.323453 13.74802 -0.3424903 -0.8582591
## 6: 2.564949 10.008788 11.26650 -0.9416085 -0.7380092
## TTR_prealbumin Tamm_Horsfall_Protein_THP Thrombomodulin Thrombopoietin
## 1: 2.944439 -3.095810 -1.340566 -0.1026334
## 2: 2.833213 -3.111190 -1.675252 -0.6733501
## 3: 2.944439 -3.166721 -1.534276 -0.9229670
## 4: 2.944439 -3.155652 -1.975407 -0.7510004
## 5: 3.044522 -3.038017 -1.210709 0.0976177
## 6: 3.044522 -3.125574 -1.451666 -1.0000000
## Thymus_Expressed_Chemokine_TECK Thyroid_Stimulating_Hormone
## 1: 4.149327 -3.863233
## 2: 3.810182 -4.828314
## 3: 2.791992 -4.990833
## 4: 4.037285 -4.892852
## 5: 4.534163 -4.645992
## 6: 4.534163 -4.422849
## Thyroxine_Binding_Globulin Tissue_Factor Transferrin
## 1: -1.4271164 2.04122033 3.332205
## 2: -1.6094379 2.02814825 2.890372
## 3: -1.8971200 1.43508452 2.890372
## 4: -2.0402208 2.02814825 2.890372
## 5: -0.4780358 1.98787435 3.496508
## 6: -1.2378744 -0.01005034 2.995732
## Trefoil_Factor_3_TFF3 VCAM_1 VEGF Vitronectin
## 1: -3.381395 3.258097 22.03456 -0.04082199
## 2: -3.912023 2.708050 18.60184 -0.38566248
## 3: -3.729701 2.639057 17.47619 -0.22314355
## 4: -3.816713 2.772589 17.54560 -0.65392647
## 5: -3.442019 3.044522 20.77860 0.16621555
## 6: -4.342806 2.208274 13.19761 0.26236426
## von_Willebrand_Factor age tau p_tau Ab_42 male Genotype
## 1: -3.146555 88.52057 6.297754 4.348108 12.01968 0 E3E3
## 2: -3.863233 80.33331 6.659294 4.859967 11.01576 0 E3E4
## 3: -3.540459 83.20507 6.270988 4.400247 12.30227 1 E3E4
## 4: -3.863233 83.40201 6.152733 4.494886 12.39814 0 E3E4
## 5: -3.816713 85.96176 6.623707 4.524589 11.02411 0 E3E3
## 6: -4.509860 69.37921 5.361292 3.465736 11.22671 1 E4E4
## response gender
## 1: NotImpaired Female
## 2: NotImpaired Female
## 3: NotImpaired M
## 4: NotImpaired Female
## 5: NotImpaired Female
## 6: Impaired Male
Wow that’s a lot of data! Let’s start by focusing on some that we think are interesting. Since I use data.table there are some differences in syntax compared to using a data.frame, and I just want to point those out to avoid confusion.
Some differences between data.frame and data.table syntax:
mydata[row, column] assumption but if you only include 1 argument mydata[x]
Alzheimers_small <- Alzheimers[, .(response,
Genotype,
gender,
age,
tau)]
Another quick data check.
summary(Alzheimers_small)
## response Genotype gender age
## Impaired : 91 E2E2: 2 female: 2 Min. : 65.00
## NotImpaired:242 E2E3: 37 Female:202 1st Qu.: 73.99
## E2E4: 8 M : 3 Median : 80.23
## E3E3:167 male : 2 Mean : 79.99
## E3E4:106 Male :124 3rd Qu.: 86.05
## E4E4: 13 Max. :100.00
## tau
## Min. :4.535
## 1st Qu.:5.365
## Median :5.754
## Mean :5.778
## 3rd Qu.:6.175
## Max. :7.172
Looks great but it looks like the coding on the gender variable is a little messy. Let’s clean that up. You will notice that data.table allows for internal updating of variables using :=.
Alzheimers_small[, gender := ifelse(gender == "M" | gender == "male", "Male",
ifelse(gender == "female", "Female", levels(gender)[gender]))]
Alzheimers_small[, gender := factor(gender)] # remove old factor levels
Another convenient feature of data.table is the .N function, which provides row counts. You can pair it with the by argument to get row counts of groups.
Alzheimers_small[, .N, by = gender] # check to make sure it is right
## gender N
## 1: Female 204
## 2: Male 129
Next, I just want to vizualize the distributions of the data using the ggplot2 package.
library(ggplot2)
I usually use ggplot2 for data explorations and visualizations. One of my favorite ggplot2 references is http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/
First, I will visualize the distributions of the data with geom_bar().
ggplot(Alzheimers_small,
aes(x = response)) +
geom_bar()
I could copy and paste that code over and over for each variable or use a for loop to go through each column. First I will create a vector of the column names and determine the number of columns.
cols <- names(Alzheimers_small)
num_cols <- length(cols)
print(cols)
## [1] "response" "Genotype" "gender" "age" "tau"
The column names are now listed as a string, and ggplot by default is not looking for a string but an object name. Therefore in the loop, we need to use aes_string() instead of the default aes(). Side note, aes stands for aesthetic which creates the visual mappings between the data and the visualization.
for (i in 1:num_cols) {
print(ggplot(Alzheimers_small,
aes_string(x = cols[i])) +
geom_bar())
}
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Now, let’s take a look at how some of these variables interact. For example, I might want to understand how age and genotype relate to impairment.
ggplot(data = Alzheimers_small,
aes(x = Genotype,
y = age,
color = response)) +
geom_boxplot()
This could be interesting. Maybe we want to share it with someone or include it in a report. Let’s add some titles to our plot and make it more readable.
ggplot(data = Alzheimers_small,
aes(x = Genotype,
y = age,
color = response)) +
geom_boxplot(position = position_dodge(width=0.9)) +
scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
labs(title = "Alzheimers Impairment by Age and Genotype",
y = "Age")
The default styling isn’t bad, but I think I could do better.
ggplot(data = Alzheimers_small,
aes(x = Genotype,
y = age,
color = response)) +
geom_boxplot(position = position_dodge(width=0.9)) +
scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
labs(title = "Alzheimers Impairment by Age and Genotype",
y = "Age") +
theme(plot.title = element_text(size=16,
face="bold",
vjust = 2,
color = "grey25"),
axis.title.y = element_text(size = 14,
face="bold",
color = "grey42",
vjust = .5),
axis.title.x = element_text(size = 14,
face="bold",
color = "grey42",
vjust = -.5),
axis.text = element_text(size = 12),
legend.position = "bottom",
legend.title = element_blank())
Now I like the way that looks a lot better. I just need to copy that code to the 10 other graphs I want to include in my report, but wait, why copy code when you don’t have to. I can store my theme in an object and just add the object to all my plots. Then if I decide I want to change one of the settings across the board I only have to do it in one place.
my_theme <- theme_grey() +
theme(plot.title = element_text(size=16,
face="bold",
vjust = 2,
color = "grey25"),
axis.title.y = element_text(size = 14,
face = "bold",
color = "grey42",
vjust = .5),
axis.title.x = element_text(size = 14,
face = "bold",
color = "grey42",
vjust = -.5),
axis.text = element_text(size = 12),
legend.position = "bottom",
legend.title = element_text(size = 12,
color = "grey42"))
ggplot(data = Alzheimers_small,
aes(x = Genotype,
y = age,
color = response)) +
geom_boxplot(fill="white",outlier.colour = NA,
position = position_dodge(width=0.9)) +
scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
labs(title = "Alzheimers Impairment by Age and Genotype",
y = "Age") +
my_theme + theme(legend.title = element_blank())
Now I am ready to crank out all those graphs for my report, but there is always one thing that has bothered me about a boxplot. You can’t tell how many observations are in each group. Therefore, I like to add the points to the graph using position_jitterdodge so that they aren’t all overlapping one another on the center line.
ggplot(data = Alzheimers_small,
aes(x = Genotype,
y = age,
color = response,
fill = response)) +
geom_boxplot(fill="white", outlier.colour = "grey42",
position = position_dodge(width=0.9)) +
geom_point(position=position_jitterdodge(dodge.width=0.9)) +
scale_color_discrete(labels = c("Impaired", "Not Impaired")) +
scale_fill_discrete(guide = F) +
labs(title = "Alzheimers Impairment by Age and Genotype",
y = "Age") +
my_theme + theme(legend.title = element_blank())
Let’s just look at one other type of graph, we will do a scatter plot of age compared to the tau protein level and color code the dots by Genotype. We can then layer on a smoothing function to help see if there are any trends worth investigating.
ggplot(data = Alzheimers_small,
aes(x = age,
y = tau,
color = Genotype)) +
geom_point(size = 3) +
geom_smooth(size = 2,
alpha = .15,
method = "lm") +
ggtitle(expression(atop("Tau Protein Levels Relative to Patient Age",
atop(italic("Grouped by Genotype"))))) +
scale_color_discrete("Genotype") +
labs(x = "Age",
y = "Tau Level") +
my_theme + theme(plot.title = element_text(vjust = 0))
## Warning in qt((1 - level)/2, df): NaNs produced