Data Source and Description
Kaggle conducted an industry-wide survey to establish a comprehensive view of the state of data science and machine learning. The survey received over 16,000 responses and we learned a ton about who is working with data, what’s happening at the cutting edge of machine learning across industries, and how new data scientists can best break into the field.
You can dowload data here.
Data Wrangling May Be The Most Time-Consuming Phrase

R codes for this graph:
rm(list = ls())
library(tidyverse)
library(hrbrthemes)
df_survey <- read.csv("E:\\R_project\\Kaggle\\ds_survey\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)
df_survey %>%
select(contains("Time")) -> df_time
df_time %>%
filter(TimeFindingInsights <= 75) %>%
filter(TimeOtherSelect <= 50) %>%
filter(TimeProduction <= 75) %>%
filter(TimeVisualizing <= 75) -> df_time
df_time %>%
dplyr::select_if(is.numeric) %>%
gather(a, b) %>%
mutate(a = str_replace_all(a, "Time", "")) -> df_time
df_time %>%
group_by(a) %>%
summarise(med_time = median(b, na.rm = TRUE)) %>%
ungroup() -> df_time_med
right_join(df_time, df_time_med) %>%
mutate(a = paste0(a, ": ", med_time)) -> df_time
df_time %>%
ggplot(aes(b, fill = a, color = a)) +
geom_density(alpha = 0.15, show.legend = FALSE) +
facet_wrap(~ a, scales = "free") +
geom_vline(data = df_time_med %>% mutate(a = paste0(a, ": ", med_time)), aes(xintercept = med_time, color = a), show.legend = FALSE, linetype = "dashed", size = 1) +
theme_ft_rc() +
scale_fill_ft() +
scale_color_ft()
df_time %>%
ggplot(aes(b, fill = a, color = a)) +
geom_density(alpha = 0.15, show.legend = FALSE) +
facet_wrap(~ a, scales = "free") +
geom_vline(data = df_time_med %>% mutate(a = paste0(a, ": ", med_time)),
aes(xintercept = med_time), color = "white", linetype = "dashed") +
theme_ft_rc() +
scale_fill_ft() +
scale_color_ft() +
theme(panel.grid = element_blank()) +
theme(axis.text.y = element_blank()) +
theme(strip.text.x = element_text(color = "white", size = 15)) +
theme(plot.title = element_text(size = 22)) +
theme(plot.subtitle = element_text(size = 14, color = "grey90")) +
theme(plot.caption = element_text(size = 14, face = "italic")) +
theme(axis.text.x = element_text(size = 14)) +
labs(x = NULL, y = NULL,
title = "Time Distribution for Data Wrangling Phrase of Data Projects",
subtitle = "Data scientists spend most of their time gathering/wragling rather than modeling data. Median time for this stage\nis 35 (vertical line). In other words, about 50% of them devotes 35 hours per month on this stage.",
caption = "Data Source: Kaggle Data Science Survey")
LS0tDQp0aXRsZTogIkthZ2dsZSBEYXRhIFNjaWVuY2UgU3VydmV5IChEYXRhIFdyYW5nbGluZyBNYXkgQmUgVGhlIE1vc3QgVGltZS1Db25zdW1pbmcpIg0KYXV0aG9yOiAiTmd1eWVuIENoaSBEdW5nIg0Kc3VidGl0bGU6ICJEYWlseSBHcmFwaCBTZXJpZXMiDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgY29kZV9kb3dubG9hZDogeWVzDQogICAgIyBjb2RlX2ZvbGRpbmc6IGhpZGUNCiAgICBoaWdobGlnaHQ6IHplbmJ1cm4NCiAgICB0aGVtZTogZmxhdGx5DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZmxvYXQ6IHllcw0KICB3b3JkX2RvY3VtZW50Og0KICAgIHRvYzogeWVzDQotLS0NCg0KYGBge3Igc2V0dXAsaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UsIGZpZy5yZXRpbmE9MikNCmBgYA0KDQojIERhdGEgU291cmNlIGFuZCBEZXNjcmlwdGlvbg0KDQpLYWdnbGUgY29uZHVjdGVkIGFuIGluZHVzdHJ5LXdpZGUgc3VydmV5IHRvIGVzdGFibGlzaCBhIGNvbXByZWhlbnNpdmUgdmlldyBvZiB0aGUgc3RhdGUgb2YgZGF0YSBzY2llbmNlIGFuZCBtYWNoaW5lIGxlYXJuaW5nLiBUaGUgc3VydmV5IHJlY2VpdmVkIG92ZXIgMTYsMDAwIHJlc3BvbnNlcyBhbmQgd2UgbGVhcm5lZCBhIHRvbiBhYm91dCB3aG8gaXMgd29ya2luZyB3aXRoIGRhdGEsIHdoYXTigJlzIGhhcHBlbmluZyBhdCB0aGUgY3V0dGluZyBlZGdlIG9mIG1hY2hpbmUgbGVhcm5pbmcgYWNyb3NzIGluZHVzdHJpZXMsIGFuZCBob3cgbmV3IGRhdGEgc2NpZW50aXN0cyBjYW4gYmVzdCBicmVhayBpbnRvIHRoZSBmaWVsZC4NCg0KWW91IGNhbiBkb3dsb2FkIGRhdGEgW2hlcmVdKGh0dHBzOi8vd3d3LmthZ2dsZS5jb20va2FnZ2xlL2thZ2dsZS1zdXJ2ZXktMjAxNy9kYXRhKS4gDQoNCiMgRGF0YSBXcmFuZ2xpbmcgTWF5IEJlIFRoZSBNb3N0IFRpbWUtQ29uc3VtaW5nIFBocmFzZQ0KDQohW10oQzpcVXNlcnNcWmJvb2tcRGVza3RvcFxwaWNccDEzLmpwZykNCg0KUiBjb2RlcyBmb3IgdGhpcyBncmFwaDogDQoNCmBgYHtyLCBldmFsPUZBTFNFfQ0KDQpybShsaXN0ID0gbHMoKSkNCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShocmJydGhlbWVzKQ0KDQpkZl9zdXJ2ZXkgPC0gcmVhZC5jc3YoIkU6XFxSX3Byb2plY3RcXEthZ2dsZVxcZHNfc3VydmV5XFxtdWx0aXBsZUNob2ljZVJlc3BvbnNlcy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQoNCg0KZGZfc3VydmV5ICU+JSANCiAgc2VsZWN0KGNvbnRhaW5zKCJUaW1lIikpIC0+IGRmX3RpbWUNCg0KDQpkZl90aW1lICU+JSANCiAgZmlsdGVyKFRpbWVGaW5kaW5nSW5zaWdodHMgPD0gNzUpICU+JSANCiAgZmlsdGVyKFRpbWVPdGhlclNlbGVjdCA8PSA1MCkgJT4lIA0KICBmaWx0ZXIoVGltZVByb2R1Y3Rpb24gPD0gNzUpICU+JSANCiAgZmlsdGVyKFRpbWVWaXN1YWxpemluZyA8PSA3NSkgLT4gZGZfdGltZQ0KDQoNCmRmX3RpbWUgJT4lIA0KICBkcGx5cjo6c2VsZWN0X2lmKGlzLm51bWVyaWMpICU+JSANCiAgZ2F0aGVyKGEsIGIpICU+JSANCiAgbXV0YXRlKGEgPSBzdHJfcmVwbGFjZV9hbGwoYSwgIlRpbWUiLCAiIikpIC0+IGRmX3RpbWUNCiAgDQogIA0KZGZfdGltZSAlPiUgICANCiAgZ3JvdXBfYnkoYSkgJT4lIA0KICBzdW1tYXJpc2UobWVkX3RpbWUgPSBtZWRpYW4oYiwgbmEucm0gPSBUUlVFKSkgJT4lIA0KICB1bmdyb3VwKCkgLT4gZGZfdGltZV9tZWQNCg0KDQpyaWdodF9qb2luKGRmX3RpbWUsIGRmX3RpbWVfbWVkKSAlPiUgDQogIG11dGF0ZShhID0gcGFzdGUwKGEsICI6ICIsIG1lZF90aW1lKSkgLT4gZGZfdGltZQ0KDQoNCmRmX3RpbWUgJT4lIA0KICBnZ3Bsb3QoYWVzKGIsIGZpbGwgPSBhLCBjb2xvciA9IGEpKSArIA0KICBnZW9tX2RlbnNpdHkoYWxwaGEgPSAwLjE1LCBzaG93LmxlZ2VuZCA9IEZBTFNFKSArIA0KICBmYWNldF93cmFwKH4gYSwgc2NhbGVzID0gImZyZWUiKSArIA0KICBnZW9tX3ZsaW5lKGRhdGEgPSBkZl90aW1lX21lZCAlPiUgbXV0YXRlKGEgPSBwYXN0ZTAoYSwgIjogIiwgbWVkX3RpbWUpKSwgYWVzKHhpbnRlcmNlcHQgPSBtZWRfdGltZSwgY29sb3IgPSBhKSwgc2hvdy5sZWdlbmQgPSBGQUxTRSwgbGluZXR5cGUgPSAiZGFzaGVkIiwgc2l6ZSA9IDEpICsgDQogIHRoZW1lX2Z0X3JjKCkgKyANCiAgc2NhbGVfZmlsbF9mdCgpICsgDQogIHNjYWxlX2NvbG9yX2Z0KCkNCg0KDQpkZl90aW1lICU+JSANCiAgZ2dwbG90KGFlcyhiLCBmaWxsID0gYSwgY29sb3IgPSBhKSkgKyANCiAgZ2VvbV9kZW5zaXR5KGFscGhhID0gMC4xNSwgc2hvdy5sZWdlbmQgPSBGQUxTRSkgKyANCiAgZmFjZXRfd3JhcCh+IGEsIHNjYWxlcyA9ICJmcmVlIikgKyANCiAgZ2VvbV92bGluZShkYXRhID0gZGZfdGltZV9tZWQgJT4lIG11dGF0ZShhID0gcGFzdGUwKGEsICI6ICIsIG1lZF90aW1lKSksIA0KICAgICAgICAgICAgIGFlcyh4aW50ZXJjZXB0ID0gbWVkX3RpbWUpLCBjb2xvciA9ICJ3aGl0ZSIsIGxpbmV0eXBlID0gImRhc2hlZCIpICsgDQogIHRoZW1lX2Z0X3JjKCkgKyANCiAgc2NhbGVfZmlsbF9mdCgpICsgDQogIHNjYWxlX2NvbG9yX2Z0KCkgKyANCiAgdGhlbWUocGFuZWwuZ3JpZCA9IGVsZW1lbnRfYmxhbmsoKSkgKyANCiAgdGhlbWUoYXhpcy50ZXh0LnkgPSBlbGVtZW50X2JsYW5rKCkpICsgDQogIHRoZW1lKHN0cmlwLnRleHQueCA9IGVsZW1lbnRfdGV4dChjb2xvciA9ICJ3aGl0ZSIsIHNpemUgPSAxNSkpICsgDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoc2l6ZSA9IDIyKSkgKyANCiAgdGhlbWUocGxvdC5zdWJ0aXRsZSA9IGVsZW1lbnRfdGV4dChzaXplID0gMTQsIGNvbG9yID0gImdyZXk5MCIpKSArIA0KICB0aGVtZShwbG90LmNhcHRpb24gPSBlbGVtZW50X3RleHQoc2l6ZSA9IDE0LCBmYWNlID0gIml0YWxpYyIpKSArIA0KICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChzaXplID0gMTQpKSArIA0KICBsYWJzKHggPSBOVUxMLCB5ID0gTlVMTCwgDQogICAgICAgdGl0bGUgPSAiVGltZSBEaXN0cmlidXRpb24gZm9yIERhdGEgV3JhbmdsaW5nIFBocmFzZSBvZiBEYXRhIFByb2plY3RzIiwgDQogICAgICAgc3VidGl0bGUgPSAiRGF0YSBzY2llbnRpc3RzIHNwZW5kIG1vc3Qgb2YgdGhlaXIgdGltZSBnYXRoZXJpbmcvd3JhZ2xpbmcgcmF0aGVyIHRoYW4gbW9kZWxpbmcgZGF0YS4gTWVkaWFuIHRpbWUgZm9yIHRoaXMgc3RhZ2VcbmlzIDM1ICh2ZXJ0aWNhbCBsaW5lKS4gSW4gb3RoZXIgd29yZHMsIGFib3V0IDUwJSBvZiB0aGVtIGRldm90ZXMgMzUgaG91cnMgcGVyIG1vbnRoIG9uIHRoaXMgc3RhZ2UuIiwgDQogICAgICAgY2FwdGlvbiA9ICJEYXRhIFNvdXJjZTogS2FnZ2xlIERhdGEgU2NpZW5jZSBTdXJ2ZXkiKQ0KYGBgDQoNCg==