The tools and Programming languages used by Data Scientist in the United States of America (Project 3 Part)

url<-"https://raw.githubusercontent.com/baruab/msdsrepo/main/Project_3_607/kaggle-survey-2018/multipleChoiceResponses.csv"
survey_tib <- read.csv(file=url(url), sep=",")
names(survey_tib) <- paste(names(survey_tib),survey_tib[1,],sep="_")

#Remove row 1
survey_tib <- survey_tib[-c(1),]

Select only Data Scientist in the United States of America

survey_tib <- survey_tib %>% 
 filter((`Q3_In which country do you currently reside?`=='United States of America')&(`Q6_Select the title most similar to your current role (or most recent title if retired): - Selected Choice`=='Data Scientist'))
#write.csv(survey_tib,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/surv.csv", row.names = FALSE)

Select Required columns for Analysis

survey_tib <- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q6_","Q7_","Q13","Q16","Q21","Q33")))
#names(survey_tib)

Where do you find public datasets?

surv_dataset <- survey_tib %>% select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q33")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q33"),names_to="Dataset",values_to="Dataset_type")
surv_dataset_tib<-surv_dataset %>% 
  count(surv_dataset$Dataset_type) %>%
  rename("DataSource" = "surv_dataset$Dataset_type","Count"="n")%>%
  arrange(desc(Count))

#write.csv(surv_dataset,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/tibviz.csv", row.names = FALSE)
surv_dataset_tib<-surv_dataset_tib[-c(1),]
surv_dataset_tib2<-surv_dataset_tib %>% 
  mutate(proportion = round((surv_dataset_tib$Count /sum(surv_dataset_tib$Count))*100,2))

surv_dataset_tib2 %>% ggplot(aes(reorder(DataSource,proportion),proportion)) +
  geom_col(fill="#06272C")+coord_flip()+geom_text(aes(label=proportion),color="red") +labs(x="Data Sources")+theme_bw()

kbl(surv_dataset_tib2)%>%
  kable_styling() %>% kable_paper("hover", full_width = F)
DataSource Count proportion
Dataset aggregator/platform (Socrata, Kaggle Public Datasets Platform, etc.) 369 14.51
Google Search 367 14.43
Government websites 338 13.29
I collect my own data (web-scraping, etc.) 302 11.88
GitHub 280 11.01
University research group websites 214 8.42
Google Dataset Search 190 7.47
Publicly released data from private companies 171 6.72
Non-profit research group websites 149 5.86
None/I do not work with public data 138 5.43
Other 25 0.98

What data visualization libraries or tools have you used in the past 5 years?

surv_viz_tool<- survey_tib %>% select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q21")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q21"),names_to="ToolName",values_to="Tool")
surv_viz_tool_tib <- surv_viz_tool %>%
  count(surv_viz_tool$Tool) %>% rename("VisualTools" = "surv_viz_tool$Tool","Count"="n")%>%
  arrange(desc(Count))
surv_viz_tool_tib <- surv_viz_tool_tib[-c(1),]
surv_viz_tool_tib2<-surv_viz_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>% arrange(desc(proportion))
  
surv_viz_tool_tib2 %>% ggplot(aes(reorder(VisualTools,proportion),proportion)) +
  geom_col()+coord_flip()+geom_col(fill="#A7ADBE")+
  geom_text(aes(label=proportion),color="red") +labs(x="Visual Tools")+theme_bw()

kbl(surv_viz_tool_tib2) %>%
  kable_styling() %>% kable_paper("hover", full_width = F)
VisualTools Count proportion
Matplotlib 689 21.46
ggplot2 565 17.60
Seaborn 527 16.42
Plotly 401 12.49
Shiny 308 9.60
Bokeh 208 6.48
D3 207 6.45
Leaflet 104 3.24
Lattice 75 2.34
Geoplotlib 49 1.53
Altair 33 1.03
Other 31 0.97
None 13 0.40

Which of the following integrated development environments (IDE’s) have you used at work or school in the last 5 years?

ide_tool<- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q13")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q13"),names_to="IDEName",values_to="IDEUsed")
#write.csv(surv_viz_tool,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/survviz.csv", row.names = FALSE)
ide_tool_tib <- ide_tool %>%
  count(ide_tool$IDEUsed) %>% rename("IDE" = "ide_tool$IDEUsed","Count"="n")%>%
  arrange(desc(Count))
ide_tool_tib<-ide_tool_tib[-c(1),]
ide_tool_tib2<-ide_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))

ide_tool_tib2 %>% ggplot(aes(reorder(IDE,proportion),proportion)) +
  geom_col(fill="#18213A")+coord_flip()+
  geom_text(aes(label=proportion),color="red")+
  labs(x="IDE")+theme_bw()

kbl(ide_tool_tib2)%>%
  kable_styling() %>% kable_paper("hover", full_width = F)
IDE Count proportion
Jupyter/IPython 763 19.36
RStudio 586 14.87
Sublime Text 344 8.73
PyCharm 335 8.50
Notepad++ 302 7.66
Spyder 298 7.56
Vim 287 7.28
Atom 251 6.37
MATLAB 203 5.15
Visual Studio Code 193 4.90
Visual Studio 166 4.21
IntelliJ 144 3.65
Other 52 1.32
nteract 16 0.41
None 2 0.05

What programming languages do you use on a regular basis?

lang_tool<- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q16")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q16"),names_to="LangName",values_to="LangUsed")

#write.csv(surv_viz_tool,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/survviz.csv", row.names = FALSE)
Lang_tool_tib <- lang_tool %>%
  count(lang_tool$LangUsed) %>% rename("Language" = "lang_tool$LangUsed","Count"="n")%>%
  arrange(desc(Count))
Lang_tool_tib<-Lang_tool_tib[-c(1),]
Lang_tool_tib2<-Lang_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))

Lang_tool_tib2 %>% ggplot(aes(reorder(Language,proportion),proportion)) +
  geom_col(fill='#633974')+coord_flip()+
  geom_text(aes(label=proportion),color="green")+ labs(x="Language")+theme_bw()

kbl(Lang_tool_tib2) %>%
  kable_styling() %>% kable_paper("hover", full_width = F)
Language Count proportion
Python 777 30.53
SQL 562 22.08
R 452 17.76
Bash 207 8.13
Javascript/Typescript 89 3.50
SAS/STATA 72 2.83
Java 64 2.51
MATLAB 64 2.51
Scala 62 2.44
C/C++ 55 2.16
Visual Basic/VBA 43 1.69
C#/.NET 25 0.98
Other 23 0.90
Go 17 0.67
Julia 17 0.67
PHP 9 0.35
Ruby 5 0.20
None 2 0.08