url<-"https://raw.githubusercontent.com/baruab/msdsrepo/main/Project_3_607/kaggle-survey-2018/multipleChoiceResponses.csv"
survey_tib <- read.csv(file=url(url), sep=",")
names(survey_tib) <- paste(names(survey_tib),survey_tib[1,],sep="_")
#Remove row 1
survey_tib <- survey_tib[-c(1),]
Select only Data Scientist in the United States of America
survey_tib <- survey_tib %>%
filter((`Q3_In which country do you currently reside?`=='United States of America')&(`Q6_Select the title most similar to your current role (or most recent title if retired): - Selected Choice`=='Data Scientist'))
#write.csv(survey_tib,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/surv.csv", row.names = FALSE)
Select Required columns for Analysis
survey_tib <- survey_tib %>%
select(starts_with(c("Q1_","Q2_","Q3_","Q6_","Q7_","Q13","Q16","Q21","Q33")))
#names(survey_tib)
Where do you find public datasets?
surv_dataset <- survey_tib %>% select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q33")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q33"),names_to="Dataset",values_to="Dataset_type")
surv_dataset_tib<-surv_dataset %>%
count(surv_dataset$Dataset_type) %>%
rename("DataSource" = "surv_dataset$Dataset_type","Count"="n")%>%
arrange(desc(Count))
#write.csv(surv_dataset,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/tibviz.csv", row.names = FALSE)
surv_dataset_tib<-surv_dataset_tib[-c(1),]
surv_dataset_tib2<-surv_dataset_tib %>%
mutate(proportion = round((surv_dataset_tib$Count /sum(surv_dataset_tib$Count))*100,2))
surv_dataset_tib2 %>% ggplot(aes(reorder(DataSource,proportion),proportion)) +
geom_col(fill="#06272C")+coord_flip()+geom_text(aes(label=proportion),color="red") +labs(x="Data Sources")+theme_bw()

kbl(surv_dataset_tib2)%>%
kable_styling() %>% kable_paper("hover", full_width = F)
DataSource
|
Count
|
proportion
|
Dataset aggregator/platform (Socrata, Kaggle Public Datasets Platform, etc.)
|
369
|
14.51
|
Google Search
|
367
|
14.43
|
Government websites
|
338
|
13.29
|
I collect my own data (web-scraping, etc.)
|
302
|
11.88
|
GitHub
|
280
|
11.01
|
University research group websites
|
214
|
8.42
|
Google Dataset Search
|
190
|
7.47
|
Publicly released data from private companies
|
171
|
6.72
|
Non-profit research group websites
|
149
|
5.86
|
None/I do not work with public data
|
138
|
5.43
|
Other
|
25
|
0.98
|
Which of the following integrated development environments (IDE’s) have you used at work or school in the last 5 years?
ide_tool<- survey_tib %>%
select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q13")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q13"),names_to="IDEName",values_to="IDEUsed")
#write.csv(surv_viz_tool,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/survviz.csv", row.names = FALSE)
ide_tool_tib <- ide_tool %>%
count(ide_tool$IDEUsed) %>% rename("IDE" = "ide_tool$IDEUsed","Count"="n")%>%
arrange(desc(Count))
ide_tool_tib<-ide_tool_tib[-c(1),]
ide_tool_tib2<-ide_tool_tib %>%
mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))
ide_tool_tib2 %>% ggplot(aes(reorder(IDE,proportion),proportion)) +
geom_col(fill="#18213A")+coord_flip()+
geom_text(aes(label=proportion),color="red")+
labs(x="IDE")+theme_bw()

kbl(ide_tool_tib2)%>%
kable_styling() %>% kable_paper("hover", full_width = F)
IDE
|
Count
|
proportion
|
Jupyter/IPython
|
763
|
19.36
|
RStudio
|
586
|
14.87
|
Sublime Text
|
344
|
8.73
|
PyCharm
|
335
|
8.50
|
Notepad++
|
302
|
7.66
|
Spyder
|
298
|
7.56
|
Vim
|
287
|
7.28
|
Atom
|
251
|
6.37
|
MATLAB
|
203
|
5.15
|
Visual Studio Code
|
193
|
4.90
|
Visual Studio
|
166
|
4.21
|
IntelliJ
|
144
|
3.65
|
Other
|
52
|
1.32
|
nteract
|
16
|
0.41
|
None
|
2
|
0.05
|
What programming languages do you use on a regular basis?
lang_tool<- survey_tib %>%
select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q16")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q16"),names_to="LangName",values_to="LangUsed")
#write.csv(surv_viz_tool,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/survviz.csv", row.names = FALSE)
Lang_tool_tib <- lang_tool %>%
count(lang_tool$LangUsed) %>% rename("Language" = "lang_tool$LangUsed","Count"="n")%>%
arrange(desc(Count))
Lang_tool_tib<-Lang_tool_tib[-c(1),]
Lang_tool_tib2<-Lang_tool_tib %>%
mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))
Lang_tool_tib2 %>% ggplot(aes(reorder(Language,proportion),proportion)) +
geom_col(fill='#633974')+coord_flip()+
geom_text(aes(label=proportion),color="green")+ labs(x="Language")+theme_bw()

kbl(Lang_tool_tib2) %>%
kable_styling() %>% kable_paper("hover", full_width = F)
Language
|
Count
|
proportion
|
Python
|
777
|
30.53
|
SQL
|
562
|
22.08
|
R
|
452
|
17.76
|
Bash
|
207
|
8.13
|
Javascript/Typescript
|
89
|
3.50
|
SAS/STATA
|
72
|
2.83
|
Java
|
64
|
2.51
|
MATLAB
|
64
|
2.51
|
Scala
|
62
|
2.44
|
C/C++
|
55
|
2.16
|
Visual Basic/VBA
|
43
|
1.69
|
C#/.NET
|
25
|
0.98
|
Other
|
23
|
0.90
|
Go
|
17
|
0.67
|
Julia
|
17
|
0.67
|
PHP
|
9
|
0.35
|
Ruby
|
5
|
0.20
|
None
|
2
|
0.08
|