Library
#install.packages("janitor")
library(janitor)
library(dplyr)
library(psych)
library(readr)
library(skimr)
library(jmv)
library(tidyr)
library(gsheet)
library(stringr)
Data
raw_data<-gsheet2tbl("https://docs.google.com/spreadsheets/d/19AdprBOX8Y0ZTly_EgFcZYi4ay4pGFHXmLRCWsUv4Oc/edit?usp=sharing")
Data Column Name
raw_data %>%
names() %>%
head(10)
## [1] "...1"
## [2] "Timestamp"
## [3] "How.would.you.rate.your.level.of.experience.using.R."
## [4] "Compared.with.other.technical.topics.you.ve.learned.in.school.and.on.the.job..on.a.scale.of.1.to.5..how.difficult.do.you.expect.learning.R.to.be."
## [5] "From.what.you.know.about.R..how.long.do.you.expect.that.it.will.take.for.you.to.learn.enough.to.use.R.productively."
## [6] "How.do.you.think.you.would.go.about.the.process.of.learning.R."
## [7] "Which.statement.most.closely.reflects.the.primary.reason.why.you.are.interested.in.learning.R."
## [8] "If.you.were.to.learn.R..what.would.do.you.think.you.would.use.it.for...check.all.that.apply."
## [9] "Which.analytical.tools.do.you.use.today.for.the.functions.that.you.might.learn.R.for...please.check.all.that.apply."
## [10] "What.do.you.think.is.the.biggest.obstacle.you.must.overcome.in.trying.to.learn.R..The.choices.below.are.only.suggestions..if.we.haven.t.listed.your.obstacle..please.choose..Other..and.add.your.obstacle.in.the.text.."
Data value
raw_data %>%
head()
## # A tibble: 6 × 53
## ...1 Timest…¹ How.w…² Compa…³ From.…⁴ How.d…⁵ Which…⁶ If.yo…⁷ Which…⁸ What.…⁹
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 12/13/2… Expert NA <NA> <NA> <NA> <NA> <NA> <NA>
## 2 2 12/13/2… Beginn… NA <NA> <NA> <NA> <NA> <NA> <NA>
## 3 3 12/13/2… Interm… NA <NA> <NA> <NA> <NA> <NA> <NA>
## 4 4 12/13/2… Interm… NA <NA> <NA> <NA> <NA> <NA> <NA>
## 5 5 12/13/2… Interm… NA <NA> <NA> <NA> <NA> <NA> <NA>
## 6 6 12/13/2… Expert NA <NA> <NA> <NA> <NA> <NA> <NA>
## # … with 43 more variables: What.year.did.you.first.start.learning.R. <dbl>,
## # How.did.you.learn.R..If.you.used.multiple.methods..please.select.the.one.you.used.the.most. <chr>,
## # Compared.with.other.technical.topics.you.ve.learned.in.school.and.on.the.job..on.a.scale.of.1.to.5..how.difficult.has.it.been.for.you.to.learn.R. <dbl>,
## # Roughly.how.long.did.it.take.you.to.achieve.proficiency.in.R. <chr>,
## # Which.statement.most.closely.reflects.the.primary.reason.why.you.learned.R. <chr>,
## # What.do.you.think.was.the.biggest.obstacle.you.had.to.overcome.in.learning.R..The.choices.below.are.only.suggestions..if.we.haven.t.listed.your.obstacle..please.choose..Other..and.add.your.obstacle.in.the.text.. <chr>,
## # How.often.do.you.use.R.today..either.for.professional.or.personal.projects. <chr>, …
Column Rename
renamed <- raw_data %>%
rename("Qr.experience" = names(raw_data[2]), ##How.would.you.rate.your.level.of.experience.using.R
"Qr.difficulties" = names(raw_data[3]), ##Compared.with.other.technical.topics .....
"Qr.knowledge" = names(raw_data[4]), ## From.what.you.know.about.R..
"Qr.progress" = names(raw_data[5]) ## How.do.you.think.you.would.go.about.the.process.of.learning.R.
)
Change column name
qnames <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1YVL5FGAyO-pBXS-8T2mNOw6mEoLBau2JBQbBeI7wv7s/edit?usp=sharing")
write.csv(qnames,"D:\\PMP\\clean2.csv")
rsurvey <- raw_data
names(rsurvey) <- names(qnames)
rsurvey %>%
names()
## [1] "...1" "Qtime"
## [3] "Qr_experience" "Qr_difficulty"
## [5] "Qr_length_to_success" "Qhow_to_learn_r"
## [7] "Qreason_to_learn" "Qr_use"
## [9] "Qtools" "Qobstacles_to_starting"
## [11] "Qr_year" "Qr_learning_path"
## [13] "Qr_difficulty_experienced" "Qtime_to_proficiency"
## [15] "Qreason_experienced" "Qmost_difficult_aspect"
## [17] "Qr_how_often_used" "Qused_for"
## [19] "Qr_enjoyment" "Qrecommend"
## [21] "Qr_tools" "Qtidyverse_learning"
## [23] "Qtidyverse_today" "Qlike_best"
## [25] "Qlike_least" "Qr_problems"
## [27] "Qr_discover_packages" "Qr_share"
## [29] "Qr_change" "Qrobot_test"
## [31] "Qrmarkdown" "Qrmarkdown_apps"
## [33] "Qrmarkdown_change" "Qshiny"
## [35] "Qshiny_change" "Qpython_use"
## [37] "Qpython_apps" "Qpython_enjoy"
## [39] "Qpython_recommend" "Qpython_change"
## [41] "Qlanguages" "Qfirst_language"
## [43] "Qyear_born" "Qgender"
## [45] "Qethnicity" "Qdegree"
## [47] "Qcountry" "Qindustry"
## [49] "Qtitle" "Qwork_title"
## [51] "Qpeople" "Qevents"
## [53] "Qhear"
Clean variable names
rsurvey <- rsurvey %>%
janitor::clean_names()
rsurvey %>%
names()
## [1] "x1" "qtime"
## [3] "qr_experience" "qr_difficulty"
## [5] "qr_length_to_success" "qhow_to_learn_r"
## [7] "qreason_to_learn" "qr_use"
## [9] "qtools" "qobstacles_to_starting"
## [11] "qr_year" "qr_learning_path"
## [13] "qr_difficulty_experienced" "qtime_to_proficiency"
## [15] "qreason_experienced" "qmost_difficult_aspect"
## [17] "qr_how_often_used" "qused_for"
## [19] "qr_enjoyment" "qrecommend"
## [21] "qr_tools" "qtidyverse_learning"
## [23] "qtidyverse_today" "qlike_best"
## [25] "qlike_least" "qr_problems"
## [27] "qr_discover_packages" "qr_share"
## [29] "qr_change" "qrobot_test"
## [31] "qrmarkdown" "qrmarkdown_apps"
## [33] "qrmarkdown_change" "qshiny"
## [35] "qshiny_change" "qpython_use"
## [37] "qpython_apps" "qpython_enjoy"
## [39] "qpython_recommend" "qpython_change"
## [41] "qlanguages" "qfirst_language"
## [43] "qyear_born" "qgender"
## [45] "qethnicity" "qdegree"
## [47] "qcountry" "qindustry"
## [49] "qtitle" "qwork_title"
## [51] "qpeople" "qevents"
## [53] "qhear"
Describe variables
rsurvey %>%
psych::describe()
## vars n mean sd median trimmed mad min
## x1 1 1838 919.50 530.73 919.5 919.50 681.25 1
## qtime* 2 1838 913.24 526.89 912.5 913.05 676.07 1
## qr_experience* 3 1807 2.45 0.72 3.0 2.56 0.00 1
## qr_difficulty 4 8 3.50 0.53 3.5 3.50 0.74 3
## qr_length_to_success* 5 8 1.62 0.92 1.0 1.62 0.00 1
## qhow_to_learn_r* 6 8 1.75 0.89 1.5 1.75 0.74 1
## qreason_to_learn* 7 8 3.62 1.69 3.5 3.62 2.22 1
## qr_use* 8 8 4.50 2.45 4.5 4.50 2.97 1
## qtools* 9 8 4.00 1.85 4.5 4.00 2.22 1
## qobstacles_to_starting* 10 8 2.50 1.51 2.5 2.50 2.22 1
## qr_year 11 1676 2007.75 107.35 2015.0 2014.22 2.97 2
## qr_learning_path* 12 1794 19.89 9.70 20.0 19.37 13.34 1
## qr_difficulty_experienced 13 1793 2.89 0.90 3.0 2.89 1.48 1
## qtime_to_proficiency* 14 1797 3.60 1.11 3.0 3.64 1.48 1
## qreason_experienced* 15 1798 4.24 1.98 4.0 4.26 1.48 1
## qmost_difficult_aspect* 16 1783 63.12 29.62 51.0 60.84 31.13 1
## qr_how_often_used* 17 1798 3.39 1.86 5.0 3.49 0.00 1
## qused_for* 18 1797 172.02 96.24 154.0 168.63 93.40 1
## qr_enjoyment 19 1798 4.65 0.61 5.0 4.76 0.00 1
## qrecommend 20 1795 9.24 1.22 10.0 9.49 0.00 1
## qr_tools* 21 1783 364.84 165.40 356.0 373.51 212.01 1
## qtidyverse_learning* 22 1796 2.70 0.48 3.0 2.76 0.00 1
## qtidyverse_today* 23 1797 4.66 0.81 5.0 4.88 0.00 1
## qlike_best* 24 1453 621.46 365.18 629.0 622.56 481.84 1
## qlike_least* 25 1335 618.51 360.39 625.0 618.08 472.95 1
## qr_problems* 26 1795 56.26 41.19 42.0 53.24 47.44 1
## qr_discover_packages* 27 1791 204.35 98.58 252.0 210.34 99.33 1
## qr_share* 28 1723 153.39 102.14 154.0 152.20 139.36 1
## qr_change* 29 1798 2.38 0.59 2.0 2.42 0.00 1
## qrobot_test* 30 1807 16.66 4.49 16.0 16.39 0.00 1
## qrmarkdown* 31 1807 3.23 1.44 3.0 3.28 1.48 1
## qrmarkdown_apps* 32 1499 219.71 110.57 220.0 219.52 118.61 1
## qrmarkdown_change* 33 1794 2.38 0.58 2.0 2.40 0.00 1
## qshiny* 34 1805 3.43 1.26 4.0 3.54 1.48 1
## qshiny_change* 35 1775 2.43 0.58 2.0 2.45 0.00 1
## qpython_use* 36 1805 2.95 1.17 3.0 2.89 1.48 1
## qpython_apps* 37 1000 153.38 85.91 153.0 152.53 105.26 1
## qpython_enjoy 38 1019 3.25 1.08 3.0 3.27 1.48 1
## qpython_recommend 39 1015 7.37 2.17 8.0 7.61 2.97 1
## qpython_change* 40 1784 2.42 0.55 2.0 2.43 0.00 1
## qlanguages* 41 1797 492.39 237.25 536.0 503.43 269.83 1
## qfirst_language* 42 1801 84.88 40.32 81.0 84.12 53.37 1
## qyear_born 43 1731 1983.39 10.89 1985.0 1984.68 8.90 1878
## qgender* 44 1685 24.30 7.21 27.0 24.92 0.00 1
## qethnicity* 45 1711 51.38 16.91 60.0 55.04 0.00 1
## qdegree* 46 1784 6.50 3.03 8.0 6.31 4.45 1
## qcountry* 47 1690 62.52 30.87 84.0 65.96 7.41 1
## qindustry* 48 1794 61.87 30.54 48.0 60.75 29.65 1
## qtitle* 49 1498 341.40 212.54 317.5 335.57 243.89 1
## qwork_title* 50 1787 22.33 15.78 10.0 20.92 5.93 1
## qpeople 51 1646 7.35 48.26 2.0 2.31 2.97 0
## qevents* 52 694 16.03 17.16 13.0 13.65 17.79 1
## qhear* 53 1791 48.84 22.22 63.0 52.27 0.00 1
## max range skew kurtosis se
## x1 1838 1837 0.00 -1.20 12.38
## qtime* 1827 1826 0.00 -1.20 12.29
## qr_experience* 4 3 -0.85 -0.49 0.02
## qr_difficulty 4 1 0.00 -2.23 0.19
## qr_length_to_success* 3 2 0.66 -1.59 0.32
## qhow_to_learn_r* 3 2 0.40 -1.75 0.31
## qreason_to_learn* 6 5 -0.11 -1.55 0.60
## qr_use* 8 7 0.00 -1.65 0.87
## qtools* 6 5 -0.35 -1.60 0.65
## qobstacles_to_starting* 5 4 0.33 -1.58 0.53
## qr_year 2019 2017 -18.20 330.62 2.62
## qr_learning_path* 52 51 0.41 -1.37 0.23
## qr_difficulty_experienced 5 4 0.01 -0.18 0.02
## qtime_to_proficiency* 5 4 0.03 -1.13 0.03
## qreason_experienced* 8 7 0.02 -1.00 0.05
## qmost_difficult_aspect* 168 167 0.70 -0.08 0.70
## qr_how_often_used* 5 4 -0.37 -1.79 0.04
## qused_for* 368 367 0.30 -0.97 2.27
## qr_enjoyment 5 4 -1.96 4.58 0.01
## qrecommend 10 9 -2.20 6.80 0.03
## qr_tools* 627 626 -0.28 -0.95 3.92
## qtidyverse_learning* 3 2 -1.13 -0.05 0.01
## qtidyverse_today* 5 4 -2.47 5.54 0.02
## qlike_best* 1237 1236 -0.02 -1.29 9.58
## qlike_least* 1245 1244 0.00 -1.22 9.86
## qr_problems* 160 159 0.52 -0.77 0.97
## qr_discover_packages* 340 339 -0.46 -1.11 2.33
## qr_share* 348 347 0.02 -1.34 2.46
## qr_change* 4 3 -0.31 -0.68 0.01
## qrobot_test* 45 44 3.56 19.02 0.11
## qrmarkdown* 5 4 -0.24 -1.29 0.03
## qrmarkdown_apps* 463 462 -0.05 -0.69 2.86
## qrmarkdown_change* 4 3 -0.13 -0.58 0.01
## qshiny* 5 4 -0.56 -0.56 0.03
## qshiny_change* 4 3 -0.21 -0.66 0.01
## qpython_use* 5 4 0.24 -1.22 0.03
## qpython_apps* 323 322 0.01 -1.12 2.72
## qpython_enjoy 5 4 -0.13 -0.57 0.03
## qpython_recommend 10 9 -0.79 0.20 0.07
## qpython_change* 4 3 -0.07 -0.87 0.01
## qlanguages* 867 866 -0.35 -1.02 5.60
## qfirst_language* 157 156 0.13 -1.17 0.95
## qyear_born 2001 123 -2.14 11.44 0.26
## qgender* 53 52 -0.16 2.53 0.18
## qethnicity* 74 73 -1.56 0.78 0.41
## qdegree* 17 16 0.39 0.07 0.07
## qcountry* 92 91 -0.62 -1.25 0.75
## qindustry* 126 125 0.30 -1.18 0.72
## qtitle* 748 747 0.19 -1.22 5.49
## qwork_title* 56 55 0.54 -1.43 0.37
## qpeople 1000 1000 17.42 337.35 1.19
## qevents* 60 59 0.91 -0.48 0.65
## qhear* 68 67 -1.06 -0.71 0.53
Summary variables
rsurvey %>%
summary()
## x1 qtime qr_experience qr_difficulty
## Min. : 1.0 Length:1838 Length:1838 Min. :3.0
## 1st Qu.: 460.2 Class :character Class :character 1st Qu.:3.0
## Median : 919.5 Mode :character Mode :character Median :3.5
## Mean : 919.5 Mean :3.5
## 3rd Qu.:1378.8 3rd Qu.:4.0
## Max. :1838.0 Max. :4.0
## NA's :1830
## qr_length_to_success qhow_to_learn_r qreason_to_learn qr_use
## Length:1838 Length:1838 Length:1838 Length:1838
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## qtools qobstacles_to_starting qr_year qr_learning_path
## Length:1838 Length:1838 Min. : 2 Length:1838
## Class :character Class :character 1st Qu.:2012 Class :character
## Mode :character Mode :character Median :2015 Mode :character
## Mean :2008
## 3rd Qu.:2017
## Max. :2019
## NA's :162
## qr_difficulty_experienced qtime_to_proficiency qreason_experienced
## Min. :1.000 Length:1838 Length:1838
## 1st Qu.:2.000 Class :character Class :character
## Median :3.000 Mode :character Mode :character
## Mean :2.887
## 3rd Qu.:3.000
## Max. :5.000
## NA's :45
## qmost_difficult_aspect qr_how_often_used qused_for qr_enjoyment
## Length:1838 Length:1838 Length:1838 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :5.000
## Mean :4.647
## 3rd Qu.:5.000
## Max. :5.000
## NA's :40
## qrecommend qr_tools qtidyverse_learning qtidyverse_today
## Min. : 1.00 Length:1838 Length:1838 Length:1838
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean : 9.24
## 3rd Qu.:10.00
## Max. :10.00
## NA's :43
## qlike_best qlike_least qr_problems qr_discover_packages
## Length:1838 Length:1838 Length:1838 Length:1838
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## qr_share qr_change qrobot_test qrmarkdown
## Length:1838 Length:1838 Length:1838 Length:1838
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## qrmarkdown_apps qrmarkdown_change qshiny qshiny_change
## Length:1838 Length:1838 Length:1838 Length:1838
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## qpython_use qpython_apps qpython_enjoy qpython_recommend
## Length:1838 Length:1838 Min. :1.000 Min. : 1.000
## Class :character Class :character 1st Qu.:3.000 1st Qu.: 6.000
## Mode :character Mode :character Median :3.000 Median : 8.000
## Mean :3.253 Mean : 7.374
## 3rd Qu.:4.000 3rd Qu.: 9.000
## Max. :5.000 Max. :10.000
## NA's :819 NA's :823
## qpython_change qlanguages qfirst_language qyear_born
## Length:1838 Length:1838 Length:1838 Min. :1878
## Class :character Class :character Class :character 1st Qu.:1979
## Mode :character Mode :character Mode :character Median :1985
## Mean :1983
## 3rd Qu.:1991
## Max. :2001
## NA's :107
## qgender qethnicity qdegree qcountry
## Length:1838 Length:1838 Length:1838 Length:1838
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## qindustry qtitle qwork_title qpeople
## Length:1838 Length:1838 Length:1838 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Mode :character Median : 2.000
## Mean : 7.348
## 3rd Qu.: 5.000
## Max. :1000.000
## NA's :192
## qevents qhear
## Length:1838 Length:1838
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
Data completeness
rsurvey %>%
skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
1838 |
Number of columns |
53 |
_______________________ |
|
Column type frequency: |
|
character |
43 |
numeric |
10 |
________________________ |
|
Group variables |
None |
Variable type: character
qtime |
0 |
1.00 |
16 |
19 |
0 |
1827 |
0 |
qr_experience |
31 |
0.98 |
4 |
12 |
0 |
4 |
0 |
qr_length_to_success |
1830 |
0.00 |
5 |
6 |
0 |
3 |
0 |
qhow_to_learn_r |
1830 |
0.00 |
67 |
96 |
0 |
3 |
0 |
qreason_to_learn |
1830 |
0.00 |
25 |
95 |
0 |
6 |
0 |
qr_use |
1830 |
0.00 |
35 |
86 |
0 |
8 |
0 |
qtools |
1830 |
0.00 |
6 |
31 |
0 |
6 |
0 |
qobstacles_to_starting |
1830 |
0.00 |
4 |
96 |
0 |
5 |
0 |
qr_learning_path |
44 |
0.98 |
4 |
128 |
0 |
52 |
0 |
qtime_to_proficiency |
41 |
0.98 |
4 |
33 |
0 |
5 |
0 |
qreason_experienced |
40 |
0.98 |
27 |
70 |
0 |
8 |
0 |
qmost_difficult_aspect |
55 |
0.97 |
4 |
1057 |
0 |
168 |
0 |
qr_how_often_used |
40 |
0.98 |
20 |
36 |
0 |
5 |
0 |
qused_for |
41 |
0.98 |
8 |
248 |
0 |
368 |
0 |
qr_tools |
55 |
0.97 |
4 |
262 |
0 |
627 |
0 |
qtidyverse_learning |
42 |
0.98 |
2 |
34 |
0 |
3 |
0 |
qtidyverse_today |
41 |
0.98 |
5 |
30 |
0 |
5 |
0 |
qlike_best |
385 |
0.79 |
1 |
519 |
0 |
1237 |
0 |
qlike_least |
503 |
0.73 |
1 |
882 |
0 |
1245 |
0 |
qr_problems |
43 |
0.98 |
7 |
407 |
0 |
160 |
0 |
qr_discover_packages |
47 |
0.97 |
7 |
469 |
0 |
340 |
0 |
qr_share |
115 |
0.94 |
1 |
383 |
0 |
348 |
0 |
qr_change |
40 |
0.98 |
23 |
28 |
0 |
4 |
0 |
qrobot_test |
31 |
0.98 |
1 |
60 |
0 |
45 |
0 |
qrmarkdown |
31 |
0.98 |
22 |
47 |
0 |
5 |
0 |
qrmarkdown_apps |
339 |
0.82 |
3 |
384 |
0 |
463 |
0 |
qrmarkdown_change |
44 |
0.98 |
23 |
28 |
0 |
4 |
0 |
qshiny |
33 |
0.98 |
27 |
71 |
0 |
5 |
0 |
qshiny_change |
63 |
0.97 |
23 |
28 |
0 |
4 |
0 |
qpython_use |
33 |
0.98 |
18 |
47 |
0 |
5 |
0 |
qpython_apps |
838 |
0.54 |
3 |
175 |
0 |
323 |
0 |
qpython_change |
54 |
0.97 |
23 |
28 |
0 |
4 |
0 |
qlanguages |
41 |
0.98 |
3 |
140 |
0 |
867 |
0 |
qfirst_language |
37 |
0.98 |
1 |
96 |
0 |
157 |
0 |
qgender |
153 |
0.92 |
1 |
46 |
0 |
53 |
0 |
qethnicity |
127 |
0.93 |
3 |
251 |
0 |
74 |
0 |
qdegree |
54 |
0.97 |
3 |
152 |
0 |
17 |
0 |
qcountry |
148 |
0.92 |
4 |
33 |
0 |
92 |
0 |
qindustry |
44 |
0.98 |
3 |
87 |
0 |
126 |
0 |
qtitle |
340 |
0.82 |
2 |
74 |
0 |
748 |
0 |
qwork_title |
51 |
0.97 |
4 |
104 |
0 |
56 |
0 |
qevents |
1144 |
0.38 |
13 |
195 |
0 |
60 |
0 |
qhear |
47 |
0.97 |
2 |
40 |
0 |
68 |
0 |
Variable type: numeric
x1 |
0 |
1.00 |
919.50 |
530.73 |
1 |
460.25 |
919.5 |
1378.75 |
1838 |
▇▇▇▇▇ |
qr_difficulty |
1830 |
0.00 |
3.50 |
0.53 |
3 |
3.00 |
3.5 |
4.00 |
4 |
▇▁▁▁▇ |
qr_year |
162 |
0.91 |
2007.75 |
107.35 |
2 |
2012.00 |
2015.0 |
2017.00 |
2019 |
▁▁▁▁▇ |
qr_difficulty_experienced |
45 |
0.98 |
2.89 |
0.90 |
1 |
2.00 |
3.0 |
3.00 |
5 |
▁▅▇▃▁ |
qr_enjoyment |
40 |
0.98 |
4.65 |
0.61 |
1 |
4.00 |
5.0 |
5.00 |
5 |
▁▁▁▃▇ |
qrecommend |
43 |
0.98 |
9.24 |
1.22 |
1 |
9.00 |
10.0 |
10.00 |
10 |
▁▁▁▂▇ |
qpython_enjoy |
819 |
0.55 |
3.25 |
1.08 |
1 |
3.00 |
3.0 |
4.00 |
5 |
▁▃▇▆▃ |
qpython_recommend |
823 |
0.55 |
7.37 |
2.17 |
1 |
6.00 |
8.0 |
9.00 |
10 |
▁▂▅▇▇ |
qyear_born |
107 |
0.94 |
1983.39 |
10.89 |
1878 |
1979.00 |
1985.0 |
1991.00 |
2001 |
▁▁▁▂▇ |
qpeople |
192 |
0.90 |
7.35 |
48.26 |
0 |
0.00 |
2.0 |
5.00 |
1000 |
▇▁▁▁▁ |
Check data values ‘qr_year’
rsurvey %>%
select(qr_year) %>%
arrange(qr_year) %>%
head(n = 10)
## # A tibble: 10 × 1
## qr_year
## <dbl>
## 1 2
## 2 6
## 3 13
## 4 18
## 5 207
## 6 1977
## 7 1985
## 8 1989
## 9 1989
## 10 1990
Modifying Data - mutate() - ‘qr_year’
rsurvey <- rsurvey %>%
mutate(qr_year2 = ifelse(qr_year < 1977, NA, qr_year))
rsurvey %>%
select(qr_year, qr_year2) %>%
arrange(qr_year) %>%
head(n=10)
## # A tibble: 10 × 2
## qr_year qr_year2
## <dbl> <dbl>
## 1 2 NA
## 2 6 NA
## 3 13 NA
## 4 18 NA
## 5 207 NA
## 6 1977 1977
## 7 1985 1985
## 8 1989 1989
## 9 1989 1989
## 10 1990 1990
Check data values ‘qr_experience’
rsurvey %>%
count(qr_experience)
## # A tibble: 5 × 2
## qr_experience n
## <chr> <int>
## 1 Beginner 233
## 2 Expert 529
## 3 Intermediate 1037
## 4 None 8
## 5 <NA> 31
Reordering Categories - factor() - ‘qr_experience’
rsurvey <- rsurvey %>%
mutate(qr_experience2 = factor(qr_experience,
levels=c("None","Beginner", "Intermediate", "Expert", NA ))
)
rsurvey %>%
select(qr_experience, qr_experience2)%>%
count(qr_experience, qr_experience2)
## # A tibble: 5 × 3
## qr_experience qr_experience2 n
## <chr> <fct> <int>
## 1 Beginner Beginner 233
## 2 Expert Expert 529
## 3 Intermediate Intermediate 1037
## 4 None None 8
## 5 <NA> <NA> 31
Check data values ‘qindustry’
rsurvey %>%
count(qindustry)
## # A tibble: 127 × 2
## qindustry n
## <chr> <int>
## 1 Academia 2
## 2 Accommodation and Food Services 12
## 3 Advertising 1
## 4 Aerospace 2
## 5 Agriculture 5
## 6 Agriculture and animal science 1
## 7 Agrifood 1
## 8 Analytics Consulting Company 1
## 9 Any 1
## 10 Arts and Entertainment 14
## # … with 117 more rows
Convert to lower value
rsurvey <- rsurvey %>%
mutate(qindustry2 = str_squish(tolower(qindustry)))
rsurvey %>%
select(qindustry, qindustry2)%>%
count(qindustry, qindustry2)
## # A tibble: 127 × 3
## qindustry qindustry2 n
## <chr> <chr> <int>
## 1 Academia academia 2
## 2 Accommodation and Food Services accommodation and food services 12
## 3 Advertising advertising 1
## 4 Aerospace aerospace 2
## 5 Agriculture agriculture 5
## 6 Agriculture and animal science agriculture and animal science 1
## 7 Agrifood agrifood 1
## 8 Analytics Consulting Company analytics consulting company 1
## 9 Any any 1
## 10 Arts and Entertainment arts and entertainment 14
## # … with 117 more rows
Agriculture, agriculture and anaimal science, and agrifood can all
be combine into an “agriculture”
rsurvey <- rsurvey %>%
mutate(
qindustry2 = case_when(
str_detect(qindustry2, "agri") ~ "agriculture",
TRUE ~ qindustry2
))
rsurvey %>%
select(qindustry, qindustry2)%>%
count(qindustry, qindustry2)
## # A tibble: 127 × 3
## qindustry qindustry2 n
## <chr> <chr> <int>
## 1 Academia academia 2
## 2 Accommodation and Food Services accommodation and food services 12
## 3 Advertising advertising 1
## 4 Aerospace aerospace 2
## 5 Agriculture agriculture 5
## 6 Agriculture and animal science agriculture 1
## 7 Agrifood agriculture 1
## 8 Analytics Consulting Company analytics consulting company 1
## 9 Any any 1
## 10 Arts and Entertainment arts and entertainment 14
## # … with 117 more rows
Combine into common values
rsurvey <- rsurvey %>%
mutate(
qindustry2 = case_when(
str_detect(qindustry2, "agri") ~ "agriculture",
str_detect(qindustry2, "health") ~ "health",
str_detect(qindustry2, "education|academia|university|research") ~ "education and research",
str_detect(qindustry2, "marketing|business|trade|ecommerce") ~ "business",
str_detect(qindustry2, "information|analytics|software|cybersecurity|digital|telec") ~ "information technologies",
str_detect(qindustry2, "envi|forest|geo|natural|wildlife|sustain") ~ "environment",
str_detect(qindustry2, "law|legal") ~ "law",
str_detect(qindustry2, "media|journalism") ~ "media",
str_detect(qindustry2, "profit|") ~ "others",
TRUE ~ qindustry2
)
)
rsurvey %>%
select(qindustry, qindustry2)%>%
count(qindustry, qindustry2)
## # A tibble: 127 × 3
## qindustry qindustry2 n
## <chr> <chr> <int>
## 1 Academia education and research 2
## 2 Accommodation and Food Services others 12
## 3 Advertising others 1
## 4 Aerospace others 2
## 5 Agriculture agriculture 5
## 6 Agriculture and animal science agriculture 1
## 7 Agrifood agriculture 1
## 8 Analytics Consulting Company information technologies 1
## 9 Any others 1
## 10 Arts and Entertainment others 14
## # … with 117 more rows
Distinct rows/ values - 1
rsurvey %>%
select(qindustry2) %>%
distinct() %>% #this line removes duplicates
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 10
Distinct rows/ values - 2
rsurvey %>%
distinct() %>% #this line removes duplicates
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1838
Check data values ‘qused_for’
head(rsurvey$qused_for)
## [1] "Statistical analysis, Data transformation, Modeling, Visualization, Machine learning, Text processing"
## [2] "Statistical analysis, Data transformation, Visualization"
## [3] "Statistical analysis, Data transformation, Visualization"
## [4] "Data transformation"
## [5] "Statistical analysis, Data transformation, Modeling, Visualization"
## [6] "Statistical analysis, Data transformation, Modeling, Visualization, Machine learning, Text processing"
Highest number of comma in a cell
rsurvey$qused_for %>%
str_count(pattern = ",") %>%
max()
## [1] NA
Separate values
rsurvey %>%
select(qused_for) %>%
separate(qused_for, sep = ",",
into = paste0("use_", 1:11))
## # A tibble: 1,838 × 11
## use_1 use_2 use_3 use_4 use_5 use_6 use_7 use_8 use_9 use_10 use_11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Statistical an… " Da… " Mo… " Vi… " Ma… " Te… <NA> <NA> <NA> <NA> <NA>
## 2 Statistical an… " Da… " Vi… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 Statistical an… " Da… " Vi… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 Data transform… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 Statistical an… " Da… " Mo… " Vi… <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 Statistical an… " Da… " Mo… " Vi… " Ma… " Te… <NA> <NA> <NA> <NA> <NA>
## 7 Statistical an… " Da… " Mo… " Vi… <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 Statistical an… " Da… " Mo… " Vi… " Ma… " Te… " Su… <NA> <NA> <NA> <NA>
## 9 Statistical an… " Da… " Mo… " Vi… " Ma… " Te… <NA> <NA> <NA> <NA> <NA>
## 10 Statistical an… " Da… " Mo… " Vi… <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## # … with 1,828 more rows
Check data values ‘qr_difficulty_experienced’
# counting total, unique, missing, and median values
rsurvey %>%
summarise(n = n(),
nd = n_distinct(qr_difficulty_experienced),
na = sum(is.na(qr_difficulty_experienced)),
med = median(qr_difficulty_experienced, na.rm = TRUE))
## # A tibble: 1 × 4
## n nd na med
## <int> <int> <int> <dbl>
## 1 1838 6 45 3