Data Cleaning by R

Library

#install.packages("janitor")
library(janitor)
library(dplyr)
library(psych)
library(readr)
library(skimr)
library(jmv)
library(tidyr)
library(gsheet)
library(stringr)

Data

raw_data<-gsheet2tbl("https://docs.google.com/spreadsheets/d/19AdprBOX8Y0ZTly_EgFcZYi4ay4pGFHXmLRCWsUv4Oc/edit?usp=sharing")

Data Column Name

raw_data %>% 
  names() %>% 
  head(10)

##  [1] "...1"                                                                                                                                                                                                                   
##  [2] "Timestamp"                                                                                                                                                                                                              
##  [3] "How.would.you.rate.your.level.of.experience.using.R."                                                                                                                                                                   
##  [4] "Compared.with.other.technical.topics.you.ve.learned.in.school.and.on.the.job..on.a.scale.of.1.to.5..how.difficult.do.you.expect.learning.R.to.be."                                                                      
##  [5] "From.what.you.know.about.R..how.long.do.you.expect.that.it.will.take.for.you.to.learn.enough.to.use.R.productively."                                                                                                    
##  [6] "How.do.you.think.you.would.go.about.the.process.of.learning.R."                                                                                                                                                         
##  [7] "Which.statement.most.closely.reflects.the.primary.reason.why.you.are.interested.in.learning.R."                                                                                                                         
##  [8] "If.you.were.to.learn.R..what.would.do.you.think.you.would.use.it.for...check.all.that.apply."                                                                                                                           
##  [9] "Which.analytical.tools.do.you.use.today.for.the.functions.that.you.might.learn.R.for...please.check.all.that.apply."                                                                                                    
## [10] "What.do.you.think.is.the.biggest.obstacle.you.must.overcome.in.trying.to.learn.R..The.choices.below.are.only.suggestions..if.we.haven.t.listed.your.obstacle..please.choose..Other..and.add.your.obstacle.in.the.text.."

Data value

raw_data %>% 
  head()

## # A tibble: 6 × 53
##    ...1 Timest…¹ How.w…² Compa…³ From.…⁴ How.d…⁵ Which…⁶ If.yo…⁷ Which…⁸ What.…⁹
##   <dbl> <chr>    <chr>     <dbl> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1     1 12/13/2… Expert       NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## 2     2 12/13/2… Beginn…      NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## 3     3 12/13/2… Interm…      NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## 4     4 12/13/2… Interm…      NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## 5     5 12/13/2… Interm…      NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## 6     6 12/13/2… Expert       NA <NA>    <NA>    <NA>    <NA>    <NA>    <NA>   
## # … with 43 more variables: What.year.did.you.first.start.learning.R. <dbl>,
## #   How.did.you.learn.R..If.you.used.multiple.methods..please.select.the.one.you.used.the.most. <chr>,
## #   Compared.with.other.technical.topics.you.ve.learned.in.school.and.on.the.job..on.a.scale.of.1.to.5..how.difficult.has.it.been.for.you.to.learn.R. <dbl>,
## #   Roughly.how.long.did.it.take.you.to.achieve.proficiency.in.R. <chr>,
## #   Which.statement.most.closely.reflects.the.primary.reason.why.you.learned.R. <chr>,
## #   What.do.you.think.was.the.biggest.obstacle.you.had.to.overcome.in.learning.R..The.choices.below.are.only.suggestions..if.we.haven.t.listed.your.obstacle..please.choose..Other..and.add.your.obstacle.in.the.text.. <chr>,
## #   How.often.do.you.use.R.today..either.for.professional.or.personal.projects. <chr>, …

Column Rename

renamed <- raw_data %>%
  rename("Qr.experience" = names(raw_data[2]),           ##How.would.you.rate.your.level.of.experience.using.R
         "Qr.difficulties" = names(raw_data[3]),         ##Compared.with.other.technical.topics .....
         "Qr.knowledge" = names(raw_data[4]),            ## From.what.you.know.about.R..
         "Qr.progress" = names(raw_data[5])              ## How.do.you.think.you.would.go.about.the.process.of.learning.R.
         )

Change column name

qnames <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1YVL5FGAyO-pBXS-8T2mNOw6mEoLBau2JBQbBeI7wv7s/edit?usp=sharing")
write.csv(qnames,"D:\\PMP\\clean2.csv")
rsurvey <- raw_data
names(rsurvey) <- names(qnames)
rsurvey %>% 
  names()

##  [1] "...1"                      "Qtime"                    
##  [3] "Qr_experience"             "Qr_difficulty"            
##  [5] "Qr_length_to_success"      "Qhow_to_learn_r"          
##  [7] "Qreason_to_learn"          "Qr_use"                   
##  [9] "Qtools"                    "Qobstacles_to_starting"   
## [11] "Qr_year"                   "Qr_learning_path"         
## [13] "Qr_difficulty_experienced" "Qtime_to_proficiency"     
## [15] "Qreason_experienced"       "Qmost_difficult_aspect"   
## [17] "Qr_how_often_used"         "Qused_for"                
## [19] "Qr_enjoyment"              "Qrecommend"               
## [21] "Qr_tools"                  "Qtidyverse_learning"      
## [23] "Qtidyverse_today"          "Qlike_best"               
## [25] "Qlike_least"               "Qr_problems"              
## [27] "Qr_discover_packages"      "Qr_share"                 
## [29] "Qr_change"                 "Qrobot_test"              
## [31] "Qrmarkdown"                "Qrmarkdown_apps"          
## [33] "Qrmarkdown_change"         "Qshiny"                   
## [35] "Qshiny_change"             "Qpython_use"              
## [37] "Qpython_apps"              "Qpython_enjoy"            
## [39] "Qpython_recommend"         "Qpython_change"           
## [41] "Qlanguages"                "Qfirst_language"          
## [43] "Qyear_born"                "Qgender"                  
## [45] "Qethnicity"                "Qdegree"                  
## [47] "Qcountry"                  "Qindustry"                
## [49] "Qtitle"                    "Qwork_title"              
## [51] "Qpeople"                   "Qevents"                  
## [53] "Qhear"

Clean variable names

rsurvey <- rsurvey %>% 
  janitor::clean_names()
rsurvey %>% 
  names()

##  [1] "x1"                        "qtime"                    
##  [3] "qr_experience"             "qr_difficulty"            
##  [5] "qr_length_to_success"      "qhow_to_learn_r"          
##  [7] "qreason_to_learn"          "qr_use"                   
##  [9] "qtools"                    "qobstacles_to_starting"   
## [11] "qr_year"                   "qr_learning_path"         
## [13] "qr_difficulty_experienced" "qtime_to_proficiency"     
## [15] "qreason_experienced"       "qmost_difficult_aspect"   
## [17] "qr_how_often_used"         "qused_for"                
## [19] "qr_enjoyment"              "qrecommend"               
## [21] "qr_tools"                  "qtidyverse_learning"      
## [23] "qtidyverse_today"          "qlike_best"               
## [25] "qlike_least"               "qr_problems"              
## [27] "qr_discover_packages"      "qr_share"                 
## [29] "qr_change"                 "qrobot_test"              
## [31] "qrmarkdown"                "qrmarkdown_apps"          
## [33] "qrmarkdown_change"         "qshiny"                   
## [35] "qshiny_change"             "qpython_use"              
## [37] "qpython_apps"              "qpython_enjoy"            
## [39] "qpython_recommend"         "qpython_change"           
## [41] "qlanguages"                "qfirst_language"          
## [43] "qyear_born"                "qgender"                  
## [45] "qethnicity"                "qdegree"                  
## [47] "qcountry"                  "qindustry"                
## [49] "qtitle"                    "qwork_title"              
## [51] "qpeople"                   "qevents"                  
## [53] "qhear"

Describe variables

rsurvey %>% 
  psych::describe()

##                           vars    n    mean     sd median trimmed    mad  min
## x1                           1 1838  919.50 530.73  919.5  919.50 681.25    1
## qtime*                       2 1838  913.24 526.89  912.5  913.05 676.07    1
## qr_experience*               3 1807    2.45   0.72    3.0    2.56   0.00    1
## qr_difficulty                4    8    3.50   0.53    3.5    3.50   0.74    3
## qr_length_to_success*        5    8    1.62   0.92    1.0    1.62   0.00    1
## qhow_to_learn_r*             6    8    1.75   0.89    1.5    1.75   0.74    1
## qreason_to_learn*            7    8    3.62   1.69    3.5    3.62   2.22    1
## qr_use*                      8    8    4.50   2.45    4.5    4.50   2.97    1
## qtools*                      9    8    4.00   1.85    4.5    4.00   2.22    1
## qobstacles_to_starting*     10    8    2.50   1.51    2.5    2.50   2.22    1
## qr_year                     11 1676 2007.75 107.35 2015.0 2014.22   2.97    2
## qr_learning_path*           12 1794   19.89   9.70   20.0   19.37  13.34    1
## qr_difficulty_experienced   13 1793    2.89   0.90    3.0    2.89   1.48    1
## qtime_to_proficiency*       14 1797    3.60   1.11    3.0    3.64   1.48    1
## qreason_experienced*        15 1798    4.24   1.98    4.0    4.26   1.48    1
## qmost_difficult_aspect*     16 1783   63.12  29.62   51.0   60.84  31.13    1
## qr_how_often_used*          17 1798    3.39   1.86    5.0    3.49   0.00    1
## qused_for*                  18 1797  172.02  96.24  154.0  168.63  93.40    1
## qr_enjoyment                19 1798    4.65   0.61    5.0    4.76   0.00    1
## qrecommend                  20 1795    9.24   1.22   10.0    9.49   0.00    1
## qr_tools*                   21 1783  364.84 165.40  356.0  373.51 212.01    1
## qtidyverse_learning*        22 1796    2.70   0.48    3.0    2.76   0.00    1
## qtidyverse_today*           23 1797    4.66   0.81    5.0    4.88   0.00    1
## qlike_best*                 24 1453  621.46 365.18  629.0  622.56 481.84    1
## qlike_least*                25 1335  618.51 360.39  625.0  618.08 472.95    1
## qr_problems*                26 1795   56.26  41.19   42.0   53.24  47.44    1
## qr_discover_packages*       27 1791  204.35  98.58  252.0  210.34  99.33    1
## qr_share*                   28 1723  153.39 102.14  154.0  152.20 139.36    1
## qr_change*                  29 1798    2.38   0.59    2.0    2.42   0.00    1
## qrobot_test*                30 1807   16.66   4.49   16.0   16.39   0.00    1
## qrmarkdown*                 31 1807    3.23   1.44    3.0    3.28   1.48    1
## qrmarkdown_apps*            32 1499  219.71 110.57  220.0  219.52 118.61    1
## qrmarkdown_change*          33 1794    2.38   0.58    2.0    2.40   0.00    1
## qshiny*                     34 1805    3.43   1.26    4.0    3.54   1.48    1
## qshiny_change*              35 1775    2.43   0.58    2.0    2.45   0.00    1
## qpython_use*                36 1805    2.95   1.17    3.0    2.89   1.48    1
## qpython_apps*               37 1000  153.38  85.91  153.0  152.53 105.26    1
## qpython_enjoy               38 1019    3.25   1.08    3.0    3.27   1.48    1
## qpython_recommend           39 1015    7.37   2.17    8.0    7.61   2.97    1
## qpython_change*             40 1784    2.42   0.55    2.0    2.43   0.00    1
## qlanguages*                 41 1797  492.39 237.25  536.0  503.43 269.83    1
## qfirst_language*            42 1801   84.88  40.32   81.0   84.12  53.37    1
## qyear_born                  43 1731 1983.39  10.89 1985.0 1984.68   8.90 1878
## qgender*                    44 1685   24.30   7.21   27.0   24.92   0.00    1
## qethnicity*                 45 1711   51.38  16.91   60.0   55.04   0.00    1
## qdegree*                    46 1784    6.50   3.03    8.0    6.31   4.45    1
## qcountry*                   47 1690   62.52  30.87   84.0   65.96   7.41    1
## qindustry*                  48 1794   61.87  30.54   48.0   60.75  29.65    1
## qtitle*                     49 1498  341.40 212.54  317.5  335.57 243.89    1
## qwork_title*                50 1787   22.33  15.78   10.0   20.92   5.93    1
## qpeople                     51 1646    7.35  48.26    2.0    2.31   2.97    0
## qevents*                    52  694   16.03  17.16   13.0   13.65  17.79    1
## qhear*                      53 1791   48.84  22.22   63.0   52.27   0.00    1
##                            max range   skew kurtosis    se
## x1                        1838  1837   0.00    -1.20 12.38
## qtime*                    1827  1826   0.00    -1.20 12.29
## qr_experience*               4     3  -0.85    -0.49  0.02
## qr_difficulty                4     1   0.00    -2.23  0.19
## qr_length_to_success*        3     2   0.66    -1.59  0.32
## qhow_to_learn_r*             3     2   0.40    -1.75  0.31
## qreason_to_learn*            6     5  -0.11    -1.55  0.60
## qr_use*                      8     7   0.00    -1.65  0.87
## qtools*                      6     5  -0.35    -1.60  0.65
## qobstacles_to_starting*      5     4   0.33    -1.58  0.53
## qr_year                   2019  2017 -18.20   330.62  2.62
## qr_learning_path*           52    51   0.41    -1.37  0.23
## qr_difficulty_experienced    5     4   0.01    -0.18  0.02
## qtime_to_proficiency*        5     4   0.03    -1.13  0.03
## qreason_experienced*         8     7   0.02    -1.00  0.05
## qmost_difficult_aspect*    168   167   0.70    -0.08  0.70
## qr_how_often_used*           5     4  -0.37    -1.79  0.04
## qused_for*                 368   367   0.30    -0.97  2.27
## qr_enjoyment                 5     4  -1.96     4.58  0.01
## qrecommend                  10     9  -2.20     6.80  0.03
## qr_tools*                  627   626  -0.28    -0.95  3.92
## qtidyverse_learning*         3     2  -1.13    -0.05  0.01
## qtidyverse_today*            5     4  -2.47     5.54  0.02
## qlike_best*               1237  1236  -0.02    -1.29  9.58
## qlike_least*              1245  1244   0.00    -1.22  9.86
## qr_problems*               160   159   0.52    -0.77  0.97
## qr_discover_packages*      340   339  -0.46    -1.11  2.33
## qr_share*                  348   347   0.02    -1.34  2.46
## qr_change*                   4     3  -0.31    -0.68  0.01
## qrobot_test*                45    44   3.56    19.02  0.11
## qrmarkdown*                  5     4  -0.24    -1.29  0.03
## qrmarkdown_apps*           463   462  -0.05    -0.69  2.86
## qrmarkdown_change*           4     3  -0.13    -0.58  0.01
## qshiny*                      5     4  -0.56    -0.56  0.03
## qshiny_change*               4     3  -0.21    -0.66  0.01
## qpython_use*                 5     4   0.24    -1.22  0.03
## qpython_apps*              323   322   0.01    -1.12  2.72
## qpython_enjoy                5     4  -0.13    -0.57  0.03
## qpython_recommend           10     9  -0.79     0.20  0.07
## qpython_change*              4     3  -0.07    -0.87  0.01
## qlanguages*                867   866  -0.35    -1.02  5.60
## qfirst_language*           157   156   0.13    -1.17  0.95
## qyear_born                2001   123  -2.14    11.44  0.26
## qgender*                    53    52  -0.16     2.53  0.18
## qethnicity*                 74    73  -1.56     0.78  0.41
## qdegree*                    17    16   0.39     0.07  0.07
## qcountry*                   92    91  -0.62    -1.25  0.75
## qindustry*                 126   125   0.30    -1.18  0.72
## qtitle*                    748   747   0.19    -1.22  5.49
## qwork_title*                56    55   0.54    -1.43  0.37
## qpeople                   1000  1000  17.42   337.35  1.19
## qevents*                    60    59   0.91    -0.48  0.65
## qhear*                      68    67  -1.06    -0.71  0.53

Summary variables

rsurvey %>% 
  summary()

##        x1            qtime           qr_experience      qr_difficulty 
##  Min.   :   1.0   Length:1838        Length:1838        Min.   :3.0   
##  1st Qu.: 460.2   Class :character   Class :character   1st Qu.:3.0   
##  Median : 919.5   Mode  :character   Mode  :character   Median :3.5   
##  Mean   : 919.5                                         Mean   :3.5   
##  3rd Qu.:1378.8                                         3rd Qu.:4.0   
##  Max.   :1838.0                                         Max.   :4.0   
##                                                         NA's   :1830  
##  qr_length_to_success qhow_to_learn_r    qreason_to_learn      qr_use         
##  Length:1838          Length:1838        Length:1838        Length:1838       
##  Class :character     Class :character   Class :character   Class :character  
##  Mode  :character     Mode  :character   Mode  :character   Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##     qtools          qobstacles_to_starting    qr_year     qr_learning_path  
##  Length:1838        Length:1838            Min.   :   2   Length:1838       
##  Class :character   Class :character       1st Qu.:2012   Class :character  
##  Mode  :character   Mode  :character       Median :2015   Mode  :character  
##                                            Mean   :2008                     
##                                            3rd Qu.:2017                     
##                                            Max.   :2019                     
##                                            NA's   :162                      
##  qr_difficulty_experienced qtime_to_proficiency qreason_experienced
##  Min.   :1.000             Length:1838          Length:1838        
##  1st Qu.:2.000             Class :character     Class :character   
##  Median :3.000             Mode  :character     Mode  :character   
##  Mean   :2.887                                                     
##  3rd Qu.:3.000                                                     
##  Max.   :5.000                                                     
##  NA's   :45                                                        
##  qmost_difficult_aspect qr_how_often_used   qused_for          qr_enjoyment  
##  Length:1838            Length:1838        Length:1838        Min.   :1.000  
##  Class :character       Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character       Mode  :character   Mode  :character   Median :5.000  
##                                                               Mean   :4.647  
##                                                               3rd Qu.:5.000  
##                                                               Max.   :5.000  
##                                                               NA's   :40     
##    qrecommend      qr_tools         qtidyverse_learning qtidyverse_today  
##  Min.   : 1.00   Length:1838        Length:1838         Length:1838       
##  1st Qu.: 9.00   Class :character   Class :character    Class :character  
##  Median :10.00   Mode  :character   Mode  :character    Mode  :character  
##  Mean   : 9.24                                                            
##  3rd Qu.:10.00                                                            
##  Max.   :10.00                                                            
##  NA's   :43                                                               
##   qlike_best        qlike_least        qr_problems        qr_discover_packages
##  Length:1838        Length:1838        Length:1838        Length:1838         
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##    qr_share          qr_change         qrobot_test         qrmarkdown       
##  Length:1838        Length:1838        Length:1838        Length:1838       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  qrmarkdown_apps    qrmarkdown_change     qshiny          qshiny_change     
##  Length:1838        Length:1838        Length:1838        Length:1838       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  qpython_use        qpython_apps       qpython_enjoy   qpython_recommend
##  Length:1838        Length:1838        Min.   :1.000   Min.   : 1.000   
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.: 6.000   
##  Mode  :character   Mode  :character   Median :3.000   Median : 8.000   
##                                        Mean   :3.253   Mean   : 7.374   
##                                        3rd Qu.:4.000   3rd Qu.: 9.000   
##                                        Max.   :5.000   Max.   :10.000   
##                                        NA's   :819     NA's   :823      
##  qpython_change      qlanguages        qfirst_language      qyear_born  
##  Length:1838        Length:1838        Length:1838        Min.   :1878  
##  Class :character   Class :character   Class :character   1st Qu.:1979  
##  Mode  :character   Mode  :character   Mode  :character   Median :1985  
##                                                           Mean   :1983  
##                                                           3rd Qu.:1991  
##                                                           Max.   :2001  
##                                                           NA's   :107   
##    qgender           qethnicity          qdegree            qcountry        
##  Length:1838        Length:1838        Length:1838        Length:1838       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   qindustry            qtitle          qwork_title           qpeople        
##  Length:1838        Length:1838        Length:1838        Min.   :   0.000  
##  Class :character   Class :character   Class :character   1st Qu.:   0.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :   2.000  
##                                                           Mean   :   7.348  
##                                                           3rd Qu.:   5.000  
##                                                           Max.   :1000.000  
##                                                           NA's   :192       
##    qevents             qhear          
##  Length:1838        Length:1838       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

Data completeness

rsurvey %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	1838
Number of columns	53
_______________________
Column type frequency:
character	43
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
qtime	0	1.00	16	19	1827
qr_experience	31	0.98	4	12	4
qr_length_to_success	1830	0.00	5	6	3
qhow_to_learn_r	1830	0.00	67	96	3
qreason_to_learn	1830	0.00	25	95	6
qr_use	1830	0.00	35	86	8
qtools	1830	0.00	6	31	6
qobstacles_to_starting	1830	0.00	4	96	5
qr_learning_path	44	0.98	4	128	52
qtime_to_proficiency	41	0.98	4	33	5
qreason_experienced	40	0.98	27	70	8
qmost_difficult_aspect	55	0.97	4	1057	168
qr_how_often_used	40	0.98	20	36	5
qused_for	41	0.98	8	248	368
qr_tools	55	0.97	4	262	627
qtidyverse_learning	42	0.98	2	34	3
qtidyverse_today	41	0.98	5	30	5
qlike_best	385	0.79	1	519	1237
qlike_least	503	0.73	1	882	1245
qr_problems	43	0.98	7	407	160
qr_discover_packages	47	0.97	7	469	340
qr_share	115	0.94	1	383	348
qr_change	40	0.98	23	28	4
qrobot_test	31	0.98	1	60	45
qrmarkdown	31	0.98	22	47	5
qrmarkdown_apps	339	0.82	3	384	463
qrmarkdown_change	44	0.98	23	28	4
qshiny	33	0.98	27	71	5
qshiny_change	63	0.97	23	28	4
qpython_use	33	0.98	18	47	5
qpython_apps	838	0.54	3	175	323
qpython_change	54	0.97	23	28	4
qlanguages	41	0.98	3	140	867
qfirst_language	37	0.98	1	96	157
qgender	153	0.92	1	46	53
qethnicity	127	0.93	3	251	74
qdegree	54	0.97	3	152	17
qcountry	148	0.92	4	33	92
qindustry	44	0.98	3	87	126
qtitle	340	0.82	2	74	748
qwork_title	51	0.97	4	104	56
qevents	1144	0.38	13	195	60
qhear	47	0.97	2	40	68

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
x1	0	1.00	919.50	530.73	1	460.25	919.5	1378.75	1838	▇▇▇▇▇
qr_difficulty	1830	0.00	3.50	0.53	3	3.00	3.5	4.00	4	▇▁▁▁▇
qr_year	162	0.91	2007.75	107.35	2	2012.00	2015.0	2017.00	2019	▁▁▁▁▇
qr_difficulty_experienced	45	0.98	2.89	0.90	1	2.00	3.0	3.00	5	▁▅▇▃▁
qr_enjoyment	40	0.98	4.65	0.61	1	4.00	5.0	5.00	5	▁▁▁▃▇
qrecommend	43	0.98	9.24	1.22	1	9.00	10.0	10.00	10	▁▁▁▂▇
qpython_enjoy	819	0.55	3.25	1.08	1	3.00	3.0	4.00	5	▁▃▇▆▃
qpython_recommend	823	0.55	7.37	2.17	1	6.00	8.0	9.00	10	▁▂▅▇▇
qyear_born	107	0.94	1983.39	10.89	1878	1979.00	1985.0	1991.00	2001	▁▁▁▂▇
qpeople	192	0.90	7.35	48.26	0	0.00	2.0	5.00	1000	▇▁▁▁▁

Check data values ‘qr_year’

rsurvey %>%
  select(qr_year) %>%
  arrange(qr_year) %>%
  head(n = 10)

## # A tibble: 10 × 1
##    qr_year
##      <dbl>
##  1       2
##  2       6
##  3      13
##  4      18
##  5     207
##  6    1977
##  7    1985
##  8    1989
##  9    1989
## 10    1990

Modifying Data - mutate() - ‘qr_year’

rsurvey <- rsurvey %>%
  mutate(qr_year2 = ifelse(qr_year < 1977, NA, qr_year))
rsurvey %>%
  select(qr_year, qr_year2) %>%
  arrange(qr_year) %>%
  head(n=10)

## # A tibble: 10 × 2
##    qr_year qr_year2
##      <dbl>    <dbl>
##  1       2       NA
##  2       6       NA
##  3      13       NA
##  4      18       NA
##  5     207       NA
##  6    1977     1977
##  7    1985     1985
##  8    1989     1989
##  9    1989     1989
## 10    1990     1990

Check data values ‘qr_experience’

rsurvey %>% 
  count(qr_experience)

## # A tibble: 5 × 2
##   qr_experience     n
##   <chr>         <int>
## 1 Beginner        233
## 2 Expert          529
## 3 Intermediate   1037
## 4 None              8
## 5 <NA>             31

Reordering Categories - factor() - ‘qr_experience’

rsurvey <- rsurvey %>% 
  mutate(qr_experience2 = factor(qr_experience,
        levels=c("None","Beginner", "Intermediate", "Expert", NA ))
  )
rsurvey %>%
  select(qr_experience, qr_experience2)%>% 
  count(qr_experience, qr_experience2)

## # A tibble: 5 × 3
##   qr_experience qr_experience2     n
##   <chr>         <fct>          <int>
## 1 Beginner      Beginner         233
## 2 Expert        Expert           529
## 3 Intermediate  Intermediate    1037
## 4 None          None               8
## 5 <NA>          <NA>              31

Check data values ‘qindustry’

rsurvey %>%
  count(qindustry)

## # A tibble: 127 × 2
##    qindustry                           n
##    <chr>                           <int>
##  1 Academia                            2
##  2 Accommodation and Food Services    12
##  3 Advertising                         1
##  4 Aerospace                           2
##  5 Agriculture                         5
##  6 Agriculture and animal science      1
##  7 Agrifood                            1
##  8 Analytics Consulting Company        1
##  9 Any                                 1
## 10 Arts and Entertainment             14
## # … with 117 more rows

Convert to lower value

rsurvey <- rsurvey %>%
  mutate(qindustry2 = str_squish(tolower(qindustry)))
rsurvey %>%
  select(qindustry, qindustry2)%>% 
  count(qindustry, qindustry2)

## # A tibble: 127 × 3
##    qindustry                       qindustry2                          n
##    <chr>                           <chr>                           <int>
##  1 Academia                        academia                            2
##  2 Accommodation and Food Services accommodation and food services    12
##  3 Advertising                     advertising                         1
##  4 Aerospace                       aerospace                           2
##  5 Agriculture                     agriculture                         5
##  6 Agriculture and animal science  agriculture and animal science      1
##  7 Agrifood                        agrifood                            1
##  8 Analytics Consulting Company    analytics consulting company        1
##  9 Any                             any                                 1
## 10 Arts and Entertainment          arts and entertainment             14
## # … with 117 more rows

Agriculture, agriculture and anaimal science, and agrifood can all be combine into an “agriculture”

rsurvey <- rsurvey %>%
  mutate(
    qindustry2 = case_when(
      str_detect(qindustry2, "agri") ~ "agriculture",
      TRUE ~ qindustry2
    ))
rsurvey %>%
  select(qindustry, qindustry2)%>% 
  count(qindustry, qindustry2)

## # A tibble: 127 × 3
##    qindustry                       qindustry2                          n
##    <chr>                           <chr>                           <int>
##  1 Academia                        academia                            2
##  2 Accommodation and Food Services accommodation and food services    12
##  3 Advertising                     advertising                         1
##  4 Aerospace                       aerospace                           2
##  5 Agriculture                     agriculture                         5
##  6 Agriculture and animal science  agriculture                         1
##  7 Agrifood                        agriculture                         1
##  8 Analytics Consulting Company    analytics consulting company        1
##  9 Any                             any                                 1
## 10 Arts and Entertainment          arts and entertainment             14
## # … with 117 more rows

Combine into common values

rsurvey <- rsurvey %>%
  mutate(
    qindustry2 = case_when(
      str_detect(qindustry2, "agri") ~ "agriculture",
      str_detect(qindustry2, "health") ~ "health",
      str_detect(qindustry2, "education|academia|university|research") ~ "education and research",
      str_detect(qindustry2, "marketing|business|trade|ecommerce") ~ "business",
      str_detect(qindustry2, "information|analytics|software|cybersecurity|digital|telec") ~ "information technologies",
      str_detect(qindustry2, "envi|forest|geo|natural|wildlife|sustain") ~ "environment",
      str_detect(qindustry2, "law|legal") ~ "law",
      str_detect(qindustry2, "media|journalism") ~ "media",
      str_detect(qindustry2, "profit|") ~ "others",
      TRUE ~ qindustry2
    )
  )
rsurvey %>%
  select(qindustry, qindustry2)%>% 
  count(qindustry, qindustry2)

## # A tibble: 127 × 3
##    qindustry                       qindustry2                   n
##    <chr>                           <chr>                    <int>
##  1 Academia                        education and research       2
##  2 Accommodation and Food Services others                      12
##  3 Advertising                     others                       1
##  4 Aerospace                       others                       2
##  5 Agriculture                     agriculture                  5
##  6 Agriculture and animal science  agriculture                  1
##  7 Agrifood                        agriculture                  1
##  8 Analytics Consulting Company    information technologies     1
##  9 Any                             others                       1
## 10 Arts and Entertainment          others                      14
## # … with 117 more rows

Distinct rows/ values - 1

rsurvey %>% 
  select(qindustry2) %>%
  distinct() %>% #this line removes duplicates
  count()

## # A tibble: 1 × 1
##       n
##   <int>
## 1    10

Distinct rows/ values - 2

rsurvey %>% 
  distinct() %>% #this line removes duplicates
  count()

## # A tibble: 1 × 1
##       n
##   <int>
## 1  1838

Check data values ‘qused_for’

head(rsurvey$qused_for)

## [1] "Statistical analysis, Data transformation, Modeling, Visualization, Machine learning, Text processing"
## [2] "Statistical analysis, Data transformation, Visualization"                                             
## [3] "Statistical analysis, Data transformation, Visualization"                                             
## [4] "Data transformation"                                                                                  
## [5] "Statistical analysis, Data transformation, Modeling, Visualization"                                   
## [6] "Statistical analysis, Data transformation, Modeling, Visualization, Machine learning, Text processing"

Highest number of comma in a cell

rsurvey$qused_for %>%
  str_count(pattern = ",") %>% 
  max()

## [1] NA

Separate values

rsurvey %>%
  select(qused_for) %>% 
  separate(qused_for, sep = ",",
           into = paste0("use_", 1:11))

## # A tibble: 1,838 × 11
##    use_1           use_2 use_3 use_4 use_5 use_6 use_7 use_8 use_9 use_10 use_11
##    <chr>           <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> 
##  1 Statistical an… " Da… " Mo… " Vi… " Ma… " Te…  <NA> <NA>  <NA>  <NA>   <NA>  
##  2 Statistical an… " Da… " Vi…  <NA>  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
##  3 Statistical an… " Da… " Vi…  <NA>  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
##  4 Data transform…  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
##  5 Statistical an… " Da… " Mo… " Vi…  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
##  6 Statistical an… " Da… " Mo… " Vi… " Ma… " Te…  <NA> <NA>  <NA>  <NA>   <NA>  
##  7 Statistical an… " Da… " Mo… " Vi…  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
##  8 Statistical an… " Da… " Mo… " Vi… " Ma… " Te… " Su… <NA>  <NA>  <NA>   <NA>  
##  9 Statistical an… " Da… " Mo… " Vi… " Ma… " Te…  <NA> <NA>  <NA>  <NA>   <NA>  
## 10 Statistical an… " Da… " Mo… " Vi…  <NA>  <NA>  <NA> <NA>  <NA>  <NA>   <NA>  
## # … with 1,828 more rows

Check data values ‘qr_difficulty_experienced’

# counting total, unique, missing, and median values
rsurvey %>%
  summarise(n = n(),
            nd = n_distinct(qr_difficulty_experienced),
            na = sum(is.na(qr_difficulty_experienced)),
            med = median(qr_difficulty_experienced, na.rm = TRUE))

## # A tibble: 1 × 4
##       n    nd    na   med
##   <int> <int> <int> <dbl>
## 1  1838     6    45     3

Replace NA with median value

rsurvey <- rsurvey %>%
  mutate(qr_difficulty_experienced1 = replace(qr_difficulty_experienced,
                   is.na(qr_difficulty_experienced),
                   median(qr_difficulty_experienced, na.rm = TRUE)))
rsurvey %>%
  select(qr_difficulty_experienced, qr_difficulty_experienced1)

## # A tibble: 1,838 × 2
##    qr_difficulty_experienced qr_difficulty_experienced1
##                        <dbl>                      <dbl>
##  1                         5                          5
##  2                         3                          3
##  3                         3                          3
##  4                         4                          4
##  5                         3                          3
##  6                         3                          3
##  7                         3                          3
##  8                         2                          2
##  9                         2                          2
## 10                         2                          2
## # … with 1,828 more rows

Reference

https://bookdown.org/aschmi11/RESMHandbook/data-preparation-and-cleaning-in-r.html

Data Cleaning by R

Naimul Islam

2022-10-10

Library

Data

Data Column Name

Data value

Column Rename

Change column name

Clean variable names

Describe variables

Summary variables

Data completeness

Check data values ‘qr_year’

Modifying Data - mutate() - ‘qr_year’

Check data values ‘qr_experience’

Reordering Categories - factor() - ‘qr_experience’

Check data values ‘qindustry’

Convert to lower value

Agriculture, agriculture and anaimal science, and agrifood can all be combine into an “agriculture”

Combine into common values

Distinct rows/ values - 1

Distinct rows/ values - 2

Check data values ‘qused_for’

Highest number of comma in a cell

Separate values

Check data values ‘qr_difficulty_experienced’

Replace NA with median value

Reference