Data Wrangling using R

Alok Pratap Singh

16 September, 2020

Packages

# library(readxl)  # reading .xlsx file
library(dplyr, warn.conflicts = F)   # Data Wrangling

Loding Data

readxl::read_xlsx("C:/Users/Asus/Documents/R Pubs/R Pubs/df.xlsx") -> df
as_tibble(df) -> df   # Modern Data frame
knitr::kable(head(df,3))
…1 Sr. No. Age Gender Socioeconomic status Income Education Residence Cause of visinting How many days परिवहन आवास भोजन खरीददारी फोटोग्राफी स्वास्थ्य सेवाएँ पथ प्रदर्शक अन्य व्यय
1 1 45 male m2 40000 PG delhi trekking 5 15000 10000 3000 2000 500 200 2000 1000
2 2 40 male m2 50000 graduate delhi trekking 5 16000 12500 3000 1000 500 100 2000 800
3 3 35 male m1 30000 graduate haryana r. t. 2 5000 2000 1000 500 200 100 0 500
# removing 1st column
df[,2:ncol(df)] ->df
# df <- df %>% select(-1)

Renaming the variable names

# Writing column names and saving to vector "n"
n <- c("id", "age", "gen", "soci_s", "inc", "edu", "resi", "vis_c", "days", "travel", "hotel", 
       "food","market", "photo_g", "health_s", "guide", "other")
# changing column names to vector "n"
colnames(df) <- n
# checking again column names
colnames(df)
##  [1] "id"       "age"      "gen"      "soci_s"   "inc"      "edu"     
##  [7] "resi"     "vis_c"    "days"     "travel"   "hotel"    "food"    
## [13] "market"   "photo_g"  "health_s" "guide"    "other"

Checking Uniques Cases

lapply(df[c("gen","soci_s", "edu", "resi", "vis_c")], unique)
## $gen
## [1] "male"   "female" "FEMALE" "MALE"  
## 
## $soci_s
##  [1] "m2"   "m1"   "m3"   "m4"   "M4"   "M3"   "MONK" "M2"   "M1"   "monk"
## 
## $edu
##  [1] "PG"              "graduate"        "12th"            "pg"             
##  [5] "doctorate"       "intermediate"    "HS"              "illiterate"     
##  [9] "INtermediate UP" "hs"              "jhs"             "5th"            
## [13] "docterate"       "GRADUATE"        "INTERMEDIATE"    "ILLITERATE"     
## [17] "GRADUATEUP"      "GRADUATEFOREIGN" "DOCTRATE"       
## 
## $resi
##  [1] "delhi"          "haryana"        "u. p."          "m. p."         
##  [5] "rajasthan"      "maharashtra WS" "karnataka"      "tamilnadu"     
##  [9] "mp"             "up"             "punjab"         "maharashtra"   
## [13] "Andhra p"       "telangana"      "Bihar"          "UK"            
## [17] "jharkhand"      "odisha"         "chhattisgarh"   "kerala"        
## [21] "WB"             "foreign"        "BIHAR"          NA              
## [25] "MP"             "DELHI"          "GUJRAT"         "AP"            
## [29] "Telangana"      "uk"             "gujrat"         "UP"            
## [33] "RAJSTHAN"       "PUNJAB"         "HARIYANART"     "HARIYANA"      
## [37] "HP"             "RAJASTHAN"      "FOREIGN"        "KERLA"         
## [41] "CHHATTISGARH"   "JHARKHAND"      "ODISA"          "MAHARASHTRA"   
## [45] "KARNATAKA"      "TAMILNADU"      "GOA"            "ASSAM"         
## [49] "HARYANA"       
## 
## $vis_c
##  [1] "trekking" "r. t."    NA         "WS"       "ws"       "sv"      
##  [7] "rt"       "ET"       "RT"       "AS"       "research" "TREKKING"
## [13] "RURAL"    "RESEARCH"

Objectives

1. gen : To lower case

2. soci_s: Convert in one format, lower case

3. edu: Need to merge categories

4. resi: To unique names, i.e. “u. p.”, “UP”, “up” must be in single format.

5. vis_c: uniform format.

Objectie 1

df$gen <- tolower((df$gen))
# checking
unique(df$gen)
## [1] "male"   "female"

Objective 2

df$soci_s <- tolower(df$soci_s)
# checking
unique(df$soci_s)
## [1] "m2"   "m1"   "m3"   "m4"   "monk"

Objective 3

Converting all the unique cases of education to lower case and then merging all the level of education into 5 to 6 categories.

  1. blw_highscl

  2. highscl

  3. intermed

  4. graduate

  5. postgrad

  6. doctorate

df$edu <- tolower(df$edu)  # converting to lower case

# correcting "doctorate"

df %>% mutate(edu= replace(edu, edu== "doctrate", "doctorate"),
              edu= replace(edu, edu=="docterate", "doctorate")) %>%
  as_tibble() -> df

# merging for blw_highsch

df %>% mutate(edu= replace(edu, edu== "illiterate", "blw_highscl"),
              edu= replace(edu, edu== "5th", "blw_highscl"), 
              edu= replace(edu, edu== "jhs", "blw_highscl"))-> df

# merging for graduate

df %>% mutate(edu= replace(edu, edu== "graduateup", "graduate"),
              edu= replace(edu, edu== "graduateforeign", "graduate"))-> df

# merging for intermed

df %>% mutate(edu= replace(edu, edu== "12th", "intermed"), 
              edu= replace(edu, edu== "intermediate", "intermed"),
              edu= replace(edu, edu=="intermediate up", "intermed"))-> df

# hs to highscl

df %>% mutate(edu= replace(edu, edu== "hs", "highscl")) ->df

# checking
unique(df$edu)
## [1] "pg"          "graduate"    "intermed"    "doctorate"   "highscl"    
## [6] "blw_highscl"

Objective 4

df$resi <- tolower(df$resi)
df$resi <- replace(df$resi, df$resi== "hariyana", "haryana")
df$resi <- replace(df$resi, df$resi== "hariyanart", "haryana")
df %>% mutate(resi= replace(resi, resi== "u. p.", "up"))-> df
df$resi <- replace(df$resi, df$resi== "m. p.", "mp")
df$resi <- replace(df$resi, df$resi == "maharashtra ws", "maharashtra")
df$resi <- replace(df$resi, df$resi== "andhra p", "andhra")
df$resi <- replace(df$resi, df$resi== "odisa", "odisha")
df$resi <- replace(df$resi, df$resi== "chhattisgarh", "chh")

# checking
unique(df$resi)
##  [1] "delhi"       "haryana"     "up"          "mp"          "rajasthan"  
##  [6] "maharashtra" "karnataka"   "tamilnadu"   "punjab"      "andhra"     
## [11] "telangana"   "bihar"       "uk"          "jharkhand"   "odisha"     
## [16] "chh"         "kerala"      "wb"          "foreign"     NA           
## [21] "gujrat"      "ap"          "rajsthan"    "hp"          "kerla"      
## [26] "goa"         "assam"

Objective 5

# solving comment 5
df$vis_c <- tolower(df$vis_c)
replace(df$vis_c, df$vis_c=="r. t.", "rt") -> df$vis_c
unique(df$vis_c)
## [1] "trekking" "rt"       NA         "ws"       "sv"       "et"       "as"      
## [8] "research" "rural"

checking all together

lapply(df[c("gen","soci_s", "edu", "resi", "vis_c")], unique)
## $gen
## [1] "male"   "female"
## 
## $soci_s
## [1] "m2"   "m1"   "m3"   "m4"   "monk"
## 
## $edu
## [1] "pg"          "graduate"    "intermed"    "doctorate"   "highscl"    
## [6] "blw_highscl"
## 
## $resi
##  [1] "delhi"       "haryana"     "up"          "mp"          "rajasthan"  
##  [6] "maharashtra" "karnataka"   "tamilnadu"   "punjab"      "andhra"     
## [11] "telangana"   "bihar"       "uk"          "jharkhand"   "odisha"     
## [16] "chh"         "kerala"      "wb"          "foreign"     NA           
## [21] "gujrat"      "ap"          "rajsthan"    "hp"          "kerla"      
## [26] "goa"         "assam"      
## 
## $vis_c
## [1] "trekking" "rt"       NA         "ws"       "sv"       "et"       "as"      
## [8] "research" "rural"

Now checking the structure of data.

str(df)
## tibble [401 x 17] (S3: tbl_df/tbl/data.frame)
##  $ id      : num [1:401] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age     : num [1:401] 45 40 35 25 24 22 30 32 30 28 ...
##  $ gen     : chr [1:401] "male" "male" "male" "male" ...
##  $ soci_s  : chr [1:401] "m2" "m2" "m1" "m1" ...
##  $ inc     : num [1:401] 40000 50000 30000 0 0 0 80000 100000 150000 200000 ...
##  $ edu     : chr [1:401] "pg" "graduate" "graduate" "graduate" ...
##  $ resi    : chr [1:401] "delhi" "delhi" "haryana" "up" ...
##  $ vis_c   : chr [1:401] "trekking" "trekking" "rt" "trekking" ...
##  $ days    : num [1:401] 5 5 2 3 3 3 4 4 4 2 ...
##  $ travel  : num [1:401] 15000 16000 5000 5000 5000 5000 10000 10000 10000 10000 ...
##  $ hotel   : num [1:401] 10000 12500 2000 3000 3000 2800 10000 10000 10000 5000 ...
##  $ food    : num [1:401] 3000 3000 1000 1500 1200 1400 3000 2500 2000 1000 ...
##  $ market  : num [1:401] 2000 1000 500 500 400 500 2000 1000 1200 500 ...
##  $ photo_g : num [1:401] 500 500 200 0 0 200 500 500 500 200 ...
##  $ health_s: num [1:401] 200 100 100 100 0 0 500 500 500 800 ...
##  $ guide   : num [1:401] 2000 2000 0 0 1000 1000 2000 2000 2000 1500 ...
##  $ other   : num [1:401] 1000 800 500 500 1000 500 2000 1000 1500 1000 ...
  • gen, soci_s, and edu should be in factor.

All the problems are solved. Now the factor is to be done in the correct form (tranforming from character to factor format.

  1. 2 levels are to be made for gender (gen) 1 for female and 2 for male.

  2. 5 levels of socioeconomic status (soci_s). Which will go from Monk to m1, m2, m3, m4.

  3. 6 levels of education (edu)- blw_highsch, highscl, intermed, graduate, pg, and doctorate.

df <- df %>% mutate(
  gen = factor(gen, levels = c("female", "male")),
  soci_s = factor(soci_s, levels = c("monk", "m1", "m2", "m3", "m4")),
  edu = factor(
    edu,
    levels = c(
      "blw_highscl",
      "highscl",
      "intermed",
      "graduate",
      "pg",
      "doctorate"
    )
  )
)

checking the final structure

str(df)
## tibble [401 x 17] (S3: tbl_df/tbl/data.frame)
##  $ id      : num [1:401] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age     : num [1:401] 45 40 35 25 24 22 30 32 30 28 ...
##  $ gen     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 1 1 1 1 ...
##  $ soci_s  : Factor w/ 5 levels "monk","m1","m2",..: 3 3 2 2 2 4 4 4 5 4 ...
##  $ inc     : num [1:401] 40000 50000 30000 0 0 0 80000 100000 150000 200000 ...
##  $ edu     : Factor w/ 6 levels "blw_highscl",..: 5 4 4 4 4 4 5 5 5 4 ...
##  $ resi    : chr [1:401] "delhi" "delhi" "haryana" "up" ...
##  $ vis_c   : chr [1:401] "trekking" "trekking" "rt" "trekking" ...
##  $ days    : num [1:401] 5 5 2 3 3 3 4 4 4 2 ...
##  $ travel  : num [1:401] 15000 16000 5000 5000 5000 5000 10000 10000 10000 10000 ...
##  $ hotel   : num [1:401] 10000 12500 2000 3000 3000 2800 10000 10000 10000 5000 ...
##  $ food    : num [1:401] 3000 3000 1000 1500 1200 1400 3000 2500 2000 1000 ...
##  $ market  : num [1:401] 2000 1000 500 500 400 500 2000 1000 1200 500 ...
##  $ photo_g : num [1:401] 500 500 200 0 0 200 500 500 500 200 ...
##  $ health_s: num [1:401] 200 100 100 100 0 0 500 500 500 800 ...
##  $ guide   : num [1:401] 2000 2000 0 0 1000 1000 2000 2000 2000 1500 ...
##  $ other   : num [1:401] 1000 800 500 500 1000 500 2000 1000 1500 1000 ...

Top 6 rows of our final data:

knitr::kable(head(df))
id age gen soci_s inc edu resi vis_c days travel hotel food market photo_g health_s guide other
1 45 male m2 40000 pg delhi trekking 5 15000 10000 3000 2000 500 200 2000 1000
2 40 male m2 50000 graduate delhi trekking 5 16000 12500 3000 1000 500 100 2000 800
3 35 male m1 30000 graduate haryana rt 2 5000 2000 1000 500 200 100 0 500
4 25 male m1 0 graduate up trekking 3 5000 3000 1500 500 0 100 0 500
5 24 male m1 0 graduate up trekking 3 5000 3000 1200 400 0 0 1000 1000
6 22 male m3 0 graduate up trekking 3 5000 2800 1400 500 200 0 1000 500

Saving the files in spss (.sav) and excel (.xlsx) format

# haven::write_sav(df, "C:/Users/Alok Pratap Singh/Documents/R Pubs/R Pubs/df.sav")
# xlsx::write.xlsx(df, "C:/Users/Alok Pratap Singh/Documents/R Pubs/R Pubs/df.xlsx")

Missing Cases

colSums(is.na(df))
##       id      age      gen   soci_s      inc      edu     resi    vis_c 
##        0        0        0        0        0        0        5        4 
##     days   travel    hotel     food   market  photo_g health_s    guide 
##        0        0        0        0        0        2        0        0 
##    other 
##        0

We can remove incomplete cases

na.omit(df)->df
colSums(is.na(df))
##       id      age      gen   soci_s      inc      edu     resi    vis_c 
##        0        0        0        0        0        0        0        0 
##     days   travel    hotel     food   market  photo_g health_s    guide 
##        0        0        0        0        0        0        0        0 
##    other 
##        0
dim(df)
## [1] 390  17

Thank You

Regards

Please visit my profile

Alok Pratap Singh (Research Scholar)

Linkedin (Open in New TAB)

Department of Psychology

University of Allahabad

Without data you’re just another person with an opinion

.