R Notebook

Data Cleaning Part

*There are few stage in data cleaning process as follow:

Exploring the raw data
Tidying data
Preparing data for analysis
Explorig raw data

df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/Movie-Ratings.csv")

head(df,4)

##                    Film     Genre Rotten.Tomatoes.Ratings..
## 1 (500) Days of Summer     Comedy                        87
## 2           10,000 B.C. Adventure                         9
## 3            12 Rounds     Action                        30
## 4             127 Hours Adventure                        93
##   Audience.Ratings.. Budget..million... Year.of.release
## 1                 81                  8            2009
## 2                 44                105            2008
## 3                 52                 20            2009
## 4                 84                 18            2010

Investigate glimse and the internal structure of data like names, class, tail, str,dim,summary

# Internal structure of the dataset

str(df)

## 'data.frame':    562 obs. of  6 variables:
##  $ Film                     : Factor w/ 562 levels "(500) Days of Summer ",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Genre                    : Factor w/ 7 levels "Action","Adventure",..: 3 2 1 2 3 1 3 5 3 3 ...
##  $ Rotten.Tomatoes.Ratings..: int  87 9 30 93 55 39 40 50 43 93 ...
##  $ Audience.Ratings..       : int  81 44 52 84 70 63 71 57 48 93 ...
##  $ Budget..million...       : int  8 105 20 18 20 200 30 32 28 8 ...
##  $ Year.of.release          : int  2009 2008 2009 2010 2009 2009 2008 2007 2011 2011 ...

# Collum names

names(df)

## [1] "Film"                      "Genre"                    
## [3] "Rotten.Tomatoes.Ratings.." "Audience.Ratings.."       
## [5] "Budget..million..."        "Year.of.release"

colnames(df) # alternative

## [1] "Film"                      "Genre"                    
## [3] "Rotten.Tomatoes.Ratings.." "Audience.Ratings.."       
## [5] "Budget..million..."        "Year.of.release"

# Summary 

summary(df)

##                     Film           Genre     Rotten.Tomatoes.Ratings..
##  (500) Days of Summer :  1   Action   :154   Min.   : 0.0             
##  10,000 B.C.          :  1   Adventure: 29   1st Qu.:25.0             
##  12 Rounds            :  1   Comedy   :172   Median :46.0             
##  127 Hours            :  1   Drama    :101   Mean   :47.4             
##  17 Again             :  1   Horror   : 49   3rd Qu.:70.0             
##  2012                 :  1   Romance  : 21   Max.   :97.0             
##  (Other)              :556   Thriller : 36                            
##  Audience.Ratings.. Budget..million... Year.of.release
##  Min.   : 0.00      Min.   :  0.0      Min.   :2007   
##  1st Qu.:47.00      1st Qu.: 20.0      1st Qu.:2008   
##  Median :58.00      Median : 35.0      Median :2009   
##  Mean   :58.83      Mean   : 50.1      Mean   :2009   
##  3rd Qu.:72.00      3rd Qu.: 65.0      3rd Qu.:2010   
##  Max.   :96.00      Max.   :300.0      Max.   :2011   
##

# See number of rows and collumns

dim(df)

## [1] 562   6

ncol(df) # Alternative

## [1] 6

nrow(df) # Alternative

## [1] 562

# Internal structure of the dataset using glimse from dplyr package

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

glimpse(df)

## Observations: 562
## Variables: 6
## $ Film                      <fctr> (500) Days of Summer , 10,000 B.C.,...
## $ Genre                     <fctr> Comedy, Adventure, Action, Adventur...
## $ Rotten.Tomatoes.Ratings.. <int> 87, 9, 30, 93, 55, 39, 40, 50, 43, 9...
## $ Audience.Ratings..        <int> 81, 44, 52, 84, 70, 63, 71, 57, 48, ...
## $ Budget..million...        <int> 8, 105, 20, 18, 20, 200, 30, 32, 28,...
## $ Year.of.release           <int> 2009, 2008, 2009, 2010, 2009, 2009, ...

*Tidy Data

We should care about the position of variables and rows. Sometimes rows should be column and opposite.

attribute=variable=column=feature depending on field of study

tidyr package

df4<-data.frame(X=c("a","b","c"),A=c(1:3),B=c(3:5),C=c(5:7))

df4

##   X A B C
## 1 a 1 3 5
## 2 b 2 4 6
## 3 c 3 5 7

Converting wide data into long data

library(tidyr)

df_long<- df4 %>% tidyr::gather(Category, Values,-X) # gather takes 3 arguments 'key', 'value' and columns to exclude or include. Key is a new column, while value is all values to form a new column

head(df_long)

##   X Category Values
## 1 a        A      1
## 2 b        A      2
## 3 c        A      3
## 4 a        B      3
## 5 b        B      4
## 6 c        B      5

Converting long data to wide data

df_wide<- df_long %>% tidyr::spread(Category, Values)

head(df_wide)

##   X A B C
## 1 a 1 3 5
## 2 b 2 4 6
## 3 c 3 5 7

Unite multiple columns to one columns

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

df5<-ToothGrowth

head(df5)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

df5_uni<- df5 %>% tidyr::unite(var1,supp,dose,sep="_")

head(df5_uni)

##    len   var1
## 1  4.2 VC_0.5
## 2 11.5 VC_0.5
## 3  7.3 VC_0.5
## 4  5.8 VC_0.5
## 5  6.4 VC_0.5
## 6 10.0 VC_0.5

Separating one column into two or more variables

separate(dataset,var1,c("var2","var3","var4"),sep="_") where var1 is old column, var2,3,4 is ew columns and sep=“_" makes thing more specific

df5_sep<- df5_uni %>% tidyr::separate(var1, c("Supplement","Dose"),sep="_")

head(df5_sep)

##    len Supplement Dose
## 1  4.2         VC  0.5
## 2 11.5         VC  0.5
## 3  7.3         VC  0.5
## 4  5.8         VC  0.5
## 5  6.4         VC  0.5
## 6 10.0         VC  0.5

If working with date time data, lubridate can be a useful package

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

dmy("17 Sep 2015")

## [1] "2015-09-17"

mdy_hm("July 15, 2012 12:56")

## [1] "2012-07-15 12:56:00 UTC"

String manipulation

library(stringr)

myname<-c("    Ha Van Tuyen    ")

# trim leading and trailing whitespace

str_trim(myname)

## [1] "Ha Van Tuyen"

# adding values to first or last 

str_pad("12345",width=8,side = "left",pad="0")

## [1] "00012345"

# create friend lists

friends<-c("Ha","Ban","Hu")

str_detect(friends,"Ha")

## [1]  TRUE FALSE FALSE

toupper(friends) # convert lowercase to uppercase

## [1] "HA"  "BAN" "HU"

tolower(friends) # Convert uppercase to lowercase

## [1] "ha"  "ban" "hu"

# replace Ha into Huong

str_replace(friends,"Ha","Huong")

## [1] "Huong" "Ban"   "Hu"

# Adding a certain value to a variable

head(cars)

##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10

cars$dist<-str_pad(cars$dist,width = 5,side="right",pad = "h")

head(cars)

##   speed  dist
## 1     4 2hhhh
## 2     4 10hhh
## 3     7 4hhhh
## 4     7 22hhh
## 5     8 16hhh
## 6     9 10hhh

-Checking missing data

df6<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/does_vn07.csv")

head(df6)

##   id gender anyfx timeany priofx_any hipfx timehip priofx_hip fnbmd lsbmd
## 1  3   Male     0    0.55          0     0    0.55          0  1.08  1.46
## 2  8 Female     0   15.38          1     0   15.38          0  0.97  1.33
## 3  9   Male     0    5.06          0     0    5.06          0  1.01  1.49
## 4 10 Female     0   14.25          0     0   14.25          0  0.84  1.21
## 5 23   Male     0   15.07          0     0   15.07          0  0.81  1.14
## 6 24 Female     0   12.30          0     0   12.30          0  0.74  0.98
##    sway quad smoking caintake   pal age wt  ht
## 1  1426   NA       1      467 163.5  73 98 175
## 2   992   18       0      543  41.2  68 72 166
## 3  2376   36       0      384  68.3  68 87 184
## 4 21700    3       1      125  71.9  62 72 173
## 5  2193   44       1      463  31.0  61 72 173
## 6  2240   15       0      471    NA  76 57 156

Checking missing data

# yes or no NA
any(is.na(df6))

## [1] TRUE

# How many NA in the dataset

sum(is.na(df6))

## [1] 1186

# Alternative

summary(df6)

##        id          gender         anyfx           timeany      
##  Min.   :   3   Female:1358   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:1452   Male  : 858   1st Qu.:0.0000   1st Qu.: 4.625  
##  Median :3296                 Median :0.0000   Median :10.785  
##  Mean   :3992                 Mean   :0.2595   Mean   : 9.163  
##  3rd Qu.:6177                 3rd Qu.:1.0000   3rd Qu.:13.562  
##  Max.   :9797                 Max.   :1.0000   Max.   :15.660  
##                                                                
##    priofx_any         hipfx           timehip         priofx_hip    
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 6.715   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :12.440   Median :0.0000  
##  Mean   :0.1476   Mean   :0.0718   Mean   :10.239   Mean   :0.1494  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:13.860   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :15.660   Max.   :1.0000  
##                   NA's   :448                                       
##      fnbmd            lsbmd            sway            quad      
##  Min.   :0.2800   Min.   :0.480   Min.   :  120   Min.   : 3.00  
##  1st Qu.:0.7300   1st Qu.:0.950   1st Qu.:  714   1st Qu.:16.00  
##  Median :0.8200   Median :1.090   Median : 1152   Median :23.00  
##  Mean   :0.8287   Mean   :1.111   Mean   : 3797   Mean   :24.43  
##  3rd Qu.:0.9300   3rd Qu.:1.250   3rd Qu.: 2150   3rd Qu.:32.00  
##  Max.   :1.5100   Max.   :2.090   Max.   :21700   Max.   :50.00  
##  NA's   :89       NA's   :87      NA's   :71      NA's   :127    
##     smoking          caintake           pal              age       
##  Min.   :0.0000   Min.   :   0.0   Min.   :  2.40   Min.   :57.00  
##  1st Qu.:0.0000   1st Qu.: 403.0   1st Qu.: 55.15   1st Qu.:65.00  
##  Median :0.0000   Median : 572.0   Median : 75.95   Median :70.00  
##  Mean   :0.4176   Mean   : 633.4   Mean   : 80.52   Mean   :70.89  
##  3rd Qu.:1.0000   3rd Qu.: 796.5   3rd Qu.:101.85   3rd Qu.:76.00  
##  Max.   :1.0000   Max.   :2943.0   Max.   :296.20   Max.   :96.00  
##  NA's   :1        NA's   :124      NA's   :132                     
##        wt               ht       
##  Min.   : 34.00   Min.   :136.0  
##  1st Qu.: 60.00   1st Qu.:158.0  
##  Median : 69.00   Median :164.0  
##  Mean   : 70.14   Mean   :164.9  
##  3rd Qu.: 79.00   3rd Qu.:171.0  
##  Max.   :133.00   Max.   :196.0  
##  NA's   :53       NA's   :54

# Get complete rows without NA

df6_no<-df6[complete.cases(df6),]

head(df6_no)

##   id gender anyfx timeany priofx_any hipfx timehip priofx_hip fnbmd lsbmd
## 2  8 Female     0   15.38          1     0   15.38          0  0.97  1.33
## 3  9   Male     0    5.06          0     0    5.06          0  1.01  1.49
## 4 10 Female     0   14.25          0     0   14.25          0  0.84  1.21
## 5 23   Male     0   15.07          0     0   15.07          0  0.81  1.14
## 7 26   Male     0   11.47          0     0   11.47          0  1.01  1.38
## 8 27 Female     0   15.13          0     0   15.13          0  0.86  1.07
##    sway quad smoking caintake  pal age wt  ht
## 2   992   18       0      543 41.2  68 72 166
## 3  2376   36       0      384 68.3  68 87 184
## 4 21700    3       1      125 71.9  62 72 173
## 5  2193   44       1      463 31.0  61 72 173
## 7  1404   26       1      647 78.8  63 97 173
## 8  2268   27       0      847 84.2  64 85 167

# Alternatively using na.omit(df6)

any(is.na(df6_no))

## [1] FALSE

Dealing with missing data

# Replace NA values by mean of qud

df6$quad[which(is.na(df6$quad))]<-mean(df6$quad,na.rm = T)

head(df6)

##   id gender anyfx timeany priofx_any hipfx timehip priofx_hip fnbmd lsbmd
## 1  3   Male     0    0.55          0     0    0.55          0  1.08  1.46
## 2  8 Female     0   15.38          1     0   15.38          0  0.97  1.33
## 3  9   Male     0    5.06          0     0    5.06          0  1.01  1.49
## 4 10 Female     0   14.25          0     0   14.25          0  0.84  1.21
## 5 23   Male     0   15.07          0     0   15.07          0  0.81  1.14
## 6 24 Female     0   12.30          0     0   12.30          0  0.74  0.98
##    sway     quad smoking caintake   pal age wt  ht
## 1  1426 24.42508       1      467 163.5  73 98 175
## 2   992 18.00000       0      543  41.2  68 72 166
## 3  2376 36.00000       0      384  68.3  68 87 184
## 4 21700  3.00000       1      125  71.9  62 72 173
## 5  2193 44.00000       1      463  31.0  61 72 173
## 6  2240 15.00000       0      471    NA  76 57 156

# if any empty values, the same function can be used to replace empty values to NA

# df6$quad[which(df6$quad=="")]<-NA