Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read Data

# inline comments
dfrPatient <- read.csv("D:/R-BA/R-Scripts/data/patient-data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrPatient)
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White   Male  False      182.87       76.57
## 2 AC/AH/017   Rosario White   Male  False      179.12       80.43
## 3 AC/AH/020     Julio Black   Male  False      169.15       75.48
## 4 AC/AH/022      Lupe White   Male  False      175.66       94.54
## 5 AC/AH/029    Lavern White Female  False      164.47       71.78
## 6 AC/AH/033    Bernie   Dog Female   True      158.27       69.90
##    BirthDate        State  Pet HealthGrade  Died RecordDate
## 1 31-01-1972  Georgia,xxx  Dog           2 False 25-11-2015
## 2 09-06-1972     Missouri  Dog           2 False 25-11-2015
## 3 03-07-1972 Pennsylvania None           2 False 25-11-2015
## 4 11-08-1972      Florida  Cat           1 False 25-11-2015
## 5 06-06-1973         Iowa NULL           2  True 25-11-2015
## 6 25-06-1973     Maryland  Dog           2 False 25-11-2015

Total Rows Of Patient File: 100

**Add column BMI* #Provide new column BMI

dfrPatient<- mutate(dfrPatient,
                      
                       BMI=((WeightInKgs)/(HeightInCms/100)^2))
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White   Male  False      182.87       76.57
## 2 AC/AH/017   Rosario White   Male  False      179.12       80.43
## 3 AC/AH/020     Julio Black   Male  False      169.15       75.48
## 4 AC/AH/022      Lupe White   Male  False      175.66       94.54
## 5 AC/AH/029    Lavern White Female  False      164.47       71.78
## 6 AC/AH/033    Bernie   Dog Female   True      158.27       69.90
##    BirthDate        State  Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972  Georgia,xxx  Dog           2 False 25-11-2015 22.89674
## 2 09-06-1972     Missouri  Dog           2 False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None           2 False 25-11-2015 26.38080
## 4 11-08-1972      Florida  Cat           1 False 25-11-2015 30.63867
## 5 06-06-1973         Iowa NULL           2  True 25-11-2015 26.53567
## 6 25-06-1973     Maryland  Dog           2 False 25-11-2015 27.90487

Add column BMILabel

dfrPatient <- mutate(
  dfrPatient
  ,
  BMILabel = ifelse (
    dfrPatient$BMI < 18.5,
    "Underweight",
    ifelse (
      dfrPatient$BMI >= 18.5 &
        dfrPatient$BMI < 25,
      "Normal",
      ifelse (
        dfrPatient$BMI >= 25   &
          dfrPatient$BMI < 30,
        "Overweight",
        "Obese"
      )
    )
  )
)
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White   Male  False      182.87       76.57
## 2 AC/AH/017   Rosario White   Male  False      179.12       80.43
## 3 AC/AH/020     Julio Black   Male  False      169.15       75.48
## 4 AC/AH/022      Lupe White   Male  False      175.66       94.54
## 5 AC/AH/029    Lavern White Female  False      164.47       71.78
## 6 AC/AH/033    Bernie   Dog Female   True      158.27       69.90
##    BirthDate        State  Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972  Georgia,xxx  Dog           2 False 25-11-2015 22.89674
## 2 09-06-1972     Missouri  Dog           2 False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None           2 False 25-11-2015 26.38080
## 4 11-08-1972      Florida  Cat           1 False 25-11-2015 30.63867
## 5 06-06-1973         Iowa NULL           2  True 25-11-2015 26.53567
## 6 25-06-1973     Maryland  Dog           2 False 25-11-2015 27.90487
##     BMILabel
## 1     Normal
## 2 Overweight
## 3 Overweight
## 4      Obese
## 5 Overweight
## 6 Overweight

Covert health grades of the records

dfrPatient$HealthGrade <- ifelse(dfrPatient$HealthGrade==1,
                                          "GoodHealth",
                                          ifelse(
                                            dfrPatient$HealthGrade==2,
                                            "Normal",
                                            "BadHealth" ))
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White   Male  False      182.87       76.57
## 2 AC/AH/017   Rosario White   Male  False      179.12       80.43
## 3 AC/AH/020     Julio Black   Male  False      169.15       75.48
## 4 AC/AH/022      Lupe White   Male  False      175.66       94.54
## 5 AC/AH/029    Lavern White Female  False      164.47       71.78
## 6 AC/AH/033    Bernie   Dog Female   True      158.27       69.90
##    BirthDate        State  Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972  Georgia,xxx  Dog      Normal False 25-11-2015 22.89674
## 2 09-06-1972     Missouri  Dog      Normal False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None      Normal False 25-11-2015 26.38080
## 4 11-08-1972      Florida  Cat  GoodHealth False 25-11-2015 30.63867
## 5 06-06-1973         Iowa NULL      Normal  True 25-11-2015 26.53567
## 6 25-06-1973     Maryland  Dog      Normal False 25-11-2015 27.90487
##     BMILabel
## 1     Normal
## 2 Overweight
## 3 Overweight
## 4      Obese
## 5 Overweight
## 6 Overweight

**summarise to find errors and missing data*

summarise(group_by(dfrPatient, BMILabel), n())
## # A tibble: 3 × 2
##     BMILabel `n()`
##        <chr> <int>
## 1     Normal    23
## 2      Obese     6
## 3 Overweight    71
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 6 × 2
##    Gender `n()`
##     <chr> <int>
## 1  Female     6
## 2    Male     3
## 3  Female    45
## 4 Female      4
## 5    Male    40
## 6   Male      2
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 × 2
##        Race `n()`
##       <chr> <int>
## 1     Asian     5
## 2 Bi-Racial     1
## 3     Black     8
## 4       Dog     1
## 5  Hispanic    17
## 6     White    68
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
##    Died `n()`
##   <chr> <int>
## 1 False    46
## 2  True    54
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 × 2
##      Pet `n()`
##    <chr> <int>
## 1   Bird     9
## 2    Cat    24
## 3    CAT     5
## 4    Dog    28
## 5    DOG     4
## 6  Horse     1
## 7   None    23
## 8   NONE     1
## 9   NULL     3
## 10  <NA>     2
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 × 2
##   Smokes `n()`
##    <chr> <int>
## 1  False    72
## 2     No     6
## 3   True    18
## 4    Yes     4
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
##   HealthGrade `n()`
##         <chr> <int>
## 1   BadHealth    41
## 2  GoodHealth    29
## 3      Normal    30
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows

Error handling in gender

summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 6 × 2
##    Gender `n()`
##     <chr> <int>
## 1  Female     6
## 2    Male     3
## 3  Female    45
## 4 Female      4
## 5    Male    40
## 6   Male      2
dfrPatient$Gender <- trimws(toupper(dfrPatient$Gender))
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 2 × 2
##   Gender `n()`
##    <chr> <int>
## 1 FEMALE    55
## 2   MALE    45

Error handling in state

summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows
dfrPatient$States[dfrPatient$State=="Georgia,xxx"] <- "Georgia"
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White   MALE  False      182.87       76.57
## 2 AC/AH/017   Rosario White   MALE  False      179.12       80.43
## 3 AC/AH/020     Julio Black   MALE  False      169.15       75.48
## 4 AC/AH/022      Lupe White   MALE  False      175.66       94.54
## 5 AC/AH/029    Lavern White FEMALE  False      164.47       71.78
## 6 AC/AH/033    Bernie   Dog FEMALE   True      158.27       69.90
##    BirthDate        State  Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972  Georgia,xxx  Dog      Normal False 25-11-2015 22.89674
## 2 09-06-1972     Missouri  Dog      Normal False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None      Normal False 25-11-2015 26.38080
## 4 11-08-1972      Florida  Cat  GoodHealth False 25-11-2015 30.63867
## 5 06-06-1973         Iowa NULL      Normal  True 25-11-2015 26.53567
## 6 25-06-1973     Maryland  Dog      Normal False 25-11-2015 27.90487
##     BMILabel  States
## 1     Normal Georgia
## 2 Overweight    <NA>
## 3 Overweight    <NA>
## 4      Obese    <NA>
## 5 Overweight    <NA>
## 6 Overweight    <NA>

Error handling in health grade

summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
##   HealthGrade `n()`
##         <chr> <int>
## 1   BadHealth    41
## 2  GoodHealth    29
## 3      Normal    30
class(dfrPatient$HealthGrade)
## [1] "character"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==1] <- "GOOD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==2] <- "NORMAL"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==3] <- "BAD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==99] <- NA
class(dfrPatient$HealthGrade)
## [1] "character"
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
##   HealthGrade `n()`
##         <chr> <int>
## 1   BadHealth    41
## 2  GoodHealth    29
## 3      Normal    30

Error handling in smokes

summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 × 2
##   Smokes `n()`
##    <chr> <int>
## 1  False    72
## 2     No     6
## 3   True    18
## 4    Yes     4
class(dfrPatient$Smokes)
## [1] "character"
dfrPatient$Smokes <- as.logical(dfrPatient$Smokes)
class(dfrPatient$Smokes)
## [1] "logical"
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 3 × 2
##   Smokes `n()`
##    <lgl> <int>
## 1  FALSE    72
## 2   TRUE    18
## 3     NA    10

Error handling in Pet

summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 × 2
##      Pet `n()`
##    <chr> <int>
## 1   Bird     9
## 2    Cat    24
## 3    CAT     5
## 4    Dog    28
## 5    DOG     4
## 6  Horse     1
## 7   None    23
## 8   NONE     1
## 9   NULL     3
## 10  <NA>     2
dfrPatient$Pet <- trimws(toupper(dfrPatient$Pet))
dfrPatient$Pet[dfrPatient$Pet=="NONE"] <- NA
dfrPatient$Pet[dfrPatient$Pet=="NULL"] <- NA
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 5 × 2
##     Pet `n()`
##   <chr> <int>
## 1  BIRD     9
## 2   CAT    29
## 3   DOG    32
## 4 HORSE     1
## 5  <NA>    29

Error handling in Died

summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
##    Died `n()`
##   <chr> <int>
## 1 False    46
## 2  True    54
class(dfrPatient$Died)
## [1] "character"
dfrPatient$Died <- as.logical(dfrPatient$Died)
class(dfrPatient$Died)
## [1] "logical"
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
##    Died `n()`
##   <lgl> <int>
## 1 FALSE    46
## 2  TRUE    54

Error handing in race

summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 × 2
##        Race `n()`
##       <chr> <int>
## 1     Asian     5
## 2 Bi-Racial     1
## 3     Black     8
## 4       Dog     1
## 5  Hispanic    17
## 6     White    68
dfrPatient$Race <- trimws(toupper(dfrPatient$Race))
dfrPatient$Race[dfrPatient$Race=="DOG"] <- NA
dfrPatient$Race[dfrPatient$Race=="BI-RACIAL"] <- NA
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 5 × 2
##       Race `n()`
##      <chr> <int>
## 1    ASIAN     5
## 2    BLACK     8
## 3 HISPANIC    17
## 4    WHITE    68
## 5     <NA>     2

Removing all records with NA in any columns

vclComplete <- complete.cases(dfrPatient)
dfrPatient <- dfrPatient[vclComplete, ]
nrow(dfrPatient)
## [1] 1
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87       76.57
##    BirthDate       State Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972 Georgia,xxx DOG      Normal FALSE 25-11-2015 22.89674
##   BMILabel  States
## 1   Normal Georgia

Reporting

Display top 10 recs of BMI

head(arrange(dfrPatient, desc(BMI)), 10)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87       76.57
##    BirthDate       State Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972 Georgia,xxx DOG      Normal FALSE 25-11-2015 22.89674
##   BMILabel  States
## 1   Normal Georgia
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87       76.57
##    BirthDate       State Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972 Georgia,xxx DOG      Normal FALSE 25-11-2015 22.89674
##   BMILabel  States
## 1   Normal Georgia

Display bottom 10 recs of BMI

head(arrange(dfrPatient, BMI), 10)
##          ID      Name  Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87       76.57
##    BirthDate       State Pet HealthGrade  Died RecordDate      BMI
## 1 31-01-1972 Georgia,xxx DOG      Normal FALSE 25-11-2015 22.89674
##   BMILabel  States
## 1   Normal Georgia

Display all people who are dead

dfrPatient <-
  filter(dfrPatient, Died == "True")

Display all Hispanic female records

dfrPatient<-
  filter(dfrPatient, Race == "Hispanic", Gender == "Female")

Gender>Race frequency/count

summarise(group_by(dfrPatient, Gender, Race), n())
## Source: local data frame [0 x 3]
## Groups: Gender [?]
## 
## # ... with 3 variables: Gender <chr>, Race <chr>, n() <int>

Race>Gender max,min and average for BMI

summarise(group_by(dfrPatient, Race, Gender), min(BMIValue), mean(BMIValue), max(BMIValue))
## Source: local data frame [0 x 5]
## Groups: Race [?]
## 
## # ... with 5 variables: Race <chr>, Gender <chr>, min(BMIValue) <lgl>,
## #   mean(BMIValue) <lgl>, max(BMIValue) <lgl>
head(dfrPatient)
##  [1] ID          Name        Race        Gender      Smokes     
##  [6] HeightInCms WeightInKgs BirthDate   State       Pet        
## [11] HealthGrade Died        RecordDate  BMI         BMILabel   
## [16] States     
## <0 rows> (or 0-length row.names)