Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read Data
# inline comments
dfrPatient <- read.csv("D:/R-BA/R-Scripts/data/patient-data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrPatient)
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White Male False 182.87 76.57
## 2 AC/AH/017 Rosario White Male False 179.12 80.43
## 3 AC/AH/020 Julio Black Male False 169.15 75.48
## 4 AC/AH/022 Lupe White Male False 175.66 94.54
## 5 AC/AH/029 Lavern White Female False 164.47 71.78
## 6 AC/AH/033 Bernie Dog Female True 158.27 69.90
## BirthDate State Pet HealthGrade Died RecordDate
## 1 31-01-1972 Georgia,xxx Dog 2 False 25-11-2015
## 2 09-06-1972 Missouri Dog 2 False 25-11-2015
## 3 03-07-1972 Pennsylvania None 2 False 25-11-2015
## 4 11-08-1972 Florida Cat 1 False 25-11-2015
## 5 06-06-1973 Iowa NULL 2 True 25-11-2015
## 6 25-06-1973 Maryland Dog 2 False 25-11-2015
Total Rows Of Patient File: 100
**Add column BMI* #Provide new column BMI
dfrPatient<- mutate(dfrPatient,
BMI=((WeightInKgs)/(HeightInCms/100)^2))
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White Male False 182.87 76.57
## 2 AC/AH/017 Rosario White Male False 179.12 80.43
## 3 AC/AH/020 Julio Black Male False 169.15 75.48
## 4 AC/AH/022 Lupe White Male False 175.66 94.54
## 5 AC/AH/029 Lavern White Female False 164.47 71.78
## 6 AC/AH/033 Bernie Dog Female True 158.27 69.90
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx Dog 2 False 25-11-2015 22.89674
## 2 09-06-1972 Missouri Dog 2 False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None 2 False 25-11-2015 26.38080
## 4 11-08-1972 Florida Cat 1 False 25-11-2015 30.63867
## 5 06-06-1973 Iowa NULL 2 True 25-11-2015 26.53567
## 6 25-06-1973 Maryland Dog 2 False 25-11-2015 27.90487
Add column BMILabel
dfrPatient <- mutate(
dfrPatient
,
BMILabel = ifelse (
dfrPatient$BMI < 18.5,
"Underweight",
ifelse (
dfrPatient$BMI >= 18.5 &
dfrPatient$BMI < 25,
"Normal",
ifelse (
dfrPatient$BMI >= 25 &
dfrPatient$BMI < 30,
"Overweight",
"Obese"
)
)
)
)
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White Male False 182.87 76.57
## 2 AC/AH/017 Rosario White Male False 179.12 80.43
## 3 AC/AH/020 Julio Black Male False 169.15 75.48
## 4 AC/AH/022 Lupe White Male False 175.66 94.54
## 5 AC/AH/029 Lavern White Female False 164.47 71.78
## 6 AC/AH/033 Bernie Dog Female True 158.27 69.90
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx Dog 2 False 25-11-2015 22.89674
## 2 09-06-1972 Missouri Dog 2 False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None 2 False 25-11-2015 26.38080
## 4 11-08-1972 Florida Cat 1 False 25-11-2015 30.63867
## 5 06-06-1973 Iowa NULL 2 True 25-11-2015 26.53567
## 6 25-06-1973 Maryland Dog 2 False 25-11-2015 27.90487
## BMILabel
## 1 Normal
## 2 Overweight
## 3 Overweight
## 4 Obese
## 5 Overweight
## 6 Overweight
Covert health grades of the records
dfrPatient$HealthGrade <- ifelse(dfrPatient$HealthGrade==1,
"GoodHealth",
ifelse(
dfrPatient$HealthGrade==2,
"Normal",
"BadHealth" ))
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White Male False 182.87 76.57
## 2 AC/AH/017 Rosario White Male False 179.12 80.43
## 3 AC/AH/020 Julio Black Male False 169.15 75.48
## 4 AC/AH/022 Lupe White Male False 175.66 94.54
## 5 AC/AH/029 Lavern White Female False 164.47 71.78
## 6 AC/AH/033 Bernie Dog Female True 158.27 69.90
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx Dog Normal False 25-11-2015 22.89674
## 2 09-06-1972 Missouri Dog Normal False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None Normal False 25-11-2015 26.38080
## 4 11-08-1972 Florida Cat GoodHealth False 25-11-2015 30.63867
## 5 06-06-1973 Iowa NULL Normal True 25-11-2015 26.53567
## 6 25-06-1973 Maryland Dog Normal False 25-11-2015 27.90487
## BMILabel
## 1 Normal
## 2 Overweight
## 3 Overweight
## 4 Obese
## 5 Overweight
## 6 Overweight
**summarise to find errors and missing data*
summarise(group_by(dfrPatient, BMILabel), n())
## # A tibble: 3 × 2
## BMILabel `n()`
## <chr> <int>
## 1 Normal 23
## 2 Obese 6
## 3 Overweight 71
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 6 × 2
## Gender `n()`
## <chr> <int>
## 1 Female 6
## 2 Male 3
## 3 Female 45
## 4 Female 4
## 5 Male 40
## 6 Male 2
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 × 2
## Race `n()`
## <chr> <int>
## 1 Asian 5
## 2 Bi-Racial 1
## 3 Black 8
## 4 Dog 1
## 5 Hispanic 17
## 6 White 68
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
## Died `n()`
## <chr> <int>
## 1 False 46
## 2 True 54
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 × 2
## Pet `n()`
## <chr> <int>
## 1 Bird 9
## 2 Cat 24
## 3 CAT 5
## 4 Dog 28
## 5 DOG 4
## 6 Horse 1
## 7 None 23
## 8 NONE 1
## 9 NULL 3
## 10 <NA> 2
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 × 2
## Smokes `n()`
## <chr> <int>
## 1 False 72
## 2 No 6
## 3 True 18
## 4 Yes 4
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
## HealthGrade `n()`
## <chr> <int>
## 1 BadHealth 41
## 2 GoodHealth 29
## 3 Normal 30
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
Error handling in gender
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 6 × 2
## Gender `n()`
## <chr> <int>
## 1 Female 6
## 2 Male 3
## 3 Female 45
## 4 Female 4
## 5 Male 40
## 6 Male 2
dfrPatient$Gender <- trimws(toupper(dfrPatient$Gender))
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 2 × 2
## Gender `n()`
## <chr> <int>
## 1 FEMALE 55
## 2 MALE 45
Error handling in state
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
dfrPatient$States[dfrPatient$State=="Georgia,xxx"] <- "Georgia"
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 × 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius White MALE False 182.87 76.57
## 2 AC/AH/017 Rosario White MALE False 179.12 80.43
## 3 AC/AH/020 Julio Black MALE False 169.15 75.48
## 4 AC/AH/022 Lupe White MALE False 175.66 94.54
## 5 AC/AH/029 Lavern White FEMALE False 164.47 71.78
## 6 AC/AH/033 Bernie Dog FEMALE True 158.27 69.90
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx Dog Normal False 25-11-2015 22.89674
## 2 09-06-1972 Missouri Dog Normal False 25-11-2015 25.06859
## 3 03-07-1972 Pennsylvania None Normal False 25-11-2015 26.38080
## 4 11-08-1972 Florida Cat GoodHealth False 25-11-2015 30.63867
## 5 06-06-1973 Iowa NULL Normal True 25-11-2015 26.53567
## 6 25-06-1973 Maryland Dog Normal False 25-11-2015 27.90487
## BMILabel States
## 1 Normal Georgia
## 2 Overweight <NA>
## 3 Overweight <NA>
## 4 Obese <NA>
## 5 Overweight <NA>
## 6 Overweight <NA>
Error handling in health grade
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
## HealthGrade `n()`
## <chr> <int>
## 1 BadHealth 41
## 2 GoodHealth 29
## 3 Normal 30
class(dfrPatient$HealthGrade)
## [1] "character"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==1] <- "GOOD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==2] <- "NORMAL"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==3] <- "BAD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==99] <- NA
class(dfrPatient$HealthGrade)
## [1] "character"
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 3 × 2
## HealthGrade `n()`
## <chr> <int>
## 1 BadHealth 41
## 2 GoodHealth 29
## 3 Normal 30
Error handling in smokes
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 × 2
## Smokes `n()`
## <chr> <int>
## 1 False 72
## 2 No 6
## 3 True 18
## 4 Yes 4
class(dfrPatient$Smokes)
## [1] "character"
dfrPatient$Smokes <- as.logical(dfrPatient$Smokes)
class(dfrPatient$Smokes)
## [1] "logical"
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 3 × 2
## Smokes `n()`
## <lgl> <int>
## 1 FALSE 72
## 2 TRUE 18
## 3 NA 10
Error handling in Pet
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 × 2
## Pet `n()`
## <chr> <int>
## 1 Bird 9
## 2 Cat 24
## 3 CAT 5
## 4 Dog 28
## 5 DOG 4
## 6 Horse 1
## 7 None 23
## 8 NONE 1
## 9 NULL 3
## 10 <NA> 2
dfrPatient$Pet <- trimws(toupper(dfrPatient$Pet))
dfrPatient$Pet[dfrPatient$Pet=="NONE"] <- NA
dfrPatient$Pet[dfrPatient$Pet=="NULL"] <- NA
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 5 × 2
## Pet `n()`
## <chr> <int>
## 1 BIRD 9
## 2 CAT 29
## 3 DOG 32
## 4 HORSE 1
## 5 <NA> 29
Error handling in Died
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
## Died `n()`
## <chr> <int>
## 1 False 46
## 2 True 54
class(dfrPatient$Died)
## [1] "character"
dfrPatient$Died <- as.logical(dfrPatient$Died)
class(dfrPatient$Died)
## [1] "logical"
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 × 2
## Died `n()`
## <lgl> <int>
## 1 FALSE 46
## 2 TRUE 54
Error handing in race
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 × 2
## Race `n()`
## <chr> <int>
## 1 Asian 5
## 2 Bi-Racial 1
## 3 Black 8
## 4 Dog 1
## 5 Hispanic 17
## 6 White 68
dfrPatient$Race <- trimws(toupper(dfrPatient$Race))
dfrPatient$Race[dfrPatient$Race=="DOG"] <- NA
dfrPatient$Race[dfrPatient$Race=="BI-RACIAL"] <- NA
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 5 × 2
## Race `n()`
## <chr> <int>
## 1 ASIAN 5
## 2 BLACK 8
## 3 HISPANIC 17
## 4 WHITE 68
## 5 <NA> 2
Removing all records with NA in any columns
vclComplete <- complete.cases(dfrPatient)
dfrPatient <- dfrPatient[vclComplete, ]
nrow(dfrPatient)
## [1] 1
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 76.57
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx DOG Normal FALSE 25-11-2015 22.89674
## BMILabel States
## 1 Normal Georgia
Display top 10 recs of BMI
head(arrange(dfrPatient, desc(BMI)), 10)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 76.57
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx DOG Normal FALSE 25-11-2015 22.89674
## BMILabel States
## 1 Normal Georgia
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 76.57
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx DOG Normal FALSE 25-11-2015 22.89674
## BMILabel States
## 1 Normal Georgia
Display bottom 10 recs of BMI
head(arrange(dfrPatient, BMI), 10)
## ID Name Race Gender Smokes HeightInCms WeightInKgs
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 76.57
## BirthDate State Pet HealthGrade Died RecordDate BMI
## 1 31-01-1972 Georgia,xxx DOG Normal FALSE 25-11-2015 22.89674
## BMILabel States
## 1 Normal Georgia
Display all people who are dead
dfrPatient <-
filter(dfrPatient, Died == "True")
Display all Hispanic female records
dfrPatient<-
filter(dfrPatient, Race == "Hispanic", Gender == "Female")
Gender>Race frequency/count
summarise(group_by(dfrPatient, Gender, Race), n())
## Source: local data frame [0 x 3]
## Groups: Gender [?]
##
## # ... with 3 variables: Gender <chr>, Race <chr>, n() <int>
Race>Gender max,min and average for BMI
summarise(group_by(dfrPatient, Race, Gender), min(BMIValue), mean(BMIValue), max(BMIValue))
## Source: local data frame [0 x 5]
## Groups: Race [?]
##
## # ... with 5 variables: Race <chr>, Gender <chr>, min(BMIValue) <lgl>,
## # mean(BMIValue) <lgl>, max(BMIValue) <lgl>
head(dfrPatient)
## [1] ID Name Race Gender Smokes
## [6] HeightInCms WeightInKgs BirthDate State Pet
## [11] HealthGrade Died RecordDate BMI BMILabel
## [16] States
## <0 rows> (or 0-length row.names)