title: “Assignment 3” author: “Sneha Kshatriya” date: “28th March 2017” output: html_document

Analysis Of Patient Data

Objectives

  • analyse patient data
  • data manipulation -data cleaning -undertstanding the concept of R Mardown
  • learning to publish on rpubs

Probem Definition

-carry out data manipulation -create columns as required -cleaning the data and removing errors from the data -remvoving missing values -filtering the data as per det requirements

Code & Output

knitr Global Options

# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')

Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read Data

# inline comments
setwd("/Users/snehakshatriya/Desktop/R-BA/R-Scripts")
dfrPatient <- read.csv("./data/patient-data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrPatient)
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White   Male  FALSE      182.87         1.8287
## 2 AC/AH/017   Rosario White   Male  FALSE      179.12         1.7912
## 3 AC/AH/020     Julio Black   Male  FALSE      169.15         1.6915
## 4 AC/AH/022      Lupe White   Male  FALSE      175.66         1.7566
## 5 AC/AH/029    Lavern White Female  FALSE      164.47         1.6447
## 6 AC/AH/033    Bernie   Dog Female   TRUE      158.27         1.5827
##   WeightInKgs BirthDate        State  Pet HealthGrade  Died RecordDate
## 1       76.57  31/01/72  Georgia,xxx  Dog           2 FALSE   25/11/15
## 2       80.43  09/06/72     Missouri  Dog           2 FALSE   25/11/15
## 3       75.48  03/07/72 Pennsylvania None           2 FALSE   25/11/15
## 4       94.54  11/08/72      Florida  Cat           1 FALSE   25/11/15
## 5       71.78  06/06/73         Iowa NULL           2  TRUE   25/11/15
## 6       69.90  25/06/73     Maryland  Dog           2 FALSE   25/11/15

Total Rows Of Patient File: 100

Add coloumn BMI-Value

# inline comments
dfrPatient <- mutate(dfrPatient, BMIValue=(WeightInKgs/(HeightInCms/100)^2))
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White   Male  FALSE      182.87         1.8287
## 2 AC/AH/017   Rosario White   Male  FALSE      179.12         1.7912
## 3 AC/AH/020     Julio Black   Male  FALSE      169.15         1.6915
## 4 AC/AH/022      Lupe White   Male  FALSE      175.66         1.7566
## 5 AC/AH/029    Lavern White Female  FALSE      164.47         1.6447
## 6 AC/AH/033    Bernie   Dog Female   TRUE      158.27         1.5827
##   WeightInKgs BirthDate        State  Pet HealthGrade  Died RecordDate
## 1       76.57  31/01/72  Georgia,xxx  Dog           2 FALSE   25/11/15
## 2       80.43  09/06/72     Missouri  Dog           2 FALSE   25/11/15
## 3       75.48  03/07/72 Pennsylvania None           2 FALSE   25/11/15
## 4       94.54  11/08/72      Florida  Cat           1 FALSE   25/11/15
## 5       71.78  06/06/73         Iowa NULL           2  TRUE   25/11/15
## 6       69.90  25/06/73     Maryland  Dog           2 FALSE   25/11/15
##   BMIValue
## 1 22.89674
## 2 25.06859
## 3 26.38080
## 4 30.63867
## 5 26.53567
## 6 27.90487

Add column BMI-Label

# inline comments
dfrPatient <- mutate(dfrPatient, BMILabel=NA)
dfrPatient$BMILabel <- ifelse(dfrPatient$BMIValue < 18.50,"UNDERWEIGHT",
                         ifelse(dfrPatient$BMIValue > 18.50 & dfrPatient$BMIValue < 25.00, "NORMAL",
                         ifelse(dfrPatient$BMIValue > 25.00 & dfrPatient$BMIValue < 30.00, "OVERWEIGHT",
                         ifelse(dfrPatient$BMIValue > 30.00,"Obese", NA))))
head(dfrPatient)
##          ID      Name  Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White   Male  FALSE      182.87         1.8287
## 2 AC/AH/017   Rosario White   Male  FALSE      179.12         1.7912
## 3 AC/AH/020     Julio Black   Male  FALSE      169.15         1.6915
## 4 AC/AH/022      Lupe White   Male  FALSE      175.66         1.7566
## 5 AC/AH/029    Lavern White Female  FALSE      164.47         1.6447
## 6 AC/AH/033    Bernie   Dog Female   TRUE      158.27         1.5827
##   WeightInKgs BirthDate        State  Pet HealthGrade  Died RecordDate
## 1       76.57  31/01/72  Georgia,xxx  Dog           2 FALSE   25/11/15
## 2       80.43  09/06/72     Missouri  Dog           2 FALSE   25/11/15
## 3       75.48  03/07/72 Pennsylvania None           2 FALSE   25/11/15
## 4       94.54  11/08/72      Florida  Cat           1 FALSE   25/11/15
## 5       71.78  06/06/73         Iowa NULL           2  TRUE   25/11/15
## 6       69.90  25/06/73     Maryland  Dog           2 FALSE   25/11/15
##   BMIValue   BMILabel
## 1 22.89674     NORMAL
## 2 25.06859 OVERWEIGHT
## 3 26.38080 OVERWEIGHT
## 4 30.63867      Obese
## 5 26.53567 OVERWEIGHT
## 6 27.90487 OVERWEIGHT

Healthgrade

summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
##   HealthGrade `n()`
##         <int> <int>
## 1           1    29
## 2           2    30
## 3           3    34
## 4          99     7
class(dfrPatient$HealthGrade)
## [1] "integer"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==1] <- "GOOD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==2] <- "NORMAL"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==3] <- "BAD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==99] <- NA
class(dfrPatient$HealthGrade)
## [1] "character"
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
##   HealthGrade `n()`
##         <chr> <int>
## 1         BAD    34
## 2        GOOD    29
## 3      NORMAL    30
## 4        <NA>     7

Error Handling

#inline comments
summarise(group_by(dfrPatient, BMILabel), n())
## # A tibble: 3 <U+00D7> 2
##     BMILabel `n()`
##        <chr> <int>
## 1     NORMAL    23
## 2 OVERWEIGHT    71
## 3      Obese     6
cat("\014")

summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 3 <U+00D7> 2
##    Gender `n()`
##     <chr> <int>
## 1  Female    54
## 2 Female      1
## 3    Male    45
cat("\014")

summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 <U+00D7> 2
##        Race `n()`
##       <chr> <int>
## 1     Asian     5
## 2 Bi-Racial     1
## 3     Black     8
## 4       Dog     1
## 5  Hispanic    17
## 6     White    68
cat("\014")

summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
##    Died `n()`
##   <lgl> <int>
## 1 FALSE    46
## 2  TRUE    54
cat("\014")

summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 <U+00D7> 2
##      Pet `n()`
##    <chr> <int>
## 1   Bird     9
## 2    CAT     5
## 3    Cat    24
## 4    DOG     4
## 5    Dog    28
## 6  Horse     1
## 7   NONE     1
## 8   NULL     3
## 9   None    23
## 10  <NA>     2
cat("\014")

summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 <U+00D7> 2
##   Smokes `n()`
##    <chr> <int>
## 1  FALSE    72
## 2     No     6
## 3   TRUE    18
## 4    Yes     4
cat("\014")

summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
##   HealthGrade `n()`
##         <chr> <int>
## 1         BAD    34
## 2        GOOD    29
## 3      NORMAL    30
## 4        <NA>     7
cat("\014")

summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows

Error handling in gender

#inline comments
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 3 <U+00D7> 2
##    Gender `n()`
##     <chr> <int>
## 1  Female    54
## 2 Female      1
## 3    Male    45
dfrPatient$Gender <- trimws(toupper(dfrPatient$Gender))
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 2 <U+00D7> 2
##   Gender `n()`
##    <chr> <int>
## 1 FEMALE    55
## 2   MALE    45

Error handling in race

summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 <U+00D7> 2
##        Race `n()`
##       <chr> <int>
## 1     Asian     5
## 2 Bi-Racial     1
## 3     Black     8
## 4       Dog     1
## 5  Hispanic    17
## 6     White    68
dfrPatient$Race <- trimws(toupper(dfrPatient$Race))
dfrPatient$Race[dfrPatient$Race=="DOG"] <- NA
dfrPatient$Race[dfrPatient$Race=="BI-RACIAL"] <- NA
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 5 <U+00D7> 2
##       Race `n()`
##      <chr> <int>
## 1    ASIAN     5
## 2    BLACK     8
## 3 HISPANIC    17
## 4    WHITE    68
## 5     <NA>     2

Error handling in died

summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
##    Died `n()`
##   <lgl> <int>
## 1 FALSE    46
## 2  TRUE    54
class(dfrPatient$Died)
## [1] "logical"
dfrPatient$Died <- as.logical(dfrPatient$Died)
class(dfrPatient$Died)
## [1] "logical"
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
##    Died `n()`
##   <lgl> <int>
## 1 FALSE    46
## 2  TRUE    54

Error handling in pet

summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 <U+00D7> 2
##      Pet `n()`
##    <chr> <int>
## 1   Bird     9
## 2    CAT     5
## 3    Cat    24
## 4    DOG     4
## 5    Dog    28
## 6  Horse     1
## 7   NONE     1
## 8   NULL     3
## 9   None    23
## 10  <NA>     2
dfrPatient$Pet <- trimws(toupper(dfrPatient$Pet))
dfrPatient$Pet[dfrPatient$Pet=="NONE"] <- NA
dfrPatient$Pet[dfrPatient$Pet=="NULL"] <- NA
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 5 <U+00D7> 2
##     Pet `n()`
##   <chr> <int>
## 1  BIRD     9
## 2   CAT    29
## 3   DOG    32
## 4 HORSE     1
## 5  <NA>    29

Error handling in smokes

summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 <U+00D7> 2
##   Smokes `n()`
##    <chr> <int>
## 1  FALSE    72
## 2     No     6
## 3   TRUE    18
## 4    Yes     4
class(dfrPatient$Smokes)
## [1] "character"
dfrPatient$Smokes <- as.logical(dfrPatient$Smokes)
class(dfrPatient$Smokes)
## [1] "logical"
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 3 <U+00D7> 2
##   Smokes `n()`
##    <lgl> <int>
## 1  FALSE    72
## 2   TRUE    18
## 3     NA    10

Error handling in State

summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows
dfrPatient$States[dfrPatient$State=="Georgia,xxx"] <- "Georgia"
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
##          State `n()`
##          <chr> <int>
## 1      Alabama     2
## 2      Arizona     2
## 3   California    13
## 4     Colorado     1
## 5  Connecticut     1
## 6      Florida     8
## 7      Georgia     3
## 8  Georgia,xxx     1
## 9       Hawaii     2
## 10    Illinois     4
## # ... with 24 more rows

Complete cases

cat("\014")

vclComplete <- complete.cases(dfrPatient)
vclComplete[is.true(vclComplete)]
## Error in eval(expr, envir, enclos): could not find function "is.true"
dfrPatient <- dfrPatient[vclComplete, ]
nrow(dfrPatient)
## [1] 1

Reporting

Display top 10 records based on BMI-Value.

head(arrange(dfrPatient, desc(BMIValue)), 10)
##          ID      Name  Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87         1.8287
##   WeightInKgs BirthDate       State Pet HealthGrade  Died RecordDate
## 1       76.57  31/01/72 Georgia,xxx DOG      NORMAL FALSE   25/11/15
##   BMIValue BMILabel  States
## 1 22.89674   NORMAL Georgia

Display bottom 10 records based on BMI-Value.

head(arrange(dfrPatient, BMIValue), 10)
##          ID      Name  Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius WHITE   MALE  FALSE      182.87         1.8287
##   WeightInKgs BirthDate       State Pet HealthGrade  Died RecordDate
## 1       76.57  31/01/72 Georgia,xxx DOG      NORMAL FALSE   25/11/15
##   BMIValue BMILabel  States
## 1 22.89674   NORMAL Georgia

Gender > Race - frequency / counts

summarise(group_by(dfrPatient, Gender, Race), n())
## Source: local data frame [1 x 3]
## Groups: Gender [?]
## 
##   Gender  Race `n()`
##    <chr> <chr> <int>
## 1   MALE WHITE     1

Race > Gender - max, min and average values for BMI-Values

summarise(group_by(dfrPatient, Race, Gender), min(BMIValue), mean(BMIValue), max(BMIValue))
## Source: local data frame [1 x 5]
## Groups: Race [?]
## 
##    Race Gender `min(BMIValue)` `mean(BMIValue)` `max(BMIValue)`
##   <chr>  <chr>           <dbl>            <dbl>           <dbl>
## 1 WHITE   MALE        22.89674         22.89674        22.89674

All dead people

filter(dfrPatient, Died==TRUE)
##  [1] ID             Name           Race           Gender        
##  [5] Smokes         HeightInCms    HeightInmeters WeightInKgs   
##  [9] BirthDate      State          Pet            HealthGrade   
## [13] Died           RecordDate     BMIValue       BMILabel      
## [17] States        
## <0 rows> (or 0-length row.names)
nrow(filter(dfrPatient, Died==TRUE))
## [1] 0

Hispanic Females

filter(dfrPatient, Race=="HISPANIC" & Gender=="FEMALE")
##  [1] ID             Name           Race           Gender        
##  [5] Smokes         HeightInCms    HeightInmeters WeightInKgs   
##  [9] BirthDate      State          Pet            HealthGrade   
## [13] Died           RecordDate     BMIValue       BMILabel      
## [17] States        
## <0 rows> (or 0-length row.names)
nrow(filter(dfrPatient, Race=="HISPANIC" & Gender=="FEMALE"))
## [1] 0

7 sample records from the dataset using seed(707)

set.seed(707)
sample_n(dfrPatient, 10)
## Error: Sample size (10) greater than population size (1). Do you want replace = TRUE?

Note for R-BA students:
now start the same with your code

Summary

Note
Patient-data gives us information about the patients in a particular hospital ward

Objectives
The objectives of analyis of data, study of dplyr package, working of rmarkdown and publishing an html document on rpubs was successfully met.