title: “Assignment 3” author: “Sneha Kshatriya” date: “28th March 2017” output: html_document
-carry out data manipulation -create columns as required -cleaning the data and removing errors from the data -remvoving missing values -filtering the data as per det requirements
knitr Global Options
# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')
Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read Data
# inline comments
setwd("/Users/snehakshatriya/Desktop/R-BA/R-Scripts")
dfrPatient <- read.csv("./data/patient-data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrPatient)
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White Male FALSE 182.87 1.8287
## 2 AC/AH/017 Rosario White Male FALSE 179.12 1.7912
## 3 AC/AH/020 Julio Black Male FALSE 169.15 1.6915
## 4 AC/AH/022 Lupe White Male FALSE 175.66 1.7566
## 5 AC/AH/029 Lavern White Female FALSE 164.47 1.6447
## 6 AC/AH/033 Bernie Dog Female TRUE 158.27 1.5827
## WeightInKgs BirthDate State Pet HealthGrade Died RecordDate
## 1 76.57 31/01/72 Georgia,xxx Dog 2 FALSE 25/11/15
## 2 80.43 09/06/72 Missouri Dog 2 FALSE 25/11/15
## 3 75.48 03/07/72 Pennsylvania None 2 FALSE 25/11/15
## 4 94.54 11/08/72 Florida Cat 1 FALSE 25/11/15
## 5 71.78 06/06/73 Iowa NULL 2 TRUE 25/11/15
## 6 69.90 25/06/73 Maryland Dog 2 FALSE 25/11/15
Total Rows Of Patient File: 100
Add coloumn BMI-Value
# inline comments
dfrPatient <- mutate(dfrPatient, BMIValue=(WeightInKgs/(HeightInCms/100)^2))
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White Male FALSE 182.87 1.8287
## 2 AC/AH/017 Rosario White Male FALSE 179.12 1.7912
## 3 AC/AH/020 Julio Black Male FALSE 169.15 1.6915
## 4 AC/AH/022 Lupe White Male FALSE 175.66 1.7566
## 5 AC/AH/029 Lavern White Female FALSE 164.47 1.6447
## 6 AC/AH/033 Bernie Dog Female TRUE 158.27 1.5827
## WeightInKgs BirthDate State Pet HealthGrade Died RecordDate
## 1 76.57 31/01/72 Georgia,xxx Dog 2 FALSE 25/11/15
## 2 80.43 09/06/72 Missouri Dog 2 FALSE 25/11/15
## 3 75.48 03/07/72 Pennsylvania None 2 FALSE 25/11/15
## 4 94.54 11/08/72 Florida Cat 1 FALSE 25/11/15
## 5 71.78 06/06/73 Iowa NULL 2 TRUE 25/11/15
## 6 69.90 25/06/73 Maryland Dog 2 FALSE 25/11/15
## BMIValue
## 1 22.89674
## 2 25.06859
## 3 26.38080
## 4 30.63867
## 5 26.53567
## 6 27.90487
Add column BMI-Label
# inline comments
dfrPatient <- mutate(dfrPatient, BMILabel=NA)
dfrPatient$BMILabel <- ifelse(dfrPatient$BMIValue < 18.50,"UNDERWEIGHT",
ifelse(dfrPatient$BMIValue > 18.50 & dfrPatient$BMIValue < 25.00, "NORMAL",
ifelse(dfrPatient$BMIValue > 25.00 & dfrPatient$BMIValue < 30.00, "OVERWEIGHT",
ifelse(dfrPatient$BMIValue > 30.00,"Obese", NA))))
head(dfrPatient)
## ID Name Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius White Male FALSE 182.87 1.8287
## 2 AC/AH/017 Rosario White Male FALSE 179.12 1.7912
## 3 AC/AH/020 Julio Black Male FALSE 169.15 1.6915
## 4 AC/AH/022 Lupe White Male FALSE 175.66 1.7566
## 5 AC/AH/029 Lavern White Female FALSE 164.47 1.6447
## 6 AC/AH/033 Bernie Dog Female TRUE 158.27 1.5827
## WeightInKgs BirthDate State Pet HealthGrade Died RecordDate
## 1 76.57 31/01/72 Georgia,xxx Dog 2 FALSE 25/11/15
## 2 80.43 09/06/72 Missouri Dog 2 FALSE 25/11/15
## 3 75.48 03/07/72 Pennsylvania None 2 FALSE 25/11/15
## 4 94.54 11/08/72 Florida Cat 1 FALSE 25/11/15
## 5 71.78 06/06/73 Iowa NULL 2 TRUE 25/11/15
## 6 69.90 25/06/73 Maryland Dog 2 FALSE 25/11/15
## BMIValue BMILabel
## 1 22.89674 NORMAL
## 2 25.06859 OVERWEIGHT
## 3 26.38080 OVERWEIGHT
## 4 30.63867 Obese
## 5 26.53567 OVERWEIGHT
## 6 27.90487 OVERWEIGHT
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
## HealthGrade `n()`
## <int> <int>
## 1 1 29
## 2 2 30
## 3 3 34
## 4 99 7
class(dfrPatient$HealthGrade)
## [1] "integer"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==1] <- "GOOD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==2] <- "NORMAL"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==3] <- "BAD"
dfrPatient$HealthGrade[dfrPatient$HealthGrade==99] <- NA
class(dfrPatient$HealthGrade)
## [1] "character"
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
## HealthGrade `n()`
## <chr> <int>
## 1 BAD 34
## 2 GOOD 29
## 3 NORMAL 30
## 4 <NA> 7
#inline comments
summarise(group_by(dfrPatient, BMILabel), n())
## # A tibble: 3 <U+00D7> 2
## BMILabel `n()`
## <chr> <int>
## 1 NORMAL 23
## 2 OVERWEIGHT 71
## 3 Obese 6
cat("\014")
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 3 <U+00D7> 2
## Gender `n()`
## <chr> <int>
## 1 Female 54
## 2 Female 1
## 3 Male 45
cat("\014")
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 <U+00D7> 2
## Race `n()`
## <chr> <int>
## 1 Asian 5
## 2 Bi-Racial 1
## 3 Black 8
## 4 Dog 1
## 5 Hispanic 17
## 6 White 68
cat("\014")
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
## Died `n()`
## <lgl> <int>
## 1 FALSE 46
## 2 TRUE 54
cat("\014")
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 <U+00D7> 2
## Pet `n()`
## <chr> <int>
## 1 Bird 9
## 2 CAT 5
## 3 Cat 24
## 4 DOG 4
## 5 Dog 28
## 6 Horse 1
## 7 NONE 1
## 8 NULL 3
## 9 None 23
## 10 <NA> 2
cat("\014")
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 <U+00D7> 2
## Smokes `n()`
## <chr> <int>
## 1 FALSE 72
## 2 No 6
## 3 TRUE 18
## 4 Yes 4
cat("\014")
summarise(group_by(dfrPatient, HealthGrade), n())
## # A tibble: 4 <U+00D7> 2
## HealthGrade `n()`
## <chr> <int>
## 1 BAD 34
## 2 GOOD 29
## 3 NORMAL 30
## 4 <NA> 7
cat("\014")
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
#inline comments
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 3 <U+00D7> 2
## Gender `n()`
## <chr> <int>
## 1 Female 54
## 2 Female 1
## 3 Male 45
dfrPatient$Gender <- trimws(toupper(dfrPatient$Gender))
summarise(group_by(dfrPatient, Gender), n())
## # A tibble: 2 <U+00D7> 2
## Gender `n()`
## <chr> <int>
## 1 FEMALE 55
## 2 MALE 45
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 6 <U+00D7> 2
## Race `n()`
## <chr> <int>
## 1 Asian 5
## 2 Bi-Racial 1
## 3 Black 8
## 4 Dog 1
## 5 Hispanic 17
## 6 White 68
dfrPatient$Race <- trimws(toupper(dfrPatient$Race))
dfrPatient$Race[dfrPatient$Race=="DOG"] <- NA
dfrPatient$Race[dfrPatient$Race=="BI-RACIAL"] <- NA
summarise(group_by(dfrPatient, Race), n())
## # A tibble: 5 <U+00D7> 2
## Race `n()`
## <chr> <int>
## 1 ASIAN 5
## 2 BLACK 8
## 3 HISPANIC 17
## 4 WHITE 68
## 5 <NA> 2
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
## Died `n()`
## <lgl> <int>
## 1 FALSE 46
## 2 TRUE 54
class(dfrPatient$Died)
## [1] "logical"
dfrPatient$Died <- as.logical(dfrPatient$Died)
class(dfrPatient$Died)
## [1] "logical"
summarise(group_by(dfrPatient, Died), n())
## # A tibble: 2 <U+00D7> 2
## Died `n()`
## <lgl> <int>
## 1 FALSE 46
## 2 TRUE 54
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 10 <U+00D7> 2
## Pet `n()`
## <chr> <int>
## 1 Bird 9
## 2 CAT 5
## 3 Cat 24
## 4 DOG 4
## 5 Dog 28
## 6 Horse 1
## 7 NONE 1
## 8 NULL 3
## 9 None 23
## 10 <NA> 2
dfrPatient$Pet <- trimws(toupper(dfrPatient$Pet))
dfrPatient$Pet[dfrPatient$Pet=="NONE"] <- NA
dfrPatient$Pet[dfrPatient$Pet=="NULL"] <- NA
summarise(group_by(dfrPatient, Pet), n())
## # A tibble: 5 <U+00D7> 2
## Pet `n()`
## <chr> <int>
## 1 BIRD 9
## 2 CAT 29
## 3 DOG 32
## 4 HORSE 1
## 5 <NA> 29
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 4 <U+00D7> 2
## Smokes `n()`
## <chr> <int>
## 1 FALSE 72
## 2 No 6
## 3 TRUE 18
## 4 Yes 4
class(dfrPatient$Smokes)
## [1] "character"
dfrPatient$Smokes <- as.logical(dfrPatient$Smokes)
class(dfrPatient$Smokes)
## [1] "logical"
summarise(group_by(dfrPatient, Smokes), n())
## # A tibble: 3 <U+00D7> 2
## Smokes `n()`
## <lgl> <int>
## 1 FALSE 72
## 2 TRUE 18
## 3 NA 10
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
dfrPatient$States[dfrPatient$State=="Georgia,xxx"] <- "Georgia"
summarise(group_by(dfrPatient, State), n())
## # A tibble: 34 <U+00D7> 2
## State `n()`
## <chr> <int>
## 1 Alabama 2
## 2 Arizona 2
## 3 California 13
## 4 Colorado 1
## 5 Connecticut 1
## 6 Florida 8
## 7 Georgia 3
## 8 Georgia,xxx 1
## 9 Hawaii 2
## 10 Illinois 4
## # ... with 24 more rows
cat("\014")
vclComplete <- complete.cases(dfrPatient)
vclComplete[is.true(vclComplete)]
## Error in eval(expr, envir, enclos): could not find function "is.true"
dfrPatient <- dfrPatient[vclComplete, ]
nrow(dfrPatient)
## [1] 1
head(arrange(dfrPatient, desc(BMIValue)), 10)
## ID Name Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 1.8287
## WeightInKgs BirthDate State Pet HealthGrade Died RecordDate
## 1 76.57 31/01/72 Georgia,xxx DOG NORMAL FALSE 25/11/15
## BMIValue BMILabel States
## 1 22.89674 NORMAL Georgia
head(arrange(dfrPatient, BMIValue), 10)
## ID Name Race Gender Smokes HeightInCms HeightInmeters
## 1 AC/AH/001 Demetrius WHITE MALE FALSE 182.87 1.8287
## WeightInKgs BirthDate State Pet HealthGrade Died RecordDate
## 1 76.57 31/01/72 Georgia,xxx DOG NORMAL FALSE 25/11/15
## BMIValue BMILabel States
## 1 22.89674 NORMAL Georgia
summarise(group_by(dfrPatient, Gender, Race), n())
## Source: local data frame [1 x 3]
## Groups: Gender [?]
##
## Gender Race `n()`
## <chr> <chr> <int>
## 1 MALE WHITE 1
summarise(group_by(dfrPatient, Race, Gender), min(BMIValue), mean(BMIValue), max(BMIValue))
## Source: local data frame [1 x 5]
## Groups: Race [?]
##
## Race Gender `min(BMIValue)` `mean(BMIValue)` `max(BMIValue)`
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 WHITE MALE 22.89674 22.89674 22.89674
filter(dfrPatient, Died==TRUE)
## [1] ID Name Race Gender
## [5] Smokes HeightInCms HeightInmeters WeightInKgs
## [9] BirthDate State Pet HealthGrade
## [13] Died RecordDate BMIValue BMILabel
## [17] States
## <0 rows> (or 0-length row.names)
nrow(filter(dfrPatient, Died==TRUE))
## [1] 0
filter(dfrPatient, Race=="HISPANIC" & Gender=="FEMALE")
## [1] ID Name Race Gender
## [5] Smokes HeightInCms HeightInmeters WeightInKgs
## [9] BirthDate State Pet HealthGrade
## [13] Died RecordDate BMIValue BMILabel
## [17] States
## <0 rows> (or 0-length row.names)
nrow(filter(dfrPatient, Race=="HISPANIC" & Gender=="FEMALE"))
## [1] 0
set.seed(707)
sample_n(dfrPatient, 10)
## Error: Sample size (10) greater than population size (1). Do you want replace = TRUE?
Note for R-BA students:
now start the same with your code
Note
Patient-data gives us information about the patients in a particular hospital ward
Objectives
The objectives of analyis of data, study of dplyr package, working of rmarkdown and publishing an html document on rpubs was successfully met.