R Week 2 Assignment:

Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list)

1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least 2 attributes

Data is for a sample of 200 patients at an Intensive Care Unit.

# Read ICU csv file
theURL <- "https://vincentarelbundock.github.io/Rdatasets/csv/Stat2Data/ICU.csv"
ICU_patients_DF <- read.table(file=theURL, header=TRUE, sep=",")

dim(ICU_patients_DF)
## [1] 200  10
head(ICU_patients_DF)
##   X ID Survive Age AgeGroup Sex Infection SysBP Pulse Emergency
## 1 1  4       0  87        3   1         1    80    96         1
## 2 2  8       1  27        1   1         1   142    88         1
## 3 3 12       1  59        2   0         0   112    80         1
## 4 4 14       1  77        3   0         0   100    70         0
## 5 5 27       0  76        3   1         1   128    90         1
## 6 6 28       1  54        2   0         1   142   103         1
tail(ICU_patients_DF)
##       X  ID Survive Age AgeGroup Sex Infection SysBP Pulse Emergency
## 195 195 915       1  67        2   0         0   152    78         0
## 196 196 921       0  50        2   1         0   256    64         1
## 197 197 923       1  20        1   0         0   104    83         1
## 198 198 924       1  73        3   1         0   162   100         1
## 199 199 925       1  59        2   0         0   100    88         1
## 200 200 929       1  42        1   0         0   122    84         1
names (ICU_patients_DF)
##  [1] "X"         "ID"        "Survive"   "Age"       "AgeGroup"  "Sex"      
##  [7] "Infection" "SysBP"     "Pulse"     "Emergency"
#is.character(ICU_patients_DF$AgeGroup)
#is.numeric(ICU_patients_DF$AgeGroup)

str(ICU_patients_DF)   # examine data.frame structure
## 'data.frame':    200 obs. of  10 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ID       : int  4 8 12 14 27 28 32 38 40 41 ...
##  $ Survive  : int  0 1 1 1 0 1 1 1 1 1 ...
##  $ Age      : int  87 27 59 77 76 54 87 69 63 30 ...
##  $ AgeGroup : int  3 1 2 3 3 2 3 2 2 1 ...
##  $ Sex      : int  1 1 0 0 1 0 1 0 0 1 ...
##  $ Infection: int  1 1 0 0 1 1 1 1 0 0 ...
##  $ SysBP    : int  80 142 112 100 128 142 110 110 104 144 ...
##  $ Pulse    : int  96 88 80 70 90 103 154 132 66 110 ...
##  $ Emergency: int  1 1 1 0 1 1 1 1 0 1 ...
class(ICU_patients_DF)
## [1] "data.frame"
# load packages
library(plyr)  # load this before dplyr
library(dplyr) # load plyr first then this pkg
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
# library(tidyverse) # includes stringr ?incl in other pkgs??
library(stringr)  # for string manipulation

cat ("Number of ICU patients: ", as.character(count(ICU_patients_DF)))
## Number of ICU patients:  200
# Use the summary function to gain an overview of the data set.

ICU_patients_DF %>%
 group_by(AgeGroup) %>%
  summarize(AvgBloodPress=mean(SysBP))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   AgeGroup AvgBloodPress
##      <int>         <dbl>
## 1        1          130.
## 2        2          133.
## 3        3          134.
# Then display the mean and median for at least 2 attributes

cat ("Mean Systolic BP of ICU patients: ", mean(ICU_patients_DF$SysBP))
## Mean Systolic BP of ICU patients:  132.28
cat ("Median Systolic BP of ICU patients: ", median(ICU_patients_DF$SysBP))
## Median Systolic BP of ICU patients:  130

2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.

Select a list of ICU patients with a [High] Systolic Blood Pressure > 120 mmHg

# create a subset of patients with high systolic blood pressure
HBP_patients_DF <- subset(ICU_patients_DF, SysBP > 120, select = Age:Pulse)

#HBP_patients_DF
head(HBP_patients_DF)
##    Age AgeGroup Sex Infection SysBP Pulse
## 2   27        1   1         1   142    88
## 5   76        3   1         1   128    90
## 6   54        2   0         1   142   103
## 10  30        1   1         0   144   110
## 12  78        3   0         1   130   132
## 13  70        3   1         0   138   103
tail(HBP_patients_DF)
##     Age AgeGroup Sex Infection SysBP Pulse
## 192  46        1   0         1   142    89
## 194  71        3   0         1   124   124
## 195  67        2   0         0   152    78
## 196  50        2   1         0   256    64
## 198  73        3   1         0   162   100
## 200  42        1   0         0   122    84
cat ("Number of ICU patients with High Systolic Blood Pressure > 120 mmHg: ", as.character(count(HBP_patients_DF)))
## Number of ICU patients with High Systolic Blood Pressure > 120 mmHg:  127

4. Use the summary function to create an overview of your new data frame. Then print the mean and median for the same two attribute. Please compare.

Notes:

cat ("Number of patients with High BP: ", as.character(count(HBP_patients_DF)))
## Number of patients with High BP:  127
cat("Percent of ICU patients with high Systolic Blood Pressure: ", as.character((count(HBP_patients_DF) / count(ICU_patients_DF) * 100)))
## Percent of ICU patients with high Systolic Blood Pressure:  63.5
cat ("Mean Systolic Blood Pressure of patients with high BP: ", mean(HBP_patients_DF$SysBP))
## Mean Systolic Blood Pressure of patients with high BP:  150.6063
cat ("Median Systolic Blood Pressure of patients with high BP: ", 
median(HBP_patients_DF$SysBP))
## Median Systolic Blood Pressure of patients with high BP:  142
cat ("Mean Age of patients with high Systolic Blood Pressure: ", 
mean(HBP_patients_DF$Age))
## Mean Age of patients with high Systolic Blood Pressure:  58.55118
cat ("Median Age of patients with high Systolic Blood Pressure: ", 
median(HBP_patients_DF$Age))
## Median Age of patients with high Systolic Blood Pressure:  64
#compare
if (mean(HBP_patients_DF$SysBP) > mean(ICU_patients_DF$SysBP)) {
  print ("There is a higher mean Systolic BP for high BP patients")
  } else if (mean(HBP_patients_DF$SysBP) < mean(ICU_patients_DF$SysBP)) {
    print ("There is a lower mean Systolic BP for high BP patients compared to non high BP patients in ICU")
}
## [1] "There is a higher mean Systolic BP for high BP patients"
if (median(HBP_patients_DF$SysBP) > median(ICU_patients_DF$SysBP)) {
  print ("There is a higher median Systolic BP for high BP patients")
  } else if (median(HBP_patients_DF$SysBP) < median(ICU_patients_DF$SysBP)) {
    print ("There is a lower median Systolic BP for high BP patients compared to non high BP patients in ICU")
}
## [1] "There is a higher median Systolic BP for high BP patients"

5. For at least 3 values in a column please rename so that every value in the column is renames.

Notes:

age_young <- c("Young (under 50)")
age_middle <- c("Middle (50-69)")
age_old <- c("Old (70+)")
age_unknown <- c("Age Group Unknown")

HBP_patients_DF$AgeGroup
##   [1] 1 3 2 1 3 3 2 1 2 2 2 2 2 2 3 2 1 1 3 2 2 2 3 2 2 3 3 3 3 2 3 2 2 3 2 1 1
##  [38] 2 1 2 1 2 2 2 3 3 2 2 1 3 3 3 2 3 3 2 1 1 2 1 1 3 1 2 2 3 3 2 2 2 2 3 2 1
##  [75] 1 3 3 3 3 1 1 3 1 1 3 3 3 3 3 2 1 2 1 3 3 1 3 1 3 1 3 2 2 3 2 2 1 3 1 2 1
## [112] 3 1 1 1 1 1 2 2 3 3 1 3 2 2 3 1
is.numeric(HBP_patients_DF$AgeGroup)
## [1] TRUE
class(HBP_patients_DF)
## [1] "data.frame"
class(HBP_patients_DF$AgeGroup)
## [1] "integer"
length(HBP_patients_DF$AgeGroup)
## [1] 127
#print (HBP_patients_DF$AgeGroup)

vec <- HBP_patients_DF$AgeGroup
#vec
#class(vec)
#print ("start")

ChgAgeGroup <- function(vec) {
#  print ("function")
  newvec <- vec
#  print (newvec)
  for (i in 1:length(vec)) {
#    print (vec[i])
    if(vec[i] == "1") {
#      print ("cond 1")
      newvec [i] <- age_young
    }else if(vec[i] == "2") {
#      print ("cond 2")
      newvec [i] <- age_middle
    }else if(vec[i] == "3") {
#      print ("cond 3")
      newvec [i] <- age_old
    }else {
      newvec [i] <- age_unknown
    }
  }
#  print ("end func")
#  print (i)
#  print (newvec)
  return(vec <-newvec)
}

HBP_patients_DF$AgeGroup <- (ChgAgeGroup(vec))
#print ("post func")
#HBP_patients_DF$AgeGroup

head(HBP_patients_DF)
##    Age         AgeGroup Sex Infection SysBP Pulse
## 2   27 Young (under 50)   1         1   142    88
## 5   76        Old (70+)   1         1   128    90
## 6   54   Middle (50-69)   0         1   142   103
## 10  30 Young (under 50)   1         0   144   110
## 12  78        Old (70+)   0         1   130   132
## 13  70        Old (70+)   1         0   138   103
tail(HBP_patients_DF)
##     Age         AgeGroup Sex Infection SysBP Pulse
## 192  46 Young (under 50)   0         1   142    89
## 194  71        Old (70+)   0         1   124   124
## 195  67   Middle (50-69)   0         0   152    78
## 196  50   Middle (50-69)   1         0   256    64
## 198  73        Old (70+)   1         0   162   100
## 200  42 Young (under 50)   0         0   122    84

3. Create new column names for the new data frame.

colnames(HBP_patients_DF) <- c("Pt Age","AgeGroup","Gender","Infection Pt","Systolic BP","HR")
head(HBP_patients_DF)
##    Pt Age         AgeGroup Gender Infection Pt Systolic BP  HR
## 2      27 Young (under 50)      1            1         142  88
## 5      76        Old (70+)      1            1         128  90
## 6      54   Middle (50-69)      0            1         142 103
## 10     30 Young (under 50)      1            0         144 110
## 12     78        Old (70+)      0            1         130 132
## 13     70        Old (70+)      1            0         138 103

Looking forward to your feedback! Thank you, Rick