library(tidyverse)heart <- read_csv("https://raw.githubusercontent.com/javernw/DATA607GroupProjects/master/heart.csv")
heart <- as.tibble(heart)
head(heart)## # A tibble: 6 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63 1 3 145 233 1 0 150 0 2.3
## 2 37 1 2 130 250 0 1 187 0 3.5
## 3 41 0 1 130 204 0 0 172 0 1.4
## 4 56 1 1 120 236 0 1 178 0 0.8
## 5 57 0 0 120 354 0 1 163 1 0.6
## 6 57 1 0 140 192 0 1 148 0 0.4
## # … with 4 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>
The selected package I want to use id dplyr.
slice capability tutorialUsing slice we can select rows by specifying the row number.
slice(.data, …)
To select rows 6 to 12
slice(heart, 6:12)## # A tibble: 7 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 57 1 0 140 192 0 1 148 0 0.4
## 2 56 0 1 140 294 0 0 153 0 1.3
## 3 44 1 1 120 263 0 1 173 0 0
## 4 52 1 2 172 199 1 1 162 0 0.5
## 5 57 1 2 150 168 0 1 174 0 1.6
## 6 54 1 0 140 239 0 1 160 0 1.2
## 7 48 0 2 130 275 0 1 139 0 0.2
## # … with 4 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>
OR
To select rows 10 to 15, 18 and 299 to 302
heart %>% slice(c(10:15, 18, 299:302)) #passed as a vector## # A tibble: 11 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 57 1 2 150 168 0 1 174 0 1.6
## 2 54 1 0 140 239 0 1 160 0 1.2
## 3 48 0 2 130 275 0 1 139 0 0.2
## 4 49 1 1 130 266 0 1 171 0 0.6
## 5 64 1 3 110 211 0 0 144 1 1.8
## 6 58 0 3 150 283 1 0 162 0 1
## 7 66 0 3 150 226 0 1 114 0 2.6
## 8 57 0 0 140 241 0 1 123 1 0.2
## 9 45 1 3 110 264 0 1 132 0 1.2
## 10 68 1 0 144 193 1 1 141 0 3.4
## 11 57 1 0 130 131 0 1 115 1 1.2
## # … with 4 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>
mutate capability tutorialCreates new columns based on existing ones
mutate(.data, …)
Let’s look at a ratio of resting blood pressure to cholesterol under a new column name “Ratio”
#heart %>% mutate(Ratio = trestbps/chol)
heart %>% select(trestbps, chol) %>% mutate(Ratio = trestbps/chol)## # A tibble: 303 x 3
## trestbps chol Ratio
## <dbl> <dbl> <dbl>
## 1 145 233 0.622
## 2 130 250 0.52
## 3 130 204 0.637
## 4 120 236 0.508
## 5 120 354 0.339
## 6 140 192 0.729
## 7 140 294 0.476
## 8 120 263 0.456
## 9 172 199 0.864
## 10 150 168 0.893
## # … with 293 more rows
mutate_if() capability tutorialEdits specific columns with a predicate function
mutate_if(.data, .predicate, .funs, …)
Let’s add 10 to every variable that is a double.
add.10 <- function(x, na.rm=FALSE) (x+10)
heart %>% mutate_if(is.double, add.10, na.rm = TRUE)## # A tibble: 303 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 73 11 13 155 243 11 10 160 10 12.3
## 2 47 11 12 140 260 10 11 197 10 13.5
## 3 51 10 11 140 214 10 10 182 10 11.4
## 4 66 11 11 130 246 10 11 188 10 10.8
## 5 67 10 10 130 364 10 11 173 11 10.6
## 6 67 11 10 150 202 10 11 158 10 10.4
## 7 66 10 11 150 304 10 10 163 10 11.3
## 8 54 11 11 130 273 10 11 183 10 10
## 9 62 11 12 182 209 11 11 172 10 10.5
## 10 67 11 12 160 178 10 11 184 10 11.6
## # … with 293 more rows, and 4 more variables: slope <dbl>, ca <dbl>,
## # thal <dbl>, target <dbl>
mutate_at() capability tutorialEdits specific columns with a character vector or vars()
mutate_at(.data, .vars, .funs, …, .cols = NULL)
Let’s add 10 to the age and sex variable.
add.10 <- function(x, na.rm=FALSE) (x+10)
heart %>% mutate_at(c("age","sex"), add.10, na.rm = TRUE)## # A tibble: 303 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 73 11 3 145 233 1 0 150 0 2.3
## 2 47 11 2 130 250 0 1 187 0 3.5
## 3 51 10 1 130 204 0 0 172 0 1.4
## 4 66 11 1 120 236 0 1 178 0 0.8
## 5 67 10 0 120 354 0 1 163 1 0.6
## 6 67 11 0 140 192 0 1 148 0 0.4
## 7 66 10 1 140 294 0 0 153 0 1.3
## 8 54 11 1 120 263 0 1 173 0 0
## 9 62 11 2 172 199 1 1 162 0 0.5
## 10 67 11 2 150 168 0 1 174 0 1.6
## # … with 293 more rows, and 4 more variables: slope <dbl>, ca <dbl>,
## # thal <dbl>, target <dbl>
summarise capability tutorialApplies functions that return results of length 1. Can perform multiple calculations in the same call.
summarise(data, …)
Lets see mean and median cholesterol along with mean and median maximum heart rate achieved.
heart %>% summarise(Avg_cholesteral = mean(chol), Median_cholesteral = median(chol), Avg_heartrate = mean(thalach), Median_heartrate = median(thalach))## # A tibble: 1 x 4
## Avg_cholesteral Median_cholesteral Avg_heartrate Median_heartrate
## <dbl> <dbl> <dbl> <dbl>
## 1 246. 240 150. 153
do capability tutorial (do anything)Performs any arbitrary computations on the data
do(.data, …)
We can create a function that sorts the data by age then returns the first 3 for each age group.
x = 3
top <- function(t, x){
t %>% arrange(desc(age)) %>% head(x)
}
heart %>% group_by(age) %>% do(top(., x))## # A tibble: 113 x 14
## # Groups: age [41]
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29 1 1 130 204 0 0 202 0 0
## 2 34 1 3 118 182 0 0 174 0 0
## 3 34 0 1 118 210 0 1 192 0 0.7
## 4 35 0 0 138 183 0 1 182 0 1.4
## 5 35 1 1 122 192 0 1 174 0 0
## 6 35 1 0 120 198 0 1 130 1 1.6
## 7 37 1 2 130 250 0 1 187 0 3.5
## 8 37 0 2 120 215 0 1 170 0 0
## 9 38 1 2 138 175 0 1 173 0 0
## 10 38 1 2 138 175 0 1 173 0 0
## # … with 103 more rows, and 4 more variables: slope <dbl>, ca <dbl>,
## # thal <dbl>, target <dbl>