Tutorial

In depth tutorial

dataset_url <- "http://s3.amazonaws.com/practice_assignment/diet_data.zip"
download.file(dataset_url, "diet_data.zip")
unzip("diet_data.zip", exdir = "diet_data")
list.files("diet_data")

## [1] "Andy.csv"  "David.csv" "John.csv"  "Mike.csv"  "Steve.csv"

andy <- read.csv("diet_data/Andy.csv")
head(andy)

##   Patient.Name Age Weight Day
## 1         Andy  30    140   1
## 2         Andy  30    140   2
## 3         Andy  30    140   3
## 4         Andy  30    139   4
## 5         Andy  30    138   5
## 6         Andy  30    138   6

dim(andy)

## [1] 30  4

str(andy)

## 'data.frame':    30 obs. of  4 variables:
##  $ Patient.Name: Factor w/ 1 level "Andy": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age         : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ Weight      : int  140 140 140 139 138 138 138 138 138 138 ...
##  $ Day         : int  1 2 3 4 5 6 7 8 9 10 ...

summary(andy)

##  Patient.Name      Age         Weight           Day       
##  Andy:30      Min.   :30   Min.   :135.0   Min.   : 1.00  
##               1st Qu.:30   1st Qu.:137.0   1st Qu.: 8.25  
##               Median :30   Median :137.5   Median :15.50  
##               Mean   :30   Mean   :137.3   Mean   :15.50  
##               3rd Qu.:30   3rd Qu.:138.0   3rd Qu.:22.75  
##               Max.   :30   Max.   :140.0   Max.   :30.00

names(andy)

## [1] "Patient.Name" "Age"          "Weight"       "Day"

andy[1, "Weight"]   ## Andy's first weigh-in

## [1] 140

andy[30, "Weight"]  ## Andy's last weigh-in

## [1] 135

## Alternatively
andy[which(andy$Day == 1), "Weight"]

## [1] 140

andy[which(andy[,"Day"] == 30), "Weight"]

## [1] 135

subset(andy, Day == 30, select = Weight)

##    Weight
## 30    135

subset(andy$Weight, andy$Day==30)

## [1] 135

## Assign names
andy_first <- subset(andy, Day == 1, select = Weight)
andy_last <- subset(andy, Day == 30, select = Weight)
andy_loss <- andy_first - andy_last
list <- list.files("diet_data")
list  ## Now we can call a specific file by subsetting it

## [1] "Andy.csv"  "David.csv" "John.csv"  "Mike.csv"  "Steve.csv"

list[1]

## [1] "Andy.csv"

list[2]

## [1] "David.csv"

list[3:5]

## [1] "John.csv"  "Mike.csv"  "Steve.csv"

## Let's take a look at John
list_full <- list.files("diet_data", full.names=TRUE)
list_full

## [1] "diet_data/Andy.csv"  "diet_data/David.csv" "diet_data/John.csv" 
## [4] "diet_data/Mike.csv"  "diet_data/Steve.csv"

head(read.csv(list_full[3]))

##   Patient.Name Age Weight Day
## 1         John  22    175   1
## 2         John  22    175   2
## 3         John  22    175   3
## 4         John  22    175   4
## 5         John  22    175   5
## 6         John  22    175   6

## Now let's create a dataframe for Andy and David
andy_david <- rbind(andy, read.csv(list_full[2]))
head(andy_david)

##   Patient.Name Age Weight Day
## 1         Andy  30    140   1
## 2         Andy  30    140   2
## 3         Andy  30    140   3
## 4         Andy  30    139   4
## 5         Andy  30    138   5
## 6         Andy  30    138   6

tail(andy_david)

##    Patient.Name Age Weight Day
## 55        David  35    203  25
## 56        David  35    203  26
## 57        David  35    202  27
## 58        David  35    202  28
## 59        David  35    202  29
## 60        David  35    201  30

## Now create subset for day 25
day_25 <- andy_david[which(andy_david$Day == 25), ]
## We can append everyone manually, but a for loop would be more efficient for such a job
for (i in 1:5) {print(i)}  ## This loop takes the i variable and in each iteration of the loop gives it values 1, 2,...,5 then exits.

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5

dat <- data.frame()  ## First we create an empty data frame
for(i in 1:5) {
  data <- rbind(dat, read.csv(list_full[i]))
}
## If the data frame was expressed within the loop, then we'd end up with only the last file.
str(data)

## 'data.frame':    30 obs. of  4 variables:
##  $ Patient.Name: Factor w/ 1 level "Steve": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age         : int  55 55 55 55 55 55 55 55 55 55 ...
##  $ Weight      : int  225 225 225 224 224 224 223 223 223 223 ...
##  $ Day         : int  1 2 3 4 5 6 7 8 9 10 ...

head(data)

##   Patient.Name Age Weight Day
## 1        Steve  55    225   1
## 2        Steve  55    225   2
## 3        Steve  55    225   3
## 4        Steve  55    224   4
## 5        Steve  55    224   5
## 6        Steve  55    224   6

median(data$Weight)

## [1] 220.5