In depth tutorial
dataset_url <- "http://s3.amazonaws.com/practice_assignment/diet_data.zip"
download.file(dataset_url, "diet_data.zip")
unzip("diet_data.zip", exdir = "diet_data")
list.files("diet_data")
## [1] "Andy.csv" "David.csv" "John.csv" "Mike.csv" "Steve.csv"
andy <- read.csv("diet_data/Andy.csv")
head(andy)
## Patient.Name Age Weight Day
## 1 Andy 30 140 1
## 2 Andy 30 140 2
## 3 Andy 30 140 3
## 4 Andy 30 139 4
## 5 Andy 30 138 5
## 6 Andy 30 138 6
dim(andy)
## [1] 30 4
str(andy)
## 'data.frame': 30 obs. of 4 variables:
## $ Patient.Name: Factor w/ 1 level "Andy": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 30 30 30 30 30 30 30 30 30 30 ...
## $ Weight : int 140 140 140 139 138 138 138 138 138 138 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
summary(andy)
## Patient.Name Age Weight Day
## Andy:30 Min. :30 Min. :135.0 Min. : 1.00
## 1st Qu.:30 1st Qu.:137.0 1st Qu.: 8.25
## Median :30 Median :137.5 Median :15.50
## Mean :30 Mean :137.3 Mean :15.50
## 3rd Qu.:30 3rd Qu.:138.0 3rd Qu.:22.75
## Max. :30 Max. :140.0 Max. :30.00
names(andy)
## [1] "Patient.Name" "Age" "Weight" "Day"
andy[1, "Weight"] ## Andy's first weigh-in
## [1] 140
andy[30, "Weight"] ## Andy's last weigh-in
## [1] 135
## Alternatively
andy[which(andy$Day == 1), "Weight"]
## [1] 140
andy[which(andy[,"Day"] == 30), "Weight"]
## [1] 135
subset(andy, Day == 30, select = Weight)
## Weight
## 30 135
subset(andy$Weight, andy$Day==30)
## [1] 135
## Assign names
andy_first <- subset(andy, Day == 1, select = Weight)
andy_last <- subset(andy, Day == 30, select = Weight)
andy_loss <- andy_first - andy_last
list <- list.files("diet_data")
list ## Now we can call a specific file by subsetting it
## [1] "Andy.csv" "David.csv" "John.csv" "Mike.csv" "Steve.csv"
list[1]
## [1] "Andy.csv"
list[2]
## [1] "David.csv"
list[3:5]
## [1] "John.csv" "Mike.csv" "Steve.csv"
## Let's take a look at John
list_full <- list.files("diet_data", full.names=TRUE)
list_full
## [1] "diet_data/Andy.csv" "diet_data/David.csv" "diet_data/John.csv"
## [4] "diet_data/Mike.csv" "diet_data/Steve.csv"
head(read.csv(list_full[3]))
## Patient.Name Age Weight Day
## 1 John 22 175 1
## 2 John 22 175 2
## 3 John 22 175 3
## 4 John 22 175 4
## 5 John 22 175 5
## 6 John 22 175 6
## Now let's create a dataframe for Andy and David
andy_david <- rbind(andy, read.csv(list_full[2]))
head(andy_david)
## Patient.Name Age Weight Day
## 1 Andy 30 140 1
## 2 Andy 30 140 2
## 3 Andy 30 140 3
## 4 Andy 30 139 4
## 5 Andy 30 138 5
## 6 Andy 30 138 6
tail(andy_david)
## Patient.Name Age Weight Day
## 55 David 35 203 25
## 56 David 35 203 26
## 57 David 35 202 27
## 58 David 35 202 28
## 59 David 35 202 29
## 60 David 35 201 30
## Now create subset for day 25
day_25 <- andy_david[which(andy_david$Day == 25), ]
## We can append everyone manually, but a for loop would be more efficient for such a job
for (i in 1:5) {print(i)} ## This loop takes the i variable and in each iteration of the loop gives it values 1, 2,...,5 then exits.
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
dat <- data.frame() ## First we create an empty data frame
for(i in 1:5) {
data <- rbind(dat, read.csv(list_full[i]))
}
## If the data frame was expressed within the loop, then we'd end up with only the last file.
str(data)
## 'data.frame': 30 obs. of 4 variables:
## $ Patient.Name: Factor w/ 1 level "Steve": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 55 55 55 55 55 55 55 55 55 55 ...
## $ Weight : int 225 225 225 224 224 224 223 223 223 223 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
head(data)
## Patient.Name Age Weight Day
## 1 Steve 55 225 1
## 2 Steve 55 225 2
## 3 Steve 55 225 3
## 4 Steve 55 224 4
## 5 Steve 55 224 5
## 6 Steve 55 224 6
median(data$Weight)
## [1] 220.5