This assignment’s data sources comes from personal activity monitoring device, the device collects data at 5 mins interval throughout the day. The data contains 2 months of data from an annoymous individual collected between Oct and Nov 2012 while including the number of steps taken in 5 minutes intervals per day.
The data consists of the following variables: * Steps: Number of steps taking in a 5 min interval (missing values are coded as NA) * date: The date on which the measurement was taken in YYYY-MM-DD format. * interval: Identifier for the 5 mins interval in which measurement was taken.
The dataset is saved as a CSV file and total of 17,568 observations. Please refer to the dataset link to download.
1: Execute the environment and data gathering setup before deploy data download & transformation.
2: Calculate total number of steps taken per day, this requires data manipulation by filtering desired context while formatting the data into the desired condition before calculate the data. Functions such as subsetting and lapply is essential for the data table extract.
3: Replace the missing values in the dataset, then export into a completed csv file before reload into the updated database for comparison analysis.
4: Transform date structure by arranging the date into POSIXct date while add additional factors so weekdays and weekends can be separated.
5: Plot the respective plots and obtain the desired distribution before making valid conclusion.
## Load necessary libraries
library(data.table)
library(ggplot2)
## Set URL and download file into designated directory
PA1URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(PA1URL, "JHU_05_PA01.zip",method = "curl")
unzip("JHU_05_PA01.zip", exdir = "PA01_Data")
## Read csv file into data.table
activity <- fread(input = "PA01_Data/activity.csv")
## pre-check the data.table structure
str(activity)
## Classes 'data.table' and 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : IDate, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## - attr(*, ".internal.selfref")=<externalptr>
## Use the lapply for the sum function (false inclusive) to calculate the total steps per day.
Total_S <- activity[,c(lapply(.SD, sum, na.rm = FALSE)), .SDcols = c("steps"), by = .(date)]
## demonstrate the extracted data layout.
head(Total_S,10)
## date steps
## 1: 2012-10-01 NA
## 2: 2012-10-02 126
## 3: 2012-10-03 11352
## 4: 2012-10-04 12116
## 5: 2012-10-05 13294
## 6: 2012-10-06 15420
## 7: 2012-10-07 11015
## 8: 2012-10-08 NA
## 9: 2012-10-09 12811
## 10: 2012-10-10 9900
## Setup the png file for output display.
png("JHU_DS_05_PA1_1.png")
## use ggplot to initiate the setup for bar plot
ggplot(Total_S, aes(x = steps)) +
geom_histogram(fill = "#BB0000", binwidth = 1000) +
labs(title = "Daily Steps Distribution", xlab = "Steps", y = "Frequency")
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## close png write in.
dev.off()
## png
## 2
Total_S[,.(Steps_Avg = mean(steps, na.rm = TRUE), Steps_Med = median(steps, na.rm = TRUE))]
## Steps_Avg Steps_Med
## 1: 10766.19 10765
## pre-check the data.table structure
str(activity)
## Classes 'data.table' and 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : IDate, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## - attr(*, ".internal.selfref")=<externalptr>
## Use the lapply to calculate the total steps per day by interval
Inter <- activity[,c(lapply(.SD, mean, na.rm = TRUE)), .SDcols = c("steps"), by = .(interval)]
## Setup the png file for output display.
png("JHU_DS_05_PA1_2.png")
## use ggplot to initiate the setup for line plot.
ggplot(Inter, aes(x = interval, y = steps)) +
geom_line(color = "#BB0000", size = 1) +
labs(title = "Average Daily Steps", xlab = "Interval", y = "Avg. Steps per Day")
## close png write in.
dev.off()
## png
## 2
## Display the top 10 rows of the setup.
head(Total_S,10)
## date steps
## 1: 2012-10-01 NA
## 2: 2012-10-02 126
## 3: 2012-10-03 11352
## 4: 2012-10-04 12116
## 5: 2012-10-05 13294
## 6: 2012-10-06 15420
## 7: 2012-10-07 11015
## 8: 2012-10-08 NA
## 9: 2012-10-09 12811
## 10: 2012-10-10 9900
Inter[steps == max(steps), .(max_inter = interval)]
## max_inter
## 1: 835
## Count amount of rows where the steps are missing in the column.
nrow(activity[is.na(steps),])
## [1] 2304
## Fill the missing values by subsetting the steps' mean value .
activity[is.na(steps), "steps"] <- activity[, c(lapply(.SD, mean, na.rm = TRUE)), .SDcols = c("steps")]
## Warning in `[<-.data.table`(`*tmp*`, is.na(steps), "steps", value =
## structure(list(: 37.382600 (type 'double') at RHS position 1 truncated
## (precision lost) when assigning to type 'integer' (column 1 named 'steps')
## use fwrite to write out the data set and saved as a different format
fwrite(x = activity, file = "PA01_Data/fixed.csv", quote = FALSE)
## total number of steps per day
Total_S2 <- activity[, c(lapply(.SD, sum)), .SDcols = c("steps"), by = .(date)]
## mean & median total number of steps taken per day based on updated data
Total_S2[,.(Steps_Avg = mean(steps), Steps_Med = median(steps))]
## Steps_Avg Steps_Med
## 1: 10751.74 10656
## Setup the png file for output display.
png("JHU_DS_05_PA1_3.png")
## use ggplot to initiate the setup for bar plot
ggplot(Total_S2, aes(x = steps)) +
geom_histogram(fill = "#BB0000", binwidth = 1000) +
labs(title = "Daily Steps Distribution", xlab = "Steps", y = "Frequency")
## close png write in.
dev.off()
## png
## 2
## Read off the original CSV again before manipulate the dataset
active2 <- fread(input = "PA01_Data/activity.csv")
## Check current dataset structure from the actual CSV
str(active2)
## Classes 'data.table' and 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : IDate, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## - attr(*, ".internal.selfref")=<externalptr>
## Reconstruct the data format for designated setup (in this case POSI date)
active2[, date := as.POSIXct(date, format = "%Y-%m-%d")]
active2[, `Day of Week` := weekdays(x = date)]
## Use grepl to search similar pattern strings and categorize them as factors
## This depends on what is your weekday input, since mycomputer is in Chinese
## string for dates, therefore this has to be switched.
## Otherwise it can be depend on the original weekdays input accordingly.
active2[grepl(pattern = "星期一|星期二|星期三|星期四|星期五", x = `Day of Week`), "weekday or weekend"] <- "weekday"
active2[grepl(pattern = "星期六|星期日", x = `Day of Week`), "weekday or weekend"] <- "weekend"
active2[, `weekday or weekend` := as.factor(`weekday or weekend`)]
head(active2,10)
## steps date interval Day of Week weekday or weekend
## 1: NA 2012-10-01 0 星期一 weekday
## 2: NA 2012-10-01 5 星期一 weekday
## 3: NA 2012-10-01 10 星期一 weekday
## 4: NA 2012-10-01 15 星期一 weekday
## 5: NA 2012-10-01 20 星期一 weekday
## 6: NA 2012-10-01 25 星期一 weekday
## 7: NA 2012-10-01 30 星期一 weekday
## 8: NA 2012-10-01 35 星期一 weekday
## 9: NA 2012-10-01 40 星期一 weekday
## 10: NA 2012-10-01 45 星期一 weekday
## Replace the data with average of steps first
active2[is.na(steps), "steps"] <- active2[, c(lapply(.SD, mean, na.rm = TRUE)), .SDcols = c("steps")]
## Warning in `[<-.data.table`(`*tmp*`, is.na(steps), "steps", value =
## structure(list(: 37.382600 (type 'double') at RHS position 1 truncated
## (precision lost) when assigning to type 'integer' (column 1 named 'steps')
## Execute the interval data while sorting by interval with weekday or weekend
Inter2 <- active2[, c(lapply(.SD, mean, na.rm = TRUE)), .SDcols = c("steps"), by = .(interval, `weekday or weekend`)]
## Setup the png file for output display.
png("JHU_DS_05_PA1_4.png")
## Plot
ggplot(Inter2 , aes(x = interval , y = steps, color = `weekday or weekend`)) + geom_line() + labs(title = "Avg. Daily Steps by Week Type", x = "Interval", y = "No. of Steps") + facet_wrap(~ `weekday or weekend` , ncol = 1, nrow=2)
## close png write in.
dev.off()
## png
## 2
## free up memories
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1002286 53.6 2020770 108 1363982 72.9
## Vcells 1907666 14.6 8388608 64 3478639 26.6