Suleman Wadur
Loading data file
## Turns off exponential notation of numeric values such as when using the mean function
options(scipen = 999)
## Load needed libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Set working directory
workdir <- "C:/Move 4/Coursera/DataScience/Course5-ReproducibleResearch/Week2/Assignment/RepData_PeerAssessment1"
setwd(workdir)
if (file.exists("activity.zip") && !file.exists("activity.csv")) {
unzip(zipfile="./activity.zip", exdir=".")
}
if(!file.exists("activity.csv")) {
stop("One of the input files is missing!")
}
activitydata <- read.csv("activity.csv")
#plot(activitydata$date, mean(is.na(activitydata$steps))) DailyMean=activitydata$steps
data0 <- subset(activitydata, !is.na(steps))
TotalStepsByDay <- setNames(aggregate(data0$steps, list(data0$date), FUN=sum), c("Day", "Total.Steps"))
#plot(data1$x, data1$Day, type = "h", xlab = "Day", ylab = "Mean daily steps", main = "Average total steps per day")
TotalStepsByDay
## Day Total.Steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
## 7 2012-10-09 12811
## 8 2012-10-10 9900
## 9 2012-10-11 10304
## 10 2012-10-12 17382
## 11 2012-10-13 12426
## 12 2012-10-14 15098
## 13 2012-10-15 10139
## 14 2012-10-16 15084
## 15 2012-10-17 13452
## 16 2012-10-18 10056
## 17 2012-10-19 11829
## 18 2012-10-20 10395
## 19 2012-10-21 8821
## 20 2012-10-22 13460
## 21 2012-10-23 8918
## 22 2012-10-24 8355
## 23 2012-10-25 2492
## 24 2012-10-26 6778
## 25 2012-10-27 10119
## 26 2012-10-28 11458
## 27 2012-10-29 5018
## 28 2012-10-30 9819
## 29 2012-10-31 15414
## 30 2012-11-02 10600
## 31 2012-11-03 10571
## 32 2012-11-05 10439
## 33 2012-11-06 8334
## 34 2012-11-07 12883
## 35 2012-11-08 3219
## 36 2012-11-11 12608
## 37 2012-11-12 10765
## 38 2012-11-13 7336
## 39 2012-11-15 41
## 40 2012-11-16 5441
## 41 2012-11-17 14339
## 42 2012-11-18 15110
## 43 2012-11-19 8841
## 44 2012-11-20 4472
## 45 2012-11-21 12787
## 46 2012-11-22 20427
## 47 2012-11-23 21194
## 48 2012-11-24 14478
## 49 2012-11-25 11834
## 50 2012-11-26 11162
## 51 2012-11-27 13646
## 52 2012-11-28 10183
## 53 2012-11-29 7047
Histogram: total number of steps taken per day
hist(TotalStepsByDay$Total.Steps, main = "Histogram: Total Number of Steps per Day", col = "lightblue", xlab = "Total Steps per day" )
Mean and Median of Steps Taken per day
meanOfSteps <- mean(TotalStepsByDay$Total.Steps)
medianOfSteps <- median(TotalStepsByDay$Total.Steps)
Mean is 10766.1886792 and median is 10765
ActivtyPattern <- aggregate(steps ~ interval, data0, mean)
plot(ActivtyPattern$interval, ActivtyPattern$steps, type = "l", main="Average Daily Activity Pattern", ylab = "Average Daily Steps", xlab = "5-minute Interval")
maxSteps <- ActivtyPattern[which.max(ActivtyPattern$steps),]
5-minute interval with the maximum number of steps is 835 with 206.1698113 steps
# get a vector of complete cases. This will return true or false, with rows having missing values as TRUE
y <- !complete.cases(activitydata)
# counts the number of records with TRUE, indicating they have missing records.
mrecords <- sum(y)
There are a total of 2304 records with missing values
Using the mean of 5-minute interval, a new variable is added for records with missing steps…
#create a new variable with the copy of the steps data. This will be updated in the function below
activitydata$new.steps <- activitydata$steps
#determine average steps by interval
meansByInterval <- aggregate(steps ~ interval, data = activitydata, mean)
# gets index of all rows in the original dataset with missing data
p <- which(is.na(activitydata), arr.ind=TRUE)
for(i in 1:nrow(p)) {
j <- p[i,1]
#gets index of the mean interval stored in meansByInterval data frame
indexOfMeanInterval = which(meansByInterval$interval == activitydata[i,3])
#update the new steps with the mean of its interval
activitydata[j, 4] <- meansByInterval[indexOfMeanInterval,2]
}
Calculating summary on updated data…
TotalStepsByDay <- setNames(aggregate(activitydata$new.steps, list(activitydata$date), FUN=sum), c("Day", "Total.Steps"))
hist(TotalStepsByDay$Total.Steps, main = "Histogram: Total Number of Steps per Day: Missing Data corrected", col = "lightblue", xlab = "Total Steps per day" )
NewMeanOfSteps <- mean(TotalStepsByDay$Total.steps)
## Warning in mean.default(TotalStepsByDay$Total.steps): argument is not
## numeric or logical: returning NA
NewMedianOfSteps <- median(TotalStepsByDay$Total.steps)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
Old Mean: 10766.1886792 and Old median: 10765
New Mean: NA and New median:
From the histogram above, the frequency has certainly increased. Only the median total daily steps seems to have changed when the missing data got updated.
#loop through data and derive the weekend/weekday
for (i in 1:nrow(activitydata)) {
if (weekdays(as.Date(activitydata$date[i])) == "Saturday" | weekdays(as.Date(activitydata$date[i])) == "Sunday")
activitydata$weekday[i] <- "weekend"
else
activitydata$weekday[i] <- "weekday"
}
TotalStepsByWeekDay <- setNames(aggregate(activitydata$new.steps~activitydata$weekday+activitydata$interval, list(activitydata$weekday), FUN=mean), c("Weekday", "Interval", "Average.Steps"))
#TotalStepsByWeekDay
#defines a 2 row, 1 column panel plot area
par(mfrow = c(2,1) )
#subset data according to weekend or weekday and plot the time series graph
dday <- subset(TotalStepsByWeekDay, Weekday == "weekday")
dend <- subset(TotalStepsByWeekDay, Weekday == "weekend")
plot(dday$Interval, dday$Average.Steps, type = "l", main="Activity Pattern - Weekday", ylab = "Average number of Steps", xlab = "5-minute Interval", col = "lightblue" )
plot(dend$Interval, dend$Average.Steps, type = "l", main="Activity Pattern - weekend", ylab = "Average number of Steps", xlab = "5-minute Interval", col = "lightblue")