R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

set the working directory

setwd("/Users/Mohamed/DS/datasciencecoursera/reproducible-research/reproducible-research-Week2-Project")

unzip the input data file

if(!file.exists("activity.csv"))
{
    unzip("repdata%2Fdata%2Factivity.zip")
}

read the activity file into a data frame

activityData <- read.csv(file = "activity.csv", header = TRUE)
activityData$date <- as.Date(activityData$date, "%Y-%m-%d")

get info on the activityData data frame

str(activityData)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

Get a summary of the data

summary(activityData)
##      steps             date               interval     
##  Min.   :  0.00   Min.   :2012-10-01   Min.   :   0.0  
##  1st Qu.:  0.00   1st Qu.:2012-10-16   1st Qu.: 588.8  
##  Median :  0.00   Median :2012-10-31   Median :1177.5  
##  Mean   : 37.38   Mean   :2012-10-31   Mean   :1177.5  
##  3rd Qu.: 12.00   3rd Qu.:2012-11-15   3rd Qu.:1766.2  
##  Max.   :806.00   Max.   :2012-11-30   Max.   :2355.0  
##  NA's   :2304

check the class of the date column and convert it to a Data class

activityData$date <-  as.Date(activityData$date, format = "%Y-%m-%d")
class(activityData$date)
## [1] "Date"
dailyStepSum <- tapply(activityData$steps, activityData$date, sum, na.rm = TRUE)
histData <- tapply(activityData$steps, activityData$date, sum, na.rm = TRUE)
hist1 <- hist(histData, breaks = 19, col = "blue", xlab = "Total steps per day", ylab = "Number of days", main = "Frequency of Total number of steps per day")

### mean and median

meanDataByInterval <- mean(histData, na.rm = TRUE)
medianDataByInterval <- median(histData, na.rm=TRUE)
fiveMinSteps <- aggregate(steps ~ interval, data = activityData, FUN =mean)
head(fiveMinSteps)
##   interval     steps
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396

Average daily activity pattern - time series plot

5-minute interval, on average across all the days in the dataset, that contains the maximum number of steps?

stepsIntervalMean <- tapply(activityData$steps, as.factor(activityData$interval), mean, na.rm = TRUE)
plot(levels(as.factor(activityData$interval)), stepsIntervalMean, type = "l", xlab = "Five Minute Interval",
     ylab = "Number of Steps", main = "Mean Number of Steps based on 5 Minute Intervals", col = "red")

stepsIntervalMax <- which.max(stepsIntervalMean)
intervalMax <- as.numeric(levels(as.factor(activityData$interval)))[stepsIntervalMax]
intervalMax
## [1] 835

Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?

stepsMax <- as.numeric(stepsIntervalMean[stepsIntervalMax])
stepsMax
## [1] 206.1698

What is the average daily activity pattern?

Imputing missing values

Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with 𝙽𝙰s)

nbrNAs <- nrow(activityData[!complete.cases(activityData), ])
nbrNAs
## [1] 2304

Clean the data

cleanData <- na.omit(activityData)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataByDay <- group_by(cleanData, date)

Get a summary of the data

summary(dataByDay)
##      steps             date               interval     
##  Min.   :  0.00   Min.   :2012-10-02   Min.   :   0.0  
##  1st Qu.:  0.00   1st Qu.:2012-10-16   1st Qu.: 588.8  
##  Median :  0.00   Median :2012-10-29   Median :1177.5  
##  Mean   : 37.38   Mean   :2012-10-30   Mean   :1177.5  
##  3rd Qu.: 12.00   3rd Qu.:2012-11-16   3rd Qu.:1766.2  
##  Max.   :806.00   Max.   :2012-11-29   Max.   :2355.0

Devise a strategy for filling in all of the missing values in the dataset.

library(dplyr)
replaceWithMean <- function(num) replace(num, is.na(num), mean(num, na.rm = TRUE))
dataComplete <- (activityData %>% group_by(interval) %>% mutate(steps = replaceWithMean(steps)))
head(dataComplete)
## # A tibble: 6 x 3
## # Groups:   interval [6]
##       steps       date interval
##       <dbl>     <date>    <int>
## 1 1.7169811 2012-10-01        0
## 2 0.3396226 2012-10-01        5
## 3 0.1320755 2012-10-01       10
## 4 0.1509434 2012-10-01       15
## 5 0.0754717 2012-10-01       20
## 6 2.0943396 2012-10-01       25

Verify no more NA in dataComplete

sum(is.na(dataComplete))
## [1] 0

Create a new dataset that is equal to the original dataset but with the missing data filled in.

cleanDataComplete <- tapply(dataComplete$steps, dataComplete$date, sum, na.rm = TRUE)
hist(cleanDataComplete, breaks = 19, col = "green", xlab = "Total steps per day", ylab = "Number of days", main = "Frequency of Total number of complete steps per day")

### Calculate the mean and median total number of steps taken per day of the new complete clean data

stepsCompleteMean <- mean(cleanDataComplete, na.rm = TRUE)
stepsCompleteMean
## [1] 10766.19
stepsCompleteMedian <- median(cleanDataComplete, na.rm = TRUE)
stepsCompleteMedian
## [1] 10766.19

Do these values differ from the estimates from the first part of the assignment?

Before cleaning

meanDataByInterval
## [1] 9354.23
medianDataByInterval
## [1] 10395

After cleaning

stepsCompleteMean
## [1] 10766.19
stepsCompleteMedian
## [1] 10766.19

We see a slight difference in the mean and median before and adter the data cleaning

Are there differences in activity patterns between weekdays and weekends?

Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

dataComplete$day <- ifelse(weekdays(dataComplete$date) %in% c("Saturday", "Sunday"), "weekend", "weekday")

display

head(dataComplete)
## # A tibble: 6 x 4
## # Groups:   interval [6]
##       steps       date interval     day
##       <dbl>     <date>    <int>   <chr>
## 1 1.7169811 2012-10-01        0 weekday
## 2 0.3396226 2012-10-01        5 weekday
## 3 0.1320755 2012-10-01       10 weekday
## 4 0.1509434 2012-10-01       15 weekday
## 5 0.0754717 2012-10-01       20 weekday
## 6 2.0943396 2012-10-01       25 weekday

Make a panel plot containing a time series plot (i.e. 𝚝𝚢𝚙𝚎 = “𝚕”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).

wday <- subset(dataComplete, day == "weekday")
wday.steps <- aggregate(wday$steps, list(wday$interval), mean)
wday.steps$day <- "Weekday"
colnames(wday.steps) <- c("Interval", "Mean.Steps", "day")

wend <- subset(dataComplete, day == "weekend")
wend.steps <- aggregate(wend$steps, list(wend$interval), mean)
wend.steps$day <- "Weekend"
colnames(wend.steps) <- c("Interval", "Mean.Steps", "day")

activity.data.weekday <- rbind(wday.steps, wend.steps)

The two Time Series plots

library(lattice)
xyplot(Mean.Steps ~ Interval | day, activity.data.weekday, type = "l", layout = c(1,2), 
       ylab = "Average Number of Steps", main = "Average Number of Steps in 5 minute intervals Wday vs Wend", col = "green")

There are some differences at the beginning of the graphs between weekdays and weekends.