This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
setwd("/Users/Mohamed/DS/datasciencecoursera/reproducible-research/reproducible-research-Week2-Project")
if(!file.exists("activity.csv"))
{
unzip("repdata%2Fdata%2Factivity.zip")
}
activityData <- read.csv(file = "activity.csv", header = TRUE)
activityData$date <- as.Date(activityData$date, "%Y-%m-%d")
str(activityData)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
summary(activityData)
## steps date interval
## Min. : 0.00 Min. :2012-10-01 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-31 Median :1177.5
## Mean : 37.38 Mean :2012-10-31 Mean :1177.5
## 3rd Qu.: 12.00 3rd Qu.:2012-11-15 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-30 Max. :2355.0
## NA's :2304
activityData$date <- as.Date(activityData$date, format = "%Y-%m-%d")
class(activityData$date)
## [1] "Date"
dailyStepSum <- tapply(activityData$steps, activityData$date, sum, na.rm = TRUE)
histData <- tapply(activityData$steps, activityData$date, sum, na.rm = TRUE)
hist1 <- hist(histData, breaks = 19, col = "blue", xlab = "Total steps per day", ylab = "Number of days", main = "Frequency of Total number of steps per day")
### mean and median
meanDataByInterval <- mean(histData, na.rm = TRUE)
medianDataByInterval <- median(histData, na.rm=TRUE)
fiveMinSteps <- aggregate(steps ~ interval, data = activityData, FUN =mean)
head(fiveMinSteps)
## interval steps
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
stepsIntervalMean <- tapply(activityData$steps, as.factor(activityData$interval), mean, na.rm = TRUE)
plot(levels(as.factor(activityData$interval)), stepsIntervalMean, type = "l", xlab = "Five Minute Interval",
ylab = "Number of Steps", main = "Mean Number of Steps based on 5 Minute Intervals", col = "red")
stepsIntervalMax <- which.max(stepsIntervalMean)
intervalMax <- as.numeric(levels(as.factor(activityData$interval)))[stepsIntervalMax]
intervalMax
## [1] 835
stepsMax <- as.numeric(stepsIntervalMean[stepsIntervalMax])
stepsMax
## [1] 206.1698
nbrNAs <- nrow(activityData[!complete.cases(activityData), ])
nbrNAs
## [1] 2304
cleanData <- na.omit(activityData)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataByDay <- group_by(cleanData, date)
summary(dataByDay)
## steps date interval
## Min. : 0.00 Min. :2012-10-02 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-29 Median :1177.5
## Mean : 37.38 Mean :2012-10-30 Mean :1177.5
## 3rd Qu.: 12.00 3rd Qu.:2012-11-16 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-29 Max. :2355.0
library(dplyr)
replaceWithMean <- function(num) replace(num, is.na(num), mean(num, na.rm = TRUE))
dataComplete <- (activityData %>% group_by(interval) %>% mutate(steps = replaceWithMean(steps)))
head(dataComplete)
## # A tibble: 6 x 3
## # Groups: interval [6]
## steps date interval
## <dbl> <date> <int>
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
sum(is.na(dataComplete))
## [1] 0
cleanDataComplete <- tapply(dataComplete$steps, dataComplete$date, sum, na.rm = TRUE)
hist(cleanDataComplete, breaks = 19, col = "green", xlab = "Total steps per day", ylab = "Number of days", main = "Frequency of Total number of complete steps per day")
### Calculate the mean and median total number of steps taken per day of the new complete clean data
stepsCompleteMean <- mean(cleanDataComplete, na.rm = TRUE)
stepsCompleteMean
## [1] 10766.19
stepsCompleteMedian <- median(cleanDataComplete, na.rm = TRUE)
stepsCompleteMedian
## [1] 10766.19
meanDataByInterval
## [1] 9354.23
medianDataByInterval
## [1] 10395
stepsCompleteMean
## [1] 10766.19
stepsCompleteMedian
## [1] 10766.19
dataComplete$day <- ifelse(weekdays(dataComplete$date) %in% c("Saturday", "Sunday"), "weekend", "weekday")
head(dataComplete)
## # A tibble: 6 x 4
## # Groups: interval [6]
## steps date interval day
## <dbl> <date> <int> <chr>
## 1 1.7169811 2012-10-01 0 weekday
## 2 0.3396226 2012-10-01 5 weekday
## 3 0.1320755 2012-10-01 10 weekday
## 4 0.1509434 2012-10-01 15 weekday
## 5 0.0754717 2012-10-01 20 weekday
## 6 2.0943396 2012-10-01 25 weekday
wday <- subset(dataComplete, day == "weekday")
wday.steps <- aggregate(wday$steps, list(wday$interval), mean)
wday.steps$day <- "Weekday"
colnames(wday.steps) <- c("Interval", "Mean.Steps", "day")
wend <- subset(dataComplete, day == "weekend")
wend.steps <- aggregate(wend$steps, list(wend$interval), mean)
wend.steps$day <- "Weekend"
colnames(wend.steps) <- c("Interval", "Mean.Steps", "day")
activity.data.weekday <- rbind(wday.steps, wend.steps)
library(lattice)
xyplot(Mean.Steps ~ Interval | day, activity.data.weekday, type = "l", layout = c(1,2),
ylab = "Average Number of Steps", main = "Average Number of Steps in 5 minute intervals Wday vs Wend", col = "green")