Begin by loading the data and changing the steps variable to a number.
activity <- read.csv("activity.csv", header = T)
activity$steps <- as.numeric(activity$steps)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
activityDF <- activity %>%
group_by(date) %>%
summarize(dateSums = sum(steps))
g <- ggplot(activityDF, aes(x = dateSums))
p <- g + geom_histogram(binwidth = 2500, fill = "white", colour = "black") +
labs(title = "Distribution of daily actiities", x = "steps per day")
print(p)
mean(activityDF$dateSums, na.rm = T)
## [1] 10766.19
median(activityDF$dateSums, na.rm = T)
## [1] 10765
dailyPattern <- activity %>%
na.omit(activity) %>%
group_by(interval) %>%
summarize(averageSteps = mean(steps))
gAverage <- ggplot(dailyPattern, aes(x = interval, y = averageSteps))
pAverage <- gAverage + geom_line() +
labs(title = "Average number of steps per time interval",
y = "average steps")
print(pAverage)
maxAverageSteps <- dailyPattern[dailyPattern$averageSteps == max(dailyPattern$averageSteps),]
maxAverageSteps
## Source: local data frame [1 x 2]
##
## interval averageSteps
## 1 835 206.1698
To analyse missing values, I created a logical vector for the steps variable. This shows that 2304 rows have NAs.
##Missing values
NAvector <- is.na(activity$steps)
table(NAvector)
## NAvector
## FALSE TRUE
## 15264 2304
I then created a for loop in which I replaced every NA value in the steps variable with the value for the average steps for that interval.
##This code looks up NA values in the steps variable and replaces them with
##average step values for the corresponding interval
dailyPatternDF <- data.frame(dailyPattern) ##must use dailyPattern as a dataframe
adjustedSteps <- numeric()##define a new, empty steps vector
##For every observation (row), I check to see if it is an NA value.
##If it is, I replace the NA with the average number of steps for that interval.
##If it is not, I add the original steps value.
for(i in 1:length(NAvector)){
if(NAvector[i] == TRUE){
tempInterval <- activity$interval[i]
tempSteps <- dailyPatternDF[dailyPatternDF$interval == tempInterval, 2]
} else{
tempSteps <- activity$steps[i]
}
adjustedSteps <- c(adjustedSteps, tempSteps)
tempSteps <- numeric()
}
activityCleaned <- data.frame(adjustedSteps, activity[,2:3])
activityCleanedDF <- activityCleaned %>%
group_by(date) %>%
summarize(dateSums = sum(adjustedSteps))
g2 <- ggplot(activityCleanedDF, aes(x = dateSums))
p2 <- g2 + geom_histogram(binwidth = 2500, fill = "white", colour = "black") +
labs(title = "Distribution of daily actiities, NAs removed", x = "steps per day")
print(p2)
mean(activityCleanedDF$dateSums)
## [1] 10766.19
median(activityCleanedDF$dateSums)
## [1] 10766.19
##Differences in activity patters between weekdays and weekends
##Extract date information and create logical vector for weekends or not
Day <- as.Date(activityCleaned$date)
WeekDay <- weekdays(Day)
WeekEnd <- WeekDay == "Saturday" | WeekDay == "Sunday"
##Make a factor variable labeling each date as weekday or weekend
WeekDayFactor <- factor(!WeekEnd, levels = c("TRUE","FALSE"),
labels = c("Weekday", "Weekend"))
activityCleaned$WeekDay <- WeekDayFactor
dailyPatternCleaned <- activityCleaned %>%
group_by(WeekDay, interval) %>%
summarize(averageSteps = mean(adjustedSteps))
library(lattice)
## Warning: package 'lattice' was built under R version 3.1.3
xyplot(averageSteps ~ interval | WeekDay, data = dailyPatternCleaned, layout = c(1,2),
type = "a", ylab = "Number of steps",
main = "Average number of steps per time interval")