First, we make sure that everyone will be able to see the R code, we set echo=“TRUE” for the whole document.
knitr:: opts_chunk$set(echo=TRUE, results = "asis")
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(url,"activity.zip",mode="wb")
unzip("./activity.zip", exdir = getwd())
Activity <- read.csv("activity.csv")
ActivitybyDay <- aggregate(Activity$steps, by=list(Activity$date), FUN=sum, na.rm=TRUE)
names(ActivitybyDay) <- c("Date","Steps")
hist(ActivitybyDay$Steps, ylim =c(0,30), xlab="Steps", main="Histogram of the total number of steps taken each day")
mean <- round(mean(ActivitybyDay$Steps))
median <- median(ActivitybyDay$Steps)
The average steps taken per day is 9354. The median steps taken per day is 10395
library(ggplot2)
ActivitybyInterval <- aggregate(Activity$steps, by= list(Activity$interval), mean, na.rm=TRUE)
names(ActivitybyInterval) <- c("Interval","Steps")
g <- ggplot(ActivitybyInterval, aes(Interval,Steps)) + geom_line()+labs(title ="Average daily activity pattern" )
g
MaxInterval <- ActivitybyInterval$Interval[ActivitybyInterval$Steps==max(ActivitybyInterval$Steps)]
Interval 835 contains the maximum number of steps
NAs <- sum(is.na(Activity$steps))
The total number of missing values in the dataset is 2304
names(Activity) <- c("Steps","Date","Interval")
NewActivity <- merge(Activity,ActivitybyInterval,by="Interval")
names(NewActivity) <- c("Interval", "Steps", "Date","AverageSteps")
NewActivity$Steps[is.na(NewActivity$Steps)] <- NewActivity$AverageSteps[is.na(NewActivity$Steps)]
NewActivity <- NewActivity[,1:3]
NewActivitybyDate <- aggregate(NewActivity$Steps, by=list(NewActivity$Date), FUN=sum)
names(NewActivitybyDate) <- c("Date","Steps")
hist(NewActivitybyDate$Steps, ylim=c(0,35), xlab = "Steps", main = "New Histogram of the Total number of steps taken each day")
Newmean <- mean(NewActivitybyDate$Steps)
Newmedian <- median(NewActivitybyDate$Steps)
The new mean and meidan values are bigger than the mean and median values. Imputing missing data will increase the data of the total daily number of steps. The new mean is 1.076618910^{4}. The new median is 10395.
NewActivity$Date <- as.Date(NewActivity$Date)
NewActivity$DayofWeek <- weekdays(NewActivity$Date)
for (i in 1:nrow(NewActivity)){
if (NewActivity$DayofWeek[i] == "Sunday"){
NewActivity$DayofWeek[i] <- "Weekend"
}
else if(NewActivity$DayofWeek[i]=="Saturday"){
NewActivity$DayofWeek[i] <-"Weekend"
}
else{
NewActivity$DayofWeek[i] <- "Weekday"
}
}
weekend <- NewActivity[NewActivity$DayofWeek=="Weekend",]
weekday <- NewActivity[NewActivity$DayofWeek=="Weekday",]
weekend <- aggregate(weekend$Steps,list(weekend$Interval),FUN=mean)
weekday <- aggregate(weekday$Steps, list(weekday$Interval), FUN=mean)
names <- c("Interval", "Steps")
names(weekend) <- names
names(weekday) <- names
par(mfrow=c(2,1))
plot(weekend$Interval,weekend$Steps,type="l", xlab="Interval", ylab="Steps", main="Weekend")
plot(weekday$Interval,weekday$Steps,type="l", xlab="Interval", ylab="Steps", main="Weekday")