#Download zip file & unzip
zip=download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip",
destfile="raw_data.zip",method="curl")
files=unzip("raw_data.zip")
#Load data and assign it to variable "data"
data=read.csv(files)
#Clean data
data$date=as.Date(data$date)
data1=tapply(data$steps, data$date, sum)
data2=data.frame(date=as.Date(attributes(data1)$dimnames[[1]]), steps=data1)
data3=data.frame(interval=unique(data$interval), steps=tapply(data$steps, data$interval, mean, na.rm=T))
#1.Make a histogram of the total number of steps taken each day
library(ggplot2)
ggplot(data2, aes(x=date, y=steps))+
geom_bar(stat="identity",color="blue", fill="blue")+
ggtitle("Daily Steps Taken")+
theme(plot.title = element_text(lineheight=.8, face="bold"))
#Calculate the mean and median total number of steps taken per day
mean_steps=as.integer(mean(data2$steps, na.rm=T))
median_steps=median(data2$steps, na.rm=T)
#Make a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
ggplot(data3, aes(x=interval, y=steps))+
geom_line()+
geom_vline(xintercept=data3$interval[data3$steps==max(data3$steps)], color="red", linetype="longdash")+
ggtitle("Average Daily Activity Pattern by 5-minute Interval")+
annotate("text", x = 1245, y = max(data3$steps), label = "Max Avg. Daily Activity")
#Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
max_steps_interval=data3$interval[data3$steps==max(data3$steps)]
#1. Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
missing_values=sum(is.na(data$steps))
#Create a new dataset that is equal to the original dataset but with the missing data filled in with the mean value of the same interval
interval_mean=tapply(data$steps, data$interval, mean, na.rm=T)
data4=data
for (i in (1:nrow(data4))){
if(is.na(data4$steps[i])){
data4$steps[i]=floor(interval_mean[attributes(interval_mean)$dimnames[[1]]==data4$interval[i]])
}
}
#Original Dataset with NAs
head(data)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
#New Dataset with NA filled out with the mean steps from the same interval.
head(data4)
## steps date interval
## 1 1 2012-10-01 0
## 2 0 2012-10-01 5
## 3 0 2012-10-01 10
## 4 0 2012-10-01 15
## 5 0 2012-10-01 20
## 6 2 2012-10-01 25
#Histogram and the new mean & mean from the new dataset
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
original_plot=ggplot(data2, aes(x=date, y=steps))+
geom_bar(stat="identity",color="blue", fill="blue")+
ggtitle("Daily Steps Taken (Original Dataset)")+
theme(plot.title = element_text(lineheight=.8, face="bold"))
new_plot=ggplot(data4, aes(x=date, y=steps))+
geom_bar(stat="identity",color="purple", fill="purple")+
ggtitle("Daily Steps Taken (New Dataset)")+
theme(plot.title = element_text(lineheight=.8, face="bold"))
grid.arrange(original_plot, new_plot, ncol=2)
## Warning: Removed 8 rows containing missing values (position_stack).
original_mean_steps=as.integer(mean(data2$steps, na.rm=T))
original_median_steps=median(data2$steps, na.rm=T)
new_mean_steps=as.integer(mean(data4$steps, na.rm=T))
new_median_steps=median(data4$steps, na.rm=T)
*Original dataset mean: 10766
*New dataset mean: 10766
*Original dataset median: 10765
*New dataset median: 10765
#Create a new factor variable with two levels “weekday” and “weekend” indicating whether a given date is a weekday or weekend day and merge with the new dataset
weekdays=weekdays(data4$date)
weekdays=as.factor(ifelse(weekdays %in% c("Saturday","Sunday"), "weekend", "weekday"))
data4=cbind(data4, weekdays)
#Creating plots
ggplot(data4, aes(x=interval, y=steps))+geom_line(color="blue")+ylab("Number of steps")+facet_wrap(~weekdays, ncol=1)