Synopsis

This analysis aims at using data collected at 5 min interval throughout the day over a period of 2 months to infer user activity patterns for different subjects.

Loading and preprocessing the data

temp <- tempfile()
download.file("http://d396qusza40orc.cloudfront.net/repdata/data/activity.zip",temp)
data <- read.csv(unz(temp, "activity.csv"))
unlink(temp)

# Removing NA's from the dataset and storing it in data1
data1<-data[!is.na(data$steps),]

#Spliting the data1 by date variable into a list called dataSplit
dataSplit<-split(data1,data1$date)

#creating an empty vector called sum_vector
sum_vector<-c()

Mean total number of steps taken per day

#Iterating through all the variables in the list obtained from the above operation and calculating the sum(mean*length) of the steps per day and storing the result in a sum_vector
for(i in seq_along(dataSplit))
{
sum_vector<-c(sum_vector,mean(dataSplit[[i]]$steps)*length(dataSplit[[i]]$steps))
}

#Removing NA's from the sum_vector
sum_vector_without_NA<-sum_vector[!is.na(sum_vector)]

#Histogram of the total number of steps taken each day
hist(sum_vector_without_NA,main="Histogram of total number of steps taken each day",xlab="total number of steps taken each day",col=rainbow(7))

# Finding the mean and the median of the total number of steps taken per day

The mean of the total number of steps taken per day is 1.076618910^{4} and the corresponding median is 1.076510^{4}

Average daily activity pattern

#initialising an empty vector
indices_with_NA<-c()

# This section of the code removes those dates during which no activity is recorded
for(i in seq_along(sum_vector))
{
if(is.na(sum_vector)[i]==TRUE)
{indices_with_NA<-c(indices_with_NA,i)
}
}

#removing those no activity days from our dataSplit list and storing the result in a new list.
dataSplit_without_NA<-dataSplit[-indices_with_NA]


#initialising an empty data frame called steps_taken_data_frame
steps_taken_data_frame<-data.frame()


# This section of the code loops through all the elements of the list dataSpit_without_NA and rbinds the new data frame with such that each row represents the step variable for a particular date. 
for(i in seq_along(dataSplit_without_NA))
{
row<-c()
for(j in seq_along(dataSplit_without_NA[[i]]$steps))
{
row<-c(row,dataSplit_without_NA[[i]]$steps[j])
}
steps_taken_data_frame<-rbind(steps_taken_data_frame,row)
}

#Finding the colMeans of the steps_taken_data_frame. mean_col is the vector which contains the mean no. of steps for all 288 -5 min intervals)
mean_col<-colMeans(steps_taken_data_frame)


# creating a time series variable p
no_of_intervals<-length(mean_col)
temp<-no_of_intervals-1
p<-c(0:temp)
p<-ts(p)

#Making a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
plot(p,mean_col,main="5-minute interval (x-axis) vs the average number of steps taken",xlab="5-minute interval",ylab="average number of steps taken",type="l")

#The 5-minute interval, on average across all the days in the dataset, containing the maximum number of steps

The 5-minute interval, on average across all the days in the dataset, containing the maximum number of step is 835

Imputing missing values

# Calculating the total number of missing values in the dataset (i.e. the total number of rows with NAs)
a<-is.na(data$steps)
count=0
for(i in seq_along(a))
{
if(a[i]==TRUE)
{
count=count+1
}
}

Total number of missing values in the dataset (i.e. the total number of rows with NAs) is 2304

#replacing missing values with the mean for that 5-minute interval in the original data

temp<-data[1:no_of_intervals,]$interval
temp<-as.character(temp)
names(mean_col)<-temp


f<-c(1:nrow(data))

for(i in seq_along(f))
{
if(is.na(data$steps[i])==TRUE)
{
interval<-data$interval[i]
mean_value_of_five_min_interval<-mean_col[as.character(interval)]
data$steps[i]<-mean_value_of_five_min_interval
}
}


# repeating the same set of operations as in the first section but this time with the new data obtained by inputing the missing values
dataSplit_new<-split(data,data$date)
vector_new<-c()
for(i in seq_along(dataSplit_new))
{
vector_new<-c(vector_new,mean(dataSplit_new[[i]]$steps)*length(dataSplit_new[[i]]$steps))
}
hist(vector_new,main="Histogram of total number of steps taken each day",xlab="total number of steps taken each day",col=rainbow(7))

NEW mean and median total number of steps taken per day are 1.076618910^{4} and 1.076618910^{4} respectively

Differences in activity patterns between weekdays and weekends

#this section of the code deals with the creation of new factor variable "weekday" and "weekend"

new_data<-data
f<-c(1:nrow(new_data))
new_col<-c()
for(i in seq_along(f))
{
if((weekdays(as.Date(new_data$date[i]))=="Saturday")|(weekdays(as.Date(new_data$date[i]))=="Sunday"))
{
new_col<-c(new_col,"weekend")
}
else
{
new_col<-c(new_col,"weekday")
}
}

# adding the new factor variable to the our data frame
data<-cbind(data,new_col)



new_dataSplit<-split(data,data$date)

new_steps_taken_data_frame1<-data.frame()
new_steps_taken_data_frame2<-data.frame()

for(i in seq_along(new_dataSplit))
{
row1<-c()
row2<-c()
for(j in seq_along(new_dataSplit[[i]]$steps))
{
if(new_dataSplit[[i]]$new_col[j]=="weekday")
{
row1<-c(row1,new_dataSplit[[i]]$steps[j])
}
else
{
row2<-c(row2,new_dataSplit[[i]]$steps[j])
}
}
new_steps_taken_data_frame1<-rbind(new_steps_taken_data_frame1,row1)
new_steps_taken_data_frame2<-rbind(new_steps_taken_data_frame2,row2)
}

#Finding the colMeans of the steps_taken_data_frame. mean_col1 and mean_col2 are vectors which contain the mean no. of steps for all 288 -5 min intervals) for weekday and weekend respectively
mean_col1<-colMeans(new_steps_taken_data_frame1)
mean_col2<-colMeans(new_steps_taken_data_frame2)

# creating a time series variable p
no_of_intervals<-length(mean_col1)
temp<-no_of_intervals-1
p<-c(0:temp)
p<-ts(p)

#Making a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
par(mfrow=c(2,1))
plot(p,mean_col2,main="weekend",xlab="5-minute interval",ylab="average number of steps taken",type="l")
plot(p,mean_col1,main="weekday",xlab="5-minute interval",ylab="average number of steps taken",type="l")

The activity patterns during weekdays seem to hit peaks during certain intervals while the activity patterns during weekends appear to be fairly consistent