Project Assignment- 1

Mean total steps taken per day

Reading and having a first look at data

activity <- read.csv("activity.csv")
names(activity)
## [1] "steps"    "date"     "interval"
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

Total steps taken in a day

totalsteps <- aggregate(steps~date, data = activity, FUN = sum, na.rm=T)
# Have a look at dataset
head(totalsteps)
##         date steps
## 1 2012-10-02   126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
#plotting graph
hist(totalsteps$steps, xlab = "Steps per day", main = "total number of steps per day", col = "red")

Mean and median steps

meansteps <- mean(totalsteps$steps)
mediansteps <- median(totalsteps$steps)
meansteps
## [1] 10766.19
mediansteps
## [1] 10765

Average daily activity pattern

intervalsteps <- aggregate(steps~interval, data = activity, FUN=mean, na.rm = T)
#have a look at dataset
str(intervalsteps)
## 'data.frame':    288 obs. of  2 variables:
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
##  $ steps   : num  1.717 0.3396 0.1321 0.1509 0.0755 ...
head(intervalsteps)
##   interval     steps
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396
plot(intervalsteps$interval, intervalsteps$steps, col = "green", xlab = "Intervals", ylab = "Total Steps per interval", main = "Number of steps per interval", type = "l")

Finding max. steps

maxsteps <- max(intervalsteps$steps)
#Maximun number of steps aken in a 5 min. interval is:
maxsteps
## [1] 206.1698
maxinterval <- intervalsteps$interval[which(intervalsteps$steps == maxsteps)]
#interval in which max. steps are taken is:
maxinterval
## [1] 835

Imputing missing vaues

#The number of missing values in out activity dataset is:
sum(is.na(activity))
## [1] 2304

Strategy for filling the NA’s

missingvalues <- subset(activity, is.na(steps))
par(mfrow = c(2,1), mar = c(3,3,2,2))
hist(missingvalues$interval, main = "NA's per interval")
hist(as.POSIXct(missingvalues$date), main = "NA's per date", breaks = 61)
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

#we can see that NA's run equally over all interval but in days it is available for only 8 days, so we will take of mean of date across all dataset

Creating new data set with missing values

meanstepinterval <- tapply(activity$steps, activity$interval, mean, na.rm = T)
activityna <- activity[is.na(activity$steps),]
activitynonna <- activity[!is.na(activity$steps),]
activityna$steps <- as.factor(activityna$interval)
levels(activityna$steps) <- meanstepinterval
levels(activityna$steps) <- round(as.numeric(levels(activityna$steps)))
activityna$steps <- as.integer(as.vector(activityna$steps))
imputedactivity <- rbind(activityna, activitynonna)
str(imputedactivity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  2 0 0 0 0 2 1 1 0 1 ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
sum(is.na(imputedactivity))
## [1] 0
# Thus all the NA's have been removed

Histogram of total number of steps taken in each day

par(mfrow = c(1,2))
activitysteps <- aggregate(steps~date, data = activity, FUN = sum, na.rm =T)
hist(activitysteps$steps, xlab = "Steps per day", main ="Total steps per day, no NA's", col = "blue")
impactivitysteps <- aggregate(steps~date, data = imputedactivity, FUN=sum)
hist(impactivitysteps$steps, xlab ="Steps per day", main ="NA's imputed, Total steps per day", col = "blue")

imputedmean <- mean(impactivitysteps$steps)
imputedmedian <- median(impactivitysteps$steps)
imputedmean
## [1] 10765.64
imputedmedian
## [1] 10762
result_mm <- data.frame(c(meansteps, mediansteps), c(imputedmean, imputedmedian))
colnames(result_mm) <- c("NA removed", "Imputed NA values")
rownames(result_mm) <- c("mean", "median")
result_mm
##        NA removed Imputed NA values
## mean     10766.19          10765.64
## median   10765.00          10762.00
#Conclusion : imputing NA has no big effect on mean and median

###Difference in weekands and weekdays

imputedactivity$daytype <- ifelse(weekdays(as.Date(imputedactivity$date)) == c("Saturday", "Sunday"), "Weekend", "weekday")
head(imputedactivity)
##   steps       date interval daytype
## 1     2 2012-10-01        0 weekday
## 2     0 2012-10-01        5 weekday
## 3     0 2012-10-01       10 weekday
## 4     0 2012-10-01       15 weekday
## 5     0 2012-10-01       20 weekday
## 6     2 2012-10-01       25 weekday
imputedactivity$daytype <- factor(imputedactivity$daytype)
stepsdaytype <- aggregate(steps~interval + daytype, data = imputedactivity, FUN = mean)
head(stepsdaytype)
##   interval daytype     steps
## 1        0 weekday 1.9811321
## 2        5 weekday 0.3396226
## 3       10 weekday 0.1320755
## 4       15 weekday 0.1509434
## 5       20 weekday 0.0754717
## 6       25 weekday 1.3773585
library(ggplot2)
g <- ggplot(stepsdaytype, aes(interval, steps))
g + geom_line(col = "green") + facet_grid(daytype~.) + labs(x="Intervals", y = "Average Steps", title = "Activity Pattern")

Conclusion : The subject gets an early start on weekdays then weekends