Loading and preprocessing the data
1 Load the data (i.e. read.csv()).
2 Process/transform the data (if necessary) into a format suitable for your analysis.
setwd("~/Desktop/Coursera/Reproducible Research/Peer Assessment 1")
data<-read.csv("activity.csv")
All missing values in the dataset are ignored.
1 Make a histogram of the total number of steps taken each day
Dailysteps<-tapply(data$steps,data$date,sum)
hist(Dailysteps)
2 Calculate and report the mean and median total number of steps taken per day
Mean<-tapply(data$steps,data$date,mean)
Median<-tapply(data$steps,as.character(data$date),median)
Mean
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## NA 0.4375 39.4167 42.0694 46.1597 53.5417
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 38.2465 NA 44.4826 34.3750 35.7778 60.3542
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 43.1458 52.4236 35.2049 52.3750 46.7083 34.9167
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 41.0729 36.0938 30.6285 46.7361 30.9653 29.0104
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 8.6528 23.5347 35.1354 39.7847 17.4236 34.0938
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 53.5208 NA 36.8056 36.7049 NA 36.2465
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 28.9375 44.7326 11.1771 NA NA 43.7778
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 37.3785 25.4722 NA 0.1424 18.8924 49.7882
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 52.4653 30.6979 15.5278 44.3993 70.9271 73.5903
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 50.2708 41.0903 38.7569 47.3819 35.3576 24.4688
## 2012-11-30
## NA
Median
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## NA 0 0 0 0 0
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 0 NA 0 0 0 0
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 0 0 0 0 0 0
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 0 0 0 0 0 0
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 0 0 0 0 0 0
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 0 NA 0 0 NA 0
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 0 0 0 NA NA 0
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 0 0 NA 0 0 0
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 0 0 0 0 0 0
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 0 0 0 0 0 0
## 2012-11-30
## NA
1 Make a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
data1<-data[order(data$interval),]
data2<-na.omit(data1)
NoOfinterval<-numeric(0)
sum<-numeric(0)
interval<-unique(data2$interval)
for (i in interval) {
intervaldataset<-subset(data2,data2$interval==i)
NoOfinterval<-c(NoOfinterval,nrow(intervaldataset))
sum<-c(sum,sum(intervaldataset$steps))
}
interval_mean<-sum/NoOfinterval
plot(interval,interval_mean,type="l")
2 Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
intervalmeanchar<-as.character(interval_mean)
maxno<-grep(as.character(max(interval_mean,na.rm=T)),intervalmeanchar)
Max_5minuteInterval<-interval[[maxno]] ###### This is the value of the Max_5-minuteInterval
Max_5minuteInterval
## [1] 835
1 Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
totalrow<-nrow(data)
completedrow<-nrow(na.omit(data))
NAsrow<-totalrow-completedrow ##### This is the total number of rows with NAs.
NAsrow
## [1] 2304
2 Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.
table<-data.frame(rep(interval_mean,61)) ##### Replicate the interval means for 61 times,
##### because there are 61 days in the dataset.
colnames(table)<-c("mean")
data_new<-cbind(data,table)
data_new$label<-is.na(data_new$steps) ##### Create a factor "is.na"" label
colnames<-colnames(data_new)
data_split<-split(data_new,data_new$label) ##### Split the dataset into two datasets
##### based on "is.na" label
data_split1<-data.frame(data_split[1])
colnames(data_split1)<-colnames
data_split2<-data.frame(data_split[2])
colnames(data_split2)<-colnames
data_split2[,1]<-data_split2[,4]
data_new<-rbind(data_split1,data_split2)
data_new<-data_new[,1:3]
data_new<-data_new[order(data_new$date),]
############repeat the steps in Section "What is mean total number of steps taken per day?"
Dailysteps_new<-tapply(data_new$steps,data_new$date,sum)
hist(Dailysteps_new)
3 Calculate and report the mean and median total number of steps taken per day in the new dataset
Mean<-tapply(data_new$steps,data_new$date,mean)
Median<-tapply(data_new$steps,as.character(data_new$date),median)
Mean
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## 37.3826 0.4375 39.4167 42.0694 46.1597 53.5417
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 38.2465 37.3826 44.4826 34.3750 35.7778 60.3542
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 43.1458 52.4236 35.2049 52.3750 46.7083 34.9167
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 41.0729 36.0938 30.6285 46.7361 30.9653 29.0104
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 8.6528 23.5347 35.1354 39.7847 17.4236 34.0938
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 53.5208 37.3826 36.8056 36.7049 37.3826 36.2465
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 28.9375 44.7326 11.1771 37.3826 37.3826 43.7778
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 37.3785 25.4722 37.3826 0.1424 18.8924 49.7882
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 52.4653 30.6979 15.5278 44.3993 70.9271 73.5903
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 50.2708 41.0903 38.7569 47.3819 35.3576 24.4688
## 2012-11-30
## 37.3826
Median
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## 34.11 0.00 0.00 0.00 0.00 0.00
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 0.00 34.11 0.00 0.00 0.00 0.00
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 0.00 34.11 0.00 0.00 34.11 0.00
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 0.00 0.00 0.00 34.11 34.11 0.00
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 0.00 0.00 34.11 0.00 0.00 0.00
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-11-30
## 34.11
1 Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
data_new$date<-strptime(as.character(data_new$date),"%Y-%m-%d")
data_new$weekday<-weekdays(data_new$date)
data_new$week<-ifelse(data_new$weekday %in% c("Satuday", "Sunday"),
data_new$week <-"Weekend", data_new$week <-"Weekday")
2 Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).
data1<-data_new[order(data_new$interval),]
NoOfinterval<-numeric(0)
sum<-numeric(0)
interval<-unique(data1$interval)
for (i in interval) {
intervaldataset<-subset(data1,data1$interval==i)
NoOfinterval<-c(NoOfinterval,nrow(intervaldataset))
sum<-c(sum,sum(intervaldataset$steps))
}
data1$mean<-sum/NoOfinterval
data_weekday<-subset(data1,data1$week=="Weekday")
data_weekend<-subset(data1,data1$week=="Weekend")
par(mar=c(4,4,0.3,0.3), mfcol=c(2,1))
plot(data_weekend$interval,data_weekend$mean,type="l",ylab="Number fo steps")
plot(data_weekday$interval,data_weekday$mean,type="l",ylab="Number fo steps")