This project is the coursera Getting and Cleaning Data course project. The project involves the transformation of raw variable names to tidy descriptive names that is easier to read and understand.
Download and load the datasets
if (!file.exists("Courseradata")) {
dir.create("Courseradata")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
download.file(fileUrl, destfile = "/Users/adrianromano/Downloads/Courseradata/dataset.zip", method = "curl")
if (!file.exists("/Users/adrianromano/Downloads/Courseradata/UCI HAR Dataset")) {
unzip(zipfile = "/Users/adrianromano/Downloads/Courseradata/dataset.zip",
exdir = "/Users/adrianromano/Downloads/Courseradata")
}
dataPath <- "/Users/adrianromano/Downloads/Courseradata/UCI HAR Dataset"
Read the datasets
activityLabels <- read.table(file.path(dataPath, "activity_labels.txt"))
features <- read.table(file.path(dataPath, "features.txt"))
features$V2 <- as.character(features$V2)
subjectTrain <- read.table(file.path(dataPath, "train", "subject_train.txt"))
featuresTrain <- read.table(file.path(dataPath, "train", "X_train.txt"))
activityTrain <- read.table(file.path(dataPath, "train", "y_train.txt"))
subjectTest <- read.table(file.path(dataPath, "test", "subject_test.txt"))
featuresTest <- read.table(file.path(dataPath, "test", "X_test.txt"))
activityTest <- read.table(file.path(dataPath, "test", "y_test.txt"))
Merge the training and the test sets to create one data set
trainData <- cbind(subjectTrain, featuresTrain, activityTrain)
testData <- cbind(subjectTest, featuresTest, activityTest)
mergeData <- rbind(trainData, testData)
colnames(mergeData) <- c("subject", features$V2, "activity")
#names(mergeData) #check
Subset the measurements that include mean and standard deviation names
selectedColumns <- grepl("subject|mean|std|activity", colnames(mergeData))
mergeData <- mergeData[, selectedColumns]
names(mergeData) # check
## [1] "subject" "tBodyAcc-mean()-X"
## [3] "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z"
## [5] "tBodyAcc-std()-X" "tBodyAcc-std()-Y"
## [7] "tBodyAcc-std()-Z" "tGravityAcc-mean()-X"
## [9] "tGravityAcc-mean()-Y" "tGravityAcc-mean()-Z"
## [11] "tGravityAcc-std()-X" "tGravityAcc-std()-Y"
## [13] "tGravityAcc-std()-Z" "tBodyAccJerk-mean()-X"
## [15] "tBodyAccJerk-mean()-Y" "tBodyAccJerk-mean()-Z"
## [17] "tBodyAccJerk-std()-X" "tBodyAccJerk-std()-Y"
## [19] "tBodyAccJerk-std()-Z" "tBodyGyro-mean()-X"
## [21] "tBodyGyro-mean()-Y" "tBodyGyro-mean()-Z"
## [23] "tBodyGyro-std()-X" "tBodyGyro-std()-Y"
## [25] "tBodyGyro-std()-Z" "tBodyGyroJerk-mean()-X"
## [27] "tBodyGyroJerk-mean()-Y" "tBodyGyroJerk-mean()-Z"
## [29] "tBodyGyroJerk-std()-X" "tBodyGyroJerk-std()-Y"
## [31] "tBodyGyroJerk-std()-Z" "tBodyAccMag-mean()"
## [33] "tBodyAccMag-std()" "tGravityAccMag-mean()"
## [35] "tGravityAccMag-std()" "tBodyAccJerkMag-mean()"
## [37] "tBodyAccJerkMag-std()" "tBodyGyroMag-mean()"
## [39] "tBodyGyroMag-std()" "tBodyGyroJerkMag-mean()"
## [41] "tBodyGyroJerkMag-std()" "fBodyAcc-mean()-X"
## [43] "fBodyAcc-mean()-Y" "fBodyAcc-mean()-Z"
## [45] "fBodyAcc-std()-X" "fBodyAcc-std()-Y"
## [47] "fBodyAcc-std()-Z" "fBodyAcc-meanFreq()-X"
## [49] "fBodyAcc-meanFreq()-Y" "fBodyAcc-meanFreq()-Z"
## [51] "fBodyAccJerk-mean()-X" "fBodyAccJerk-mean()-Y"
## [53] "fBodyAccJerk-mean()-Z" "fBodyAccJerk-std()-X"
## [55] "fBodyAccJerk-std()-Y" "fBodyAccJerk-std()-Z"
## [57] "fBodyAccJerk-meanFreq()-X" "fBodyAccJerk-meanFreq()-Y"
## [59] "fBodyAccJerk-meanFreq()-Z" "fBodyGyro-mean()-X"
## [61] "fBodyGyro-mean()-Y" "fBodyGyro-mean()-Z"
## [63] "fBodyGyro-std()-X" "fBodyGyro-std()-Y"
## [65] "fBodyGyro-std()-Z" "fBodyGyro-meanFreq()-X"
## [67] "fBodyGyro-meanFreq()-Y" "fBodyGyro-meanFreq()-Z"
## [69] "fBodyAccMag-mean()" "fBodyAccMag-std()"
## [71] "fBodyAccMag-meanFreq()" "fBodyBodyAccJerkMag-mean()"
## [73] "fBodyBodyAccJerkMag-std()" "fBodyBodyAccJerkMag-meanFreq()"
## [75] "fBodyBodyGyroMag-mean()" "fBodyBodyGyroMag-std()"
## [77] "fBodyBodyGyroMag-meanFreq()" "fBodyBodyGyroJerkMag-mean()"
## [79] "fBodyBodyGyroJerkMag-std()" "fBodyBodyGyroJerkMag-meanFreq()"
## [81] "activity"
Name the activities with descriptive names
mergeData$activity <- factor(mergeData$activity, levels = activityLabels[,1], labels = activityLabels[,2])
mergeData$subject <- as.factor(mergeData$subject)
head(mergeData$activity) #check
## [1] STANDING STANDING STANDING STANDING STANDING STANDING
## 6 Levels: WALKING WALKING_UPSTAIRS WALKING_DOWNSTAIRS ... LAYING
Clean the names to appropriate descriptive variable names
colnames(mergeData) <- gsub('[-()]', '', colnames(mergeData))
colnames(mergeData) <- gsub("^t", "time", colnames(mergeData))
colnames(mergeData) <- gsub("^f", "frequency", colnames(mergeData))
colnames(mergeData) <- gsub("Acc", "Accelerometer", colnames(mergeData))
colnames(mergeData) <- gsub("Gyro", "Gyroscope", colnames(mergeData))
colnames(mergeData) <- gsub("Mag", "Magnitude", colnames(mergeData))
colnames(mergeData) <- gsub("mean", "Mean", colnames(mergeData))
colnames(mergeData) <- gsub("std", "Std", colnames(mergeData))
colnames(mergeData) <- gsub("BodyBody", "Body", colnames(mergeData))
names(mergeData) # check
## [1] "subject"
## [2] "timeBodyAccelerometerMeanX"
## [3] "timeBodyAccelerometerMeanY"
## [4] "timeBodyAccelerometerMeanZ"
## [5] "timeBodyAccelerometerStdX"
## [6] "timeBodyAccelerometerStdY"
## [7] "timeBodyAccelerometerStdZ"
## [8] "timeGravityAccelerometerMeanX"
## [9] "timeGravityAccelerometerMeanY"
## [10] "timeGravityAccelerometerMeanZ"
## [11] "timeGravityAccelerometerStdX"
## [12] "timeGravityAccelerometerStdY"
## [13] "timeGravityAccelerometerStdZ"
## [14] "timeBodyAccelerometerJerkMeanX"
## [15] "timeBodyAccelerometerJerkMeanY"
## [16] "timeBodyAccelerometerJerkMeanZ"
## [17] "timeBodyAccelerometerJerkStdX"
## [18] "timeBodyAccelerometerJerkStdY"
## [19] "timeBodyAccelerometerJerkStdZ"
## [20] "timeBodyGyroscopeMeanX"
## [21] "timeBodyGyroscopeMeanY"
## [22] "timeBodyGyroscopeMeanZ"
## [23] "timeBodyGyroscopeStdX"
## [24] "timeBodyGyroscopeStdY"
## [25] "timeBodyGyroscopeStdZ"
## [26] "timeBodyGyroscopeJerkMeanX"
## [27] "timeBodyGyroscopeJerkMeanY"
## [28] "timeBodyGyroscopeJerkMeanZ"
## [29] "timeBodyGyroscopeJerkStdX"
## [30] "timeBodyGyroscopeJerkStdY"
## [31] "timeBodyGyroscopeJerkStdZ"
## [32] "timeBodyAccelerometerMagnitudeMean"
## [33] "timeBodyAccelerometerMagnitudeStd"
## [34] "timeGravityAccelerometerMagnitudeMean"
## [35] "timeGravityAccelerometerMagnitudeStd"
## [36] "timeBodyAccelerometerJerkMagnitudeMean"
## [37] "timeBodyAccelerometerJerkMagnitudeStd"
## [38] "timeBodyGyroscopeMagnitudeMean"
## [39] "timeBodyGyroscopeMagnitudeStd"
## [40] "timeBodyGyroscopeJerkMagnitudeMean"
## [41] "timeBodyGyroscopeJerkMagnitudeStd"
## [42] "frequencyBodyAccelerometerMeanX"
## [43] "frequencyBodyAccelerometerMeanY"
## [44] "frequencyBodyAccelerometerMeanZ"
## [45] "frequencyBodyAccelerometerStdX"
## [46] "frequencyBodyAccelerometerStdY"
## [47] "frequencyBodyAccelerometerStdZ"
## [48] "frequencyBodyAccelerometerMeanFreqX"
## [49] "frequencyBodyAccelerometerMeanFreqY"
## [50] "frequencyBodyAccelerometerMeanFreqZ"
## [51] "frequencyBodyAccelerometerJerkMeanX"
## [52] "frequencyBodyAccelerometerJerkMeanY"
## [53] "frequencyBodyAccelerometerJerkMeanZ"
## [54] "frequencyBodyAccelerometerJerkStdX"
## [55] "frequencyBodyAccelerometerJerkStdY"
## [56] "frequencyBodyAccelerometerJerkStdZ"
## [57] "frequencyBodyAccelerometerJerkMeanFreqX"
## [58] "frequencyBodyAccelerometerJerkMeanFreqY"
## [59] "frequencyBodyAccelerometerJerkMeanFreqZ"
## [60] "frequencyBodyGyroscopeMeanX"
## [61] "frequencyBodyGyroscopeMeanY"
## [62] "frequencyBodyGyroscopeMeanZ"
## [63] "frequencyBodyGyroscopeStdX"
## [64] "frequencyBodyGyroscopeStdY"
## [65] "frequencyBodyGyroscopeStdZ"
## [66] "frequencyBodyGyroscopeMeanFreqX"
## [67] "frequencyBodyGyroscopeMeanFreqY"
## [68] "frequencyBodyGyroscopeMeanFreqZ"
## [69] "frequencyBodyAccelerometerMagnitudeMean"
## [70] "frequencyBodyAccelerometerMagnitudeStd"
## [71] "frequencyBodyAccelerometerMagnitudeMeanFreq"
## [72] "frequencyBodyAccelerometerJerkMagnitudeMean"
## [73] "frequencyBodyAccelerometerJerkMagnitudeStd"
## [74] "frequencyBodyAccelerometerJerkMagnitudeMeanFreq"
## [75] "frequencyBodyGyroscopeMagnitudeMean"
## [76] "frequencyBodyGyroscopeMagnitudeStd"
## [77] "frequencyBodyGyroscopeMagnitudeMeanFreq"
## [78] "frequencyBodyGyroscopeJerkMagnitudeMean"
## [79] "frequencyBodyGyroscopeJerkMagnitudeStd"
## [80] "frequencyBodyGyroscopeJerkMagnitudeMeanFreq"
## [81] "activity"
Create a second independent tidy data set with the average of each variable for each activity and each subject
library(dplyr)
secondData <- mergeData %>%
group_by(subject, activity) %>%
summarise_each(funs(mean))
write.table(secondData, "tidydata.txt", row.names = FALSE, quote = FALSE)