The purpose of this project is to demonstrate your ability to collect, work with, and clean a data set. The goal is to prepare tidy data that can be used for later analysis.
The data used are from an experiment carried out on a group of 30 volunteers within an age group of 19-48 years. Each person performed six activities (WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING) with a smartphone (Samsung Galaxy S II) at the waist.
For each record in the dataset it is provided: - Triaxial acceleration from the accelerometer (total acceleration) and the estimated body acceleration. - Triaxial Angular velocity from the gyroscope. - A 561-feature vector with time and frequency domain variables. - Its activity label. - An identifier of the subject who carried out the experiment.
# we will use the RStudio IDE
#setwd(“create a work folder”)
getwd()
## [1] "/home/pc/GettingCleaning"
if ( !file.exists("getdata_projectfiles_UCI HAR Dataset.zip") ) {
url.file<-"https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
file.zip<-"getdata_projectfiles_UCI HAR Dataset.zip"
download.file(url.file, file.zip)
unzip(file.zip)
}
list.files()
## [1] "activityLabels.png"
## [2] "CodeBook.md"
## [3] "dataActivity.png"
## [4] "dataActivityTest.png"
## [5] "dataActivityTrain.png"
## [6] "dataFeaturesNames.png"
## [7] "dataFeatures.png"
## [8] "dataFeaturesTest.png"
## [9] "dataFeaturesTrain.png"
## [10] "dataSubject.png"
## [11] "dataSubjectTest.png"
## [12] "dataSubjectTrain.png"
## [13] "extractedData$activity.png"
## [14] "extractedData.png"
## [15] "getdata_projectfiles_UCI HAR Dataset.zip"
## [16] "Getting_and_Cleaning_Data_Course_Project.html"
## [17] "Getting_and_Cleaning_Data_Course_Project.Rmd"
## [18] "magz.png"
## [19] "mergeColData.png"
## [20] "newdata_temp.txt"
## [21] "newdata.txt"
## [22] "README.md"
## [23] "rsconnect"
## [24] "run_analysis.R"
## [25] "UCI HAR Dataset"
list.files("UCI HAR Dataset/")
## [1] "activity_labels.txt" "features_info.txt" "features.txt"
## [4] "README.txt" "test" "train"
# load data
dataSubjectTrain <- read.table("UCI HAR Dataset/train/subject_train.txt",header = FALSE)
dataSubjectTest <- read.table("UCI HAR Dataset/test/subject_test.txt",header = FALSE)
dataActivityTest <- read.table("UCI HAR Dataset/test/y_test.txt",header = FALSE)
dataActivityTrain <- read.table("UCI HAR Dataset/train/y_train.txt",header = FALSE)
dataFeaturesTest <- read.table("UCI HAR Dataset/test/X_test.txt",header = FALSE)
dataFeaturesTrain <- read.table("UCI HAR Dataset/train/X_train.txt",header = FALSE)
head(dataSubjectTrain)
## V1
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
head(dataActivityTrain)
## V1
## 1 5
## 2 5
## 3 5
## 4 5
## 5 5
## 6 5
dataFeaturesTrain[c(1:5),c(1:7)]
## V1 V2 V3 V4 V5 V6
## 1 0.2885845 -0.02029417 -0.1329051 -0.9952786 -0.9831106 -0.9135264
## 2 0.2784188 -0.01641057 -0.1235202 -0.9982453 -0.9753002 -0.9603220
## 3 0.2796531 -0.01946716 -0.1134617 -0.9953796 -0.9671870 -0.9789440
## 4 0.2791739 -0.02620065 -0.1232826 -0.9960915 -0.9834027 -0.9906751
## 5 0.2766288 -0.01656965 -0.1153619 -0.9981386 -0.9808173 -0.9904816
## V7
## 1 -0.9951121
## 2 -0.9988072
## 3 -0.9965199
## 4 -0.9970995
## 5 -0.9983211
#print("******1. Merges the training and the test sets to create one data set ****")
dataSubject <- rbind(dataSubjectTrain, dataSubjectTest)
dataActivity<- rbind(dataActivityTrain, dataActivityTest)
dataFeatures<- rbind(dataFeaturesTrain, dataFeaturesTest)
#add title head
colnames(dataSubject) <- "subject"
colnames(dataActivity) <- "activity"
dataFeaturesNames <- read.table("UCI HAR Dataset/features.txt",head=FALSE)
colnames(dataFeatures) <- dataFeaturesNames$V2
mergeColData <- cbind(dataFeatures,dataActivity,dataSubject)
#print("******2. Extracts only the measurements on the mean and standard deviation for each measurement *****")
#We load only the columns that have mean y std
colWithMeanSTD <- grep(".*mean.*|.*std.*", names(mergeColData), ignore.case=TRUE)
requiredColumns <- c(colWithMeanSTD, 562, 563)
extractedData <- mergeColData[,requiredColumns]
#dim(extractedData)
#print("******3. Uses descriptive activity names to name the activities in the data set ******")
#We change the numerical value of "activity" column by its description, example 5 = STANDING
activityLabels <- read.table("UCI HAR Dataset/activity_labels.txt",header = FALSE)
#head(activityLabels$V2[extractedData$activity],30)
extractedData$activity <- head(activityLabels$V2[extractedData$activity],length(extractedData$activity))
#print("*****4. Appropriately labels the data set with descriptive variable names. ****")
#We change the letter that is in each column for each description, example t = time
names(extractedData)<-gsub("^t", "time", names(extractedData))
names(extractedData)<-gsub("^f", "frequency", names(extractedData))
names(extractedData)<-gsub("Acc", "Accelerometer", names(extractedData))
names(extractedData)<-gsub("Gyro", "Gyroscope", names(extractedData))
names(extractedData)<-gsub("Mag", "Magnitude", names(extractedData))
names(extractedData)<-gsub("BodyBody", "Body", names(extractedData))
names(extractedData)
## [1] "timeBodyAccelerometer-mean()-X"
## [2] "timeBodyAccelerometer-mean()-Y"
## [3] "timeBodyAccelerometer-mean()-Z"
## [4] "timeBodyAccelerometer-std()-X"
## [5] "timeBodyAccelerometer-std()-Y"
## [6] "timeBodyAccelerometer-std()-Z"
## [7] "timeGravityAccelerometer-mean()-X"
## [8] "timeGravityAccelerometer-mean()-Y"
## [9] "timeGravityAccelerometer-mean()-Z"
## [10] "timeGravityAccelerometer-std()-X"
## [11] "timeGravityAccelerometer-std()-Y"
## [12] "timeGravityAccelerometer-std()-Z"
## [13] "timeBodyAccelerometerJerk-mean()-X"
## [14] "timeBodyAccelerometerJerk-mean()-Y"
## [15] "timeBodyAccelerometerJerk-mean()-Z"
## [16] "timeBodyAccelerometerJerk-std()-X"
## [17] "timeBodyAccelerometerJerk-std()-Y"
## [18] "timeBodyAccelerometerJerk-std()-Z"
## [19] "timeBodyGyroscope-mean()-X"
## [20] "timeBodyGyroscope-mean()-Y"
## [21] "timeBodyGyroscope-mean()-Z"
## [22] "timeBodyGyroscope-std()-X"
## [23] "timeBodyGyroscope-std()-Y"
## [24] "timeBodyGyroscope-std()-Z"
## [25] "timeBodyGyroscopeJerk-mean()-X"
## [26] "timeBodyGyroscopeJerk-mean()-Y"
## [27] "timeBodyGyroscopeJerk-mean()-Z"
## [28] "timeBodyGyroscopeJerk-std()-X"
## [29] "timeBodyGyroscopeJerk-std()-Y"
## [30] "timeBodyGyroscopeJerk-std()-Z"
## [31] "timeBodyAccelerometerMagnitude-mean()"
## [32] "timeBodyAccelerometerMagnitude-std()"
## [33] "timeGravityAccelerometerMagnitude-mean()"
## [34] "timeGravityAccelerometerMagnitude-std()"
## [35] "timeBodyAccelerometerJerkMagnitude-mean()"
## [36] "timeBodyAccelerometerJerkMagnitude-std()"
## [37] "timeBodyGyroscopeMagnitude-mean()"
## [38] "timeBodyGyroscopeMagnitude-std()"
## [39] "timeBodyGyroscopeJerkMagnitude-mean()"
## [40] "timeBodyGyroscopeJerkMagnitude-std()"
## [41] "frequencyBodyAccelerometer-mean()-X"
## [42] "frequencyBodyAccelerometer-mean()-Y"
## [43] "frequencyBodyAccelerometer-mean()-Z"
## [44] "frequencyBodyAccelerometer-std()-X"
## [45] "frequencyBodyAccelerometer-std()-Y"
## [46] "frequencyBodyAccelerometer-std()-Z"
## [47] "frequencyBodyAccelerometer-meanFreq()-X"
## [48] "frequencyBodyAccelerometer-meanFreq()-Y"
## [49] "frequencyBodyAccelerometer-meanFreq()-Z"
## [50] "frequencyBodyAccelerometerJerk-mean()-X"
## [51] "frequencyBodyAccelerometerJerk-mean()-Y"
## [52] "frequencyBodyAccelerometerJerk-mean()-Z"
## [53] "frequencyBodyAccelerometerJerk-std()-X"
## [54] "frequencyBodyAccelerometerJerk-std()-Y"
## [55] "frequencyBodyAccelerometerJerk-std()-Z"
## [56] "frequencyBodyAccelerometerJerk-meanFreq()-X"
## [57] "frequencyBodyAccelerometerJerk-meanFreq()-Y"
## [58] "frequencyBodyAccelerometerJerk-meanFreq()-Z"
## [59] "frequencyBodyGyroscope-mean()-X"
## [60] "frequencyBodyGyroscope-mean()-Y"
## [61] "frequencyBodyGyroscope-mean()-Z"
## [62] "frequencyBodyGyroscope-std()-X"
## [63] "frequencyBodyGyroscope-std()-Y"
## [64] "frequencyBodyGyroscope-std()-Z"
## [65] "frequencyBodyGyroscope-meanFreq()-X"
## [66] "frequencyBodyGyroscope-meanFreq()-Y"
## [67] "frequencyBodyGyroscope-meanFreq()-Z"
## [68] "frequencyBodyAccelerometerMagnitude-mean()"
## [69] "frequencyBodyAccelerometerMagnitude-std()"
## [70] "frequencyBodyAccelerometerMagnitude-meanFreq()"
## [71] "frequencyBodyAccelerometerJerkMagnitude-mean()"
## [72] "frequencyBodyAccelerometerJerkMagnitude-std()"
## [73] "frequencyBodyAccelerometerJerkMagnitude-meanFreq()"
## [74] "frequencyBodyGyroscopeMagnitude-mean()"
## [75] "frequencyBodyGyroscopeMagnitude-std()"
## [76] "frequencyBodyGyroscopeMagnitude-meanFreq()"
## [77] "frequencyBodyGyroscopeJerkMagnitude-mean()"
## [78] "frequencyBodyGyroscopeJerkMagnitude-std()"
## [79] "frequencyBodyGyroscopeJerkMagnitude-meanFreq()"
## [80] "angle(tBodyAccelerometerMean,gravity)"
## [81] "angle(tBodyAccelerometerJerkMean),gravityMean)"
## [82] "angle(tBodyGyroscopeMean,gravityMean)"
## [83] "angle(tBodyGyroscopeJerkMean,gravityMean)"
## [84] "angle(X,gravityMean)"
## [85] "angle(Y,gravityMean)"
## [86] "angle(Z,gravityMean)"
## [87] "activity"
## [88] "subject"
#print("********5. From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.*******")
#We create a data set ordered subject
Data<-aggregate(. ~subject + activity, extractedData, mean)
Data<-Data[order(Data$subject,Data$activity),]
write.table(Data, file = "newdata_temp.txt",row.name=FALSE)
source code: https://github.com/magzupao/GettingCleaning/