# Coursera - Johns Hopkins Data Science Specialization
# Course 3 - Getting and Cleaning Data- Week 4 - Assignment  
# https://github.com/gangxu79/Getting-and-Cleaning-Data-Week-4-Assignment/blob/master/run_analysis.R

# SET WORKING DIRECTORY ACCORDINGLY

# Load dplyr library

library(dplyr)
## 
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read training data

x_train <- read.table("UCI HAR Dataset/train/X_train.txt")
y_train <- read.table("UCI HAR Dataset/train/y_train.txt")
subject_train <- read.table("UCI HAR Dataset/train/subject_train.txt")

# Read test data

x_test <- read.table("UCI HAR Dataset/test/X_test.txt")
y_test <- read.table("UCI HAR Dataset/test/y_test.txt")
subject_test <- read.table("UCI HAR Dataset/test/subject_test.txt")

# Merge the training and test set to create one data set

x_data <- rbind(x_train, x_test)
y_data <- rbind(y_train, y_test)
subject_data <- rbind(subject_train, subject_test)

# Read features file

features <- read.table("UCI HAR Dataset/features.txt")

# Extract only the measurements on the mean and standard deviation for each measurement

# Use the grep function to search for the patterns "mean" and "std"

features_extracted <- features[grep("mean\\(\\)|std\\(\\)",features[,2]),]
x_data <- x_data[,features_extracted[,1]]

# Read activity labels

activity_labels <- read.table("UCI HAR Dataset/activity_labels.txt")

# Use descriptive activity names to name the activities in the data set

colnames(y_data) <- "Activity"
y_data$Activity <- factor(y_data$Activity, labels = as.character(activity_labels[,2]))

# Appropriately label the data set with descriptive variable names

colnames(x_data) <- features_extracted[,2]

# Create a second, independent tidy data set with the average of each variable for each activity and each subject

# Name the subject column

colnames(subject_data) <- "Subject"

# Combine the columns of subjects, activities and extracted measurements
data_combined <- cbind(subject_data, y_data, x_data)

# Calculate the mean of the measurements according to activity and subject
data_mean <- data_combined %>% group_by(Activity, Subject) %>% summarize_all(mean)

# Output result to text file
write.table(data_mean, "tidy_set.txt", row.name=FALSE)