We were given a short list of choices that are all capable of executing the app suite’s functions, and we were asked to analyze the positive and negative attitudes toward these smart phones online in order to narrow this list down to one device in the end. An extensive web sentiment analysis will be performed in this scenario to gain insights into the attitudes toward these devices.
For the second part of the project, we will try on various feature selections and feature engineering methods to generate the most optimal predictive models and then we will apply these models to the Large Matrix.csv file to complete our sentiment analysis toward both iPhone and Samsung Galaxy.
# Call libraries and set seed
library(caret)
library(readr)
library(plotly)
library(dplyr)
library(tidyr)
library(corrplot)
library(ggplot2)
set.seed(520)
# Set up parallel processing
library(doParallel)
# Find how many cores are on your machine
detectCores() # result [8]
# Create Cluster with desired number of cores.
cl <- makeCluster(4)
# Register Cluster
registerDoParallel(cl)
# Confirm how many cores are now "assigned" to R and RStudio
getDoParWorkers() # result [4]
## Import iphone dataset
iphoneDF <- read_csv("C:/Dev/Data Analysis/Course 4/Task 3/iphone_smallmatrix_labeled_8d.csv")
# Check general data structure/info
str(iphoneDF)
summary(iphoneDF)
# Check all attributes of iphone DF
names(iphoneDF)
## [1] "iphone" "samsunggalaxy" "sonyxperia" "nokialumina"
## [5] "htcphone" "ios" "googleandroid" "iphonecampos"
## [9] "samsungcampos" "sonycampos" "nokiacampos" "htccampos"
## [13] "iphonecamneg" "samsungcamneg" "sonycamneg" "nokiacamneg"
## [17] "htccamneg" "iphonecamunc" "samsungcamunc" "sonycamunc"
## [21] "nokiacamunc" "htccamunc" "iphonedispos" "samsungdispos"
## [25] "sonydispos" "nokiadispos" "htcdispos" "iphonedisneg"
## [29] "samsungdisneg" "sonydisneg" "nokiadisneg" "htcdisneg"
## [33] "iphonedisunc" "samsungdisunc" "sonydisunc" "nokiadisunc"
## [37] "htcdisunc" "iphoneperpos" "samsungperpos" "sonyperpos"
## [41] "nokiaperpos" "htcperpos" "iphoneperneg" "samsungperneg"
## [45] "sonyperneg" "nokiaperneg" "htcperneg" "iphoneperunc"
## [49] "samsungperunc" "sonyperunc" "nokiaperunc" "htcperunc"
## [53] "iosperpos" "googleperpos" "iosperneg" "googleperneg"
## [57] "iosperunc" "googleperunc" "iphonesentiment"
plot_ly(iphoneDF, x= ~iphoneDF$iphonesentiment, type='histogram')
* Above histogram the distribution of iphone sentiment score, which shows a strong positive sign (skewed to 5).
## Check for missing values
sum(is.na(iphoneDF))
# NearZeroVar() with saveMetrics = FALSE returns an vector
nzv <- nearZeroVar(iphoneDF, saveMetrics = FALSE)
# Create a new data set and remove near zero variance features
iphoneNZV <- iphoneDF[,-nzv]
str(iphoneNZV)
# Let's sample the data before using RFE
iphoneSample <- iphoneDF[sample(1:nrow(iphoneDF), 1000, replace=FALSE),]
# Set up RFE Control with randomforest, repeated cross validation and no updates
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
# Use RFE and omit the response variable (attribute 59 iphonesentiment)
rfeResults <- rfe(iphoneSample[,1:58],
iphoneSample$iphonesentiment,
sizes= c(1:58),
rfeControl= ctrl)
# Get results
rfeResults
## Plot results
plot(rfeResults, type=c("g", "o"))
## Create new data set with rfe recommended features
iphoneRFE <- iphoneDF[,predictors(rfeResults)]
## Add the dependent variable to iphoneRFE
iphoneRFE$iphonesentiment <- iphoneDF$iphonesentiment
# Factorize the dependent variable
iphoneDF$iphonesentiment <- as.factor(iphoneDF$iphonesentiment)
iphoneNZV$iphonesentiment <- as.factor(iphoneNZV$iphonesentiment)
iphoneRFE$iphonesentiment <- as.factor(iphoneRFE$iphonesentiment)
str(iphoneDF$iphonesentiment)
## Factor w/ 6 levels "0","1","2","3",..: 1 1 1 1 1 5 5 1 1 1 ...
# Define an 70%/30% train/test split of the iphoneDF
inTraining <- createDataPartition(iphoneDF$iphonesentiment, p = .70, list = FALSE)
training <- iphoneDF[inTraining,]
testing <- iphoneDF[-inTraining,]
# 10 fold cross validation
fitControl <- trainControl(method = "cv", number = 10)
# C5.0 training
C50 <- train(iphonesentiment~., data = training, method = "C5.0", trControl=fitControl)
# Testing
prediction_C50 <- predict(C50, testing)
# Use RandomForest with 10-fold cross validation
rf <- train(iphonesentiment~., data = training, method = "rf", trControl=fitControl)
# Testing
prediction_rf<- predict(rf, testing)
# Use SVM with 10-fold cross validation
svm <- train(iphonesentiment~., data = training, method = "svmLinear", trControl=fitControl)
# Testing
prediction_svm<- predict(svm, testing)
# Use KKNN with 10-fold cross validation
kknn <- train(iphonesentiment~., data = training, method = "kknn", trControl=fitControl)
# Testing
prediction_kknn<- predict(kknn, testing)
# Evaluate C5.0 Model
postResample(prediction_C50, testing$iphonesentiment)
# Evaluate RF Model
postResample(prediction_rf, testing$iphonesentiment)
# Evaluate SVM Model
postResample(prediction_svm, testing$iphonesentiment)
# Evaluate KKNN Model
postResample(prediction_kknn, testing$iphonesentiment)
# Define an 70%/30% train/test split of the iphoneNZV
inTraining_iphoneNZV <- createDataPartition(iphoneNZV$iphonesentiment, p = .70, list = FALSE)
training_NZV <- iphoneNZV[inTraining,]
testing_NZV <- iphoneNZV[-inTraining,]
# Apply RandomForest with 10-fold cross validation on iphoneNZV
rf_NZV <- train(iphonesentiment~., data = training_NZV, method = "rf", trControl=fitControl)
# Testing
prediction_rf_NZV<- predict(rf_NZV, testing_NZV)
# Define an 70%/30% train/test split of the iphoneRFE
inTraining_iphoneRFE <- createDataPartition(iphoneRFE$iphonesentiment, p = .70, list = FALSE)
training_RFE <- iphoneRFE[inTraining,]
testing_RFE <- iphoneRFE[-inTraining,]
# Apply RandomForest with 10-fold cross validation on iphoneRFE
rf_RFE <- train(iphonesentiment~., data = training_RFE, method = "rf", trControl=fitControl)
# Testing
prediction_rf_RFE<- predict(rf_RFE, testing_RFE)
postResample(prediction_rf_NZV, testing_NZV$iphonesentiment)
postResample(prediction_rf_RFE, testing_RFE$iphonesentiment)
NZV and RFE Comparison
# Data = training and testing from iphoneDF (no feature selection)
# Excluded the dependent variable and set threshold to .95
preprocessParams <- preProcess(training[,-59], method=c("center", "scale", "pca"), thresh = 0.95)
print(preprocessParams)
## Created from 9083 samples and 58 variables
##
## Pre-processing:
## - centered (58)
## - ignored (0)
## - principal component signal extraction (58)
## - scaled (58)
##
## PCA needed 26 components to capture 95 percent of the variance
# Use predict to apply pca parameters, create training, exclude dependant
train.pca <- predict(preprocessParams, training[,-59])
# Add the dependent to training
train.pca$iphonesentiment <- training$iphonesentiment
# Use predict to apply pca parameters, create testing, exclude dependant
test.pca <- predict(preprocessParams, testing[,-59])
# Add the dependent to testing
test.pca$iphonesentiment <- testing$iphonesentiment
# 10 fold cross validation
fitControl <- trainControl(method = "cv", number = 10)
# Apply RandomForest Model with 10-fold cross validation on Principal Component Analysis
rf_pca <- train(iphonesentiment~., data = train.pca, method = "rf", trControl=fitControl)
# Testing
prediction_rf_pca<- predict(rf_pca, test.pca)
# Evaluate the model
postResample(prediction_rf_pca, test.pca$iphonesentiment)
## Accuracy Kappa
## 0.7606684 0.5382464
# Create a new dataset that will be used for recoding sentiment
iphoneRC <- iphoneDF
# Recode sentiment to combine factor levels 0 & 1 and 4 & 5
iphoneRC$iphonesentiment <- recode(iphoneRC$iphonesentiment, '0' = 1, '1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 4)
# Make iphonesentiment a factor
iphoneRC$iphonesentiment <- as.factor(iphoneRC$iphonesentiment)
# Exam the data structure on 'iphonesentiment'
str(iphoneRC$iphonesentiment)
## Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 4 4 1 1 1 ...
# Define an 70%/30% train/test split of the iphoneRC
inTrainingRC <- createDataPartition(iphoneRC$iphonesentiment, p = .70, list = FALSE)
training_RC <- iphoneRC[inTraining,]
testing_RC <- iphoneRC[-inTraining,]
# Use The Best RandomForest with 10-fold cross validation on Recoding the Dependant variable
rf_RC <- train(iphonesentiment~., data = training_RC, method = "rf", trControl=fitControl)
# Testing
prediction_rf_RC<- predict(rf_RC, testing_RC)
# Evaluate the model
postResample(prediction_rf_RC, testing_RC$iphonesentiment)
## Accuracy Kappa
## 0.8514139 0.6315268
# Importing SamsungDF
samsungDF <- read_csv("C:/Dev/Data Analysis/Course 4/Task 3/galaxy_smallmatrix_labeled_9d.csv")
# Create a new dataset that will be used for recoding sentiment
samsungRC <- samsungDF
# Recode sentiment to combine factor levels 0 & 1 and 4 & 5
samsungRC$galaxysentiment <- recode(samsungRC$galaxysentiment, '0' = 1, '1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 4)
# Make iphonesentiment a factor
samsungRC$galaxysentiment <- as.factor(samsungRC$galaxysentiment)
# Define an 70%/30% train/test split of the samsungRC
inTrainingRC_samsung <- createDataPartition(samsungRC$galaxysentiment, p = .70, list = FALSE)
training_RC_samsung <- samsungRC[inTrainingRC_samsung,]
testing_RC_samsung <- samsungRC[-inTrainingRC_samsung,]
# 10 fold cross validation
fitControl <- trainControl(method = "cv", number = 10)
# Apply The Best Random Forest Model
rf_RC_samsung <- train(galaxysentiment~., data = training_RC_samsung, method = "rf", trControl=fitControl)
# Testing
prediction_rf_RC_samsung<- predict(rf_RC_samsung, testing_RC_samsung)
# Evaluate the model
postResample(prediction_rf_RC_samsung, testing_RC_samsung$galaxysentiment)
## Accuracy Kappa
## 0.843750 0.599699
# Apply Model to Large Matrix (22461 Observations)
iphoneLargeMatrix <- read_csv("C:/Dev/Data Analysis/Course 4/Task 3/iphoneLargeMatrix.csv")
# Remove the 1st column id from iphoneLargeMatrix
iphoneLargeMatrix$id <- NULL
# Make predictions for iphone
finalPred_iphone <- predict(rf_RC, iphoneLargeMatrix)
summary(finalPred_iphone)
library(plotly)
pieData <- data.frame(COM = c("negative", "somewhat negative", "somewhat positive", "positive"),
values = c(9467, 614, 1407, 10790 ))
# Create pie chart
plot_ly(pieData, labels = ~COM, values = ~ values, type = "pie",
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste( values),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)),
showlegend = F) %>%
layout(title = 'iPhone Sentiment on Large Matrix',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
# Create two pie charts (side by side)
# summary(iphoneRC) # get last column of iphonesentiment count and put it in below values vector
pieData_iphoneRC <- data.frame(COM = c("negative", "somewhat negative", "somewhat positive", "positive"),
values = c( 2352, 454, 1188, 8979 ))
#summary(samsungRC) # get last column of galaxysentiment count and put it in below values vector
pieData_samsungRC <- data.frame(COM = c("negative", "somewhat negative", "somewhat positive", "positive"),
values = c( 2078, 450, 1175, 9208 ))
plot_ly(pieData_iphoneRC, labels = ~COM, values = ~ values, type = "pie", title = 'iPhone Sentiment',
domain = list(x = c(0, 0.5), y = c(0, 1))) %>%
add_trace(data = pieData_samsungRC, labels = ~COM, values = ~ values, type = "pie", title = 'Samsung Sentiment',
domain = list(x = c(0.52, 1.02), y = c(0, 1)))
# Stop Cluster
stopCluster(cl)