Helio is working with a government health agency to create a suite of smart phone medical apps for use by aid workers in developing countries. This suite of apps will enable the aid workers to manage local health conditions by facilitating communication with medical professionals located elsewhere (one of the apps, for example, enables specialists in communicable diseases to diagnose conditions by examining images and other patient data uploaded by local aid workers). The government agency requires that the app suite be bundled with one model of smart phone. Helio is in the process of evaluating potential handset models to determine which one to bundle their software with. After completing an initial investigation, Helio has created a short list of five devices that are all capable of executing the app suite’s functions. To help Helio narrow their list down to one device, they have asked us to examine the prevalence of positive and negative attitudes toward these devices on the web.
The objective is to investigate predictive models using machine learning methods. These models will be applied to the Large Matrix file to complete the analysis of overall sentiment toward both iPhone and Samsung Galaxy. In this task machine learning methods will be used to predict the overall sentiment toward Samsung Galaxy handsets.
galaxy_smallmatrix_labeled_8d.csv is the data matrix that was used in this task to develop the models to predict the overall sentiment toward Samsung Galaxy. They include the counts of relevant words (sentiment lexicons) for about 12,000 instances (web pages). The values in the device sentiment columns (last column in the matrix) represents the overall sentiment toward the device on a scale of 0-5. The overall sentiment value has been manually input by a team of coworkers who read each webpage and rated the sentiment.
The scale is as follows:
Finally, the models will be used on the large matrix created from the AWS in the previous task to predict sentiment.
library(doParallel)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidyr)
library(plotly)
library(corrplot)
library(caret)
library(e1071)
library(kknn)
library(readxl)
library(rmarkdown)
Since we are dealing with a large dataset, parallel processing is performed to reduce computing time.
# Find out how many cores there are on my laptop
detectCores() # Result = 8
# Create cluster with desired number of cores.
cl <- makeCluster(2)
# Register cluster
registerDoParallel(cl)
# Confirm how many cores are now assigned to R and Rstudio
getDoParWorkers() # Result = 2
galaxyDF <- read.csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Sentiment Analysis/Dataset and csv/galaxy_smallmatrix_labeled_8d.csv")
## [1] "iphone" "samsunggalaxy" "sonyxperia" "nokialumina"
## [5] "htcphone" "ios" "googleandroid" "iphonecampos"
## [9] "samsungcampos" "sonycampos" "nokiacampos" "htccampos"
## [13] "iphonecamneg" "samsungcamneg" "sonycamneg" "nokiacamneg"
## [17] "htccamneg" "iphonecamunc" "samsungcamunc" "sonycamunc"
## [21] "nokiacamunc" "htccamunc" "iphonedispos" "samsungdispos"
## [25] "sonydispos" "nokiadispos" "htcdispos" "iphonedisneg"
## [29] "samsungdisneg" "sonydisneg" "nokiadisneg" "htcdisneg"
## [33] "iphonedisunc" "samsungdisunc" "sonydisunc" "nokiadisunc"
## [37] "htcdisunc" "iphoneperpos" "samsungperpos" "sonyperpos"
## [41] "nokiaperpos" "htcperpos" "iphoneperneg" "samsungperneg"
## [45] "sonyperneg" "nokiaperneg" "htcperneg" "iphoneperunc"
## [49] "samsungperunc" "sonyperunc" "nokiaperunc" "htcperunc"
## [53] "iosperpos" "googleperpos" "iosperneg" "googleperneg"
## [57] "iosperunc" "googleperunc" "galaxysentiment"
The attribute we want to predict:
## [1] "integer"
## [1] FALSE
Select only Samsung-related columns (Samsung, galaxy, google)
# Select relevant columns for galaxy
galaxy_relevant_columns <- galaxyDF %>%
select(starts_with("samsung"), starts_with("google"), galaxysentiment)
Checking for collinearity
cor(galaxy_relevant_columns$samsungdisunc, galaxy_relevant_columns$samsungdispos) #correlation coefficient is 0.9098321
## [1] 0.9098321
cor(galaxy_relevant_columns$samsungperneg, galaxy_relevant_columns$samsungdisneg) #correlation coefficient is 0.9394673
## [1] 0.9394673
cor(galaxy_relevant_columns$samsungperunc, galaxy_relevant_columns$samsungdisunc) #correlation coefficient is 0.9403043
## [1] 0.9403043
cor(galaxy_relevant_columns$googleperneg, galaxy_relevant_columns$googleperpos) #correlation coefficient is 0.9574098
## [1] 0.9574098
# Remove columns due to collinearity
galaxy_nocol <- galaxy_relevant_columns %>%
select(-c(samsungdisunc, samsungdisneg, googleperneg))
Corrplot after removal attributes due to collinearity
# create a new dataset
galaxyCOR <- galaxy_nocol
Removal of attributes with (near) zero variance
# Examine feature variance: nearZeroVar() with saveMetrics = TRUE returns an object containing a table including: frequency ratio, percentage unique, zero variance and near zero variance
galaxynzvMetrics <- nearZeroVar(galaxyCOR, saveMetrics = TRUE)
galaxynzvMetrics
## freqRatio percentUnique zeroVar nzv
## samsunggalaxy 14.127336 0.05395822 FALSE FALSE
## samsungcampos 93.625000 0.08479149 FALSE TRUE
## samsungcamneg 100.132812 0.06937486 FALSE TRUE
## samsungcamunc 74.308140 0.06937486 FALSE TRUE
## samsungdispos 97.061069 0.13104139 FALSE TRUE
## samsungperpos 94.200000 0.10791644 FALSE TRUE
## samsungperneg 101.650794 0.10020812 FALSE TRUE
## samsungperunc 86.500000 0.09249981 FALSE TRUE
## googleandroid 61.247573 0.04624990 FALSE TRUE
## googleperpos 98.592308 0.06937486 FALSE TRUE
## googleperunc 96.443609 0.07708317 FALSE TRUE
## galaxysentiment 4.579565 0.04624990 FALSE FALSE
# nearZeroVar() with saveMetrics = FALSE returns an vector
gnzv <- nearZeroVar(galaxyCOR, saveMetrics = FALSE)
## [1] 2 3 4 5 6 7 8 9 10 11
nearZeroVar columns: 2 3 4 5 6 7 8 9 10 11
# create a new data set and remove near zero variance features
galaxyNZV <- galaxyCOR[,-gnzv]
However, we can see that this dataset only contains 1 indepedent attribute: samsunggalaxy
Recursive Feature Elimination
# Sample the data (original dataset) before using RFE
set.seed(123)
galaxySample <- galaxyDF[sample(1:nrow(galaxyDF), 1000, replace=FALSE),]
# Set up rfeControl with randomforest, repeated cross validation and no updates
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
# Use rfe and omit the response variable (attribute sentiment)
rfeResults_galaxy <- rfe(galaxySample[,1:58],
galaxySample$galaxysentiment,
sizes=(1:58),
rfeControl=ctrl)
# create new data set with rfe recommended features
galaxyRFE <- galaxyDF[,predictors(rfeResults_galaxy)]
# add the dependent variable to galaxyRFE
galaxyRFE$galaxysentiment <- galaxyDF$galaxysentiment
# review outcome
glimpse(galaxyRFE)
# variable importance
varImp(rfeResults_galaxy)
# Overall
# iphone 71.171831
# googleandroid 37.439517
# iphonedispos 30.710355
# iphonedisneg 27.831920
# samsunggalaxy 26.544268
# iphonedisunc 24.024327
Recoding
# create new dataset that will be used for recoding sentiment
galaxyRC <- galaxyDF
# recode sentiment to combine factor levels 0 & 1 and 4 & 5
# recode sentiment to combine factor levels 0 & 1 and 4 & 5
galaxyRC$galaxysentiment <- recode(galaxyRC$galaxysentiment, '0' = 1, '1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 4)
Principal Component Analysis
# data = training and testing from galaxyDF (no feature selection)
# create object containing centered, scaled PCA components from training set
# excluded the dependent variable and set threshold to .95
preprocessParams <- preProcess(trainsetDF[,-59], method=c("center", "scale", "pca"), thresh = 0.95)
# use predict to apply pca parameters, create training, exclude dependant
train.pca <- predict(preprocessParams, trainsetDF[,-59])
# add the dependent to training
train.pca$galaxysentiment <- trainsetDF$galaxysentiment
# use predict to apply pca parameters, create testing, exclude dependant
test.pca <- predict(preprocessParams, testsetDF[,-59])
# add the dependent to training
test.pca$galaxysentiment <- testsetDF$galaxysentiment
PCA needed 25 components to capture 95 percent of the variance
Predicting galaxy sentiment using C5.0
## Negative Positive Somewhat Negative Somewhat Positive
## 5502 3240 578 123
Predicting galaxy sentiment using Random Forest
## Negative Positive Somewhat Negative Somewhat Positive
## 5554 3272 584 33
# Stop cluster. After performing task, stop cluster
stopCluster(cl)