The data is on Canvas in the data folder. You will be working with the data in the file called wweCalls. Do not unzip it through your computer’s GUI; instead, write the code to do it:
file.list <- list.files(path = "/Users/LiamBrothers_1/Downloads/wweCalls", pattern ='parsed', full.names = TRUE)
data_list <- lapply(file.list, function(x) read.csv(x))
merged_data <- data.table::rbindlist(data_list, fill = TRUE)
# Sentiment analysis.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(purrr)
library(tibble)
library(sentimentr)
library(lexicon)
##
## Attaching package: 'lexicon'
## The following object is masked from 'package:sentimentr':
##
## available_data
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.1.2
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(dplyr)
text_file <- as.data.frame(merged_data[,6])
#Simple Sentiment
text_file_rows <- text_file %>%
mutate_all(as.character) %>%
unnest_tokens(word, text)
library(tidyr)
text_file_rows %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill <- 0) %>%
mutate(sentiment <- positive - negative)
## Joining, by = "word"
## negative positive sentiment <- positive - negative
## 1 1624 6863 5239
text_file_rows %>%
inner_join(get_sentiments("afinn")) %>%
summarize(n = nrow(.), sentSum = sum(value)) %>%
mutate(sentiment = sentSum / n)
## Joining, by = "word"
## n sentSum sentiment
## 1 9595 12214 1.272955
#Attempt 3
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
text_as_character <- as.character(text_file$text)
sentiment <- get_nrc_sentiment(text_as_character)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
text_sentiment <- cbind(text_file$text,sentiment)
barplot(colSums(sentiment), col = rainbow(10), ylab = 'Count', xlab = 'Emotions', main = 'Sentiment Scores')
# MSBA ML CHALLENGE!
# You have the text for each call and a variety of variables (person, company, predicted gender, etc.). Use the text to generate predictions for any variable that you would like. Briefly discuss how your model performed.
#Whoever has the highest accuracy on a test set wins the belt. For the brave, you are welcome to use those raw call transcripts in the original zip file -- would certainly offer more training data, but they need parsed first!
#Combine and organize dataframes.
library(dplyr)
scores_only <- select(text_sentiment, anger, anticipation, disgust, fear, joy, sadness, surprise, trust, negative, positive)
data <- cbind(merged_data, scores_only)
data <- data %>%
relocate(text, .before = anger)
#Add binary yes/no column for male <- 1, female <- 0.
data$gender_YN <- 0
data$gender_YN <- ifelse(data$gender=="male", 1, 0)
# Omit NA's (Operator)
data <- data %>%
filter(name != "operator")
# CLEANING THE DATA
# Organization Column Cleaning
data$organization <- gsub(",","Marathon Partners", data$organization)
data$organization <- gsub(".*B. Riley.*","B. Riley and Company", data$organization)
data$organization <- gsub(".*Eaglerock.*","EagleRock Capital", data$organization)
data$organization <- gsub(".*Gabelli.*","Gabelli and Company", data$organization)
data$organization <- gsub(".*Sachs.*","Goldman Sachs Equity", data$organization)
data$organization <- gsub(".*Hudson.*","Hudson Square Research", data$organization)
data$organization <- gsub(".*Jefferies.*","Jefferies and Company", data$organization)
data$organization <- gsub(".*Marathan.*","Marathon Partners", data$organization)
data$organization <- gsub(".*Alpine.*","MacAlpine and Associates", data$organization)
data$organization <- gsub(".*Nat.*","Natexis", data$organization)
data$organization <- gsub(".*Noble.*","Noble Financial", data$organization)
data$organization <- gsub(".*Research Associates.*","Research Associates", data$organization)
data$organization <- gsub(".*Roth.*","Roth Capital Partners", data$organization)
data$organization <- gsub(".*Sido.*","Sidoti and Company", data$organization)
data$organization <- gsub(".*Soleil.*","Soleil Resarch Associates", data$organization)
data$organization <- gsub(".*Sterne.*","Stern Agee and Leach", data$organization)
data$organization <- gsub(".*Susqueh.*","Susquehanna Financial Group", data$organization)
data$organization <- gsub(".*Terrier.*","Terrier Partners", data$organization)
data$organization <- gsub(".*Wrestling.*","WWE", data$organization)
data$organization <- gsub(".*Zimmer.*","Zimmer Lucas", data$organization)
# Job Title Column Cleaning
data$title <- gsub(".*CEO.*","CEO", data$title)
data$title <- gsub(".*CFO.*","CFO", data$title)
data$title <- gsub(".*Executive.*","CEO", data$title)
data$title <- gsub(".*Fiancial.*","CFO", data$title)
data$title <- gsub(".*Chief Financial Officer.*","CFO", data$title)
data$title <- gsub(".*Analysis.*","Director - Planning and Analysis", data$title)
data$title <- gsub(".*Analyst.*","Analyst", data$title)
data$title <- gsub(".*IR.*","VP - Planning and Investor Relations", data$title)
data$title <- gsub(".*Investor.*","VP - Planning and Investor Relations", data$title)
data$title <- gsub(".*Accounting.*","CAO", data$title)
# Employee Name Cleaning
data$name <- gsub("ali migharabi","ali mogharabi", data$name)
data$name <- gsub(".*kilmery.*","dan kilmary", data$name)
data$name <- gsub(".*alpine.*","dennis macalpine", data$name)
data$name <- gsub(".*eduardo.*","eduardo bush", data$name)
data$name <- gsub(".*frank ser.*","frank serp", data$name)
data$name <- gsub(".*james clinic.*","james clinit", data$name)
data$name <- gsub(".*jared schr*","jared schramm", data$name)
data$name <- gsub(".*mario.*","mario cibelli", data$name)
data$name <- gsub(".*goldstein.*","michele goldstein", data$name)
data$name <- gsub(".*mike si.*","michael sileck", data$name)
data$name <- gsub(".*mike ke.*","michael kelman", data$name)
data$name <- gsub(".*nayar.*","nader tavakoli", data$name)
data$name <- gsub(".*phil.*","philip livingston", data$name)
data$name <- gsub(".*ingrassia.*","richard ingrassia", data$name)
data$name <- gsub(".*rouse.*","robert routh", data$name)
data <- data %>%
filter(name != "as a reminder") %>%
filter(name != "india is the other up and coming for us. we have a wonderful tv deal with zee tv. it used to be -- i think it was -- its taj") %>%
filter(name != "thomson reuters media")
# Likely Race Column Cleaning
table(data$likelyRace)
##
## meanAPI meanHispanic meanwhite
## 19 171 1700
data$likelyRace <- gsub(".*meanwhite","white",data$likelyRace)
data$likelyRace <- gsub(".*meanHispanic","hispanic",data$likelyRace)
data$likelyRace <- gsub(".*meanAPI","asian",data$likelyRace)
#Take Out Unwanted Columns
data1 <- subset(data, select = -c(name, firstName, firstLast, gender,likelyRaceProb,ticker,date,quarter,text))
#Transform to categorical to dummy variables.
library(fastDummies)
library(dplyr)
dummy_ <- fastDummies::dummy_cols(data1)
#Take out unwanted/weird columns
dummy <- subset(dummy_, select = -c(organization,title,likelyRace,organization_NA,title_NA,likelyRace_NA))
# Move DV to the end
dummy <- dummy %>% relocate(gender_YN, .after = likelyRace_white)
#Remove NA's
dummy <- na.omit(dummy)
#Data Partition
set.seed(95608)
# Data partition / Sample splitting
total_obs <- dim(dummy)[1]
train_data_indices <- sample(1:total_obs, 0.8*total_obs)
train_data <- dummy[train_data_indices,]
test_data <- dummy[-train_data_indices,]
# Record the size of training data and test data
train_obs <- dim(train_data)[1]
test_obs <- dim(test_data)[2]
#Tree Model
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(splitstackshape)
dummy$gender_YN <- as.factor(dummy$gender_YN)
tree_model <- rpart(gender_YN~.,data = dummy, method = "class")
tree_preds <- predict(tree_model, dummy)
tree_preds <- as.data.frame(tree_preds)
tree_preds$outcome <- 0
View(tree_preds)
colnames(tree_preds) <- c("non", "wwe", "outcome")
tree_preds$outcome <- ifelse(tree_preds$wwe > .5, 1, 0)
table <- table(tree_preds$outcome, dummy$gender_YN)
confusionMatrix(table, positive <- "1")
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 524 92
## 1 99 1146
##
## Accuracy : 0.8974
## 95% CI : (0.8827, 0.9108)
## No Information Rate : 0.6652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7689
##
## Mcnemar's Test P-Value : 0.6642
##
## Sensitivity : 0.9257
## Specificity : 0.8411
## Pos Pred Value : 0.9205
## Neg Pred Value : 0.8506
## Prevalence : 0.6652
## Detection Rate : 0.6158
## Detection Prevalence : 0.6690
## Balanced Accuracy : 0.8834
##
## 'Positive' Class : 1
##
# Accuracy : 0.8974
#Tree Model With Names
#Take Out Unwanted Columns
View(data)
data_w_names <- subset(data, select = -c(firstName, firstLast, gender,likelyRaceProb,ticker,date,quarter,text))
View(data_w_names)
#Convert with names data to dummy variables
dummy_w_names <- fastDummies::dummy_cols(data_w_names)
View(dummy_w_names)
#Take out unwanted/weird columns
dummy_w_names <- subset(dummy_w_names, select = -c(name,organization,title,likelyRace,organization_NA,title_NA,likelyRace_NA))
View(dummy_w_names)
# Move DV to the end
dummy_w_names <- dummy_w_names %>% relocate(gender_YN, .after = likelyRace_white)
View(dummy_w_names)
#Remove NA's
dummy_w_names <- na.omit(dummy_w_names)
View(dummy_w_names)
#Data Partition
set.seed(95608)
# Data partition / Sample splitting
total_obs_names <- dim(dummy_w_names)[1]
train_data_indices_names <- sample(1:total_obs, 0.8*total_obs)
train_data_names <- dummy_w_names[train_data_indices,]
test_data_names <- dummy_w_names[-train_data_indices,]
# Record the size of training data and test data
train_obs_names <- dim(train_data)[1]
test_obs_names <- dim(test_data)[2]
#With Names
dummy_w_names$gender_YN <- as.factor(dummy_w_names$gender_YN)
tree_model_names <- rpart(gender_YN~.,data = dummy_w_names, method = "class")
tree_preds_names <- predict(tree_model_names, dummy_w_names)
tree_preds_names <- as.data.frame(tree_preds_names)
tree_preds_names$outcome <- 0
colnames(tree_preds_names) <- c("non", "wwe", "outcome")
tree_preds_names$outcome <- ifelse(tree_preds_names$wwe > .5, 1, 0)
table_names <- table(tree_preds_names$outcome, dummy_w_names$gender_YN)
confusionMatrix(table_names, positive <- "1")
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 618 0
## 1 5 1238
##
## Accuracy : 0.9973
## 95% CI : (0.9937, 0.9991)
## No Information Rate : 0.6652
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.994
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9920
## Pos Pred Value : 0.9960
## Neg Pred Value : 1.0000
## Prevalence : 0.6652
## Detection Rate : 0.6652
## Detection Prevalence : 0.6679
## Balanced Accuracy : 0.9960
##
## 'Positive' Class : 1
##
# Accuracy: .9973
# XGBOOST Model / No names.
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(caret)
library(OptimalCutpoints)
library(ggplot2)
library(xgboostExplainer)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(SHAPforxgboost)
set.seed(95608)
test_index <- sample(1:nrow(dummy), size = nrow(dummy) * 0.2, replace = FALSE)
# Create training data
dtrain <- xgb.DMatrix(data = as.matrix(dummy[-test_index, 1:61]), label = as.numeric(dummy$gender_YN[-test_index])-1)
# Create test matrix
dtest <- xgb.DMatrix(data = as.matrix(dummy[test_index, 1:61]),label = as.numeric(dummy$gender_YN[test_index]))
# Fit XGBoost model
xg_model <- xgboost(data = dtrain, # Set training data
nrounds = 100, # Set number of rounds
verbose = 1, # 1 - Prints out fit
print_every_n = 20, # Prints out result every 20th iteration
objective = "binary:logistic", # Set objective
eval_metric = "auc",
eval_metric = "error") # Set evaluation metric to use
## [1] train-auc:0.953750 train-error:0.085292
## [21] train-auc:0.983199 train-error:0.068502
## [41] train-auc:0.989152 train-error:0.049026
## [61] train-auc:0.991157 train-error:0.044997
## [81] train-auc:0.992018 train-error:0.043653
## [100] train-auc:0.992862 train-error:0.040967
boost_preds <- predict(xg_model, dtrain) # Create predictions for xgboost model
pred_dat <- cbind.data.frame(boost_preds, as.numeric(dummy$gender_YN[-test_index]) -1)
names(pred_dat) <- c("predictions", "response")
oc<- optimal.cutpoints(X = "predictions",
status = "response",
tag.healthy = 0,
data = pred_dat,
methods = "MaxEfficiency")
boost_preds_1 <- predict(xg_model, dtest) # Create predictions for xgboost model
pred_dat <- cbind.data.frame(boost_preds_1, as.numeric(dummy$gender_YN[test_index]) -1)
# Convert predictions to classes, using optimal cut-off
boost_pred_class <- rep(0, length(boost_preds_1))
boost_pred_class[boost_preds_1 >= oc$MaxEfficiency$Global$optimal.cutoff$cutoff[1]] <- 1
t <- table(boost_pred_class, as.numeric(dummy$gender_YN[test_index]) -1) # Create table
confusionMatrix(t, positive = "1") # Produce confusion matrix
## Confusion Matrix and Statistics
##
##
## boost_pred_class 0 1
## 0 99 32
## 1 16 225
##
## Accuracy : 0.871
## 95% CI : (0.8326, 0.9033)
## No Information Rate : 0.6909
## P-Value [Acc > NIR] : 3.773e-16
##
## Kappa : 0.7091
##
## Mcnemar's Test P-Value : 0.03038
##
## Sensitivity : 0.8755
## Specificity : 0.8609
## Pos Pred Value : 0.9336
## Neg Pred Value : 0.7557
## Prevalence : 0.6909
## Detection Rate : 0.6048
## Detection Prevalence : 0.6478
## Balanced Accuracy : 0.8682
##
## 'Positive' Class : 1
##
#.871. just for practice, i'll try to cheat.
set.seed(123456)
# Generate random seeds
seeds <- sample(100000:999999, size = 100)
# Create AUC vector to store results
auc_vec <- rep(NA, length(seeds))
# Loop through seeds
for(i in 1:length(seeds)){
# Set as seed i
set.seed(seeds[i])
# Generate test index
test_index_cheat <- sample(1:nrow(dummy), size = nrow(dummy) * 0.2, replace = FALSE)
# Create training data
dtrain_cheat <- xgb.DMatrix(data = as.matrix(dummy[-test_index_cheat, 1:61]), label = as.numeric(dummy$gender_YN[-test_index_cheat])-1)
# Create test matrix
dtest_cheat <- xgb.DMatrix(data = as.matrix(dummy[test_index_cheat, 1:61]),label = as.numeric(dummy$gender_YN[test_index_cheat]))
# Fit XGBoost model
xg_model_cheat <- xgboost(data = dtrain_cheat, # Set training data
nrounds = 100, # Set number of rounds
verbose = 0, # 1 - Prints out fit
print_every_n = 20, # Prints out result every 20th iteration
objective = "binary:logistic", # Set objective
eval_metric = "auc",
eval_metric = "error") # Set evaluation metric to use
# Fit XGBoost predictions
xg_preds_test_cheat <- predict(xg_model_cheat, dtest_cheat) # Create predictions for xgboost model
# Calculate ROC curve
roc_hack <- roc(dummy$gender_YN[test_index_cheat], as.numeric(as.factor(xg_preds_test_cheat)), quiet = TRUE)
# Store AUC
auc_vec[i] <- roc_hack$auc
}
# Join seeds and AUC
temp <- cbind.data.frame(seeds, auc_vec)
g_6 <- ggplot(temp, aes(x = auc_vec)) +
geom_density(alpha = 0.8, fill = "blue") +
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) +
labs(x = "AUC", title = "AUC for Different Seeds")
g_6
which.max(temp$auc_vec)
## [1] 81
temp[which.max(temp$auc_vec),]
## seeds auc_vec
## 81 843406 0.9706071
set.seed(843406)
test_index_newseed <- sample(1:nrow(dummy), size = nrow(dummy) * 0.2, replace = FALSE)
# Create training data
dtrain_newseed <- xgb.DMatrix(data = as.matrix(dummy[-test_index_newseed, 1:61]), label = as.numeric(dummy$gender_YN[-test_index_newseed])-1)
# Create test matrix
dtest_newseed <- xgb.DMatrix(data = as.matrix(dummy[test_index_newseed, 1:61]),label = as.numeric(dummy$gender_YN[test_index_newseed]))
# Fit XGBoost model
xg_model_newseed <- xgboost(data = dtrain_newseed, # Set training data
nrounds = 100, # Set number of rounds
verbose = 1, # 1 - Prints out fit
print_every_n = 20, # Prints out result every 20th iteration
objective = "binary:logistic", # Set objective
eval_metric = "auc",
eval_metric = "error") # Set evaluation metric to use
## [1] train-auc:0.957550 train-error:0.096709
## [21] train-auc:0.980035 train-error:0.073203
## [41] train-auc:0.985765 train-error:0.061115
## [61] train-auc:0.989442 train-error:0.051713
## [81] train-auc:0.990761 train-error:0.049698
## [100] train-auc:0.991464 train-error:0.046340
boost_preds_newseed <- predict(xg_model_newseed, dtrain_newseed) # Create predictions for xgboost model
pred_dat_newseed <- cbind.data.frame(boost_preds_newseed, as.numeric(dummy$gender_YN[-test_index_newseed]) -1)
names(pred_dat_newseed) <- c("predictions", "response")
oc_newseed<- optimal.cutpoints(X = "predictions",
status = "response",
tag.healthy = 0,
data = pred_dat_newseed,
methods = "MaxEfficiency")
boost_preds_1_newseed <- predict(xg_model_newseed, dtest_newseed) # Create predictions for xgboost model
pred_dat_newseed <- cbind.data.frame(boost_preds_1_newseed, as.numeric(dummy$gender_YN[test_index_newseed]) -1)
# Convert predictions to classes, using optimal cut-off
boost_pred_class_newseed <- rep(0, length(boost_preds_1_newseed))
boost_pred_class_newseed[boost_preds_1_newseed >= oc$MaxEfficiency$Global$optimal.cutoff$cutoff[1]] <- 1
t_newseed <- table(boost_pred_class_newseed, as.numeric(dummy$gender_YN[test_index_newseed]) -1) # Create table
confusionMatrix(t_newseed, positive = "1")
## Confusion Matrix and Statistics
##
##
## boost_pred_class_newseed 0 1
## 0 111 15
## 1 17 229
##
## Accuracy : 0.914
## 95% CI : (0.8807, 0.9404)
## No Information Rate : 0.6559
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8087
##
## Mcnemar's Test P-Value : 0.8597
##
## Sensitivity : 0.9385
## Specificity : 0.8672
## Pos Pred Value : 0.9309
## Neg Pred Value : 0.8810
## Prevalence : 0.6559
## Detection Rate : 0.6156
## Detection Prevalence : 0.6613
## Balanced Accuracy : 0.9029
##
## 'Positive' Class : 1
##
#Still only .914.
#Model Discussion: The first tree model performed well, but once names were introduced as a variable, the model was almost perfectly accurate, with a specificity of 1.0.
# The non-cheating XGBoost model had an accuracy of .871, which is lower than the original tree model's accuracy of .8974.
#Trying to find the optimal p-value was not fairly effective and only increased the accuracy to .914.