El Objetivo principal de este proyecto es predecir las variables más significativas dentro de un listado, para clasificar a los usuarios de Twitter según su genero basados en el uso de los tweets.
A continuacion cargaremos las librerias a utilizar, cargaremos el dataset y eliminaremos valores nulos.
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v stringr 1.4.0
## v tidyr 0.8.1 v forcats 0.4.0
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
twitter_classification <- read_csv("twitter-gender-classification.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `_unit_id` = col_integer(),
## `_golden` = col_logical(),
## `_trusted_judgments` = col_integer(),
## `gender:confidence` = col_double(),
## `profile_yn:confidence` = col_double(),
## fav_number = col_integer(),
## retweet_count = col_integer(),
## tweet_count = col_integer(),
## tweet_id = col_double()
## )
## See spec(...) for full column specifications.
twitter_classification <- twitter_classification[!is.na(twitter_classification$gender),]
twitter_classification
Eliminaremos las Columnas que se considera no influyen en la predicción.
twitter_classification <- twitter_classification[,-c(1,2,3,4,5,7,8,9,10,11,13,14,15,16,17,19,20,21,23,24,25,26)]
twitter_classification
Se crean los k-Folds y se trabajará con 5 grupos
Folds <-
createFolds(twitter_classification$gender, k = 5, list = TRUE, returnTrain = FALSE)
createFolds
## function (y, k = 10, list = TRUE, returnTrain = FALSE)
## {
## if (class(y)[1] == "Surv")
## y <- y[, "time"]
## if (is.numeric(y)) {
## cuts <- floor(length(y)/k)
## if (cuts < 2)
## cuts <- 2
## if (cuts > 5)
## cuts <- 5
## breaks <- unique(quantile(y, probs = seq(0, 1, length = cuts)))
## y <- cut(y, breaks, include.lowest = TRUE)
## }
## if (k < length(y)) {
## y <- factor(as.character(y))
## numInClass <- table(y)
## foldVector <- vector(mode = "integer", length(y))
## for (i in 1:length(numInClass)) {
## min_reps <- numInClass[i]%/%k
## if (min_reps > 0) {
## spares <- numInClass[i]%%k
## seqVector <- rep(1:k, min_reps)
## if (spares > 0)
## seqVector <- c(seqVector, sample(1:k, spares))
## foldVector[which(y == names(numInClass)[i])] <- sample(seqVector)
## }
## else {
## foldVector[which(y == names(numInClass)[i])] <- sample(1:k,
## size = numInClass[i])
## }
## }
## }
## else foldVector <- seq(along = y)
## if (list) {
## out <- split(seq(along = y), foldVector)
## names(out) <- paste("Fold", gsub(" ", "0", format(seq(along = out))),
## sep = "")
## if (returnTrain)
## out <- lapply(out, function(data, y) y[-data], y = seq(along = y))
## }
## else out <- foldVector
## out
## }
## <bytecode: 0x000000001b70c038>
## <environment: namespace:caret>
Aca obtenemos todas las combinaciones posibles de los modelos y de las cuales saldra la combinación mas significativa.
feats_twitter_classification <- colnames(twitter_classification)
feats_twitter_classification
## [1] "gender" "fav_number" "retweet_count" "tweet_count"
feats_twitter_classification <- setdiff(feats_twitter_classification, "gender")
feats_twitter_classification
## [1] "fav_number" "retweet_count" "tweet_count"
models_list <- c()
for (i in 1:length(feats_twitter_classification)){
models_list <- c(models_list, combn(feats_twitter_classification, i, simplify = FALSE))
}
models_list
## [[1]]
## [1] "fav_number"
##
## [[2]]
## [1] "retweet_count"
##
## [[3]]
## [1] "tweet_count"
##
## [[4]]
## [1] "fav_number" "retweet_count"
##
## [[5]]
## [1] "fav_number" "tweet_count"
##
## [[6]]
## [1] "retweet_count" "tweet_count"
##
## [[7]]
## [1] "fav_number" "retweet_count" "tweet_count"
tweet_count: número de tweets que el usuario ha publicado
Obtenemos las matrices de confusiones y en base a la exactitud elegiremos el mejor modelo.
get_metric <- function(kfolds, dataset, label, specific_features, index_fold){
index <- kfolds[[index_fold]]
test <- dataset[index,]
train <- dataset[-index,]
formula <- paste0(specific_features, collapse = "+")
formula <- paste0(label, "~", formula,
collapse = "")
fit_lda <- lda(as.formula(formula), data = train)
test_pred <- predict(fit_lda, test)
conf_matrix <- table(test_pred$class, test$gender)
print(conf_matrix)
exactitud <- (conf_matrix[2,2] + conf_matrix[1,1]) / nrow(dataset)
return(exactitud)
}
Realizaremos iteraciones con las distintas combinaciones y promediaremos las exactitudes para conocer el mejor modelo.
k_de_folds <- 5
models_folds_metric <- c()
mean_exactitud_model <- c()
for(j in 1:length(models_list)){
for(i in 1:k_de_folds){
metrica <-
get_metric(Folds,
dataset = twitter_classification,
label = "gender",
specific_features = models_list[[j]],
index_fold = i)
models_folds_metric <- c(models_folds_metric, metrica)
}
mean_exactitud_model <- c(mean_exactitud_model, mean(models_folds_metric))
}
##
## female male
## female 1340 1239
## male 0 0
##
## female male
## female 1340 1239
## male 0 0
##
## female male
## female 1340 1239
## male 0 0
##
## female male
## female 1340 1239
## male 0 0
##
## female male
## female 1340 1238
## male 0 0
##
## female male
## female 1338 1235
## male 2 4
##
## female male
## female 1337 1237
## male 3 2
##
## female male
## female 1334 1231
## male 6 8
##
## female male
## female 1339 1238
## male 1 1
##
## female male
## female 1340 1234
## male 0 4
##
## female male
## female 1262 1161
## male 78 78
##
## female male
## female 1264 1161
## male 76 78
##
## female male
## female 1295 1173
## male 45 66
##
## female male
## female 1295 1180
## male 45 59
##
## female male
## female 1284 1160
## male 56 78
##
## female male
## female 1338 1234
## male 2 5
##
## female male
## female 1339 1238
## male 1 1
##
## female male
## female 1320 1209
## male 20 30
##
## female male
## female 1339 1238
## male 1 1
##
## female male
## female 1340 1236
## male 0 2
##
## female male
## female 1241 1143
## male 99 96
##
## female male
## female 1227 1137
## male 113 102
##
## female male
## female 1246 1147
## male 94 92
##
## female male
## female 1258 1146
## male 82 93
##
## female male
## female 1257 1120
## male 83 118
##
## female male
## female 1262 1158
## male 78 81
##
## female male
## female 1264 1159
## male 76 80
##
## female male
## female 1288 1164
## male 52 75
##
## female male
## female 1294 1178
## male 46 61
##
## female male
## female 1282 1155
## male 58 83
##
## female male
## female 1237 1141
## male 103 98
##
## female male
## female 1231 1136
## male 109 103
##
## female male
## female 1236 1134
## male 104 105
##
## female male
## female 1257 1144
## male 83 95
##
## female male
## female 1256 1118
## male 84 120
mean_exactitud_model
## [1] 0.1039243 0.1039786 0.1042655 0.1042384 0.1042687 0.1043922 0.1044096
max_exactitud_model <- which.max(mean_exactitud_model)
max_exactitud_model
## [1] 7
models_list[max_exactitud_model]
## [[1]]
## [1] "fav_number" "retweet_count" "tweet_count"