Proyecto Final - Clasificacion De Genero De Un Usuario de Twitter

Desarrollo

A continuacion cargaremos las librerias a utilizar, cargaremos el dataset y eliminaremos valores nulos.

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.2

## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v stringr 1.4.0
## v tidyr   0.8.1     v forcats 0.4.0

## Warning: package 'stringr' was built under R version 3.5.2

## Warning: package 'forcats' was built under R version 3.5.2

## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

twitter_classification <- read_csv("twitter-gender-classification.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `_unit_id` = col_integer(),
##   `_golden` = col_logical(),
##   `_trusted_judgments` = col_integer(),
##   `gender:confidence` = col_double(),
##   `profile_yn:confidence` = col_double(),
##   fav_number = col_integer(),
##   retweet_count = col_integer(),
##   tweet_count = col_integer(),
##   tweet_id = col_double()
## )

## See spec(...) for full column specifications.

twitter_classification <- twitter_classification[!is.na(twitter_classification$gender),]
twitter_classification

Eliminaremos las Columnas que se considera no influyen en la predicción.

twitter_classification <- twitter_classification[,-c(1,2,3,4,5,7,8,9,10,11,13,14,15,16,17,19,20,21,23,24,25,26)]
twitter_classification

Se crean los k-Folds y se trabajará con 5 grupos

Folds <- 
createFolds(twitter_classification$gender, k = 5, list = TRUE, returnTrain = FALSE)
createFolds

## function (y, k = 10, list = TRUE, returnTrain = FALSE) 
## {
##     if (class(y)[1] == "Surv") 
##         y <- y[, "time"]
##     if (is.numeric(y)) {
##         cuts <- floor(length(y)/k)
##         if (cuts < 2) 
##             cuts <- 2
##         if (cuts > 5) 
##             cuts <- 5
##         breaks <- unique(quantile(y, probs = seq(0, 1, length = cuts)))
##         y <- cut(y, breaks, include.lowest = TRUE)
##     }
##     if (k < length(y)) {
##         y <- factor(as.character(y))
##         numInClass <- table(y)
##         foldVector <- vector(mode = "integer", length(y))
##         for (i in 1:length(numInClass)) {
##             min_reps <- numInClass[i]%/%k
##             if (min_reps > 0) {
##                 spares <- numInClass[i]%%k
##                 seqVector <- rep(1:k, min_reps)
##                 if (spares > 0) 
##                   seqVector <- c(seqVector, sample(1:k, spares))
##                 foldVector[which(y == names(numInClass)[i])] <- sample(seqVector)
##             }
##             else {
##                 foldVector[which(y == names(numInClass)[i])] <- sample(1:k, 
##                   size = numInClass[i])
##             }
##         }
##     }
##     else foldVector <- seq(along = y)
##     if (list) {
##         out <- split(seq(along = y), foldVector)
##         names(out) <- paste("Fold", gsub(" ", "0", format(seq(along = out))), 
##             sep = "")
##         if (returnTrain) 
##             out <- lapply(out, function(data, y) y[-data], y = seq(along = y))
##     }
##     else out <- foldVector
##     out
## }
## <bytecode: 0x000000001b70c038>
## <environment: namespace:caret>

Aca obtenemos todas las combinaciones posibles de los modelos y de las cuales saldra la combinación mas significativa.

feats_twitter_classification <- colnames(twitter_classification)
feats_twitter_classification

## [1] "gender"        "fav_number"    "retweet_count" "tweet_count"

feats_twitter_classification <- setdiff(feats_twitter_classification, "gender")
feats_twitter_classification

## [1] "fav_number"    "retweet_count" "tweet_count"

models_list <- c()
for (i in 1:length(feats_twitter_classification)){
  models_list <- c(models_list, combn(feats_twitter_classification, i, simplify = FALSE))
}
models_list

## [[1]]
## [1] "fav_number"
## 
## [[2]]
## [1] "retweet_count"
## 
## [[3]]
## [1] "tweet_count"
## 
## [[4]]
## [1] "fav_number"    "retweet_count"
## 
## [[5]]
## [1] "fav_number"  "tweet_count"
## 
## [[6]]
## [1] "retweet_count" "tweet_count"  
## 
## [[7]]
## [1] "fav_number"    "retweet_count" "tweet_count"

fav_number: número de Tweets que el usuario a seleccionado como favorito.
retweet_count: número de veces que el usuario ha retuiteado
tweet_count: número de tweets que el usuario ha publicado
Se crea el train y el test.
Se crea el modelo de regresión logistica y se utiliza LDA para clasificar
Obtenemos las matrices de confusiones y en base a la exactitud elegiremos el mejor modelo.

get_metric <- function(kfolds, dataset, label, specific_features, index_fold){
  index <- kfolds[[index_fold]]
  test <- dataset[index,]
  train <- dataset[-index,]
  formula <- paste0(specific_features, collapse = "+")
  formula <- paste0(label, "~", formula,
                    collapse = "")
  fit_lda <- lda(as.formula(formula), data = train)
  test_pred <- predict(fit_lda, test)
  conf_matrix <- table(test_pred$class, test$gender)
  print(conf_matrix) 
  exactitud <- (conf_matrix[2,2] + conf_matrix[1,1]) / nrow(dataset)
  return(exactitud)
}

Realizaremos iteraciones con las distintas combinaciones y promediaremos las exactitudes para conocer el mejor modelo.

k_de_folds <- 5
models_folds_metric <- c()
mean_exactitud_model <- c()
for(j in 1:length(models_list)){
  for(i in 1:k_de_folds){
    metrica <-
      get_metric(Folds,
                 dataset = twitter_classification,
                 label = "gender",
                 specific_features = models_list[[j]],
                 index_fold = i)
    models_folds_metric <- c(models_folds_metric, metrica)
  }
  mean_exactitud_model <- c(mean_exactitud_model, mean(models_folds_metric))
}

##         
##          female male
##   female   1340 1239
##   male        0    0
##         
##          female male
##   female   1340 1239
##   male        0    0
##         
##          female male
##   female   1340 1239
##   male        0    0
##         
##          female male
##   female   1340 1239
##   male        0    0
##         
##          female male
##   female   1340 1238
##   male        0    0
##         
##          female male
##   female   1338 1235
##   male        2    4
##         
##          female male
##   female   1337 1237
##   male        3    2
##         
##          female male
##   female   1334 1231
##   male        6    8
##         
##          female male
##   female   1339 1238
##   male        1    1
##         
##          female male
##   female   1340 1234
##   male        0    4
##         
##          female male
##   female   1262 1161
##   male       78   78
##         
##          female male
##   female   1264 1161
##   male       76   78
##         
##          female male
##   female   1295 1173
##   male       45   66
##         
##          female male
##   female   1295 1180
##   male       45   59
##         
##          female male
##   female   1284 1160
##   male       56   78
##         
##          female male
##   female   1338 1234
##   male        2    5
##         
##          female male
##   female   1339 1238
##   male        1    1
##         
##          female male
##   female   1320 1209
##   male       20   30
##         
##          female male
##   female   1339 1238
##   male        1    1
##         
##          female male
##   female   1340 1236
##   male        0    2
##         
##          female male
##   female   1241 1143
##   male       99   96
##         
##          female male
##   female   1227 1137
##   male      113  102
##         
##          female male
##   female   1246 1147
##   male       94   92
##         
##          female male
##   female   1258 1146
##   male       82   93
##         
##          female male
##   female   1257 1120
##   male       83  118
##         
##          female male
##   female   1262 1158
##   male       78   81
##         
##          female male
##   female   1264 1159
##   male       76   80
##         
##          female male
##   female   1288 1164
##   male       52   75
##         
##          female male
##   female   1294 1178
##   male       46   61
##         
##          female male
##   female   1282 1155
##   male       58   83
##         
##          female male
##   female   1237 1141
##   male      103   98
##         
##          female male
##   female   1231 1136
##   male      109  103
##         
##          female male
##   female   1236 1134
##   male      104  105
##         
##          female male
##   female   1257 1144
##   male       83   95
##         
##          female male
##   female   1256 1118
##   male       84  120

mean_exactitud_model

## [1] 0.1039243 0.1039786 0.1042655 0.1042384 0.1042687 0.1043922 0.1044096

max_exactitud_model <- which.max(mean_exactitud_model)
max_exactitud_model

## [1] 7

models_list[max_exactitud_model]

## [[1]]
## [1] "fav_number"    "retweet_count" "tweet_count"

Proyecto Final - Clasificacion De Genero De Un Usuario de Twitter

Cesar Tinoco Alvarez - 13003387

Descripción Del proyecto

Desarrollo

Muchas Gracias