Introducción

El objetivo del proyecto es la creación de un modelo de aprendizaje automático que determine el sexo de un usuario de Twitter (mujer, hombre o marca) a través de un tweet elegido de forma aleatoria y de datos generales de su perfil de Twitter.

Este informe describe los pasos realizados para la carga, limpieza y exploración de los datos de entrada que se utilizarán para entrenar y evaluar los modelos.

Descripción de los datos

Los datos utilizados son:

Conjunto de datos principal que contiene el género de un número limitado de usuarios de Twitter. Los datos se describen y están disponibles en Kaggle.
Base de datos de ciudades del mundo que incluye su zona horaria, así como la lista de zonas horarias y abreviaturas más comunes para poder interpretar, en la medida de lo posible, la zona horaria del usuario incluida en los datos.

###############################################################################
# User defined-functions
###############################################################################

partsOfTheDay <- function(h){
  # Hours of the day will be:
  # Early_morning from 6 am to 9 am
  # Late morning from 10 am to 12 pm
  # Early afternoon from 13 pm to 15 pm
  # Late afternoon from 16 pm to 18 pm
  # Early evening from 19 pm to 21 pm
  # Late evening from 22 pm to 00 am
  # Night from 1 am to 5 am
  partday <- "Night"
  if ((6 <= h)&(h <= 9)){
    partday <- "Early_morning"
  }else if ((10 <= h)&(h <= 12)){
    partday <- "Late_morning"
  }else if ((13 <= h)&(h <= 15)){
    partday <- "Early_afternoon"
  }else if ((16 <= h)&(h <= 18)){
    partday <- "Late_afternoon"
  }else if ((19 <= h)&(h <= 21)){
    partday <- "Early_evening"
  }else if ((22 <= h)|(h == 0)){
    partday <- "Late_evening"
  }
  return(partday)
}

# myReadImage <- function(f){
#   require(bmp)
#   require(jpeg)
#   require(caTools)
#   require(png)
#   ext <- tolower(tail(unlist(strsplit(f, ".", fixed= TRUE)),1)) # Extension
#   result <- tryCatch({
#     if (ext == "png"){
#       readPNG(f)
#     }else if (ext == "gif"){
#       read.gif(f)
#     }else if (ext == "bmp"){
#       read.bmp(f)
#     }else{ #Try JPG
#       readJPEG(f)
#     }
#   }, error = function(e){
#     99
#   })
#   close(f)
#   result
# }

wordProb <- function(s, pDT){
  i <- grepl(paste0("^",s,"$"), pDT$Word)
  r <- pDT[1,]
  r$Word <- "unknown"
  r$Frequency.b <- 0
  r$Frequency.f <- 0
  r$Frequency.m <- 0
  r$TotalFrequency <- 0
  r$WordProb <- 1
  r$probBrand <- 1
  r$probFemale <- 1
  r$probMale <- 1
  
  if (sum(i)>0){ # Observed
      r <- pDT[i,]
  }
  r
}

genderProbability <- function(text, prob, pF, pM, pB){
  # Returns numerator and denominator to compute the probability of being
  # a certain gender given all the words find in text sentence
  # (pDFemale * termDProb$probFemale[1] * termDProb$probFemale[5]/
  #    (termDProb$WordProb[1]*termDProb$WordProb[5]))
  # (pDMale * termDProb$probMale[1] * termDProb$probMale[5]/
  #     (termDProb$WordProb[1]*termDProb$WordProb[5]))
  # (pDBrand * termDProb$probBrand[1] * termDProb$probBrand[5]/
  #     (termDProb$WordProb[1]*termDProb$WordProb[5]))
  require(tm)
  
  result <- data.table(pFemale = 0,pMale = 0, pBrand = 0)
  vs <- VectorSource(text)
  doc <- Corpus(vs)
  # Eliminating stopwords:
  doc <- tm_map(doc, removeWords, stopwords("english"))
  # Eliminating radicals:
  doc <- tm_map(doc, stemDocument, language = "english")
  w <- unlist(str_split(doc[[1]][1], " "))
  if(length(w)> 0){ # There are words
    t <- sapply(w,wordProb,pDT = prob)
    t <- data.frame(t)
    result$pFemale <- 100*pF*prod(unlist(t["probFemale",]))
    result$pMale <- 100*pM*prod(unlist(t["probMale",]))
    result$pBrand <- 100*pB*prod(unlist(t["probBrand",]))
  }
  result
}

A continuación se muestra distintos aspectos del archivo que contiene el conjunto de datos principal:

# Summary of the file
fSize <- file.info(f)$size / (1024^2) #MB
twitter <- lapply(enUStwitter$text,nchar)
maxLength <- twitter[which.max(twitter)] #Greater than 140/280 due to emojis
minLength <- twitter[which.min(twitter)]
nbrLines <- length(twitter)
nbrWords <- sum(sapply(strsplit(enUStwitter$text,"\\s+"), length))
source <- c("Twitter")
att <- c("Fuente", "Tamaño (MB)", "Nr. palabras", "Nr. líneas", 
         "Mín. tamaño del texto", "Máx. tamaño del texto")
fileDescription <- data.frame(source,fSize,nbrWords,nbrLines,
                              unlist(minLength),unlist(maxLength))
colnames(fileDescription) <- att
fileDescription

##    Fuente Tamaño (MB) Nr. palabras Nr. líneas Mín. tamaño del texto
## 1 Twitter    7.797946       310838      20050                     6
##   Máx. tamaño del texto
## 1                   425

A modo resumen, el contenido del conjunto de datos inicial es:

###############################################################################
# Data overview
###############################################################################

# Summary
summary(enUStwitter)

##     _unit_id          _golden        _unit_state        _trusted_judgments
##  Min.   :815719226   Mode :logical   Length:20050       Min.   :  3.000   
##  1st Qu.:815724318   FALSE:20000     Class :character   1st Qu.:  3.000   
##  Median :815729384   TRUE :50        Mode  :character   Median :  3.000   
##  Mean   :815729449   NA's :0                            Mean   :  3.616   
##  3rd Qu.:815734514                                      3rd Qu.:  3.000   
##  Max.   :815757985                                      Max.   :274.000   
##                                                                           
##  _last_judgment_at     gender          gender:confidence
##  Length:20050       Length:20050       Min.   :0.0000   
##  Class :character   Class :character   1st Qu.:0.6778   
##  Mode  :character   Mode  :character   Median :1.0000   
##                                        Mean   :0.8828   
##                                        3rd Qu.:1.0000   
##                                        Max.   :1.0000   
##                                        NA's   :26       
##   profile_yn        profile_yn:confidence   created         
##  Length:20050       Min.   :0.6272        Length:20050      
##  Class :character   1st Qu.:1.0000        Class :character  
##  Mode  :character   Median :1.0000        Mode  :character  
##                     Mean   :0.9932                          
##                     3rd Qu.:1.0000                          
##                     Max.   :1.0000                          
##                                                             
##  description          fav_number     gender_gold         link_color       
##  Length:20050       Min.   :     0   Length:20050       Length:20050      
##  Class :character   1st Qu.:    11   Class :character   Class :character  
##  Mode  :character   Median :   456   Mode  :character   Mode  :character  
##                     Mean   :  4382                                        
##                     3rd Qu.:  3316                                        
##                     Max.   :341621                                        
##                                                                           
##      name           profile_yn_gold    profileimage      
##  Length:20050       Length:20050       Length:20050      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  retweet_count      sidebar_color          text          
##  Min.   :  0.0000   Length:20050       Length:20050      
##  1st Qu.:  0.0000   Class :character   Class :character  
##  Median :  0.0000   Mode  :character   Mode  :character  
##  Mean   :  0.0794                                        
##  3rd Qu.:  0.0000                                        
##  Max.   :330.0000                                        
##                                                          
##  tweet_coord         tweet_count      tweet_created     
##  Length:20050       Min.   :      1   Length:20050      
##  Class :character   1st Qu.:   2398   Class :character  
##  Mode  :character   Median :  11442   Mode  :character  
##                     Mean   :  38925                     
##                     3rd Qu.:  40028                     
##                     Max.   :2680199                     
##                                                         
##     tweet_id         tweet_location     user_timezone     
##  Min.   :6.587e+17   Length:20050       Length:20050      
##  1st Qu.:6.587e+17   Class :character   Class :character  
##  Median :6.587e+17   Mode  :character   Mode  :character  
##  Mean   :6.587e+17                                        
##  3rd Qu.:6.587e+17                                        
##  Max.   :6.587e+17                                        
##

Como ejemplo, se muestran las 6 primeras filas:

# Head
head(enUStwitter)

##    _unit_id _golden _unit_state _trusted_judgments _last_judgment_at
## 1 815719226   FALSE   finalized                  3    10/26/15 23:24
## 2 815719227   FALSE   finalized                  3    10/26/15 23:30
## 3 815719228   FALSE   finalized                  3    10/26/15 23:33
## 4 815719229   FALSE   finalized                  3    10/26/15 23:10
## 5 815719230   FALSE   finalized                  3     10/27/15 1:15
## 6 815719231   FALSE   finalized                  3     10/27/15 1:47
##   gender gender:confidence profile_yn profile_yn:confidence        created
## 1   male            1.0000        yes                     1   12/5/13 1:48
## 2   male            1.0000        yes                     1  10/1/12 13:51
## 3   male            0.6625        yes                     1 11/28/14 11:30
## 4   male            1.0000        yes                     1  6/11/09 22:39
## 5 female            1.0000        yes                     1  4/16/14 13:23
## 6 female            1.0000        yes                     1  3/11/10 18:14
##                                                                                                                                                        description
## 1                                                                                                                                            i sing my own rhythm.
## 2                                                                                                   I'm the author of novels filled with family drama and romance.
## 3                                                                                                                              louis whining and squealing and all
## 4               Mobile guy.  49ers, Shazam, Google, Kleiner Perkins, Yahoo!, Sprint PCS, AirTouch, Air Force.  Stanford GSB, UVa.  Dad, Husband, Brother.  Golfer.
## 5 Ricky Wilson The Best FRONTMAN/Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always :) Xxxxxxx
## 6                                                                                                                                               you don't know me.
##   fav_number gender_gold link_color           name profile_yn_gold
## 1          0                 08C2C2        sheezy0                
## 2         68                 0084B4    DavdBurnett                
## 3       7696                 ABB8C2 lwtprettylaugh                
## 4        202                 0084B4    douggarland                
## 5      37318                 3B94D9   WilfordGemma                
## 6       3901                 F5ABB5  monroevicious                
##                                                                   profileimage
## 1  https://pbs.twimg.com/profile_images/414342229096808449/fYvzqXN7_normal.png
## 2 https://pbs.twimg.com/profile_images/539604221532700673/WW16tBbU_normal.jpeg
## 3  https://pbs.twimg.com/profile_images/657330418249658368/SBLCXdF7_normal.png
## 4           https://pbs.twimg.com/profile_images/259703936/IMG_8444_normal.JPG
## 5 https://pbs.twimg.com/profile_images/564094871032446976/AOfpk-mr_normal.jpeg
## 6  https://pbs.twimg.com/profile_images/656336865033850880/ougQS3q7_normal.jpg
##   retweet_count sidebar_color
## 1             0        FFFFFF
## 2             0        C0DEED
## 3             1        C0DEED
## 4             0        C0DEED
## 5             0             0
## 6             0             0
##                                                                                                                                          text
## 1                               Robbie E Responds To Critics After Win Against Eddie Edwards In The #WorldTitleSeries https://t.co/NSybBmVjKZ
## 2 ÛÏIt felt like they were my friends and I was living the story with themÛ https://t.co/arngE0YHNO #retired #IAN1 https://t.co/CIzCANPQFz
## 3                                                            i absolutely adore when louis starts the songs it hits me hard but it feels good
## 4  Hi @JordanSpieth - Looking at the url - do you use @IFTTT?!  Don't typically see an advanced user on the @PGATOUR! https://t.co/H68ou5PE9L
## 5                                             Watching Neighbours on Sky+ catching up with the Neighbs!! Xxx _Ù÷Ä_Ù÷Ä_Ù÷Ä_ÙÔÎ_ÙÈ_ÙÔ_ÙÈ Xxx
## 6                                                           Ive seen people on the train with lamps, chairs, tvs etc  https://t.co/w6zf4pVM4I
##   tweet_coord tweet_count  tweet_created   tweet_id  tweet_location
## 1                  110964 10/26/15 12:40 6.5873e+17 main; @Kan1shk3
## 2                    7471 10/26/15 12:40 6.5873e+17                
## 3                    5617 10/26/15 12:40 6.5873e+17          clcncl
## 4                    1693 10/26/15 12:40 6.5873e+17   Palo Alto, CA
## 5                   31462 10/26/15 12:40 6.5873e+17                
## 6                   20036 10/26/15 12:40 6.5873e+17 New York Gritty
##                user_timezone
## 1                    Chennai
## 2 Eastern Time (US & Canada)
## 3                   Belgrade
## 4 Pacific Time (US & Canada)
## 5                           
## 6 Central Time (US & Canada)

Se observa que hay textos de tweets que superan el máximo número de caracteres permitidos en Twitter. En concreto, el texto de tweet más corto del conjunto de datos es:

# Example shortest line:
enUStwitter$text[which.min(twitter)]

## [1] "#NAME?"

Y el más largo es debido a la decodificación de emojis:

# Example largest line:
enUStwitter$text[which.max(twitter)]

## [1] "When You A Trap Wife and Everybody wants you _Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷__Ù÷_"

Limpieza y transformación de los datos

El conjunto de datos principal va a comenzar el proceso de limpieza y transformación para dar lugar al conjunto de datos que se utilizará para nuestro análisis. A grosso modo, el proceso consistirá en:

Utilización de la zona horaria del usuario para referir todas las fechas aportadas en el conjunto de datos a la misma zona horaria (GMT), para así poder operar, minimizando el error cometido, y calcular nuevos features como el número de tweets enviados por día desde la fecha que se creó el perfil de Twitter hasta el momento en el que se capturaron los datos (aproximado).
Limpiar tanto el texto del tweet como la descripción del perfil del usuario. El proceso de limpieza consta de los siguientes pasos:
- Decodificar correctamente posibles caracteres HTML.
- Escribir correctamente abreviaturas.
- Extraer caracteres no ASCII que suelen corresponder a emojis.
- Eliminar del texto las cadenas de caracteres creadas por ser un retweet. Si un tweet es un retweet, el texto del tweet no sería característico del usuario, sólo el hecho de “retweetear” en principio. Por tanto, aunque en este informe no se ha hecho, al utilizarse el texto del tweet como feature, estos datos deberían eliminarse.
- Extraer referencias a enlaces web.
- Extraer referencias a emails.
- Extraer tags, bien típicos de Twitter u otros.
- Extraer hashtags.
- Eliminar números del texto.
- Evitar contracciones del inglés sustituyéndolas por su versión completa, en la medida de lo posible.
- El texto que contiene sólo palabras tras este proceso de limpieza (también elimina signos de puntuación por ejemplo) se encontraría en otro feature en el data set, y sería convertido a minúsculas.
Crear un feature que indique si se disponen o no de las coordenadas del tweet.
Crear otros features como el número de emails en el texto del tweet o en la descripción del perfil que puedan ayudar a caracterizar el comportamiento del usuario de Twitter según su género.

El único elemento que no se considera en este momento es la imagen de perfil o los emojis del texto.

Se muestra a modo ilustrativo el ejemplo para el usuario con id 815719375. En el caso de dicho usuario, el texto del tweet antes de ser procesado era:

# Example of tweet after cleaning process:
enUStwitter$text[enUStwitter$`_unit_id`=="815719375"]

## [1] "ÛÏ@thacelebritea Seen on the Scene: Spotted #RavenSymone out and about earlier this week [r.p @theybfdaily ] https://t.co/sjNH93a0L3Û ÷¼ü÷¼ü÷¼ü"

Tras el proceso de limpieza, el texto es como sigue:

# Example of tweet after cleaning process:
tidytwitter$text[tidytwitter$`_unit_id`=="815719375"]

## [1] "@ Seen on the Scene: Spotted out and about earlier this week [r.p @ ]"

Y si se eliminan todo lo que no sean palabras, quedaría:

# Example of tweet after cleaning process:
tidytwitter$textWords[tidytwitter$`_unit_id`=="815719375"]

## [1] "seen on the scene spotted out and about earlier this week r p"

El nuevo conjunto de datos tras este proceso de transformación para este usuario concreto es:

# Example of data frame now:
tidytwitter[tidytwitter$`_unit_id`=="815719375",]

##      _unit_id _golden _unit_state _trusted_judgments _last_judgment_at
## 150 815719375   FALSE   finalized                  3     10/27/15 1:43
##     gender gender:confidence profile_yn profile_yn:confidence      created
## 150 female                 1        yes                     1 3/1/15 22:10
##                                                                                                                                                 description
## 150 24/CreoleMixed/BornInNewOrleans,RaisedInTexas/Love God,Family,Friends,Education,$,Sleep,Text,Food,Movies,Music,Clothes, Makeup,Nature,Animals & Colors.
##     fav_number gender_gold link_color        name profile_yn_gold
## 150          3                 D02B55 msrainerain                
##                                                                    profileimage
## 150 https://pbs.twimg.com/profile_images/649838232021004289/qG3uu_gw_normal.jpg
##     retweet_count sidebar_color
## 150             0        829D5E
##                                                                      text
## 150 @ Seen on the Scene: Spotted out and about earlier this week [r.p @ ]
##     tweet_coord tweet_count  tweet_created   tweet_id tweet_location
## 150                   45295 10/26/15 12:40 6.5873e+17     Houston,Tx
##     user_timezone    profile_creation      tweet_creation
## 150               2015-03-01 21:10:00 2015-10-26 11:40:00
##     part_day_creation last_judgement_date   profileDays tweetsPerDay
## 150      Late_morning 2015-10-27 01:43:00 239.0181 days     189.5045
##     retweetsPerDay
## 150              0
##                                                               nonASCIIText
## 150 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
##     nonASCIIDesc RetweetOrigin                URLsText URLsDesc EmailsText
## 150           NA            NA https://t.co/sjNH93a0L3       NA         NA
##     EmailsDesc              TwitterTagsText TwitterTagsDesc OtherTagsText
## 150         NA @thacelebritea, @theybfdaily              NA          @, @
##     OtherTagsDesc HashTagsText HashTagsDesc
## 150            NA #RavenSymone           NA
##                                                         textWords
## 150 seen on the scene spotted out and about earlier this week r p
##                                                                                                                                   descriptionWords
## 150 creolemixed borninneworleans raisedintexas love god family friends education sleep text food movies music clothes makeup nature animals colors
##     hasCoordinates nbrnonASCIIText nbrnonASCIIDesc nbrRetweetOrigin
## 150          FALSE              24               0                0
##     nbrURLsText nbrURLsDesc nbrEmailsText nbrEmailsDesc nbrTwitterTagsText
## 150           1           0             0             0                  2
##     nbrTwitterTagsDesc nbrOtherTagsText nbrOtherTagsDesc nbrTotalTagsText
## 150                  0                2                0                4
##     nbrTotalTagsDesc nbrHashTagsText nbrHashTagsDesc
## 150                0               1               0

Finalmente, extraemos la parte del conjunto de datos para el benchmarking final. Seguiremos la clasificación realizada por el equipo que creó este conjunto de datos, aunque, tras el proceso de limpieza, para el testeo sólo se cuentan con 48 muestras de las casi 20000 muestras válidas, muestra insuficiente, pero que nos va a ayudar a la comparación de resultados.

Las dimensiones del conjunto de datos para entrenamiento del algoritmo son:

dim(tidytwitterData)

## [1] 18786    54

Mientras que las dimensiones del conjunto de datos para la evaluación final son:

dim(tidytwitterGold)

## [1] 48 54

Exploración de los datos

En esta etapa se realizará la exploración de los datos que comenzará observando las palabras que utilizan los usuarios del mismo género en el texto del tweet. Para realizar un análisis más significativo del texto, se eliminarán las stopwords, se considerarán las raíces de las palabras (stemming) y sólo aquellas con una longitud mayor a un carácter.

###############################################################################
###############################################################################
# Exploratory analysis
###############################################################################
###############################################################################

# Quanteda is not available for this R version
# Going ahead with tm package
library(tm)
library(wordcloud)

###############################################################################
# Text analysis
###############################################################################

# By now, gender:confidence and profile_yn:confidence are not used.
# Plot confidence for Data data set:
vsMales <- VectorSource(
  tidytwitterData$textWords[tidytwitterData$gender=="male"])
vsFemales <- VectorSource(
  tidytwitterData$textWords[tidytwitterData$gender=="female"])
vsBrand <- VectorSource(
  tidytwitterData$textWords[tidytwitterData$gender=="brand"])
vsAll <- VectorSource(tidytwitterData$textWords)

docsMales <- Corpus(vsMales)
docsFemales <- Corpus(vsFemales)
docsBrands <- Corpus(vsBrand)
docsAll <- Corpus(vsAll)

# Eliminating stopwords:
docsMales <- tm_map(docsMales, removeWords, stopwords("english"))
docsFemales <- tm_map(docsFemales, removeWords, stopwords("english"))
docsBrands <- tm_map(docsBrands, removeWords, stopwords("english"))
docsAll <- tm_map(docsAll, removeWords, stopwords("english"))

# Eliminating radicals:
docsMales <- tm_map(docsMales, stemDocument, language = "english")
docsFemales <- tm_map(docsFemales, stemDocument, language = "english")
docsBrands <- tm_map(docsBrands, stemDocument, language = "english")
docsAll <- tm_map(docsAll, stemDocument, language = "english")

# # Step discarded... awkard results plus 
# # https://www.r-bloggers.com/
# # help-stemming-and-stem-completion-with-package-tm-in-r/
# # Creating a dictionary to be used for stemcompletion:
# docsAll <- tm_map(docsAll, removeWords, stopwords("english"))
# # Completing radicals as words
# docsMales <- tm_map(docsMales, 
#                     stemCompletion, 
#                     dictionary=docsAll, 
#                     type = "shortest")
# docsFemales <- tm_map(docsFemales, 
#                       stemCompletion, 
#                       dictionary=docsAll, 
#                       type = "shortest")
# docsBrands <- tm_map(docsBrands, 
#                      stemCompletion, 
#                      dictionary=docsAll, 
#                      type = "shortest")

# Building Term frequency matrix
# Based on below matrix, many data mining tasks can be done, for example, 
# clustering, classification and association analysis:

malesTDM <- TermDocumentMatrix(docsMales, 
                               control=list(
                                 bounds = list(
                                   minWordLength= 1, global = c(1,Inf))))
femalesTDM <- TermDocumentMatrix(docsFemales,
                                 control=list(
                                   bounds = list(
                                     minWordLength= 1, global = c(1,Inf))))
brandsTDM <- TermDocumentMatrix(docsBrands,
                                control=list(
                                  bounds = list(
                                    minWordLength= 1, global = c(1,Inf))))

allTDM <- TermDocumentMatrix(docsAll,
                             control=list(
                               bounds = list(
                                 minWordLength= 1, global = c(2,Inf))))

# findAssocs(____TDM, "term", correlation_limit) is good to find how terms
# are related. For instance:
# > findAssocs(brandsTDM, "weather", 0.3)
# $weather
# updat channel     get 
# 0.98    0.98    0.88 
# 
# > ?findAssocs
# > findAssocs(malesTDM, "weather", 0.3)
# $weather
# numeric(0)
# 
# > findAssocs(femalesTDM, "weather", 0.3)
# $weather
# channel 
# 0.45 
# > findAssocs(brandsTDM, c("weather","updat","channel"), 0.3)
# $weather
# get 
# 0.88 
# 
# $updat
# get 
# 0.86 
# 
# $channel
# get 
# 0.86 
# > findAssocs(brandsTDM, c("weather","updat","channel","love"), 0.3)
# $weather
# get 
# 0.88 
# 
# $updat
# get 
# 0.86 
# 
# $channel
# get 
# 0.86 
# 
# $love
# fool 
# 0.39 
# findAssocs(allTDM,"male", 0.01)
# findAssocs(allTDM, "female",0.001)
# findAssocs(allTDM, "brand",0.01)

# findFreqTerms(___TDM, lowfreq=10) is good to find the most frequent terms 
# but I do not like this method because it does not order it depending
# on how often the term appears. I rather prefer the following:
males <- as.matrix(malesTDM)
vmales <- sort(rowSums(males), decreasing = TRUE)
dmales <- data.frame(Word = names(vmales), Frequency = vmales)
dmales$Word <- factor(dmales$Word, levels = 
                        dmales$Word[order(dmales$Frequency)])

females <- as.matrix(femalesTDM)
vfemales <- sort(rowSums(females), decreasing = TRUE)
dfemales <- data.frame(Word = names(vfemales), Frequency = vfemales)
dfemales$Word <- factor(dfemales$Word, levels = 
                        dfemales$Word[order(dfemales$Frequency)])

brands <- as.matrix(brandsTDM)
vbrands <- sort(rowSums(brands), decreasing = TRUE)
dbrands <- data.frame(Word = names(vbrands), Frequency = vbrands)
dbrands$Word <- factor(dbrands$Word, levels = 
                        dbrands$Word[order(dbrands$Frequency)])

A partir de la frecuencia con la que los usuarios de Twitter de los distintos géneros utilizan cada uno de los términos, se determinará la probabilidad de pertenecer a un cierto género dada la frase escrita en el tweet o la descripción de su perfil.

# Merging all together to compute required probabilities:
m <- as.data.table(dmales)
f <- as.data.table(dfemales)
b <- as.data.table(dbrands)
termProb <- merge(m,f,all=TRUE,by = "Word",suffixes = c(".m",".f"))
termProb <- merge(termProb,b, all=TRUE, by = "Word")
colnames(termProb)[4] <- "Frequency.b"

for (i in names(termProb)){
  set(termProb, which(is.na(termProb[[i]])),i,0)
}
termProb$TotalFrequency <- termProb$Frequency.b + termProb$Frequency.m 
termProb$TotalFrequency <- termProb$TotalFrequency + termProb$Frequency.f

# Deleting terms that are said only once:
termProb <- termProb[termProb$TotalFrequency > 1,]
termProb$WordProb <- termProb$TotalFrequency / sum(termProb$TotalFrequency)
termProb$probMale <- termProb$Frequency.m / sum(termProb$Frequency.m)
termProb$probFemale <- termProb$Frequency.f / sum(termProb$Frequency.f)
termProb$probBrand <- termProb$Frequency.b / sum(termProb$Frequency.b)
pFemale <- sum(termProb$Frequency.f)/sum(termProb$TotalFrequency)
pMale <-  sum(termProb$Frequency.m)/sum(termProb$TotalFrequency)
pBrand <-  sum(termProb$Frequency.b)/sum(termProb$TotalFrequency)

head(termProb)

##       Word Frequency.m Frequency.f Frequency.b TotalFrequency     WordProb
## 1:   typic           1           3           1              5 3.822922e-05
## 2:  lmfaoo           1           1           0              2 1.529169e-05
## 3:    hath           1           0           1              2 1.529169e-05
## 4: nowaday           1           1           0              2 1.529169e-05
## 5:   holla           1           1           0              2 1.529169e-05
## 6:   audit           1           0           1              2 1.529169e-05
##        probMale   probFemale    probBrand
## 1: 2.251644e-05 6.545074e-05 2.466578e-05
## 2: 2.251644e-05 2.181691e-05 0.000000e+00
## 3: 2.251644e-05 0.000000e+00 2.466578e-05
## 4: 2.251644e-05 2.181691e-05 0.000000e+00
## 5: 2.251644e-05 2.181691e-05 0.000000e+00
## 6: 2.251644e-05 0.000000e+00 2.466578e-05

Como ejemplo, la probabilidad de ser mujer dado que se ha escrito los términos 1 y 5 de dicha tabla es:

# As an example, the probability of being woman given that the terms
# of the text used are 1 & 5 is:
(pFemale * termProb$probFemale[1] * termProb$probFemale[5]/
    (termProb$WordProb[1]*termProb$WordProb[5]))

## [1] 0.8560302

Esta solución supone, aunque sabemos que no es cierto, que el uso de cada uno de los términos en el texto o la descripción del perfil es un suceso independiente. Es decir, palabras como “amor” y “pareja” apareciendo en la misma frase son sucesoss aleatorios independientes, por lo que la probabilidad de escribirlos ambos es el producto de las probabilidades de que cada uno de ellos haya sido escrito.

En el caso de los hombres, los términos más frecuentes aparecen reflejados en el siguiente wordcloud y gráfica:

###############################################################################
# Males
###############################################################################

# Wordcloud
pal <- brewer.pal(9, "BuGn") 
pal <- pal[-(1:4)] 
wordcloud(dmales$Word, dmales$Frequency, min.freq=40, colors = pal)

## Warning in wordcloud(dmales$Word, dmales$Frequency, min.freq = 40, colors =
## pal): like could not be fit on page. It will not be plotted.

# 20 most common unigrams
g <- ggplot(dmales[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por hombres")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dmales$Frequency)
cut50 <- 0.5 * sum(dmales$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dmales$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

Teniendo en cuenta que el conjunto de textos de los tweets tienen un total de 8743 palabras, el 50% de los términos usados en el texto se alcanzarían con 425 palabras, mientras que el 90% se alcanza con 4007.

Repetimos el análisis para el caso de las mujeres, observando los términos más frecuentes:

###############################################################################
# Females
###############################################################################

# Wordcloud
wordcloud(dfemales$Word, dfemales$Frequency, min.freq=40, colors = pal)

# 20 most common unigrams
g <- ggplot(dfemales[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por mujeres")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dfemales$Frequency)
cut50 <- 0.5 * sum(dfemales$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dfemales$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

En este caso, el conjunto de textos de los tweets tienen un total de 8289 palabras, el 50% de los términos usados en el texto se alcanzarían con 320 palabras, mientras que el 90% se alcanza con 3631.

Por último, los tweets de marcas o empresas utilizan los siguientes términos:

###############################################################################
# Brands
###############################################################################

# Wordcloud
wordcloud(dbrands$Word, dbrands$Frequency, min.freq=40, colors = pal)

# 20 most common unigrams
g <- ggplot(dbrands[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por marcas")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dbrands$Frequency)
cut50 <- 0.5 * sum(dbrands$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dbrands$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

El conjunto de textos de los tweets tienen un total de 8366 palabras, el 50% de los términos usados en el texto se alcanzarían con 382 palabras, mientras que el 90% se alcanza con 4040.

El mismo análisis realizado para el texto del tweet se puede repetir para la descripción del perfil de Twitter de los usuarios, creando la tabla de probabilidades que nos permitirá calcular la probabilidad de ser hombre, mujer o marca dados los términos usados en la descripción del perfil de Twitter.

###############################################################################
# Description analysis
###############################################################################

vsMales <- VectorSource(
  tidytwitterData$descriptionWords[tidytwitterData$gender=="male"])
vsFemales <- VectorSource(
  tidytwitterData$descriptionWords[tidytwitterData$gender=="female"])
vsBrand <- VectorSource(
  tidytwitterData$descriptionWords[tidytwitterData$gender=="brand"])
vsAll <- VectorSource(tidytwitterData$descriptionWords)

docsMales <- Corpus(vsMales)
docsFemales <- Corpus(vsFemales)
docsBrands <- Corpus(vsBrand)
docsAll <- Corpus(vsAll)

# Eliminating stopwords:
docsMales <- tm_map(docsMales, removeWords, stopwords("english"))
docsFemales <- tm_map(docsFemales, removeWords, stopwords("english"))
docsBrands <- tm_map(docsBrands, removeWords, stopwords("english"))
docsAll <- tm_map(docsAll, removeWords, stopwords("english"))

# Eliminating radicals:
docsMales <- tm_map(docsMales, stemDocument, language = "english")
docsFemales <- tm_map(docsFemales, stemDocument, language = "english")
docsBrands <- tm_map(docsBrands, stemDocument, language = "english")
docsAll <- tm_map(docsAll, stemDocument, language = "english")

# Building Term frequency matrix
# Based on below matrix, many data mining tasks can be done, for example, 
# clustering, classification and association analysis:

malesTDM <- TermDocumentMatrix(docsMales, 
                               control=list(
                                 bounds = list(
                                   minWordLength= 1, global = c(1,Inf))))
femalesTDM <- TermDocumentMatrix(docsFemales,
                                 control=list(
                                   bounds = list(
                                     minWordLength= 1, global = c(1,Inf))))
brandsTDM <- TermDocumentMatrix(docsBrands,
                                control=list(
                                  bounds = list(
                                    minWordLength= 1, global = c(1,Inf))))

allTDMDesc <- TermDocumentMatrix(docsAll,
                                 control=list(
                                   bounds = list(
                                     minWordLength= 1, global = c(2,Inf))))



# Exploring data:
males <- as.matrix(malesTDM)
vmales <- sort(rowSums(males), decreasing = TRUE)
dmales <- data.frame(Word = names(vmales), Frequency = vmales)
dmales$Word <- factor(dmales$Word, levels = 
                        dmales$Word[order(dmales$Frequency)])

females <- as.matrix(femalesTDM)
vfemales <- sort(rowSums(females), decreasing = TRUE)
dfemales <- data.frame(Word = names(vfemales), Frequency = vfemales)
dfemales$Word <- factor(dfemales$Word, levels = 
                          dfemales$Word[order(dfemales$Frequency)])

brands <- as.matrix(brandsTDM)
vbrands <- sort(rowSums(brands), decreasing = TRUE)
dbrands <- data.frame(Word = names(vbrands), Frequency = vbrands)
dbrands$Word <- factor(dbrands$Word, levels = 
                         dbrands$Word[order(dbrands$Frequency)])

# Merging all together to compute required probabilities:
m <- as.data.table(dmales)
f <- as.data.table(dfemales)
b <- as.data.table(dbrands)
termDProb <- merge(m,f,all=TRUE,by = "Word",suffixes = c(".m",".f"))
termDProb <- merge(termDProb,b, all=TRUE, by = "Word")
colnames(termDProb)[4] <- "Frequency.b"

for (i in names(termDProb)){
  set(termDProb, which(is.na(termDProb[[i]])),i,0)
}
termDProb$TotalFrequency <- termDProb$Frequency.b + termDProb$Frequency.m 
termDProb$TotalFrequency <- termDProb$TotalFrequency + termDProb$Frequency.f

# Deleting terms that are said only once:
termDProb <- termDProb[termDProb$TotalFrequency > 1,]
termDProb$WordProb <- termDProb$TotalFrequency / sum(termDProb$TotalFrequency)
termDProb$probMale <- termDProb$Frequency.m / sum(termDProb$Frequency.m)
termDProb$probFemale <- termDProb$Frequency.f / sum(termDProb$Frequency.f)
termDProb$probBrand <- termDProb$Frequency.b / sum(termDProb$Frequency.b)
pDFemale <- sum(termDProb$Frequency.f)/sum(termDProb$TotalFrequency)
pDMale <-  sum(termDProb$Frequency.m)/sum(termDProb$TotalFrequency)
pDBrand <-  sum(termDProb$Frequency.b)/sum(termDProb$TotalFrequency)

head(termDProb)

##      Word Frequency.m Frequency.f Frequency.b TotalFrequency     WordProb
## 1: rhythm           1           0           1              2 1.824917e-05
## 2:    uva           1           1           0              2 1.824917e-05
## 3:  yahoo           1           0           3              4 3.649835e-05
## 4:   nude           1           2           6              9 8.212128e-05
## 5: abound           1           0           1              2 1.824917e-05
## 6:   liar           1           1           0              2 1.824917e-05
##        probMale   probFemale    probBrand
## 1: 2.583646e-05 0.000000e+00 2.775003e-05
## 2: 2.583646e-05 2.869193e-05 0.000000e+00
## 3: 2.583646e-05 0.000000e+00 8.325008e-05
## 4: 2.583646e-05 5.738387e-05 1.665002e-04
## 5: 2.583646e-05 0.000000e+00 2.775003e-05
## 6: 2.583646e-05 2.869193e-05 0.000000e+00

El cálculo se realizaría de la misma manera. A modo ejemplo, se proporciona la probabilidad de ser mujer visto que la descripción del perfil de Twitter ha usado los términos 1 y 5 de la tabla de probabilidades:

# As an example, the probability of being woman given that the terms
# of the description used are 1 & 5 is:
(pDFemale * termDProb$probFemale[1] * termDProb$probFemale[5]/
    (termDProb$WordProb[1]*termDProb$WordProb[5]))

## [1] 0

En el caso de los hombres, las palabras más usadas para la descripción de su perfil son:

###############################################################################
# Males
###############################################################################

# Wordcloud
pal <- brewer.pal(9, "BuGn") 
pal <- pal[-(1:4)] 
wordcloud(dmales$Word, dmales$Frequency, min.freq=40, colors = pal)

# 20 most common unigrams
g <- ggplot(dmales[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por hombres")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dmales$Frequency)
cut50 <- 0.5 * sum(dmales$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dmales$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

El conjunto de textos de las descripciones de perfil de los hombres tienen un total de 10211 palabras, el 50% de los términos usados en el texto se alcanzarían con 533 palabras, mientras que el 90% se alcanza con 5917.

El mismo ejercicio se realiza para las mujeres:

###############################################################################
# Females
###############################################################################

# Wordcloud
wordcloud(dfemales$Word, dfemales$Frequency, min.freq=40, colors = pal)

# 20 most common unigrams
g <- ggplot(dfemales[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por mujeres")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dfemales$Frequency)
cut50 <- 0.5 * sum(dfemales$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dfemales$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

El conjunto de textos de las descripciones de perfil de las mujeres tienen un total de 9665 palabras, el 50% de los términos usados en el texto se alcanzarían con 521 palabras, mientras que el 90% se alcanza con 5789.

Para finalizar, en el caso de las marcas:

###############################################################################
# Brands
###############################################################################

# Wordcloud
wordcloud(dbrands$Word, dbrands$Frequency, min.freq=40, colors = pal)

# 20 most common unigrams
g <- ggplot(dbrands[1:20,], aes(x = Word, y = Frequency)) 
g <- g + geom_bar(stat = "identity") + coord_flip()
g <- g + ggtitle("Las 20 palabras más frecuentes usadas por marcas")
g

# Number of words to get 50% - 90% of the instances for males
# Of a total of 8743 words, 50% of the instances in text would be achieved 
# with 425 words while 90% with 4007 words
cum <- cumsum(dbrands$Frequency)
cut50 <- 0.5 * sum(dbrands$Frequency)
words50 <- length(cum) - sum(cum > cut50) + 1
cut90 <- 0.9 * sum(dbrands$Frequency)
words90 <- length(cum) - sum(cum > cut90) + 1

El conjunto de textos de las descripciones de perfil de las marcas tienen un total de 7727 palabras, el 50% de los términos usados en el texto se alcanzarían con 378 palabras, mientras que el 90% se alcanza con 3863.

A continuación, se calculan las probabilidades de ser hombre, mujer o marca según el texto del tweet y la descripción del perfil, lo cuál nos dará seis características adicionales que se añadirán al conjunto de datos. De esta forma, campos de caracteres se han transformado, usando los mismos principios que las redes Bayesianas, en valores numéricos, aunque al considerar que los términos son independientes entre sí, la red Bayesiana sería del tipo Naive Bayes.

Como fase final de la exploración de los datos, se realizarán varias gráficas interactivas dónde se podrá observar los valores de cada una de los features del conjunto de datos para cada uno de los géneros bajo estudio.

###############################################################################
# Plots
###############################################################################

g <- ggplot(tidytwitterData, aes(fav_number, retweet_count, color=gender))
g <- g +  geom_count() + ggtitle("Valores de los features por género")
g <- g + xlab("Número de tweets favoritos") + ylab("Número de retweets")
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(link_color), color=gender)
g <- g + geom_bar() + coord_flip() + facet_grid(.~gender)
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Color del enlace de perfil (hexadecimal)")
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(sidebar_color), color=gender)
g <- g +  geom_bar() + coord_flip() + facet_grid(.~gender)
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Color de la barra lateral del perfil (hexadecimal)")
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(tweetsPerDay, nbrRetweetOrigin, color=gender))
g <- g + geom_count() + ggtitle("Valores de los features por género")
g <- g + xlab("Número de tweets al día") + ylab("Número de retweets del tweet")
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(nbrnonASCIIDesc, nbrnonASCIIText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Caracteres no ASCII en la descripción") 
g <- g + ylab("Caracteres no ASCII en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrURLsDesc, y=nbrURLsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("URLs en la descripción") 
g <- g + ylab("URLs en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrEmailsDesc, y=nbrEmailsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Número de emails en la descripción") 
g <- g + ylab("Número de emails en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrTwitterTagsDesc, y=nbrTwitterTagsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Número de Twitter tags en la descripción") 
g <- g + ylab("Número de Twitter tags en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrOtherTagsDesc, y=nbrOtherTagsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Número de otros tags en la descripción") 
g <- g + ylab("Número de otros tags en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrTotalTagsDesc, y=nbrTotalTagsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Número total de tags en la descripción") 
g <- g + ylab("Número total de tags en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=nbrHashTagsDesc, y=nbrHashTagsText, 
                                 color=gender)) +  geom_count()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Número de hashtags en la descripción") 
g <- g + ylab("Número de hashtags en el tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=descFromFemale, y=textFromFemale, 
                                 color=gender)) +  geom_point()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Probabilidad de ser mujer dada la descripción") 
g <- g + ylab("Probabilidad de ser mujer dado el texto del tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=descFromMale, y=textFromFemale, 
                                 color=gender)) +  geom_point()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Probabilidad de ser hombre dada la descripción") 
g <- g + ylab("Probabilidad de ser hombre dado el texto del tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

g <- ggplot(tidytwitterData, aes(x=descFromBrand, y=textFromBrand, 
                                 color=gender)) +  geom_point()
g <- g + ggtitle("Valores de los features por género")
g <- g + xlab("Probabilidad de ser una marca dada la descripción") 
g <- g + ylab("Probabilidad de ser una marca dado el texto del tweet") 
ggplotly(g)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Métodos de aprendizaje no supervisado

Los términos utilizados tanto en el texto de los tweets como en la descripción pueden analizarse a través de modelos de aprendizaje no supervisados. En nuestro caso, pese a que lo usual en este tipo de análisis es discernir el número de “topics” que se encuentran en el texto, utilizaremos kmeans para intentar crear tres clústers, uno por género.

En este caso, estamos asumiendo que habrá “topics” que sean característicos de los géneros y que se puedan agrupar entre sí.

###############################################################################
# Clustering: K-means
###############################################################################

# There are three groups so we will set k means clusters to 3:
kmeansTDM <- kmeans(allTDM,3)
kmeansTDMdesc <- kmeans(allTDMDesc,3)

# If we check the elements of the clusters created:
# Cluster 1 is for brands:
c1 <- names(kmeansTDM$cluster[kmeansTDM$cluster == 1])
# Cluster two seems common words between genders:
c2 <- names(kmeansTDM$cluster[kmeansTDM$cluster == 2])
# Cluster three have lot of terms:
c3 <- names(kmeansTDM$cluster[kmeansTDM$cluster == 3])

Tras analizar los términos utilizados en los textos de los tweets, se observa que el primer clúster está formado sólo por palabras relacionadas con los perfiles de Twitter de “marcas”:

# Wordcloud
pal <- brewer.pal(9, "BuGn") 
pal <- pal[-(1:4)] 
wordcloud(c1, rep.int(1, length(c1)), min.freq=1, colors = pal)

El segundo clúster contiene palabras frecuentes tanto en mujeres como en hombres:

wordcloud(c2, rep.int(1, length(c2)), min.freq=1, colors = pal)

Mientras que el tercero contendrá el resto de palabras.

# If we check the elements of the clusters created:
# Cluster 1 is for brands:
c1des <- names(kmeansTDMdesc$cluster[kmeansTDMdesc$cluster == 1])
# Cluster two seems common words between genders:
c2des <- names(kmeansTDMdesc$cluster[kmeansTDMdesc$cluster == 2])
# Cluster three have lot of terms:
c3des <- names(kmeansTDMdesc$cluster[kmeansTDMdesc$cluster == 3])

En el caso de la descripción del perfil de Twitter, el segundo clúster contiene una única palabra, frecuente en mujeres y en hombres (aunque más en mujeres):

wordcloud(c2des, rep.int(1, length(c2des)), min.freq=1, colors = pal)

Conclusiones y acciones futuras

En resumen:

Los datos son escasos y a priori parecen insuficientes para determinar el género de un usuario de Twitter a excepción de casos muy claros como el de las empresas que utilizan el mismo conjunto de palabras para sus tweets.
Eliminar los retweets de los datos.
Nos queda observar si un método de aprendizaje supervisado descubre patrones en los datos no visibles hasta ahora y evaluar el modelo para ver los resultados que se obtienen.
Sería interesante poder filtrar por confidence tanto en el género como en el perfil y ver cómo afecta al modelo.

Clasificación del género de usuarios de Twitter

MJ