El objetivo de este proyecto es mostrar el avance en el manejo de
datos y presentar un análisis exploratorio inicial que permita diseñar
un algoritmo de predicción y una aplicación Shiny.
Se utiliza el dataset SwiftKey que contiene texto de
blogs, noticias y Twitter en inglés.
# URL oficial del dataset SwiftKey
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# Archivo temporal donde guardar el zip
destfile <- "Coursera-SwiftKey.zip"
# Descargar el archivo
if(!file.exists(destfile)){
download.file(url, destfile, mode = "wb")
}
# Descomprimir en carpeta local
if(!dir.exists("Coursera-SwiftKey")){
unzip(destfile, exdir = "Coursera-SwiftKey")
}
# Definir el path de los archivos en inglés
path <- "Coursera-SwiftKey/final/en_US/"
# Cargar archivos
blogs <- readLines(paste0(path, "en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(paste0(path, "en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(paste0(path, "en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
# Mostrar primeras líneas
head(blogs, 3)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
head(news, 3)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
head(twitter, 3)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
# Tamaños de los archivos
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
# Número aproximado de palabras por dataset
library(stringi)
## Warning: package 'stringi' was built under R version 4.3.3
stri_stats_latex(blogs)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162464653 9 42636700 37570839 3
## Envirs
## 0
stri_stats_latex(news)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 12476453 0 3096618 2651432 0
## Envirs
## 0
stri_stats_latex(twitter)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 125570778 3032 35958529 30451170 963
## Envirs
## 0
# Conteo simple
lines_count <- c(blogs = length(blogs), news = length(news), twitter = length(twitter))
words_count <- c(blogs = sum(stri_count_words(blogs)),
news = sum(stri_count_words(news)),
twitter = sum(stri_count_words(twitter)))
data.frame(lines_count, words_count)
## lines_count words_count
## blogs 899288 37546806
## news 77259 2674561
## twitter 2360148 30096690
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Tomamos una muestra pequeña (1%) para explorar
set.seed(123)
sample_data <- c(sample(blogs, length(blogs)*0.01),
sample(news, length(news)*0.01),
sample(twitter, length(twitter)*0.01))
# Tokenización básica
library(tidytext)
library(tibble)
tokens <- tibble(text = sample_data) %>%
unnest_tokens(word, text) %>%
count(word, sort=TRUE)
# Top 20 palabras
top20 <- tokens[1:20,]
ggplot(top20, aes(x=reorder(word,n), y=n)) +
geom_bar(stat="identity", fill="skyblue") +
coord_flip() +
labs(title="Top 20 palabras más frecuentes", x="Palabra", y="Frecuencia")