Teste Hostgator

Sobre a fonte de dados

Os dados utilizados neste trabalho são fictícios e foram fornecidos pelo pessoal resonsável pelo processo seletivo da Hostgator para a realização de teste de mineração e análise.

Os EMAILS foram utilizados como chave para vincular os dados de ambos os arquivos.

Objetivo

Construir um gráfico com o índice de satisfação (%) por PLANO
Para análise, incluir apenas os PLANOS: P, Dedicado e M
Considerar como % de satisfação a quantidade de “positivos” sobre o valor total das respostas de cada plano. Ex: se no Plano P tivemos 10 respostas sobre satisfação e 5 delas foi “positivo”, logo a % de satisfação do Plano P é 50%.

Leitura dos dados

product <- read.csv("./report_produt.csv", header=FALSE)
satisf_email <- read.csv("./report_satisf_email.csv", header=FALSE)

Análise Exploratória

summary(product)

##                     V1               V2      
##  a.a.srva@hemar.net  :    2   Dedicado: 980  
##  a.straube@hemar.net :    2   M       :8500  
##  aaaffp@hemar.net    :    2   P       :6000  
##  aafrbr@yahaa.net.br :    2   PLANO   :   1  
##  aagama@yahaa.net.br :    2   Revenda : 661  
##  aarthur@hemar.net.br:    2                  
##  (Other)             :16130

summary(satisf_email)

##                             V1                 V2       
##  cotaudoaototuca@yahaa.net.br:    2   negativo  : 4990  
##  thoagarhardware@hemar.net   :    2   positivo  :10490  
##  01hast@uaot.net.br          :    1   Satisfação:    1  
##  100parcentafaotoa@otar.net  :    1                     
##  100parcentaonfa@otar.net    :    1                     
##  1337otucas@otar.net         :    1                     
##  (Other)                     :15473

Nomeação dos campos

names(product) <- c("EMAIL","PLANOS")
names(satisf_email) <- c("EMAIL", "SATISFAÇÃO")
summary(product)

##                   EMAIL            PLANOS    
##  a.a.srva@hemar.net  :    2   Dedicado: 980  
##  a.straube@hemar.net :    2   M       :8500  
##  aaaffp@hemar.net    :    2   P       :6000  
##  aafrbr@yahaa.net.br :    2   PLANO   :   1  
##  aagama@yahaa.net.br :    2   Revenda : 661  
##  aarthur@hemar.net.br:    2                  
##  (Other)             :16130

summary(satisf_email)

##                           EMAIL            SATISFAÇÃO   
##  cotaudoaototuca@yahaa.net.br:    2   negativo  : 4990  
##  thoagarhardware@hemar.net   :    2   positivo  :10490  
##  01hast@uaot.net.br          :    1   Satisfação:    1  
##  100parcentafaotoa@otar.net  :    1                     
##  100parcentaonfa@otar.net    :    1                     
##  1337otucas@otar.net         :    1                     
##  (Other)                     :15473

Vinculação das duas fontes de dados pelo campo EMAIL

mergedData <- merge(x=product, y=satisf_email, by="EMAIL", all=TRUE)
summary(mergedData)

##                           EMAIL            PLANOS          SATISFAÇÃO   
##  cotaudoaototuca@yahaa.net.br:    4   Dedicado: 980   negativo  : 5025  
##  thoagarhardware@hemar.net   :    4   M       :8504   positivo  :10554  
##  a.a.srva@hemar.net          :    2   P       :6000   Satisfação:    1  
##  a.straube@hemar.net         :    2   PLANO   :   1   NA's      :  567  
##  aaaffp@hemar.net            :    2   Revenda : 661                     
##  aafrbr@yahaa.net.br         :    2   NA's    :   1                     
##  (Other)                     :16131

Filtragem dos dados relevantes

Incluir apenas os PLANOS: P, Dedicado e M

subsetedMergedData <- subset(mergedData, PLANOS %in% c('P','Dedicado','M'), select = -EMAIL)
# Remover fatores inexistentes
subsetedMergedData[] <- lapply(subsetedMergedData, function(x) if(is.factor(x)) factor(x) else x)
# Exibir amostra dos dados
summary(subsetedMergedData)

##       PLANOS        SATISFAÇÃO   
##  Dedicado: 980   negativo: 4990  
##  M       :8504   positivo:10494  
##  P       :6000

head(subsetedMergedData)

##   PLANOS SATISFAÇÃO
## 1      P   positivo
## 2      P   negativo
## 3      P   negativo
## 4      P   positivo
## 5      P   positivo
## 6      P   positivo

tail(subsetedMergedData)

##       PLANOS SATISFAÇÃO
## 16141      M   positivo
## 16142      M   positivo
## 16143      M   positivo
## 16144      P   negativo
## 16145      P   positivo
## 16146      P   positivo

Gráfico: Índice de Satisfação (%) por Plano

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

d <-  subsetedMergedData %>%
      group_by(PLANOS,SATISFAÇÃO) %>%
      summarise (n = n()) %>%
      mutate(freq = n / sum(n))
d

## Source: local data frame [6 x 4]
## Groups: PLANOS [3]
## 
##     PLANOS SATISFAÇÃO     n      freq
##     <fctr>     <fctr> <int>     <dbl>
## 1 Dedicado   negativo   300 0.3061224
## 2 Dedicado   positivo   680 0.6938776
## 3        M   negativo  3100 0.3645343
## 4        M   positivo  5404 0.6354657
## 5        P   negativo  1590 0.2650000
## 6        P   positivo  4410 0.7350000

library(ggplot2)
library(scales)

## Warning: package 'scales' was built under R version 3.3.3

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 3.3.3

plotOutput <- ggplot(data = data.frame(), aes(x = PLANOS, y = freq, fill = SATISFAÇÃO)) +
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) +
  scale_y_continuous(labels = percent) +
  scale_fill_few('medium', drop = FALSE) +
  labs(x = 'PLANOS', y = NULL, fill = 'SATISFAÇÃO', title = 'Satisfação (%) por PLANO')

plotOutput %+% d

Sys.setlocale("LC_TIME","English")

## [1] "English_United States.1252"

time <- format(Sys.time(), "%F %T %Z")
tz   <- Sys.timezone()

All content generated in 2017-03-08 13:54:37 BRT BRT in America/Sao_Paulo. This work is a skill test for a job opportunity.