R Bridge Course Final Project

Problem Statement

This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015. We will try to answer:

How Spamming is spread across the videos and it’s high comment rate.
Find SPAM / Comments pattern over year and month and during the hour of day
How often badwords have surfaced in the comments

Data Source

About DataSet

This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015.

Packages Used

Tidyverse

stringr

xml

data.table

Loading Packages


suppressWarnings(library(tidyverse))
suppressWarnings(library(stringr))
suppressWarnings(library(data.table))
library(plyr)
library(XML)
library(crayon)
library(lubridate)
library(rjson)
library(readxl)

Question 2. Data wrangling :

Read

Here we have created and loaded all the file in directory of Project in our local variable fdataMaster, fdataEninem, fdataKatyPerry, fdataLMFAO, fdataShakira.

# Read Data folder called Project in current working directory, which hold all the file FIVE csv files and ONE excel master file.
workDir <- getwd()
filePath = paste0(workDir,"/project")
fileName <- list.files(path=filePath)
SpamData <- data.frame(NA)


# Start of code to read data from folder 

for (n in  fileName){
  #Get the File name and key 
  tempName <- str_split(n,"-",n = 2,simplify = TRUE)[1,2]
  key  <- str_split(tempName,"(.xlsx)|(.csv)",n=2,simplify = TRUE)[1,1]
 
  if(str_detect(n,".xlsx$")){
    #Using read excel from readxl package 
    # We know its Master file 
    assign(paste0("fdata",key),read_excel(paste0(filePath,"/",n)))
    
  } else{
    # We know its our record data file for each item in Master file 
    
    assign(paste0("fdata",key),read_csv(paste0(filePath,"/",n))) 
      }
     print(paste0("fdata",key))
     }

## Multiple files in zip: reading 'Youtube01-Psy.csv'

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdata"

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double(),
##   key = col_character(),
##   YID = col_character(),
##   YEAR = col_double(),
##   DAY = col_double(),
##   WDAY = col_character(),
##   HOUR = col_double(),
##   TIME = col_time(format = ""),
##   ConDupcount = col_double(),
##   AuthDupcount = col_double()
## )

## [1] "fdata"

## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.

## [1] "fdataMaster"

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdataPsy"

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdataKatyPerry"

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdataLMFAO"

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdataEminem"

## Parsed with column specification:
## cols(
##   COMMENT_ID = col_character(),
##   AUTHOR = col_character(),
##   DATE = col_datetime(format = ""),
##   CONTENT = col_character(),
##   CLASS = col_double()
## )

## [1] "fdataShakira"

# End of loading data

Show Data

# Function to show Glimpse of the data
showGlimpse <- function(x = NULL){
  if(!is.null(x)){
    print(paste0("--------------Start of %s-----------------------",x))
  print(eval(head(parse(text = x))))
  glimpse(x)
    print(paste0("--------------End of %s-----------------------",x))
  cat("\n")
    return()}
  
  print("Data--------------Start of fdataMaster-----------------------")
  print(head(fdataMaster))
  print("Data--------------End of fdataMaster-----------------------")
  cat("\n")

  print("Data--------------Start of fdataEminem-----------------------")
  glimpse(fdataEminem)
  print("Data--------------End of fdataEminem-----------------------")
  cat("\n") 
  
  print("Data--------------Start of fdataShakira-----------------------")
  glimpse(fdataShakira)
  print("Data--------------End of fdataShakira-----------------------")
  cat("\n") 
  
  print("Data--------------Start of fdataKatyPerry-----------------------")
  glimpse(fdataKatyPerry)
  print("Data--------------End of fdataKatyPerry-----------------------")
  cat("\n") 
  
  print("Data--------------Start of fdataLMFAO-----------------------")
  glimpse(fdataLMFAO)
  print("Data--------------End of fdataLMFAO-----------------------")
  cat("\n") 

  print("Data--------------Start of fdataPsy-----------------------")
  glimpse(fdataPsy)
  print("Data--------------End of fdataPsy-----------------------")
  cat("\n") 

  
  print("Data--------------Start of SpamData-----------------------")
  glimpse(SpamData)
  print("Data--------------End of SpamData-----------------------")
  cat("\n") 
}
showGlimpse()

## [1] "Data--------------Start of fdataMaster-----------------------"
## # A tibble: 5 x 5
##   Dataset   `YouTube ID`  Spam   Ham Total
##   <chr>     <chr>        <dbl> <dbl> <dbl>
## 1 Psy       9bZkp7q19f0    175   175   350
## 2 KatyPerry CevxZvSJLk8    175   175   350
## 3 LMFAO     KQ6zr6kCPj8    236   202   438
## 4 Eminem    uelHwf8o7_U    245   203   448
## 5 Shakira   pRpeEdMmmQ0    174   196   370
## [1] "Data--------------End of fdataMaster-----------------------"
## 
## [1] "Data--------------Start of fdataEminem-----------------------"
## Observations: 448
## Variables: 5
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## [1] "Data--------------End of fdataEminem-----------------------"
## 
## [1] "Data--------------Start of fdataShakira-----------------------"
## Observations: 370
## Variables: 5
## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
## $ AUTHOR     <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
## $ DATE       <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
## $ CONTENT    <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
## $ CLASS      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## [1] "Data--------------End of fdataShakira-----------------------"
## 
## [1] "Data--------------Start of fdataKatyPerry-----------------------"
## Observations: 350
## Variables: 5
## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
## $ AUTHOR     <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
## $ DATE       <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
## $ CONTENT    <chr> "i love this so much. AND also I Generate Free Lead...
## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## [1] "Data--------------End of fdataKatyPerry-----------------------"
## 
## [1] "Data--------------Start of fdataLMFAO-----------------------"
## Observations: 438
## Variables: 5
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## [1] "Data--------------End of fdataLMFAO-----------------------"
## 
## [1] "Data--------------Start of fdataPsy-----------------------"
## Observations: 350
## Variables: 5
## $ COMMENT_ID <chr> "LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU", "LZQ...
## $ AUTHOR     <chr> "Julius NM", "adam riyati", "Evgeny Murashkin", "El...
## $ DATE       <dttm> 2013-11-07 06:20:48, 2013-11-07 12:37:15, 2013-11-...
## $ CONTENT    <chr> "Huh, anyway check out this you[tube] channel: koby...
## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## [1] "Data--------------End of fdataPsy-----------------------"
## 
## [1] "Data--------------Start of SpamData-----------------------"
## Observations: 1
## Variables: 1
## $ NA. <lgl> NA
## [1] "Data--------------End of SpamData-----------------------"

Data Wrangling

Add Column /Combine Data

# AddKey and Name as colum to dataset, so that we can put all the data in 1 file 
# Here I will load the data and then check if this data is availble in Master file 
# If yes then add two column at the begning of the dataset with name "Key" for
# dataset "YID" for youtube id.
    
fdataKatyPerry <- mutate(fdataKatyPerry,key= "KatyPerry", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "KatyPerry"),][1,2]))

fdataEminem <- mutate(fdataEminem,key= "Eminem", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Eminem"),][1,2]))

fdataLMFAO <- mutate(fdataLMFAO,key= "LMFAO", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "LMFAO"),][1,2]))
fdataPsy <- mutate(fdataLMFAO,key= "Psy", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Psy"),][1,2]))
fdataShakira <- mutate(fdataShakira,key= "Shakira", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Shakira"),][1,2]))

showGlimpse()  # Check tibbles to see if they have all the new columns

## [1] "Data--------------Start of fdataMaster-----------------------"
## # A tibble: 5 x 5
##   Dataset   `YouTube ID`  Spam   Ham Total
##   <chr>     <chr>        <dbl> <dbl> <dbl>
## 1 Psy       9bZkp7q19f0    175   175   350
## 2 KatyPerry CevxZvSJLk8    175   175   350
## 3 LMFAO     KQ6zr6kCPj8    236   202   438
## 4 Eminem    uelHwf8o7_U    245   203   448
## 5 Shakira   pRpeEdMmmQ0    174   196   370
## [1] "Data--------------End of fdataMaster-----------------------"
## 
## [1] "Data--------------Start of fdataEminem-----------------------"
## Observations: 448
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key        <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
## [1] "Data--------------End of fdataEminem-----------------------"
## 
## [1] "Data--------------Start of fdataShakira-----------------------"
## Observations: 370
## Variables: 7
## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
## $ AUTHOR     <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
## $ DATE       <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
## $ CONTENT    <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
## $ CLASS      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key        <chr> "Shakira", "Shakira", "Shakira", "Shakira", "Shakir...
## $ YID        <chr> "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeE...
## [1] "Data--------------End of fdataShakira-----------------------"
## 
## [1] "Data--------------Start of fdataKatyPerry-----------------------"
## Observations: 350
## Variables: 7
## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
## $ AUTHOR     <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
## $ DATE       <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
## $ CONTENT    <chr> "i love this so much. AND also I Generate Free Lead...
## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ key        <chr> "KatyPerry", "KatyPerry", "KatyPerry", "KatyPerry",...
## $ YID        <chr> "CevxZvSJLk8", "CevxZvSJLk8", "CevxZvSJLk8", "CevxZ...
## [1] "Data--------------End of fdataKatyPerry-----------------------"
## 
## [1] "Data--------------Start of fdataLMFAO-----------------------"
## Observations: 438
## Variables: 7
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key        <chr> "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO...
## $ YID        <chr> "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr...
## [1] "Data--------------End of fdataLMFAO-----------------------"
## 
## [1] "Data--------------Start of fdataPsy-----------------------"
## Observations: 438
## Variables: 7
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key        <chr> "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "P...
## $ YID        <chr> "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp...
## [1] "Data--------------End of fdataPsy-----------------------"
## 
## [1] "Data--------------Start of SpamData-----------------------"
## Observations: 1
## Variables: 1
## $ NA. <lgl> NA
## [1] "Data--------------End of SpamData-----------------------"

# Now we will use rbind to put file in 1 dataframe "SpamData" 
SpamData <- rbind(fdataEminem,fdataKatyPerry,fdataLMFAO,fdataPsy,fdataShakira)
head(SpamData)

showGlimpse("SpamData")

## [1] "--------------Start of %s-----------------------SpamData"
## # A tibble: 2,044 x 7
##    COMMENT_ID    AUTHOR  DATE                CONTENT      CLASS key   YID  
##    <chr>         <chr>   <dttm>              <chr>        <dbl> <chr> <chr>
##  1 z12rwfnyyrbs~ Lisa W~ NA                  +4479354541~     1 Emin~ uelH~
##  2 z130wpnwwnyu~ jason ~ 2015-05-29 02:26:10 I always en~     0 Emin~ uelH~
##  3 z13vsfqirtav~ Ajkal ~ NA                  "my sister ~     1 Emin~ uelH~
##  4 z12wjzc4eprn~ Dakota~ 2015-05-29 02:13:07 Cool<U+FEFF>             0 Emin~ uelH~
##  5 z13xjfr42z3u~ Jihad ~ NA                  Hello I&#39~     1 Emin~ uelH~
##  6 z133yfmjdur4~ Darrio~ 2015-05-29 01:27:30 Wow this vi~     0 Emin~ uelH~
##  7 z12zgrw5furd~ kyeman~ NA                  Go check ou~     1 Emin~ uelH~
##  8 z12vxdzzds2k~ Damax   2015-05-29 00:41:22 Almost 1 bi~     0 Emin~ uelH~
##  9 z12gxdortqzw~ Muhamm~ NA                  Aslamu Lyku~     1 Emin~ uelH~
## 10 z132wd4ywmic~ JuanPa~ 2015-05-28 23:23:41 Eminem is i~     0 Emin~ uelH~
## # ... with 2,034 more rows
##  chr "SpamData"
## [1] "--------------End of %s-----------------------SpamData"

## NULL

#Lets check the strucutre of column
glimpse(SpamData)

## Observations: 2,044
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key        <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...

Change Str/ Add Column

#Converting Key as factor 
SpamData$key = as.factor(SpamData$key)
glimpse(SpamData)

## Observations: 2,044
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key        <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, Emi...
## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...

# Check Unique of Video , indicates we have got data for all the videos in one dataset
unique(SpamData$key)

## [1] Eminem    KatyPerry LMFAO     Psy       Shakira  
## Levels: Eminem KatyPerry LMFAO Psy Shakira

#Check the Dataset duration 
unique(year(SpamData$DATE))

## [1]   NA 2015 2014 2013

#add MOnth Column
SpamData$YEAR <- as.factor(year(SpamData$DATE))
SpamData$DAY <- as.character(month(SpamData$DATE))
SpamData$WDAY <- as.factor(weekdays(SpamData$DATE))


#See the time of Day
unique(hour(SpamData$DATE))

##  [1] NA  2  1  0 23 22 21 20 19 18 17 16 15 11  7  6  3  9  4 12 10  8  5
## [24] 14 13

# Add Time and Hour from Date column
SpamData$HOUR <- hour(SpamData$DATE)
SpamData$TIME <- str_sub(SpamData$DATE,12,end = -1L) # Time starts at 12th place in string of date,

Subset

#Lets start working with Author Data now, We don't expect any duplicate but checking 
dups_Author <- which(duplicated(SpamData$AUTHOR))

# this looks like we have some same comment put on multiple videos (not checking as of now if it's by same by same user. )
dup_comments <- which(duplicated(SpamData$CONTENT))

#################################### Adding count for Same Comments 
SpamData_ccount <- count(SpamData$CONTENT)
SpamData$ConDupcount <-  mapply(function(x)(SpamData_ccount[which(SpamData_ccount$x==x),][1,2]),SpamData$CONTENT)


#################################### Adding count for Same Authors 

SpamData_tcount <- count(SpamData$AUTHOR)
SpamData$AuthDupcount <-  mapply(function(x)(SpamData_tcount[which(SpamData_tcount$x==x),][1,2]),SpamData$AUTHOR)

#################################### 

#SO we have few Data which are with NA date, it may not add much vlaue in our analysis Lets park this data in other dataset
#Create TWO New Data set with ONly data with valid Data one with VALIDSPAM and other with NASPAM where date is NA


suppressWarnings(rm(validSPAM) ) # Remove Old Variable \
 suppressWarnings(rm(NASPAM))     # Remove Old Variable \
 
 # Creating new dataset with all the columns from MAIN data 
 validSPAM <- SpamData[which(SpamData$DAY != 'NA'),]
 NASPAM <- SpamData[-which(SpamData$DAY != 'NA'),]
 glimpse(SpamData)

## Observations: 2,044
## Variables: 14
## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
## $ AUTHOR       <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
## $ DATE         <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
## $ CLASS        <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR         <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
## $ DAY          <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
## $ WDAY         <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
## $ HOUR         <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
## $ TIME         <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
## $ ConDupcount  <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

 glimpse(validSPAM)

## Observations: 1,799
## Variables: 14
## $ COMMENT_ID   <chr> "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04", "z12wjzc4e...
## $ AUTHOR       <chr> "jason graham", "Dakota Taylor", "Darrion Johnson...
## $ DATE         <dttm> 2015-05-29 02:26:10, 2015-05-29 02:13:07, 2015-0...
## $ CONTENT      <chr> "I always end up coming back to this song<br /><U+FEFF>"...
## $ CLASS        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2...
## $ DAY          <chr> "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",...
## $ WDAY         <fct> Friday, Friday, Friday, Friday, Thursday, Thursda...
## $ HOUR         <int> 2, 2, 1, 0, 23, 23, 22, 22, 21, 21, 20, 20, 20, 1...
## $ TIME         <chr> "02:26:10", "02:13:07", "01:27:30", "00:41:22", "...
## $ ConDupcount  <int> 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

 glimpse(NASPAM)

## Observations: 245
## Variables: 14
## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z13vsfqirta...
## $ AUTHOR       <chr> "Lisa Wellas", "Ajkal Khan", "Jihad Naser", "kyem...
## $ DATE         <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my ...
## $ CLASS        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ DAY          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ WDAY         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ HOUR         <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ TIME         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ ConDupcount  <int> 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 36, 1, 1, 1, 36, 1,...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

String Operation

Convert/ Add Column

# Converting 0 and 1 to Text "NO-SPAM", "SPAM" and create new dataset
SpamData_text <- mutate(SpamData,CLASS= ifelse(CLASS == 0, "NO-SPAM", "SPAM")) 
head(SpamData_text[,c("CLASS","CONTENT")])

# SPAM DATA By  Year and Type 
SpamData_Content <- SpamData_text %>% select("CLASS","CONTENT","YEAR","DAY")

Reading JSON / Detect/ gsub

Read JSON File from www to perfrom str_detect on the collection of badwords.

# Get  bad word JSON File from GITHUB to do you string compare 
#install.packages("rjson")
badwordURL <- "https://raw.githubusercontent.com/web-mech/badwords/master/lib/lang.json"
#library(rjson)
suppressWarnings(rm(dontSay))

raw <- read_file(badwordURL)
dontSay <- fromJSON(raw)
class(dontSay)

## [1] "list"

dontSay <- as.data.frame(dontSay$words)
colnames(dontSay) <- c("words")
dontSay <- as.character(dontSay$words)
# Replace and a all special char with scapce sequence in bad word  # . \ | ( ) [ { ^ $ * + ?   replace 
dontSay <- gsub("[(]","\\\\(",dontSay)
dontSay <- gsub("[.]","\\\\.",dontSay)
dontSay <- gsub("[\\]","\\\\\\",dontSay)
dontSay <- gsub("[|]","\\\\|",dontSay)
dontSay <- gsub("[)]","\\\\)",dontSay)
dontSay <- gsub("[*]","\\\\*",dontSay)

#since Str_detect neeeds 1d varible to check using pase with collapse 
 checkword <- paste(dontSay, collapse = '|')
 dontSay_data <- SpamData_Content[which(str_detect(SpamData_Content$CONTENT,pattern =regex(checkword,ignore_case = TRUE)) ),]
 
 dontSay_data$CLASS <- as.factor(dontSay_data$CLASS)
summary(dontSay_data)

##      CLASS       CONTENT            YEAR        DAY           
##  NO-SPAM: 72   Length:190         2013:25   Length:190        
##  SPAM   :118   Class :character   2014:51   Class :character  
##                Mode  :character   2015:78   Mode  :character  
##                                   NA's:36

## Message Marked as SPAM and have Bad words.
 glimpse(dontSay_data[which(dontSay_data$CLASS=="SPAM"),])

## Observations: 118
## Variables: 4
## $ CLASS   <fct> SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my siste...
## $ YEAR    <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DAY     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

 ## Adding Bad column to new data set show if comments was bad or no
 SpamData_text<- mutate(SpamData_text, bad = str_detect(SpamData_text$CONTENT,pattern =checkword ) )
 
 
 # summary(SpamData_text)
 # plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>% 
 #  plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))

Write

write “SpamData” it to local directory

# write  "SpamData" it to local directory 
write.csv(SpamData,file = "project/SpamData.csv")

Question 1. Data Exploration

Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.

Objective

Doing String operations and creating new data set. 
Calcualting mean , median and other info.

summary

statistics

glimpse(SpamData)

## Observations: 2,044
## Variables: 14
## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
## $ AUTHOR       <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
## $ DATE         <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
## $ CLASS        <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR         <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
## $ DAY          <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
## $ WDAY         <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
## $ HOUR         <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
## $ TIME         <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
## $ ConDupcount  <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

head(fdataMaster)

summary(SpamData)

##   COMMENT_ID           AUTHOR               DATE                    
##  Length:2044        Length:2044        Min.   :2013-07-12 22:33:27  
##  Class :character   Class :character   1st Qu.:2014-09-25 21:39:20  
##  Mode  :character   Mode  :character   Median :2015-03-29 07:12:48  
##                                        Mean   :2014-12-25 18:08:03  
##                                        3rd Qu.:2015-05-22 16:27:17  
##                                        Max.   :2015-06-05 20:01:23  
##                                        NA's   :245                  
##    CONTENT              CLASS               key          YID           
##  Length:2044        Min.   :0.0000   Eminem   :448   Length:2044       
##  Class :character   1st Qu.:0.0000   KatyPerry:350   Class :character  
##  Mode  :character   Median :1.0000   LMFAO    :438   Mode  :character  
##                     Mean   :0.5215   Psy      :438                     
##                     3rd Qu.:1.0000   Shakira  :370                     
##                     Max.   :1.0000                                     
##                                                                        
##    YEAR          DAY                   WDAY          HOUR      
##  2013: 200   Length:2044        Tuesday  :309   Min.   : 0.00  
##  2014: 517   Class :character   Saturday :290   1st Qu.: 6.00  
##  2015:1082   Mode  :character   Thursday :272   Median :14.00  
##  NA's: 245                      Wednesday:269   Mean   :12.33  
##                                 Sunday   :242   3rd Qu.:18.00  
##                                 (Other)  :417   Max.   :23.00  
##                                 NA's     :245   NA's   :245    
##      TIME            ConDupcount      AuthDupcount 
##  Length:2044        Min.   :  1.00   Min.   :1.00  
##  Class :character   1st Qu.:  1.00   1st Qu.:1.00  
##  Mode  :character   Median :  1.00   Median :2.00  
##                     Mean   : 16.33   Mean   :1.77  
##                     3rd Qu.:  2.00   3rd Qu.:2.00  
##                     Max.   :171.00   Max.   :8.00  
##

ddply(SpamData, "key",summarise, duration = max(as.numeric(YEAR)) - min(as.numeric(YEAR)))

# count the total number of SAPM by key column
# The output of ddply is just the give column as we have user Summarise fucnation, if we use funcation  tranform
# SpamDataT <- ddply(SpamData, "key", transform, Spam = sum( CLASS ) )  it would result full data set for use to user 

# Data Of Key Video and SPAM Count 
# Sine CALSS Is logical and 1 is for SPAM and 0 for NO SPAM , Sum of CLASS would give sum of SAPM content 
# Rename the column by doing pipe %>% of the data to rename of plys pacakge 
ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
rename( c("key"="Video Name","Spam"= " SPAM Count"))

# Spam over days of week
ddply(SpamData, "WDAY", summarise, Spam = sum( CLASS ) ) %>%
rename( c("WDAY"="Day of week","Spam"= " SPAM Count"))

Calculate %

# Count of each value of "Key" in the first spam data
SpamData_keyCount <- count(SpamData, vars = "key")

# Calculate % of SAPM by video 

SpamData_percent <- ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
mutate(  SPAM_PERCENT =  as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] , 
         TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )]) 

SpamData_percent[order(SpamData_percent$TOTAL_COMMENTS,decreasing = TRUE),]

means

# Know the Mean of comments from Author and Hours of the day

sapply( list( "Repeat Author" = SpamData$AuthDupcount,"Hour of Day" = SpamData$HOUR ), FUN =mean, na.rm=TRUE)

## Repeat Author   Hour of Day 
##      1.770059     12.327404

# MEAN Of % of SPAM with respect to total spam
sapply(list("Spam % Mean" = SpamData_percent$SPAM_PERCENT, "| Total Comment Mean"=SpamData_percent$TOTAL_COMMENTS),FUN = mean)

##          Spam % Mean | Total Comment Mean 
##             51.89542            408.80000

# Mean of Hours and Day of the Month
 sapply( list( "Hour_Mean" = SpamData$HOUR,"Day_Mean" = day(SpamData$DATE) ), mean, na.rm=TRUE)

## Hour_Mean  Day_Mean 
##  12.32740  17.98221

Aggregate

# aggregate data by Video Key and Weekday to see How many SPAMs were reported on each daty
aggregate(list("SPAM_COUNT" = SpamData$CLASS ),by = list("ID" = SpamData$key, "Day_Of_Week" = SpamData$WDAY ) , FUN=  sum,na.action=FALSE )

library(dplyr)  

# Total comments by Week Days
SpamData_total_spam_weekday <- plyr::count(SpamData, vars = c("key","WDAY"))
SpamData_total_spam_weekday

# Count of COmment by Year 
SpamData_total_spam_year <- plyr:: count(SpamData, vars = c("key","YEAR"))
summary(SpamData_total_spam_year)

##         key      YEAR        freq      
##  Eminem   :2   2013:1   Min.   : 15.0  
##  KatyPerry:2   2014:3   1st Qu.:110.8  
##  LMFAO    :2   2015:5   Median :201.5  
##  Psy      :2   NA's:1   Mean   :204.4  
##  Shakira  :2            3rd Qu.:312.5  
##                         Max.   :347.0

#It looks like we have 0 for Eminem at every day of week. Let check it with some sample data of SPAM in Main dataset
SpamData[which(SpamData$CLASS == 1 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]

# We see we have NA for Wday and its right as we have NA for DATE , To validate further lets check the Not SAPM data for the Eminem
SpamData[which(SpamData$CLASS == 0 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]

medians

# Median of Hours and Day of the Month
sapply( list( "Hour_Median" = SpamData$HOUR,"Day_Median" = day(SpamData$DATE) ), median, na.rm=TRUE)

## Hour_Median  Day_Median 
##          14          20

quartiles

qunat_SPAM_byDay <- quantile(aggregate(list("SPAM_COUNT" = SpamData$CLASS), by= list("Day" = SpamData$WDAY),sum)$SPAM_COUNT)

sprintf("The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are %s, %s and %s spams respectively.",qunat_SPAM_byDay[1],qunat_SPAM_byDay[2],qunat_SPAM_byDay[3])

## [1] "The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are 95, 103 and 116 spams respectively."

# Using Colored Formating 


cat("The first, second and third " %+% blue$underline$bold ('quartiles') %+% " of the " %+% red("SAPM Count") %+%  " in SPAM Datas are ", blue(qunat_SPAM_byDay[1]),",",blue(qunat_SPAM_byDay[2]), " and " , blue(qunat_SPAM_byDay[3]) , " spams respectively.")

## The first, second and third quartiles of the SAPM Count in SPAM Datas are  95 , 103  and  116  spams respectively.

conclusions

We have 245 observations which were recorded with no Dates, and hence no Day is availvle for them.
The SPAM % is directly proportional Total Comments as show in below table
in 2014 , 2015 we see increse in trend as COMMENTS are incresing and SPAM too, by query summary(SpamData_total_spam_year)

##         key      YEAR        freq      
##  Eminem   :2   2013:1   Min.   : 15.0  
##  KatyPerry:2   2014:3   1st Qu.:110.8  
##  LMFAO    :2   2015:5   Median :201.5  
##  Psy      :2   NA's:1   Mean   :204.4  
##  Shakira  :2            3rd Qu.:312.5  
##                         Max.   :347.0

Mean data Indicates that SPAM is more than 50% of total comments

Spam % Mean	Total Comment Mean
51.89542	408.80000

5 SPAM Data by Weekday : The first, second and third quartiles of the SAPM Count in SPAM Datas are 95 ,103 and 116 spams respectively. + qunat_SPAM_byDay

0%	25%	50%	75%	100%
95	103	116	124	156

Question 3. Graphics

Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.

Objective

Creating differnt type of data and Graph set that can support us latter for our final data presentation. ## Graphs {.tabset .tabset-fade .tabset}

BAR Graph

library(plyr)
head(SpamData_total_spam_year)

# Comments Over Year on the Videos
ggplot(SpamData_total_spam_year,mapping=aes(x=key,y=YEAR,fill= YEAR)) +
  geom_col()+
  ggtitle("Comments Over Year on the Videos")

############################ Data by only Key year and Class using SPREAD to move ROW TO COLUMN
SpamData_kcy <- SpamData_text%>%select(key,CLASS,YEAR)
SpamData_kcy_f<- plyr::count(SpamData_kcy ,vars =c("key","YEAR","CLASS")) %>%
spread(CLASS, freq) 
  
## Graph for only SPAM over the video per year  , Its very clear that 2014 and 2015 recorded increse in Comments and hence SPAM
ggplot(SpamData_kcy_f,mapping=aes(x=key, y=SPAM, fill = YEAR)) +
geom_col(na.rm = FALSE) +facet_wrap(~YEAR) +
  labs(title="SPAM over Year on Videos",
        x ="Video ", y = "SPAM Count")+
  ggtitle(" Impact of SPAM on Number of Videos by year") +
  theme(plot.title = element_text(color="gray", size=14, face="bold" ))

## Warning: Removed 1 rows containing missing values (position_stack).

# Impact of SPAM on Number of Videos by year 

ggplot(SpamData_total_spam_year,mapping=aes(x=YEAR,fill= freq)) +
  geom_bar() +
  theme(plot.background = element_rect(color = "orange")) +
  theme(panel.background = element_blank()) +
  theme(panel.grid.major = element_line(color="blue")) +
 theme(panel.grid.minor = element_blank())+
  theme(panel.grid.major.x = element_blank())+
  ylab( "No Of Videos Impacted") +
  xlab( "Year" ) +
  ggtitle(" Impact of SPAM on Number of Videos by year") +
  theme(plot.title = element_text(color="red", size=14, face="bold.italic" ))

box plot

## using box plt to show that data was very high in 2014 and 2015
ggplot(SpamData_kcy_f,mapping=aes(x=YEAR, y= SPAM)) +
geom_boxplot()

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

# We can clearly see that Spamming is not a regualar process , it happens in a chunk , with very few exceptions .
ggplot(SpamData_text,mapping=aes(x=key, y=DATE,color = CLASS)) +
 geom_boxplot(na.rm = TRUE)+
  ylab( "Date in Year") +
  xlab( "Video Key" ) +
theme(panel.background = element_blank(),
        legend.key = element_blank()) +  #Gray color behind the actual legend 
theme(panel.grid.major = element_line(color="blue")) +
 theme(panel.grid.minor = element_blank())+
  theme(panel.grid.major.y = element_blank())+
  scale_color_discrete(name="Comment Type")+
ggtitle(" Type of comments on Videos by year") +
theme(plot.title = element_text(color="red", size=12, face="bold" ))

scatter plot

#SPAM and NO SPAM BY YEAR 
SpamData_byYear <- ddply(SpamData, "YEAR", summarise, Spam = sum( CLASS ) ) %>%
mutate(  SPAM_PERCENT =  as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] , 
         TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )]) 


# Get Data for one video overt the period of dataset
SpamData_text[order(SpamData_text$DAY),] %>%
subset(key== "LMFAO") %>%
  ggplot(mapping=aes(x=DAY, y=HOUR,color= CLASS)) +
  geom_point()+
  xlab("Month")

# Run the same check on FUll Data Set now
# Its very clear here that during 1st half 2015 spam were higher than the regular comments in the months.
SpamData_text[order(SpamData_text$DAY),] %>%
  ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
  geom_point(na.rm = TRUE,alpha= 1/3)  +
  geom_smooth(na.rm = TRUE,span = 0.1) +
  labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
  theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

histogram

# store the mean over year 
myear<- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999"))))

mYear2 <- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999")))[c("2014","2013","9999")])

 ################################## Hisogram showing same trend of more SPAM in incresing trend close to mean of all the comments over Month 
    ggplot(SpamData_text,mapping = aes(x=DATE, fill = CLASS)) +
    # start your data from 0 coordinates 
    # bin creates 10 group of data .. deault R will get some random bin in this case it was 30 )
     geom_histogram(na.rm= TRUE,boundary = 0,  
                    bins = 10
                    )  +
      geom_hline(linetype = 5,  
                 color="blue",
                 yintercept = myear) +
      
      # Adding Annotation 
      annotate("text",label = sprintf("mean of comments over year %s",myear), x= as.POSIXct("2014-01-25 18:08:03"), y= 535 ) +
      geom_hline(linetype = 2,  
                 color="red",
                 yintercept = mYear2) +
    
    annotate("text",label = sprintf("mean of comments till 2014, %.2f",mYear2), x= as.POSIXct("2014-01-25 18:08:03"), y= mYear2+20 )+
       labs(title =" Type of comments on Videos by  \n Year ", x= "DATE" , y ="Comments Count") +
  theme(plot.title = element_text(color="Red", size=12, face="bold" ))

library(plyr)
summary(SpamData_text[SpamData_text$YEAR == "2015",]$YEAR)

## 2013 2014 2015 NA's 
##    0    0 1082  245

 #................................Data points indicates that people have commented more during  NIGHT Times , but its little slow during early days of work.

# Mean time of comment is 12:33 during day.

    xm <- as.numeric(summary(SpamData_text$HOUR)[4])
    plyr::count(SpamData_text,c("HOUR","CLASS")) %>% 
    ggplot(mapping = aes(x=HOUR,y= freq, fill=CLASS)) +
    geom_col(na.rm = TRUE) +
      ## # 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
      geom_vline(na.rm = TRUE,linetype = 5,  
                 color="blue",
                 xintercept =as.numeric(summary(SpamData_text$HOUR)[4])) +
      theme(panel.background = element_blank(),
        legend.key = element_blank()) +
      # Adding Annotation 
      annotate("text",label = sprintf("mean of Time %.2f",as.numeric(summary(SpamData_text$HOUR)[4])), x= as.numeric(summary(SpamData_text$HOUR)[4]), y= 100 ) +
    labs(title =" Type of Comments made during by Hours of day ", x= "Hour of Day" , y ="Comments Count") +
  theme(plot.title = element_text(color="Red", size=12, face="bold" ))

## Warning: Removed 1 rows containing missing values (position_stack).

GEOM SMOOTH

xm <- as.numeric(summary(SpamData_text$HOUR)[4])
    plyr::count(SpamData_text,c("HOUR","CLASS")) %>% 
    ggplot(mapping = aes(x=HOUR,y= freq, color=CLASS)) +
      geom_point(na.rm = TRUE) +
    geom_smooth(na.rm = TRUE,span = 0.8)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

    #Indicating Evening to Midnight more comments are created.

Question 4. Conclusions

Analysis

Our Analysis based on data is as below:

Both Good comments and bad comments are increasing over the year (point 1)

People are spending more time during evening to midnight , but less time during morrning (point 2)

SPAMers target diffrent videos diffrent time of the month . Not same videos would be spammed every day or month .(point 2)

Badwords are equally present in good and spam comments , but are not very big in number. (point 3)

Point 1

How Spamming is spread across the videos and it’s comment rate.

As we can see that spams are there since the time data was collected and it has increased over time . We can clearly see that SPAM is very close to good content for some the videos in year 2014 and mostly all the videos in 2015 .

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_point).

point 2

Find SPAM / Comments pattern over year and month and during the hour of day

SPAM / Year

We can see that as we are moving ahead , SPAM BOX is also increasing. Good point to note that even good comments has increased during 2015

Who is getting SPAM?

We can clearly see that Spamming is not a regualar process for a Video, it happens in a in some months, with very few exceptions .

incontrast, for multiple videos, Data Point for 2015 indicates that most of the videos were spammed in this year, almost every month some vidoes were spammed.

Trends of SPAM..

Hisogram showing same trend of more SPAM in incresing trend close to mean of all the comments over Month * since 2015 data is big we tried plotting meen against all data except 2015 > Is clear that 2015 has added more Good comments and also SPAM

When people commented ?

Data points indicates that people have commented more during NIGHT Times , but its little slow during early days of work. > Less comments during morning ? We can say yes.

## Warning: Removed 1 rows containing missing values (position_stack).

When SPAM won

*Its very clear here that during 1st half of 2014 and 2015 spam were higher than the rgular comments in the months.

From below chart its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

Bulk of 2015 is visble again.

SpamData_text[order(SpamData_text$DAY),] %>%
  ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
  geom_point(na.rm = TRUE,alpha= 1/3)  +
  geom_smooth(na.rm = TRUE,span = 0.1) +
  labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
  theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

Point 3

Below table is very clear and pointing out that Bad words more in SAPM , but strange even not-Spam comments contain bad words.

  plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>% 
  plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))

Question 5.

BONUS - place the original .csv in a github file and have R read from the link.

SPAMdata Master file created from this product is Uploaded to CSV Github , Raw CSV from GIT Hub

gitRawFile = "https://raw.githubusercontent.com/Rajwantmishra/msds/master/SpamData.csv" #require(XML) read.csv.url <- read.csv( url(gitRawFile)) head(read.csv.url)

#require(read.table) read.csv.Data <- read.csv(gitRawFile,header=T) head(read.csv.Data )

#library(data.table) dataTableCSV <- fread(gitRawFile) head(dataTableCSV)

library(tidyverse) tidyDataCSV <- read_csv(gitRawFile)

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification: ## cols( ## X1 = col_double(), ## COMMENT_ID = col_character(), ## AUTHOR = col_character(), ## DATE = col_datetime(format = ""), ## CONTENT = col_character(), ## CLASS = col_double(), ## key = col_character(), ## YID = col_character() ## )

str(tidyDataCSV)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2044 obs. of 8 variables: ## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ... ## $ COMMENT_ID: chr "z12rwfnyyrbsefonb232i5ehdxzkjzjs2" "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04" "z13vsfqirtavjvu0t22ezrgzyorwxhpf3" "z12wjzc4eprnvja4304cgbbizuved35wxcs" ... ## $ AUTHOR : chr "Lisa Wellas" "jason graham" "Ajkal Khan" "Dakota Taylor" ... ## $ DATE : POSIXct, format: NA "2015-05-29 02:26:10" ... ## $ CONTENT : chr "+447935454150 lovely girl talk to me xxx<U+FEFF>" "I always end up coming back to this song<br /><U+FEFF>" "my sister just received over 6,500 new <a rel=\"nofollow\" class=\"ot-hashtag\" href=\"https://plus.google.com/"| __truncated__ "Cool<U+FEFF>" ... ## $ CLASS : num 1 0 1 0 1 0 1 0 1 0 ... ## $ key : chr "Eminem" "Eminem" "Eminem" "Eminem" ... ## $ YID : chr "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" ... ## - attr(*, "spec")= ## .. cols( ## .. X1 = col_double(), ## .. COMMENT_ID = col_character(), ## .. AUTHOR = col_character(), ## .. DATE = col_datetime(format = ""), ## .. CONTENT = col_character(), ## .. CLASS = col_double(), ## .. key = col_character(), ## .. YID = col_character() ## .. )

head(tidyDataCSV)

Project MSDS Week 3

Rajwant Mishra

January 9, 2019

R Bridge Course Final Project

Problem Statement

Data Source

About DataSet

Packages Used

Loading Packages

Question 2. Data wrangling :

Read

Show Data

Data Wrangling

Add Column /Combine Data

Change Str/ Add Column

Subset

String Operation

Convert/ Add Column

Reading JSON / Detect/ gsub

Write

Question 1. Data Exploration

Objective

summary

statistics

Calculate %

means

Aggregate

medians

quartiles

conclusions

Question 3. Graphics

Objective

BAR Graph

box plot

scatter plot

histogram

GEOM SMOOTH

Question 4. Conclusions

Analysis

Point 1

point 2

SPAM / Year

Who is getting SPAM?

Trends of SPAM..

When people commented ?

When SPAM won

Point 3

Question 5.