R Bridge Course Final Project

Problem Statement

This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015. We will try to answer:

  • How Spamming is spread across the videos and it’s high comment rate.
  • Find SPAM / Comments pattern over year and month and during the hour of day
  • How often badwords have surfaced in the comments

About DataSet

This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015.

Packages Used

  • Tidyverse
  • stringr
  • xml
  • data.table

  • Loading Packages

    
    suppressWarnings(library(tidyverse))
    suppressWarnings(library(stringr))
    suppressWarnings(library(data.table))
    library(plyr)
    library(XML)
    library(crayon)
    library(lubridate)
    library(rjson)
    library(readxl)
    

    Question 2. Data wrangling :

    Read

    Here we have created and loaded all the file in directory of Project in our local variable fdataMaster, fdataEninem, fdataKatyPerry, fdataLMFAO, fdataShakira.

    # Read Data folder called Project in current working directory, which hold all the file FIVE csv files and ONE excel master file.
    workDir <- getwd()
    filePath = paste0(workDir,"/project")
    fileName <- list.files(path=filePath)
    SpamData <- data.frame(NA)
    
    
    # Start of code to read data from folder 
    
    for (n in  fileName){
      #Get the File name and key 
      tempName <- str_split(n,"-",n = 2,simplify = TRUE)[1,2]
      key  <- str_split(tempName,"(.xlsx)|(.csv)",n=2,simplify = TRUE)[1,1]
     
      if(str_detect(n,".xlsx$")){
        #Using read excel from readxl package 
        # We know its Master file 
        assign(paste0("fdata",key),read_excel(paste0(filePath,"/",n)))
        
      } else{
        # We know its our record data file for each item in Master file 
        
        assign(paste0("fdata",key),read_csv(paste0(filePath,"/",n))) 
          }
         print(paste0("fdata",key))
         }
    ## Multiple files in zip: reading 'Youtube01-Psy.csv'
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdata"
    ## Warning: Missing column names filled in: 'X1' [1]
    ## Parsed with column specification:
    ## cols(
    ##   X1 = col_double(),
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double(),
    ##   key = col_character(),
    ##   YID = col_character(),
    ##   YEAR = col_double(),
    ##   DAY = col_double(),
    ##   WDAY = col_character(),
    ##   HOUR = col_double(),
    ##   TIME = col_time(format = ""),
    ##   ConDupcount = col_double(),
    ##   AuthDupcount = col_double()
    ## )
    ## [1] "fdata"
    ## readxl works best with a newer version of the tibble package.
    ## You currently have tibble v1.4.2.
    ## Falling back to column name repair from tibble <= v1.4.2.
    ## Message displays once per session.
    ## [1] "fdataMaster"
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdataPsy"
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdataKatyPerry"
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdataLMFAO"
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdataEminem"
    ## Parsed with column specification:
    ## cols(
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double()
    ## )
    ## [1] "fdataShakira"
    # End of loading data 

    Show Data

    # Function to show Glimpse of the data
    showGlimpse <- function(x = NULL){
      if(!is.null(x)){
        print(paste0("--------------Start of %s-----------------------",x))
      print(eval(head(parse(text = x))))
      glimpse(x)
        print(paste0("--------------End of %s-----------------------",x))
      cat("\n")
        return()}
      
      print("Data--------------Start of fdataMaster-----------------------")
      print(head(fdataMaster))
      print("Data--------------End of fdataMaster-----------------------")
      cat("\n")
    
      print("Data--------------Start of fdataEminem-----------------------")
      glimpse(fdataEminem)
      print("Data--------------End of fdataEminem-----------------------")
      cat("\n") 
      
      print("Data--------------Start of fdataShakira-----------------------")
      glimpse(fdataShakira)
      print("Data--------------End of fdataShakira-----------------------")
      cat("\n") 
      
      print("Data--------------Start of fdataKatyPerry-----------------------")
      glimpse(fdataKatyPerry)
      print("Data--------------End of fdataKatyPerry-----------------------")
      cat("\n") 
      
      print("Data--------------Start of fdataLMFAO-----------------------")
      glimpse(fdataLMFAO)
      print("Data--------------End of fdataLMFAO-----------------------")
      cat("\n") 
    
      print("Data--------------Start of fdataPsy-----------------------")
      glimpse(fdataPsy)
      print("Data--------------End of fdataPsy-----------------------")
      cat("\n") 
    
      
      print("Data--------------Start of SpamData-----------------------")
      glimpse(SpamData)
      print("Data--------------End of SpamData-----------------------")
      cat("\n") 
    }
    showGlimpse()
    ## [1] "Data--------------Start of fdataMaster-----------------------"
    ## # A tibble: 5 x 5
    ##   Dataset   `YouTube ID`  Spam   Ham Total
    ##   <chr>     <chr>        <dbl> <dbl> <dbl>
    ## 1 Psy       9bZkp7q19f0    175   175   350
    ## 2 KatyPerry CevxZvSJLk8    175   175   350
    ## 3 LMFAO     KQ6zr6kCPj8    236   202   438
    ## 4 Eminem    uelHwf8o7_U    245   203   448
    ## 5 Shakira   pRpeEdMmmQ0    174   196   370
    ## [1] "Data--------------End of fdataMaster-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataEminem-----------------------"
    ## Observations: 448
    ## Variables: 5
    ## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
    ## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
    ## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
    ## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
    ## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
    ## [1] "Data--------------End of fdataEminem-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataShakira-----------------------"
    ## Observations: 370
    ## Variables: 5
    ## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
    ## $ AUTHOR     <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
    ## $ DATE       <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
    ## $ CONTENT    <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
    ## $ CLASS      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
    ## [1] "Data--------------End of fdataShakira-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataKatyPerry-----------------------"
    ## Observations: 350
    ## Variables: 5
    ## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
    ## $ AUTHOR     <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
    ## $ DATE       <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
    ## $ CONTENT    <chr> "i love this so much. AND also I Generate Free Lead...
    ## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## [1] "Data--------------End of fdataKatyPerry-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataLMFAO-----------------------"
    ## Observations: 438
    ## Variables: 5
    ## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
    ## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
    ## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
    ## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
    ## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
    ## [1] "Data--------------End of fdataLMFAO-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataPsy-----------------------"
    ## Observations: 350
    ## Variables: 5
    ## $ COMMENT_ID <chr> "LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU", "LZQ...
    ## $ AUTHOR     <chr> "Julius NM", "adam riyati", "Evgeny Murashkin", "El...
    ## $ DATE       <dttm> 2013-11-07 06:20:48, 2013-11-07 12:37:15, 2013-11-...
    ## $ CONTENT    <chr> "Huh, anyway check out this you[tube] channel: koby...
    ## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
    ## [1] "Data--------------End of fdataPsy-----------------------"
    ## 
    ## [1] "Data--------------Start of SpamData-----------------------"
    ## Observations: 1
    ## Variables: 1
    ## $ NA. <lgl> NA
    ## [1] "Data--------------End of SpamData-----------------------"

    Data Wrangling

    Add Column /Combine Data

    # AddKey and Name as colum to dataset, so that we can put all the data in 1 file 
    # Here I will load the data and then check if this data is availble in Master file 
    # If yes then add two column at the begning of the dataset with name "Key" for
    # dataset "YID" for youtube id.
        
    fdataKatyPerry <- mutate(fdataKatyPerry,key= "KatyPerry", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "KatyPerry"),][1,2]))
    
    fdataEminem <- mutate(fdataEminem,key= "Eminem", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Eminem"),][1,2]))
    
    fdataLMFAO <- mutate(fdataLMFAO,key= "LMFAO", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "LMFAO"),][1,2]))
    fdataPsy <- mutate(fdataLMFAO,key= "Psy", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Psy"),][1,2]))
    fdataShakira <- mutate(fdataShakira,key= "Shakira", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Shakira"),][1,2]))
    
    showGlimpse()  # Check tibbles to see if they have all the new columns
    ## [1] "Data--------------Start of fdataMaster-----------------------"
    ## # A tibble: 5 x 5
    ##   Dataset   `YouTube ID`  Spam   Ham Total
    ##   <chr>     <chr>        <dbl> <dbl> <dbl>
    ## 1 Psy       9bZkp7q19f0    175   175   350
    ## 2 KatyPerry CevxZvSJLk8    175   175   350
    ## 3 LMFAO     KQ6zr6kCPj8    236   202   438
    ## 4 Eminem    uelHwf8o7_U    245   203   448
    ## 5 Shakira   pRpeEdMmmQ0    174   196   370
    ## [1] "Data--------------End of fdataMaster-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataEminem-----------------------"
    ## Observations: 448
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
    ## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
    ## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
    ## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
    ## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
    ## $ key        <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
    ## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
    ## [1] "Data--------------End of fdataEminem-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataShakira-----------------------"
    ## Observations: 370
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
    ## $ AUTHOR     <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
    ## $ DATE       <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
    ## $ CONTENT    <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
    ## $ CLASS      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
    ## $ key        <chr> "Shakira", "Shakira", "Shakira", "Shakira", "Shakir...
    ## $ YID        <chr> "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeE...
    ## [1] "Data--------------End of fdataShakira-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataKatyPerry-----------------------"
    ## Observations: 350
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
    ## $ AUTHOR     <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
    ## $ DATE       <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
    ## $ CONTENT    <chr> "i love this so much. AND also I Generate Free Lead...
    ## $ CLASS      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## $ key        <chr> "KatyPerry", "KatyPerry", "KatyPerry", "KatyPerry",...
    ## $ YID        <chr> "CevxZvSJLk8", "CevxZvSJLk8", "CevxZvSJLk8", "CevxZ...
    ## [1] "Data--------------End of fdataKatyPerry-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataLMFAO-----------------------"
    ## Observations: 438
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
    ## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
    ## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
    ## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
    ## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
    ## $ key        <chr> "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO...
    ## $ YID        <chr> "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr...
    ## [1] "Data--------------End of fdataLMFAO-----------------------"
    ## 
    ## [1] "Data--------------Start of fdataPsy-----------------------"
    ## Observations: 438
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
    ## $ AUTHOR     <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
    ## $ DATE       <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
    ## $ CONTENT    <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
    ## $ CLASS      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
    ## $ key        <chr> "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "P...
    ## $ YID        <chr> "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp...
    ## [1] "Data--------------End of fdataPsy-----------------------"
    ## 
    ## [1] "Data--------------Start of SpamData-----------------------"
    ## Observations: 1
    ## Variables: 1
    ## $ NA. <lgl> NA
    ## [1] "Data--------------End of SpamData-----------------------"
    # Now we will use rbind to put file in 1 dataframe "SpamData" 
    SpamData <- rbind(fdataEminem,fdataKatyPerry,fdataLMFAO,fdataPsy,fdataShakira)
    head(SpamData)
    showGlimpse("SpamData")
    ## [1] "--------------Start of %s-----------------------SpamData"
    ## # A tibble: 2,044 x 7
    ##    COMMENT_ID    AUTHOR  DATE                CONTENT      CLASS key   YID  
    ##    <chr>         <chr>   <dttm>              <chr>        <dbl> <chr> <chr>
    ##  1 z12rwfnyyrbs~ Lisa W~ NA                  +4479354541~     1 Emin~ uelH~
    ##  2 z130wpnwwnyu~ jason ~ 2015-05-29 02:26:10 I always en~     0 Emin~ uelH~
    ##  3 z13vsfqirtav~ Ajkal ~ NA                  "my sister ~     1 Emin~ uelH~
    ##  4 z12wjzc4eprn~ Dakota~ 2015-05-29 02:13:07 Cool<U+FEFF>             0 Emin~ uelH~
    ##  5 z13xjfr42z3u~ Jihad ~ NA                  Hello I&#39~     1 Emin~ uelH~
    ##  6 z133yfmjdur4~ Darrio~ 2015-05-29 01:27:30 Wow this vi~     0 Emin~ uelH~
    ##  7 z12zgrw5furd~ kyeman~ NA                  Go check ou~     1 Emin~ uelH~
    ##  8 z12vxdzzds2k~ Damax   2015-05-29 00:41:22 Almost 1 bi~     0 Emin~ uelH~
    ##  9 z12gxdortqzw~ Muhamm~ NA                  Aslamu Lyku~     1 Emin~ uelH~
    ## 10 z132wd4ywmic~ JuanPa~ 2015-05-28 23:23:41 Eminem is i~     0 Emin~ uelH~
    ## # ... with 2,034 more rows
    ##  chr "SpamData"
    ## [1] "--------------End of %s-----------------------SpamData"
    ## NULL
    #Lets check the strucutre of column
    glimpse(SpamData)
    ## Observations: 2,044
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
    ## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
    ## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
    ## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
    ## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
    ## $ key        <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
    ## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...

    Change Str/ Add Column

    #Converting Key as factor 
    SpamData$key = as.factor(SpamData$key)
    glimpse(SpamData)
    ## Observations: 2,044
    ## Variables: 7
    ## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
    ## $ AUTHOR     <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
    ## $ DATE       <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
    ## $ CONTENT    <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
    ## $ CLASS      <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
    ## $ key        <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, Emi...
    ## $ YID        <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
    # Check Unique of Video , indicates we have got data for all the videos in one dataset
    unique(SpamData$key)
    ## [1] Eminem    KatyPerry LMFAO     Psy       Shakira  
    ## Levels: Eminem KatyPerry LMFAO Psy Shakira
    #Check the Dataset duration 
    unique(year(SpamData$DATE))
    ## [1]   NA 2015 2014 2013
    #add MOnth Column
    SpamData$YEAR <- as.factor(year(SpamData$DATE))
    SpamData$DAY <- as.character(month(SpamData$DATE))
    SpamData$WDAY <- as.factor(weekdays(SpamData$DATE))
    
    
    #See the time of Day
    unique(hour(SpamData$DATE))
    ##  [1] NA  2  1  0 23 22 21 20 19 18 17 16 15 11  7  6  3  9  4 12 10  8  5
    ## [24] 14 13
    # Add Time and Hour from Date column
    SpamData$HOUR <- hour(SpamData$DATE)
    SpamData$TIME <- str_sub(SpamData$DATE,12,end = -1L) # Time starts at 12th place in string of date,

    Subset

    #Lets start working with Author Data now, We don't expect any duplicate but checking 
    dups_Author <- which(duplicated(SpamData$AUTHOR))
    
    # this looks like we have some same comment put on multiple videos (not checking as of now if it's by same by same user. )
    dup_comments <- which(duplicated(SpamData$CONTENT))
    
    #################################### Adding count for Same Comments 
    SpamData_ccount <- count(SpamData$CONTENT)
    SpamData$ConDupcount <-  mapply(function(x)(SpamData_ccount[which(SpamData_ccount$x==x),][1,2]),SpamData$CONTENT)
    
    
    #################################### Adding count for Same Authors 
    
    SpamData_tcount <- count(SpamData$AUTHOR)
    SpamData$AuthDupcount <-  mapply(function(x)(SpamData_tcount[which(SpamData_tcount$x==x),][1,2]),SpamData$AUTHOR)
    
    #################################### 
    
    #SO we have few Data which are with NA date, it may not add much vlaue in our analysis Lets park this data in other dataset
    #Create TWO New Data set with ONly data with valid Data one with VALIDSPAM and other with NASPAM where date is NA
    
    
    suppressWarnings(rm(validSPAM) ) # Remove Old Variable \
     suppressWarnings(rm(NASPAM))     # Remove Old Variable \
     
     # Creating new dataset with all the columns from MAIN data 
     validSPAM <- SpamData[which(SpamData$DAY != 'NA'),]
     NASPAM <- SpamData[-which(SpamData$DAY != 'NA'),]
     glimpse(SpamData)
    ## Observations: 2,044
    ## Variables: 14
    ## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
    ## $ AUTHOR       <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
    ## $ DATE         <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
    ## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
    ## $ CLASS        <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
    ## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
    ## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
    ## $ YEAR         <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
    ## $ DAY          <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
    ## $ WDAY         <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
    ## $ HOUR         <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
    ## $ TIME         <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
    ## $ ConDupcount  <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
    ## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
     glimpse(validSPAM)
    ## Observations: 1,799
    ## Variables: 14
    ## $ COMMENT_ID   <chr> "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04", "z12wjzc4e...
    ## $ AUTHOR       <chr> "jason graham", "Dakota Taylor", "Darrion Johnson...
    ## $ DATE         <dttm> 2015-05-29 02:26:10, 2015-05-29 02:13:07, 2015-0...
    ## $ CONTENT      <chr> "I always end up coming back to this song<br /><U+FEFF>"...
    ## $ CLASS        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
    ## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
    ## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
    ## $ YEAR         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2...
    ## $ DAY          <chr> "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",...
    ## $ WDAY         <fct> Friday, Friday, Friday, Friday, Thursday, Thursda...
    ## $ HOUR         <int> 2, 2, 1, 0, 23, 23, 22, 22, 21, 21, 20, 20, 20, 1...
    ## $ TIME         <chr> "02:26:10", "02:13:07", "01:27:30", "00:41:22", "...
    ## $ ConDupcount  <int> 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
    ## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
     glimpse(NASPAM)
    ## Observations: 245
    ## Variables: 14
    ## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z13vsfqirta...
    ## $ AUTHOR       <chr> "Lisa Wellas", "Ajkal Khan", "Jihad Naser", "kyem...
    ## $ DATE         <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    ## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my ...
    ## $ CLASS        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
    ## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
    ## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
    ## $ YEAR         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
    ## $ DAY          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
    ## $ WDAY         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
    ## $ HOUR         <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
    ## $ TIME         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
    ## $ ConDupcount  <int> 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 36, 1, 1, 1, 36, 1,...
    ## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...

    String Operation

    Convert/ Add Column

    # Converting 0 and 1 to Text "NO-SPAM", "SPAM" and create new dataset
    SpamData_text <- mutate(SpamData,CLASS= ifelse(CLASS == 0, "NO-SPAM", "SPAM")) 
    head(SpamData_text[,c("CLASS","CONTENT")])
    # SPAM DATA By  Year and Type 
    SpamData_Content <- SpamData_text %>% select("CLASS","CONTENT","YEAR","DAY")

    Reading JSON / Detect/ gsub

    Read JSON File from www to perfrom str_detect on the collection of badwords.

    # Get  bad word JSON File from GITHUB to do you string compare 
    #install.packages("rjson")
    badwordURL <- "https://raw.githubusercontent.com/web-mech/badwords/master/lib/lang.json"
    #library(rjson)
    suppressWarnings(rm(dontSay))
    
    raw <- read_file(badwordURL)
    dontSay <- fromJSON(raw)
    class(dontSay)
    ## [1] "list"
    dontSay <- as.data.frame(dontSay$words)
    colnames(dontSay) <- c("words")
    dontSay <- as.character(dontSay$words)
    # Replace and a all special char with scapce sequence in bad word  # . \ | ( ) [ { ^ $ * + ?   replace 
    dontSay <- gsub("[(]","\\\\(",dontSay)
    dontSay <- gsub("[.]","\\\\.",dontSay)
    dontSay <- gsub("[\\]","\\\\\\",dontSay)
    dontSay <- gsub("[|]","\\\\|",dontSay)
    dontSay <- gsub("[)]","\\\\)",dontSay)
    dontSay <- gsub("[*]","\\\\*",dontSay)
    
    #since Str_detect neeeds 1d varible to check using pase with collapse 
     checkword <- paste(dontSay, collapse = '|')
     dontSay_data <- SpamData_Content[which(str_detect(SpamData_Content$CONTENT,pattern =regex(checkword,ignore_case = TRUE)) ),]
     
     dontSay_data$CLASS <- as.factor(dontSay_data$CLASS)
    summary(dontSay_data)
    ##      CLASS       CONTENT            YEAR        DAY           
    ##  NO-SPAM: 72   Length:190         2013:25   Length:190        
    ##  SPAM   :118   Class :character   2014:51   Class :character  
    ##                Mode  :character   2015:78   Mode  :character  
    ##                                   NA's:36
    ## Message Marked as SPAM and have Bad words.
     glimpse(dontSay_data[which(dontSay_data$CLASS=="SPAM"),])
    ## Observations: 118
    ## Variables: 4
    ## $ CLASS   <fct> SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, ...
    ## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my siste...
    ## $ YEAR    <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
    ## $ DAY     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
     ## Adding Bad column to new data set show if comments was bad or no
     SpamData_text<- mutate(SpamData_text, bad = str_detect(SpamData_text$CONTENT,pattern =checkword ) )
     
     
     # summary(SpamData_text)
     # plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>% 
     #  plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))

    Write

    write “SpamData” it to local directory

    # write  "SpamData" it to local directory 
    write.csv(SpamData,file = "project/SpamData.csv")

    Question 1. Data Exploration

    Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.

    Objective

    Doing String operations and creating new data set. 
    Calcualting mean , median and other info.
    

    summary

    statistics

    glimpse(SpamData)
    ## Observations: 2,044
    ## Variables: 14
    ## $ COMMENT_ID   <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
    ## $ AUTHOR       <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
    ## $ DATE         <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
    ## $ CONTENT      <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
    ## $ CLASS        <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
    ## $ key          <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
    ## $ YID          <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
    ## $ YEAR         <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
    ## $ DAY          <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
    ## $ WDAY         <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
    ## $ HOUR         <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
    ## $ TIME         <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
    ## $ ConDupcount  <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
    ## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
    head(fdataMaster)
    summary(SpamData)
    ##   COMMENT_ID           AUTHOR               DATE                    
    ##  Length:2044        Length:2044        Min.   :2013-07-12 22:33:27  
    ##  Class :character   Class :character   1st Qu.:2014-09-25 21:39:20  
    ##  Mode  :character   Mode  :character   Median :2015-03-29 07:12:48  
    ##                                        Mean   :2014-12-25 18:08:03  
    ##                                        3rd Qu.:2015-05-22 16:27:17  
    ##                                        Max.   :2015-06-05 20:01:23  
    ##                                        NA's   :245                  
    ##    CONTENT              CLASS               key          YID           
    ##  Length:2044        Min.   :0.0000   Eminem   :448   Length:2044       
    ##  Class :character   1st Qu.:0.0000   KatyPerry:350   Class :character  
    ##  Mode  :character   Median :1.0000   LMFAO    :438   Mode  :character  
    ##                     Mean   :0.5215   Psy      :438                     
    ##                     3rd Qu.:1.0000   Shakira  :370                     
    ##                     Max.   :1.0000                                     
    ##                                                                        
    ##    YEAR          DAY                   WDAY          HOUR      
    ##  2013: 200   Length:2044        Tuesday  :309   Min.   : 0.00  
    ##  2014: 517   Class :character   Saturday :290   1st Qu.: 6.00  
    ##  2015:1082   Mode  :character   Thursday :272   Median :14.00  
    ##  NA's: 245                      Wednesday:269   Mean   :12.33  
    ##                                 Sunday   :242   3rd Qu.:18.00  
    ##                                 (Other)  :417   Max.   :23.00  
    ##                                 NA's     :245   NA's   :245    
    ##      TIME            ConDupcount      AuthDupcount 
    ##  Length:2044        Min.   :  1.00   Min.   :1.00  
    ##  Class :character   1st Qu.:  1.00   1st Qu.:1.00  
    ##  Mode  :character   Median :  1.00   Median :2.00  
    ##                     Mean   : 16.33   Mean   :1.77  
    ##                     3rd Qu.:  2.00   3rd Qu.:2.00  
    ##                     Max.   :171.00   Max.   :8.00  
    ## 
    ddply(SpamData, "key",summarise, duration = max(as.numeric(YEAR)) - min(as.numeric(YEAR)))
    # count the total number of SAPM by key column
    # The output of ddply is just the give column as we have user Summarise fucnation, if we use funcation  tranform
    # SpamDataT <- ddply(SpamData, "key", transform, Spam = sum( CLASS ) )  it would result full data set for use to user 
    
    # Data Of Key Video and SPAM Count 
    # Sine CALSS Is logical and 1 is for SPAM and 0 for NO SPAM , Sum of CLASS would give sum of SAPM content 
    # Rename the column by doing pipe %>% of the data to rename of plys pacakge 
    ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
    rename( c("key"="Video Name","Spam"= " SPAM Count")) 
    # Spam over days of week
    ddply(SpamData, "WDAY", summarise, Spam = sum( CLASS ) ) %>%
    rename( c("WDAY"="Day of week","Spam"= " SPAM Count")) 

    Calculate %

    # Count of each value of "Key" in the first spam data
    SpamData_keyCount <- count(SpamData, vars = "key")
    
    # Calculate % of SAPM by video 
    
    SpamData_percent <- ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
    mutate(  SPAM_PERCENT =  as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] , 
             TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )]) 
    
    SpamData_percent[order(SpamData_percent$TOTAL_COMMENTS,decreasing = TRUE),]

    means

    # Know the Mean of comments from Author and Hours of the day
    
    sapply( list( "Repeat Author" = SpamData$AuthDupcount,"Hour of Day" = SpamData$HOUR ), FUN =mean, na.rm=TRUE) 
    ## Repeat Author   Hour of Day 
    ##      1.770059     12.327404
    # MEAN Of % of SPAM with respect to total spam
    sapply(list("Spam % Mean" = SpamData_percent$SPAM_PERCENT, "| Total Comment Mean"=SpamData_percent$TOTAL_COMMENTS),FUN = mean)
    ##          Spam % Mean | Total Comment Mean 
    ##             51.89542            408.80000
    # Mean of Hours and Day of the Month
     sapply( list( "Hour_Mean" = SpamData$HOUR,"Day_Mean" = day(SpamData$DATE) ), mean, na.rm=TRUE) 
    ## Hour_Mean  Day_Mean 
    ##  12.32740  17.98221

    Aggregate

    # aggregate data by Video Key and Weekday to see How many SPAMs were reported on each daty
    aggregate(list("SPAM_COUNT" = SpamData$CLASS ),by = list("ID" = SpamData$key, "Day_Of_Week" = SpamData$WDAY ) , FUN=  sum,na.action=FALSE )
    library(dplyr)  
    
    # Total comments by Week Days
    SpamData_total_spam_weekday <- plyr::count(SpamData, vars = c("key","WDAY"))
    SpamData_total_spam_weekday
    # Count of COmment by Year 
    SpamData_total_spam_year <- plyr:: count(SpamData, vars = c("key","YEAR"))
    summary(SpamData_total_spam_year)
    ##         key      YEAR        freq      
    ##  Eminem   :2   2013:1   Min.   : 15.0  
    ##  KatyPerry:2   2014:3   1st Qu.:110.8  
    ##  LMFAO    :2   2015:5   Median :201.5  
    ##  Psy      :2   NA's:1   Mean   :204.4  
    ##  Shakira  :2            3rd Qu.:312.5  
    ##                         Max.   :347.0
    #It looks like we have 0 for Eminem at every day of week. Let check it with some sample data of SPAM in Main dataset
    SpamData[which(SpamData$CLASS == 1 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]
    # We see we have NA for Wday and its right as we have NA for DATE , To validate further lets check the Not SAPM data for the Eminem
    SpamData[which(SpamData$CLASS == 0 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]

    medians

    # Median of Hours and Day of the Month
    sapply( list( "Hour_Median" = SpamData$HOUR,"Day_Median" = day(SpamData$DATE) ), median, na.rm=TRUE)
    ## Hour_Median  Day_Median 
    ##          14          20

    quartiles

    qunat_SPAM_byDay <- quantile(aggregate(list("SPAM_COUNT" = SpamData$CLASS), by= list("Day" = SpamData$WDAY),sum)$SPAM_COUNT)
    
    sprintf("The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are %s, %s and %s spams respectively.",qunat_SPAM_byDay[1],qunat_SPAM_byDay[2],qunat_SPAM_byDay[3])
    ## [1] "The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are 95, 103 and 116 spams respectively."
    # Using Colored Formating 
    
    
    cat("The first, second and third " %+% blue$underline$bold ('quartiles') %+% " of the " %+% red("SAPM Count") %+%  " in SPAM Datas are ", blue(qunat_SPAM_byDay[1]),",",blue(qunat_SPAM_byDay[2]), " and " , blue(qunat_SPAM_byDay[3]) , " spams respectively.")
    ## The first, second and third quartiles of the SAPM Count in SPAM Datas are  95 , 103  and  116  spams respectively.

    conclusions

    1. We have 245 observations which were recorded with no Dates, and hence no Day is availvle for them.
    2. The SPAM % is directly proportional Total Comments as show in below table
    3. in 2014 , 2015 we see increse in trend as COMMENTS are incresing and SPAM too, by query summary(SpamData_total_spam_year)
    ##         key      YEAR        freq      
    ##  Eminem   :2   2013:1   Min.   : 15.0  
    ##  KatyPerry:2   2014:3   1st Qu.:110.8  
    ##  LMFAO    :2   2015:5   Median :201.5  
    ##  Psy      :2   NA's:1   Mean   :204.4  
    ##  Shakira  :2            3rd Qu.:312.5  
    ##                         Max.   :347.0
    1. Mean data Indicates that SPAM is more than 50% of total comments
    Spam % Mean Total Comment Mean
    51.89542 408.80000

    5 SPAM Data by Weekday : The first, second and third quartiles of the SAPM Count in SPAM Datas are 95 ,103 and 116 spams respectively. + qunat_SPAM_byDay

    0% 25% 50% 75% 100%
    95 103 116 124 156

    Question 3. Graphics

    Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.

    Objective

    Creating differnt type of data and Graph set that can support us latter for our final data presentation. ## Graphs {.tabset .tabset-fade .tabset}

    BAR Graph

    library(plyr)
    head(SpamData_total_spam_year)
    # Comments Over Year on the Videos
    ggplot(SpamData_total_spam_year,mapping=aes(x=key,y=YEAR,fill= YEAR)) +
      geom_col()+
      ggtitle("Comments Over Year on the Videos")

    ############################ Data by only Key year and Class using SPREAD to move ROW TO COLUMN
    SpamData_kcy <- SpamData_text%>%select(key,CLASS,YEAR)
    SpamData_kcy_f<- plyr::count(SpamData_kcy ,vars =c("key","YEAR","CLASS")) %>%
    spread(CLASS, freq) 
      
    ## Graph for only SPAM over the video per year  , Its very clear that 2014 and 2015 recorded increse in Comments and hence SPAM
    ggplot(SpamData_kcy_f,mapping=aes(x=key, y=SPAM, fill = YEAR)) +
    geom_col(na.rm = FALSE) +facet_wrap(~YEAR) +
      labs(title="SPAM over Year on Videos",
            x ="Video ", y = "SPAM Count")+
      ggtitle(" Impact of SPAM on Number of Videos by year") +
      theme(plot.title = element_text(color="gray", size=14, face="bold" ))
    ## Warning: Removed 1 rows containing missing values (position_stack).

    # Impact of SPAM on Number of Videos by year 
    
    ggplot(SpamData_total_spam_year,mapping=aes(x=YEAR,fill= freq)) +
      geom_bar() +
      theme(plot.background = element_rect(color = "orange")) +
      theme(panel.background = element_blank()) +
      theme(panel.grid.major = element_line(color="blue")) +
     theme(panel.grid.minor = element_blank())+
      theme(panel.grid.major.x = element_blank())+
      ylab( "No Of Videos Impacted") +
      xlab( "Year" ) +
      ggtitle(" Impact of SPAM on Number of Videos by year") +
      theme(plot.title = element_text(color="red", size=14, face="bold.italic" ))

    box plot

    ## using box plt to show that data was very high in 2014 and 2015
    ggplot(SpamData_kcy_f,mapping=aes(x=YEAR, y= SPAM)) +
    geom_boxplot()
    ## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

    # We can clearly see that Spamming is not a regualar process , it happens in a chunk , with very few exceptions .
    ggplot(SpamData_text,mapping=aes(x=key, y=DATE,color = CLASS)) +
     geom_boxplot(na.rm = TRUE)+
      ylab( "Date in Year") +
      xlab( "Video Key" ) +
    theme(panel.background = element_blank(),
            legend.key = element_blank()) +  #Gray color behind the actual legend 
    theme(panel.grid.major = element_line(color="blue")) +
     theme(panel.grid.minor = element_blank())+
      theme(panel.grid.major.y = element_blank())+
      scale_color_discrete(name="Comment Type")+
    ggtitle(" Type of comments on Videos by year") +
    theme(plot.title = element_text(color="red", size=12, face="bold" ))

    scatter plot

    #SPAM and NO SPAM BY YEAR 
    SpamData_byYear <- ddply(SpamData, "YEAR", summarise, Spam = sum( CLASS ) ) %>%
    mutate(  SPAM_PERCENT =  as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] , 
             TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )]) 
    
    
    # Get Data for one video overt the period of dataset
    SpamData_text[order(SpamData_text$DAY),] %>%
    subset(key== "LMFAO") %>%
      ggplot(mapping=aes(x=DAY, y=HOUR,color= CLASS)) +
      geom_point()+
      xlab("Month")

    # Run the same check on FUll Data Set now
    # Its very clear here that during 1st half 2015 spam were higher than the regular comments in the months.
    SpamData_text[order(SpamData_text$DAY),] %>%
      ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
      geom_point(na.rm = TRUE,alpha= 1/3)  +
      geom_smooth(na.rm = TRUE,span = 0.1) +
      labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
      theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))
    ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

    # From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

    histogram

    # store the mean over year 
    myear<- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999"))))
    
    mYear2 <- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999")))[c("2014","2013","9999")])
    
     ################################## Hisogram showing same trend of more SPAM in incresing trend close to mean of all the comments over Month 
        ggplot(SpamData_text,mapping = aes(x=DATE, fill = CLASS)) +
        # start your data from 0 coordinates 
        # bin creates 10 group of data .. deault R will get some random bin in this case it was 30 )
         geom_histogram(na.rm= TRUE,boundary = 0,  
                        bins = 10
                        )  +
          geom_hline(linetype = 5,  
                     color="blue",
                     yintercept = myear) +
          
          # Adding Annotation 
          annotate("text",label = sprintf("mean of comments over year %s",myear), x= as.POSIXct("2014-01-25 18:08:03"), y= 535 ) +
          geom_hline(linetype = 2,  
                     color="red",
                     yintercept = mYear2) +
        
        annotate("text",label = sprintf("mean of comments till 2014, %.2f",mYear2), x= as.POSIXct("2014-01-25 18:08:03"), y= mYear2+20 )+
           labs(title =" Type of comments on Videos by  \n Year ", x= "DATE" , y ="Comments Count") +
      theme(plot.title = element_text(color="Red", size=12, face="bold" ))

    library(plyr)
    summary(SpamData_text[SpamData_text$YEAR == "2015",]$YEAR)
    ## 2013 2014 2015 NA's 
    ##    0    0 1082  245
     #................................Data points indicates that people have commented more during  NIGHT Times , but its little slow during early days of work.
    
    # Mean time of comment is 12:33 during day.
    
        xm <- as.numeric(summary(SpamData_text$HOUR)[4])
        plyr::count(SpamData_text,c("HOUR","CLASS")) %>% 
        ggplot(mapping = aes(x=HOUR,y= freq, fill=CLASS)) +
        geom_col(na.rm = TRUE) +
          ## # 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
          geom_vline(na.rm = TRUE,linetype = 5,  
                     color="blue",
                     xintercept =as.numeric(summary(SpamData_text$HOUR)[4])) +
          theme(panel.background = element_blank(),
            legend.key = element_blank()) +
          # Adding Annotation 
          annotate("text",label = sprintf("mean of Time %.2f",as.numeric(summary(SpamData_text$HOUR)[4])), x= as.numeric(summary(SpamData_text$HOUR)[4]), y= 100 ) +
        labs(title =" Type of Comments made during by Hours of day ", x= "Hour of Day" , y ="Comments Count") +
      theme(plot.title = element_text(color="Red", size=12, face="bold" ))
    ## Warning: Removed 1 rows containing missing values (position_stack).

    GEOM SMOOTH

    xm <- as.numeric(summary(SpamData_text$HOUR)[4])
        plyr::count(SpamData_text,c("HOUR","CLASS")) %>% 
        ggplot(mapping = aes(x=HOUR,y= freq, color=CLASS)) +
          geom_point(na.rm = TRUE) +
        geom_smooth(na.rm = TRUE,span = 0.8)
    ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

        #Indicating Evening to Midnight more comments are created.

    Question 4. Conclusions

    Analysis

    Our Analysis based on data is as below:

    Both Good comments and bad comments are increasing over the year (point 1)

    People are spending more time during evening to midnight , but less time during morrning (point 2)

    SPAMers target diffrent videos diffrent time of the month . Not same videos would be spammed every day or month .(point 2)

    Badwords are equally present in good and spam comments , but are not very big in number. (point 3)

    Point 1

    1. How Spamming is spread across the videos and it’s comment rate.

    As we can see that spams are there since the time data was collected and it has increased over time . We can clearly see that SPAM is very close to good content for some the videos in year 2014 and mostly all the videos in 2015 .

    ## Warning: Removed 1 rows containing missing values (position_stack).
    ## Warning: Removed 1 rows containing missing values (geom_point).

    point 2

    Find SPAM / Comments pattern over year and month and during the hour of day

    SPAM / Year

    We can see that as we are moving ahead , SPAM BOX is also increasing. Good point to note that even good comments has increased during 2015

    Who is getting SPAM?

    We can clearly see that Spamming is not a regualar process for a Video, it happens in a in some months, with very few exceptions .

    incontrast, for multiple videos, Data Point for 2015 indicates that most of the videos were spammed in this year, almost every month some vidoes were spammed.

    When people commented ?

    Data points indicates that people have commented more during NIGHT Times , but its little slow during early days of work. > Less comments during morning ? We can say yes.

    ## Warning: Removed 1 rows containing missing values (position_stack).

    When SPAM won

    *Its very clear here that during 1st half of 2014 and 2015 spam were higher than the rgular comments in the months.

    • From below chart its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

    Bulk of 2015 is visble again.

    SpamData_text[order(SpamData_text$DAY),] %>%
      ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
      geom_point(na.rm = TRUE,alpha= 1/3)  +
      geom_smooth(na.rm = TRUE,span = 0.1) +
      labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
      theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))
    ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

    # From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.

    Point 3

    Below table is very clear and pointing out that Bad words more in SAPM , but strange even not-Spam comments contain bad words.

      plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>% 
      plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))

    Question 5.

    BONUS - place the original .csv in a github file and have R read from the link.

    SPAMdata Master file created from this product is Uploaded to CSV Github , Raw CSV from GIT Hub

    gitRawFile = "https://raw.githubusercontent.com/Rajwantmishra/msds/master/SpamData.csv"
    
    #require(XML)
    read.csv.url <- read.csv( url(gitRawFile))
    head(read.csv.url)
    #require(read.table)
    read.csv.Data <- read.csv(gitRawFile,header=T)
    head(read.csv.Data )
    #library(data.table)
    dataTableCSV <- fread(gitRawFile)
    head(dataTableCSV)
    library(tidyverse)
    
    tidyDataCSV <- read_csv(gitRawFile)
    ## Warning: Missing column names filled in: 'X1' [1]
    ## Parsed with column specification:
    ## cols(
    ##   X1 = col_double(),
    ##   COMMENT_ID = col_character(),
    ##   AUTHOR = col_character(),
    ##   DATE = col_datetime(format = ""),
    ##   CONTENT = col_character(),
    ##   CLASS = col_double(),
    ##   key = col_character(),
    ##   YID = col_character()
    ## )
    str(tidyDataCSV)
    ## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2044 obs. of  8 variables:
    ##  $ X1        : num  1 2 3 4 5 6 7 8 9 10 ...
    ##  $ COMMENT_ID: chr  "z12rwfnyyrbsefonb232i5ehdxzkjzjs2" "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04" "z13vsfqirtavjvu0t22ezrgzyorwxhpf3" "z12wjzc4eprnvja4304cgbbizuved35wxcs" ...
    ##  $ AUTHOR    : chr  "Lisa Wellas" "jason graham" "Ajkal Khan" "Dakota Taylor" ...
    ##  $ DATE      : POSIXct, format: NA "2015-05-29 02:26:10" ...
    ##  $ CONTENT   : chr  "+447935454150 lovely girl talk to me xxx<U+FEFF>" "I always end up coming back to this song<br /><U+FEFF>" "my sister just received over 6,500 new <a rel=\"nofollow\" class=\"ot-hashtag\" href=\"https://plus.google.com/"| __truncated__ "Cool<U+FEFF>" ...
    ##  $ CLASS     : num  1 0 1 0 1 0 1 0 1 0 ...
    ##  $ key       : chr  "Eminem" "Eminem" "Eminem" "Eminem" ...
    ##  $ YID       : chr  "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" ...
    ##  - attr(*, "spec")=
    ##   .. cols(
    ##   ..   X1 = col_double(),
    ##   ..   COMMENT_ID = col_character(),
    ##   ..   AUTHOR = col_character(),
    ##   ..   DATE = col_datetime(format = ""),
    ##   ..   CONTENT = col_character(),
    ##   ..   CLASS = col_double(),
    ##   ..   key = col_character(),
    ##   ..   YID = col_character()
    ##   .. )
    head(tidyDataCSV)